/src/icu/icu4c/source/i18n/numparse_decimal.cpp

Source
// © 2018 and later: Unicode, Inc. and others.
// License & terms of use: http://www.unicode.org/copyright.html

#include "unicode/utypes.h"

#if !UCONFIG_NO_FORMATTING

// Allow implicit conversion from char16_t* to UnicodeString for this file:
// Helpful in toString methods and elsewhere.
#define UNISTR_FROM_STRING_EXPLICIT

#include "numparse_types.h"
#include "numparse_decimal.h"
#include "static_unicode_sets.h"
#include "numparse_utils.h"
#include "unicode/uchar.h"
#include "putilimp.h"
#include "number_decimalquantity.h"
#include "string_segment.h"

using namespace icu;
using namespace icu::numparse;
using namespace icu::numparse::impl;


DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
                               parse_flags_t parseFlags) {
    if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
    } else {
        groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
        decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
    }
    bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);

    // Parsing is very lenient even in strict mode, almost any dot or comma is a
    // grouping separator. Parsing strings like "1.234" in French was treating '.'
    // like an ignorable grouping separator, and we want it to be excluded.
    // We keep the public behavior when strictParse is false, but when it is true
    // we restrict grouping separators to the smaller set of equivalents.
    unisets::Key groupingKey = unisets::chooseFrom(groupingSeparator,
            strictSeparators ? unisets::STRICT_COMMA : unisets::ALL_SEPARATORS,
            strictSeparators ? unisets::STRICT_PERIOD : unisets::ALL_SEPARATORS);
    if (groupingKey < 0) {
        groupingKey = unisets::chooseFrom(
            groupingSeparator, unisets::OTHER_GROUPING_SEPARATORS);
    }
    if (groupingKey >= 0) {
        // Attempt to find separators in the static cache
        groupingUniSet = unisets::get(groupingKey);
    } else if (!groupingSeparator.isEmpty()) {
        auto* set = new UnicodeSet();
        set->add(groupingSeparator.char32At(0));
        set->freeze();
        groupingUniSet = set;
        fLocalGroupingUniSet.adoptInstead(set);
    } else {
        groupingUniSet = unisets::get(unisets::EMPTY);
    }

    unisets::Key decimalKey = unisets::chooseFrom(
            decimalSeparator,
            strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
            strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
    if (decimalKey >= 0) {
        decimalUniSet = unisets::get(decimalKey);
    } else if (!decimalSeparator.isEmpty()) {
        auto* set = new UnicodeSet();
        set->add(decimalSeparator.char32At(0));
        set->freeze();
        decimalUniSet = set;
        fLocalDecimalUniSet.adoptInstead(set);
    } else {
        decimalUniSet = unisets::get(unisets::EMPTY);
    }

    if (groupingKey >= 0 && decimalKey >= 0) {
        // Everything is available in the static cache
        separatorSet = groupingUniSet;
        leadSet = unisets::get(
                strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
                                 : unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
    } else {
        auto* set = new UnicodeSet();
        set->addAll(*groupingUniSet);
        set->addAll(*decimalUniSet);
        set->freeze();
        separatorSet = set;
        fLocalSeparatorSet.adoptInstead(set);
        leadSet = nullptr;
    }

    UChar32 cpZero = symbols.getCodePointZero();
    if (cpZero == -1 || !u_isdigit(cpZero) || u_digit(cpZero, 10) != 0) {
        // Uncommon case: okay to allocate.
        auto* digitStrings = new UnicodeString[10];
        fLocalDigitStrings.adoptInstead(digitStrings);
        for (int32_t i = 0; i <= 9; i++) {
            digitStrings[i] = symbols.getConstDigitSymbol(i);
        }
    }

    requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
    groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
    integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
    grouping1 = grouper.getPrimary();
    grouping2 = grouper.getSecondary();

    // Fraction grouping parsing is disabled for now but could be enabled later.
    // See https://unicode-org.atlassian.net/browse/ICU-10794
    // fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
}

bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
    return match(segment, result, 0, status);
}

bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
                           UErrorCode&) const {
    if (result.seenNumber() && exponentSign == 0) {
        // A number has already been consumed.
        return false;
    } else if (exponentSign != 0) {
        // scientific notation always comes after the number
        U_ASSERT(!result.quantity.bogus);
    }

    // Initial offset before any character consumption.
    int32_t initialOffset = segment.getOffset();

    // Return value: whether to ask for more characters.
    bool maybeMore = false;

    // All digits consumed so far.
    number::impl::DecimalQuantity digitsConsumed;
    digitsConsumed.bogus = true;

    // The total number of digits after the decimal place, used for scaling the result.
    int32_t digitsAfterDecimalPlace = 0;

    // The actual grouping and decimal separators used in the string.
    // If non-null, we have seen that token.
    UnicodeString actualGroupingString;
    UnicodeString actualDecimalString;
    actualGroupingString.setToBogus();
    actualDecimalString.setToBogus();

    // Information for two groups: the previous group and the current group.
    //
    // Each group has three pieces of information:
    //
    // Offset: the string position of the beginning of the group, including a leading separator
    // if there was a leading separator. This is needed in case we need to rewind the parse to
    // that position.
    //
    // Separator type:
    // 0 => beginning of string
    // 1 => lead separator is a grouping separator
    // 2 => lead separator is a decimal separator
    //
    // Count: the number of digits in the group. If -1, the group has been validated.
    int32_t currGroupOffset = 0;
    int32_t currGroupSepType = 0;
    int32_t currGroupCount = 0;
    int32_t prevGroupOffset = -1;
    int32_t prevGroupSepType = -1;
    int32_t prevGroupCount = -1;

    while (segment.length() > 0) {
        maybeMore = false;

        // Attempt to match a digit.
        int8_t digit = -1;

        // Try by code point digit value.
        UChar32 cp = segment.getCodePoint();
        if (u_isdigit(cp)) {
            segment.adjustOffset(U16_LENGTH(cp));
            digit = static_cast<int8_t>(u_digit(cp, 10));
        }

        // Try by digit string.
        if (digit == -1 && !fLocalDigitStrings.isNull()) {
            for (int32_t i = 0; i < 10; i++) {
                const UnicodeString& str = fLocalDigitStrings[i];
                if (str.isEmpty()) {
                    continue;
                }
                int32_t overlap = segment.getCommonPrefixLength(str);
                if (overlap == str.length()) {
                    segment.adjustOffset(overlap);
                    digit = static_cast<int8_t>(i);
                    break;
                }
                maybeMore = maybeMore || (overlap == segment.length());
            }
        }

        if (digit >= 0) {
            // Digit was found.
            if (digitsConsumed.bogus) {
                digitsConsumed.bogus = false;
                digitsConsumed.clear();
            }
            digitsConsumed.appendDigit(digit, 0, true);
            currGroupCount++;
            if (!actualDecimalString.isBogus()) {
                digitsAfterDecimalPlace++;
            }
            continue;
        }

        // Attempt to match a literal grouping or decimal separator.
        bool isDecimal = false;
        bool isGrouping = false;

        // 1) Attempt the decimal separator string literal.
        // if (we have not seen a decimal separator yet) { ... }
        if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
            int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
            maybeMore = maybeMore || (overlap == segment.length());
            if (overlap == decimalSeparator.length()) {
                isDecimal = true;
                actualDecimalString = decimalSeparator;
            }
        }

        // 2) Attempt to match the actual grouping string literal.
        if (!actualGroupingString.isBogus()) {
            int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
            maybeMore = maybeMore || (overlap == segment.length());
            if (overlap == actualGroupingString.length()) {
                isGrouping = true;
            }
        }

        // 2.5) Attempt to match a new the grouping separator string literal.
        // if (we have not seen a grouping or decimal separator yet) { ... }
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
            !groupingSeparator.isEmpty()) {
            int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
            maybeMore = maybeMore || (overlap == segment.length());
            if (overlap == groupingSeparator.length()) {
                isGrouping = true;
                actualGroupingString = groupingSeparator;
            }
        }

        // 3) Attempt to match a decimal separator from the equivalence set.
        // if (we have not seen a decimal separator yet) { ... }
        // The !isGrouping is to confirm that we haven't yet matched the current character.
        if (!isGrouping && actualDecimalString.isBogus()) {
            if (decimalUniSet->contains(cp)) {
                isDecimal = true;
                actualDecimalString = UnicodeString(cp);
            }
        }

        // 4) Attempt to match a grouping separator from the equivalence set.
        // if (we have not seen a grouping or decimal separator yet) { ... }
        if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
            if (groupingUniSet->contains(cp)) {
                isGrouping = true;
                actualGroupingString = UnicodeString(cp);
            }
        }

        // Leave if we failed to match this as a separator.
        if (!isDecimal && !isGrouping) {
            break;
        }

        // Check for conditions when we don't want to accept the separator.
        if (isDecimal && integerOnly) {
            break;
        } else if (currGroupSepType == 2 && isGrouping) {
            // Fraction grouping
            break;
        }

        // Validate intermediate grouping sizes.
        bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
        bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
        if (!prevValidSecondary || (isDecimal && !currValidPrimary)) {
            // Invalid grouping sizes.
            if (isGrouping && currGroupCount == 0) {
                // Trailing grouping separators: these are taken care of below
                U_ASSERT(currGroupSepType == 1);
            } else if (requireGroupingMatch) {
                // Strict mode: reject the parse
                digitsConsumed.clear();
                digitsConsumed.bogus = true;
            }
            break;
        } else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
            break;
        } else {
            // Grouping sizes OK so far.
            prevGroupOffset = currGroupOffset;
            prevGroupCount = currGroupCount;
            if (isDecimal) {
                // Do not validate this group any more.
                prevGroupSepType = -1;
            } else {
                prevGroupSepType = currGroupSepType;
            }
        }

        // OK to accept the separator.
        // Special case: don't update currGroup if it is empty; this allows two grouping
        // separators in a row in lenient mode.
        if (currGroupCount != 0) {
            currGroupOffset = segment.getOffset();
        }
        currGroupSepType = isGrouping ? 1 : 2;
        currGroupCount = 0;
        if (isGrouping) {
            segment.adjustOffset(actualGroupingString.length());
        } else {
            segment.adjustOffset(actualDecimalString.length());
        }
    }

    // End of main loop.
    // Back up if there was a trailing grouping separator.
    // Shift prev -> curr so we can check it as a final group.
    if (currGroupSepType != 2 && currGroupCount == 0) {
        maybeMore = true;
        segment.setOffset(currGroupOffset);
        currGroupOffset = prevGroupOffset;
        currGroupSepType = prevGroupSepType;
        currGroupCount = prevGroupCount;
        prevGroupOffset = -1;
        prevGroupSepType = 0;
        prevGroupCount = 1;
    }

    // Validate final grouping sizes.
    bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
    bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
    if (!requireGroupingMatch) {
        // The cases we need to handle here are lone digits.
        // Examples: "1,1"  "1,1,"  "1,1,1"  "1,1,1,"  ",1" (all parse as 1)
        // See more examples in numberformattestspecification.txt
        int32_t digitsToRemove = 0;
        if (!prevValidSecondary) {
            segment.setOffset(prevGroupOffset);
            digitsToRemove += prevGroupCount;
            digitsToRemove += currGroupCount;
        } else if (!currValidPrimary && (prevGroupSepType != 0 || prevGroupCount != 0)) {
            maybeMore = true;
            segment.setOffset(currGroupOffset);
            digitsToRemove += currGroupCount;
        }
        if (digitsToRemove != 0) {
            digitsConsumed.adjustMagnitude(-digitsToRemove);
            digitsConsumed.truncate();
        }
        prevValidSecondary = true;
        currValidPrimary = true;
    }
    if (currGroupSepType != 2 && (!prevValidSecondary || !currValidPrimary)) {
        // Grouping failure.
        digitsConsumed.bogus = true;
    }

    // Strings that start with a separator but have no digits,
    // or strings that failed a grouping size check.
    if (digitsConsumed.bogus) {
        maybeMore = maybeMore || (segment.length() == 0);
        segment.setOffset(initialOffset);
        return maybeMore;
    }

    // We passed all inspections. Start post-processing.

    // Adjust for fraction part.
    digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);

    // Set the digits, either normal or exponent.
    if (exponentSign != 0 && segment.getOffset() != initialOffset) {
        bool overflow = false;
        if (digitsConsumed.fitsInLong()) {
            int64_t exponentLong = digitsConsumed.toLong(false);
            U_ASSERT(exponentLong >= 0);
            if (exponentLong <= INT32_MAX) {
                auto exponentInt = static_cast<int32_t>(exponentLong);
                if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
                    overflow = true;
                }
            } else {
                overflow = true;
            }
        } else {
            overflow = true;
        }
        if (overflow) {
            if (exponentSign == -1) {
                // Set to zero
                result.quantity.clear();
            } else {
                // Set to infinity
                result.quantity.bogus = true;
                result.flags |= FLAG_INFINITY;
            }
        }
    } else {
        result.quantity = digitsConsumed;
    }

    // Set other information into the result and return.
    if (!actualDecimalString.isBogus()) {
        result.flags |= FLAG_HAS_DECIMAL_SEPARATOR;
    }
    result.setCharsConsumed(segment);
    return segment.length() == 0 || maybeMore;
}

bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
    if (requireGroupingMatch) {
        if (sepType == -1) {
            // No such group (prevGroup before first shift).
            return true;
        } else if (sepType == 0) {
            // First group.
            if (isPrimary) {
                // No grouping separators is OK.
                return true;
            } else {
                return count != 0 && count <= grouping2;
            }
        } else if (sepType == 1) {
            // Middle group.
            if (isPrimary) {
                return count == grouping1;
            } else {
                return count == grouping2;
            }
        } else {
            U_ASSERT(sepType == 2);
            // After the decimal separator.
            return true;
        }
    } else {
        if (sepType == 1) {
            // #11230: don't accept middle groups with only 1 digit.
            return count != 1;
        } else {
            return true;
        }
    }
}

bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
    // The common case uses a static leadSet for efficiency.
    if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
        return segment.startsWith(*leadSet);
    }
    if (segment.startsWith(*separatorSet) || u_isdigit(segment.getCodePoint())) {
        return true;
    }
    if (fLocalDigitStrings.isNull()) {
        return false;
    }
    for (int32_t i = 0; i < 10; i++) {
        if (segment.startsWith(fLocalDigitStrings[i])) {
            return true;
        }
    }
    return false;
}

UnicodeString DecimalMatcher::toString() const {
    return u"<Decimal>";
}


#endif /* #if !UCONFIG_NO_FORMATTING */

Coverage Report

Created: 2026-06-23 06:26

Line	Count	Source
1		// © 2018 and later: Unicode, Inc. and others.
2		// License & terms of use: http://www.unicode.org/copyright.html
3
4		#include "unicode/utypes.h"
5
6		#if !UCONFIG_NO_FORMATTING
7
8		// Allow implicit conversion from char16_t* to UnicodeString for this file:
9		// Helpful in toString methods and elsewhere.
10		#define UNISTR_FROM_STRING_EXPLICIT
11
12		#include "numparse_types.h"
13		#include "numparse_decimal.h"
14		#include "static_unicode_sets.h"
15		#include "numparse_utils.h"
16		#include "unicode/uchar.h"
17		#include "putilimp.h"
18		#include "number_decimalquantity.h"
19		#include "string_segment.h"
20
21		using namespace icu;
22		using namespace icu::numparse;
23		using namespace icu::numparse::impl;
24
25
26		DecimalMatcher::DecimalMatcher(const DecimalFormatSymbols& symbols, const Grouper& grouper,
27	0	parse_flags_t parseFlags) {
28	0	if (0 != (parseFlags & PARSE_FLAG_MONETARY_SEPARATORS)) {
29	0	groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetaryGroupingSeparatorSymbol);
30	0	decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kMonetarySeparatorSymbol);
31	0	} else {
32	0	groupingSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kGroupingSeparatorSymbol);
33	0	decimalSeparator = symbols.getConstSymbol(DecimalFormatSymbols::kDecimalSeparatorSymbol);
34	0	}
35	0	bool strictSeparators = 0 != (parseFlags & PARSE_FLAG_STRICT_SEPARATORS);
36
37		// Parsing is very lenient even in strict mode, almost any dot or comma is a
38		// grouping separator. Parsing strings like "1.234" in French was treating '.'
39		// like an ignorable grouping separator, and we want it to be excluded.
40		// We keep the public behavior when strictParse is false, but when it is true
41		// we restrict grouping separators to the smaller set of equivalents.
42	0	unisets::Key groupingKey = unisets::chooseFrom(groupingSeparator,
43	0	strictSeparators ? unisets::STRICT_COMMA : unisets::ALL_SEPARATORS,
44	0	strictSeparators ? unisets::STRICT_PERIOD : unisets::ALL_SEPARATORS);
45	0	if (groupingKey < 0) {
46	0	groupingKey = unisets::chooseFrom(
47	0	groupingSeparator, unisets::OTHER_GROUPING_SEPARATORS);
48	0	}
49	0	if (groupingKey >= 0) {
50		// Attempt to find separators in the static cache
51	0	groupingUniSet = unisets::get(groupingKey);
52	0	} else if (!groupingSeparator.isEmpty()) {
53	0	auto* set = new UnicodeSet();
54	0	set->add(groupingSeparator.char32At(0));
55	0	set->freeze();
56	0	groupingUniSet = set;
57	0	fLocalGroupingUniSet.adoptInstead(set);
58	0	} else {
59	0	groupingUniSet = unisets::get(unisets::EMPTY);
60	0	}
61
62	0	unisets::Key decimalKey = unisets::chooseFrom(
63	0	decimalSeparator,
64	0	strictSeparators ? unisets::STRICT_COMMA : unisets::COMMA,
65	0	strictSeparators ? unisets::STRICT_PERIOD : unisets::PERIOD);
66	0	if (decimalKey >= 0) {
67	0	decimalUniSet = unisets::get(decimalKey);
68	0	} else if (!decimalSeparator.isEmpty()) {
69	0	auto* set = new UnicodeSet();
70	0	set->add(decimalSeparator.char32At(0));
71	0	set->freeze();
72	0	decimalUniSet = set;
73	0	fLocalDecimalUniSet.adoptInstead(set);
74	0	} else {
75	0	decimalUniSet = unisets::get(unisets::EMPTY);
76	0	}
77
78	0	if (groupingKey >= 0 && decimalKey >= 0) {
79		// Everything is available in the static cache
80	0	separatorSet = groupingUniSet;
81	0	leadSet = unisets::get(
82	0	strictSeparators ? unisets::DIGITS_OR_ALL_SEPARATORS
83	0	: unisets::DIGITS_OR_STRICT_ALL_SEPARATORS);
84	0	} else {
85	0	auto* set = new UnicodeSet();
86	0	set->addAll(*groupingUniSet);
87	0	set->addAll(*decimalUniSet);
88	0	set->freeze();
89	0	separatorSet = set;
90	0	fLocalSeparatorSet.adoptInstead(set);
91	0	leadSet = nullptr;
92	0	}
93
94	0	UChar32 cpZero = symbols.getCodePointZero();
95	0	if (cpZero == -1 \|\| !u_isdigit(cpZero) \|\| u_digit(cpZero, 10) != 0) {
96		// Uncommon case: okay to allocate.
97	0	auto* digitStrings = new UnicodeString[10];
98	0	fLocalDigitStrings.adoptInstead(digitStrings);
99	0	for (int32_t i = 0; i <= 9; i++) {
100	0	digitStrings[i] = symbols.getConstDigitSymbol(i);
101	0	}
102	0	}
103
104	0	requireGroupingMatch = 0 != (parseFlags & PARSE_FLAG_STRICT_GROUPING_SIZE);
105	0	groupingDisabled = 0 != (parseFlags & PARSE_FLAG_GROUPING_DISABLED);
106	0	integerOnly = 0 != (parseFlags & PARSE_FLAG_INTEGER_ONLY);
107	0	grouping1 = grouper.getPrimary();
108	0	grouping2 = grouper.getSecondary();
109
110		// Fraction grouping parsing is disabled for now but could be enabled later.
111		// See https://unicode-org.atlassian.net/browse/ICU-10794
112		// fractionGrouping = 0 != (parseFlags & PARSE_FLAG_FRACTION_GROUPING_ENABLED);
113	0	}
114
115	0	bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
116	0	return match(segment, result, 0, status);
117	0	}
118
119		bool DecimalMatcher::match(StringSegment& segment, ParsedNumber& result, int8_t exponentSign,
120	0	UErrorCode&) const {
121	0	if (result.seenNumber() && exponentSign == 0) {
122		// A number has already been consumed.
123	0	return false;
124	0	} else if (exponentSign != 0) {
125		// scientific notation always comes after the number
126	0	U_ASSERT(!result.quantity.bogus);
127	0	}
128
129		// Initial offset before any character consumption.
130	0	int32_t initialOffset = segment.getOffset();
131
132		// Return value: whether to ask for more characters.
133	0	bool maybeMore = false;
134
135		// All digits consumed so far.
136	0	number::impl::DecimalQuantity digitsConsumed;
137	0	digitsConsumed.bogus = true;
138
139		// The total number of digits after the decimal place, used for scaling the result.
140	0	int32_t digitsAfterDecimalPlace = 0;
141
142		// The actual grouping and decimal separators used in the string.
143		// If non-null, we have seen that token.
144	0	UnicodeString actualGroupingString;
145	0	UnicodeString actualDecimalString;
146	0	actualGroupingString.setToBogus();
147	0	actualDecimalString.setToBogus();
148
149		// Information for two groups: the previous group and the current group.
150		//
151		// Each group has three pieces of information:
152		//
153		// Offset: the string position of the beginning of the group, including a leading separator
154		// if there was a leading separator. This is needed in case we need to rewind the parse to
155		// that position.
156		//
157		// Separator type:
158		// 0 => beginning of string
159		// 1 => lead separator is a grouping separator
160		// 2 => lead separator is a decimal separator
161		//
162		// Count: the number of digits in the group. If -1, the group has been validated.
163	0	int32_t currGroupOffset = 0;
164	0	int32_t currGroupSepType = 0;
165	0	int32_t currGroupCount = 0;
166	0	int32_t prevGroupOffset = -1;
167	0	int32_t prevGroupSepType = -1;
168	0	int32_t prevGroupCount = -1;
169
170	0	while (segment.length() > 0) {
171	0	maybeMore = false;
172
173		// Attempt to match a digit.
174	0	int8_t digit = -1;
175
176		// Try by code point digit value.
177	0	UChar32 cp = segment.getCodePoint();
178	0	if (u_isdigit(cp)) {
179	0	segment.adjustOffset(U16_LENGTH(cp));
180	0	digit = static_cast<int8_t>(u_digit(cp, 10));
181	0	}
182
183		// Try by digit string.
184	0	if (digit == -1 && !fLocalDigitStrings.isNull()) {
185	0	for (int32_t i = 0; i < 10; i++) {
186	0	const UnicodeString& str = fLocalDigitStrings[i];
187	0	if (str.isEmpty()) {
188	0	continue;
189	0	}
190	0	int32_t overlap = segment.getCommonPrefixLength(str);
191	0	if (overlap == str.length()) {
192	0	segment.adjustOffset(overlap);
193	0	digit = static_cast<int8_t>(i);
194	0	break;
195	0	}
196	0	maybeMore = maybeMore \|\| (overlap == segment.length());
197	0	}
198	0	}
199
200	0	if (digit >= 0) {
201		// Digit was found.
202	0	if (digitsConsumed.bogus) {
203	0	digitsConsumed.bogus = false;
204	0	digitsConsumed.clear();
205	0	}
206	0	digitsConsumed.appendDigit(digit, 0, true);
207	0	currGroupCount++;
208	0	if (!actualDecimalString.isBogus()) {
209	0	digitsAfterDecimalPlace++;
210	0	}
211	0	continue;
212	0	}
213
214		// Attempt to match a literal grouping or decimal separator.
215	0	bool isDecimal = false;
216	0	bool isGrouping = false;
217
218		// 1) Attempt the decimal separator string literal.
219		// if (we have not seen a decimal separator yet) { ... }
220	0	if (actualDecimalString.isBogus() && !decimalSeparator.isEmpty()) {
221	0	int32_t overlap = segment.getCommonPrefixLength(decimalSeparator);
222	0	maybeMore = maybeMore \|\| (overlap == segment.length());
223	0	if (overlap == decimalSeparator.length()) {
224	0	isDecimal = true;
225	0	actualDecimalString = decimalSeparator;
226	0	}
227	0	}
228
229		// 2) Attempt to match the actual grouping string literal.
230	0	if (!actualGroupingString.isBogus()) {
231	0	int32_t overlap = segment.getCommonPrefixLength(actualGroupingString);
232	0	maybeMore = maybeMore \|\| (overlap == segment.length());
233	0	if (overlap == actualGroupingString.length()) {
234	0	isGrouping = true;
235	0	}
236	0	}
237
238		// 2.5) Attempt to match a new the grouping separator string literal.
239		// if (we have not seen a grouping or decimal separator yet) { ... }
240	0	if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus() &&
241	0	!groupingSeparator.isEmpty()) {
242	0	int32_t overlap = segment.getCommonPrefixLength(groupingSeparator);
243	0	maybeMore = maybeMore \|\| (overlap == segment.length());
244	0	if (overlap == groupingSeparator.length()) {
245	0	isGrouping = true;
246	0	actualGroupingString = groupingSeparator;
247	0	}
248	0	}
249
250		// 3) Attempt to match a decimal separator from the equivalence set.
251		// if (we have not seen a decimal separator yet) { ... }
252		// The !isGrouping is to confirm that we haven't yet matched the current character.
253	0	if (!isGrouping && actualDecimalString.isBogus()) {
254	0	if (decimalUniSet->contains(cp)) {
255	0	isDecimal = true;
256	0	actualDecimalString = UnicodeString(cp);
257	0	}
258	0	}
259
260		// 4) Attempt to match a grouping separator from the equivalence set.
261		// if (we have not seen a grouping or decimal separator yet) { ... }
262	0	if (!groupingDisabled && actualGroupingString.isBogus() && actualDecimalString.isBogus()) {
263	0	if (groupingUniSet->contains(cp)) {
264	0	isGrouping = true;
265	0	actualGroupingString = UnicodeString(cp);
266	0	}
267	0	}
268
269		// Leave if we failed to match this as a separator.
270	0	if (!isDecimal && !isGrouping) {
271	0	break;
272	0	}
273
274		// Check for conditions when we don't want to accept the separator.
275	0	if (isDecimal && integerOnly) {
276	0	break;
277	0	} else if (currGroupSepType == 2 && isGrouping) {
278		// Fraction grouping
279	0	break;
280	0	}
281
282		// Validate intermediate grouping sizes.
283	0	bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
284	0	bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
285	0	if (!prevValidSecondary \|\| (isDecimal && !currValidPrimary)) {
286		// Invalid grouping sizes.
287	0	if (isGrouping && currGroupCount == 0) {
288		// Trailing grouping separators: these are taken care of below
289	0	U_ASSERT(currGroupSepType == 1);
290	0	} else if (requireGroupingMatch) {
291		// Strict mode: reject the parse
292	0	digitsConsumed.clear();
293	0	digitsConsumed.bogus = true;
294	0	}
295	0	break;
296	0	} else if (requireGroupingMatch && currGroupCount == 0 && currGroupSepType == 1) {
297	0	break;
298	0	} else {
299		// Grouping sizes OK so far.
300	0	prevGroupOffset = currGroupOffset;
301	0	prevGroupCount = currGroupCount;
302	0	if (isDecimal) {
303		// Do not validate this group any more.
304	0	prevGroupSepType = -1;
305	0	} else {
306	0	prevGroupSepType = currGroupSepType;
307	0	}
308	0	}
309
310		// OK to accept the separator.
311		// Special case: don't update currGroup if it is empty; this allows two grouping
312		// separators in a row in lenient mode.
313	0	if (currGroupCount != 0) {
314	0	currGroupOffset = segment.getOffset();
315	0	}
316	0	currGroupSepType = isGrouping ? 1 : 2;
317	0	currGroupCount = 0;
318	0	if (isGrouping) {
319	0	segment.adjustOffset(actualGroupingString.length());
320	0	} else {
321	0	segment.adjustOffset(actualDecimalString.length());
322	0	}
323	0	}
324
325		// End of main loop.
326		// Back up if there was a trailing grouping separator.
327		// Shift prev -> curr so we can check it as a final group.
328	0	if (currGroupSepType != 2 && currGroupCount == 0) {
329	0	maybeMore = true;
330	0	segment.setOffset(currGroupOffset);
331	0	currGroupOffset = prevGroupOffset;
332	0	currGroupSepType = prevGroupSepType;
333	0	currGroupCount = prevGroupCount;
334	0	prevGroupOffset = -1;
335	0	prevGroupSepType = 0;
336	0	prevGroupCount = 1;
337	0	}
338
339		// Validate final grouping sizes.
340	0	bool prevValidSecondary = validateGroup(prevGroupSepType, prevGroupCount, false);
341	0	bool currValidPrimary = validateGroup(currGroupSepType, currGroupCount, true);
342	0	if (!requireGroupingMatch) {
343		// The cases we need to handle here are lone digits.
344		// Examples: "1,1" "1,1," "1,1,1" "1,1,1," ",1" (all parse as 1)
345		// See more examples in numberformattestspecification.txt
346	0	int32_t digitsToRemove = 0;
347	0	if (!prevValidSecondary) {
348	0	segment.setOffset(prevGroupOffset);
349	0	digitsToRemove += prevGroupCount;
350	0	digitsToRemove += currGroupCount;
351	0	} else if (!currValidPrimary && (prevGroupSepType != 0 \|\| prevGroupCount != 0)) {
352	0	maybeMore = true;
353	0	segment.setOffset(currGroupOffset);
354	0	digitsToRemove += currGroupCount;
355	0	}
356	0	if (digitsToRemove != 0) {
357	0	digitsConsumed.adjustMagnitude(-digitsToRemove);
358	0	digitsConsumed.truncate();
359	0	}
360	0	prevValidSecondary = true;
361	0	currValidPrimary = true;
362	0	}
363	0	if (currGroupSepType != 2 && (!prevValidSecondary \|\| !currValidPrimary)) {
364		// Grouping failure.
365	0	digitsConsumed.bogus = true;
366	0	}
367
368		// Strings that start with a separator but have no digits,
369		// or strings that failed a grouping size check.
370	0	if (digitsConsumed.bogus) {
371	0	maybeMore = maybeMore \|\| (segment.length() == 0);
372	0	segment.setOffset(initialOffset);
373	0	return maybeMore;
374	0	}
375
376		// We passed all inspections. Start post-processing.
377
378		// Adjust for fraction part.
379	0	digitsConsumed.adjustMagnitude(-digitsAfterDecimalPlace);
380
381		// Set the digits, either normal or exponent.
382	0	if (exponentSign != 0 && segment.getOffset() != initialOffset) {
383	0	bool overflow = false;
384	0	if (digitsConsumed.fitsInLong()) {
385	0	int64_t exponentLong = digitsConsumed.toLong(false);
386	0	U_ASSERT(exponentLong >= 0);
387	0	if (exponentLong <= INT32_MAX) {
388	0	auto exponentInt = static_cast<int32_t>(exponentLong);
389	0	if (result.quantity.adjustMagnitude(exponentSign * exponentInt)) {
390	0	overflow = true;
391	0	}
392	0	} else {
393	0	overflow = true;
394	0	}
395	0	} else {
396	0	overflow = true;
397	0	}
398	0	if (overflow) {
399	0	if (exponentSign == -1) {
400		// Set to zero
401	0	result.quantity.clear();
402	0	} else {
403		// Set to infinity
404	0	result.quantity.bogus = true;
405	0	result.flags \|= FLAG_INFINITY;
406	0	}
407	0	}
408	0	} else {
409	0	result.quantity = digitsConsumed;
410	0	}
411
412		// Set other information into the result and return.
413	0	if (!actualDecimalString.isBogus()) {
414	0	result.flags \|= FLAG_HAS_DECIMAL_SEPARATOR;
415	0	}
416	0	result.setCharsConsumed(segment);
417	0	return segment.length() == 0 \|\| maybeMore;
418	0	}
419
420	0	bool DecimalMatcher::validateGroup(int32_t sepType, int32_t count, bool isPrimary) const {
421	0	if (requireGroupingMatch) {
422	0	if (sepType == -1) {
423		// No such group (prevGroup before first shift).
424	0	return true;
425	0	} else if (sepType == 0) {
426		// First group.
427	0	if (isPrimary) {
428		// No grouping separators is OK.
429	0	return true;
430	0	} else {
431	0	return count != 0 && count <= grouping2;
432	0	}
433	0	} else if (sepType == 1) {
434		// Middle group.
435	0	if (isPrimary) {
436	0	return count == grouping1;
437	0	} else {
438	0	return count == grouping2;
439	0	}
440	0	} else {
441	0	U_ASSERT(sepType == 2);
442		// After the decimal separator.
443	0	return true;
444	0	}
445	0	} else {
446	0	if (sepType == 1) {
447		// #11230: don't accept middle groups with only 1 digit.
448	0	return count != 1;
449	0	} else {
450	0	return true;
451	0	}
452	0	}
453	0	}
454
455	0	bool DecimalMatcher::smokeTest(const StringSegment& segment) const {
456		// The common case uses a static leadSet for efficiency.
457	0	if (fLocalDigitStrings.isNull() && leadSet != nullptr) {
458	0	return segment.startsWith(*leadSet);
459	0	}
460	0	if (segment.startsWith(*separatorSet) \|\| u_isdigit(segment.getCodePoint())) {
461	0	return true;
462	0	}
463	0	if (fLocalDigitStrings.isNull()) {
464	0	return false;
465	0	}
466	0	for (int32_t i = 0; i < 10; i++) {
467	0	if (segment.startsWith(fLocalDigitStrings[i])) {
468	0	return true;
469	0	}
470	0	}
471	0	return false;
472	0	}
473
474	0	UnicodeString DecimalMatcher::toString() const {
475	0	return u"<Decimal>";
476	0	}
477
478
479		#endif /* #if !UCONFIG_NO_FORMATTING */