Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/i18n/numparse_affixes.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2018 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
4
#include "unicode/utypes.h"
5
6
#if !UCONFIG_NO_FORMATTING
7
8
// Allow implicit conversion from char16_t* to UnicodeString for this file:
9
// Helpful in toString methods and elsewhere.
10
#define UNISTR_FROM_STRING_EXPLICIT
11
12
#include "numparse_types.h"
13
#include "numparse_affixes.h"
14
#include "numparse_utils.h"
15
#include "number_utils.h"
16
17
using namespace icu;
18
using namespace icu::numparse;
19
using namespace icu::numparse::impl;
20
using namespace icu::number;
21
using namespace icu::number::impl;
22
23
24
namespace {
25
26
/**
27
 * Helper method to return whether the given AffixPatternMatcher equals the given pattern string.
28
 * Either both arguments must be null or the pattern string inside the AffixPatternMatcher must equal
29
 * the given pattern string.
30
 */
31
0
static bool matched(const AffixPatternMatcher* affix, const UnicodeString& patternString) {
32
0
    return (affix == nullptr && patternString.isBogus()) ||
33
0
           (affix != nullptr && affix->getPattern() == patternString);
34
0
}
35
36
/**
37
 * Helper method to return the length of the given AffixPatternMatcher. Returns 0 for null.
38
 */
39
0
static int32_t length(const AffixPatternMatcher* matcher) {
40
0
    return matcher == nullptr ? 0 : matcher->getPattern().length();
41
0
}
42
43
/**
44
 * Helper method to return whether (1) both lhs and rhs are null/invalid, or (2) if they are both
45
 * valid, whether they are equal according to operator==.  Similar to Java Objects.equals()
46
 */
47
0
static bool equals(const AffixPatternMatcher* lhs, const AffixPatternMatcher* rhs) {
48
0
    if (lhs == nullptr && rhs == nullptr) {
49
0
        return true;
50
0
    }
51
0
    if (lhs == nullptr || rhs == nullptr) {
52
0
        return false;
53
0
    }
54
0
    return *lhs == *rhs;
55
0
}
56
57
}
58
59
60
AffixPatternMatcherBuilder::AffixPatternMatcherBuilder(const UnicodeString& pattern,
61
                                                       AffixTokenMatcherWarehouse& warehouse,
62
                                                       IgnorablesMatcher* ignorables)
63
        : fMatchersLen(0),
64
          fLastTypeOrCp(0),
65
          fPattern(pattern),
66
          fWarehouse(warehouse),
67
0
          fIgnorables(ignorables) {}
68
69
0
void AffixPatternMatcherBuilder::consumeToken(AffixPatternType type, UChar32 cp, UErrorCode& status) {
70
0
    // This is called by AffixUtils.iterateWithConsumer() for each token.
71
0
72
0
    // Add an ignorables matcher between tokens except between two literals, and don't put two
73
0
    // ignorables matchers in a row.
74
0
    if (fIgnorables != nullptr && fMatchersLen > 0 &&
75
0
        (fLastTypeOrCp < 0 || !fIgnorables->getSet()->contains(fLastTypeOrCp))) {
76
0
        addMatcher(*fIgnorables);
77
0
    }
78
0
79
0
    if (type != TYPE_CODEPOINT) {
80
0
        // Case 1: the token is a symbol.
81
0
        switch (type) {
82
0
            case TYPE_MINUS_SIGN:
83
0
                addMatcher(fWarehouse.minusSign());
84
0
                break;
85
0
            case TYPE_PLUS_SIGN:
86
0
                addMatcher(fWarehouse.plusSign());
87
0
                break;
88
0
            case TYPE_PERCENT:
89
0
                addMatcher(fWarehouse.percent());
90
0
                break;
91
0
            case TYPE_PERMILLE:
92
0
                addMatcher(fWarehouse.permille());
93
0
                break;
94
0
            case TYPE_CURRENCY_SINGLE:
95
0
            case TYPE_CURRENCY_DOUBLE:
96
0
            case TYPE_CURRENCY_TRIPLE:
97
0
            case TYPE_CURRENCY_QUAD:
98
0
            case TYPE_CURRENCY_QUINT:
99
0
                // All currency symbols use the same matcher
100
0
                addMatcher(fWarehouse.currency(status));
101
0
                break;
102
0
            default:
103
0
                U_ASSERT(FALSE);
104
0
        }
105
0
106
0
    } else if (fIgnorables != nullptr && fIgnorables->getSet()->contains(cp)) {
107
0
        // Case 2: the token is an ignorable literal.
108
0
        // No action necessary: the ignorables matcher has already been added.
109
0
110
0
    } else {
111
0
        // Case 3: the token is a non-ignorable literal.
112
0
        addMatcher(fWarehouse.nextCodePointMatcher(cp));
113
0
    }
114
0
    fLastTypeOrCp = type != TYPE_CODEPOINT ? type : cp;
115
0
}
116
117
0
void AffixPatternMatcherBuilder::addMatcher(NumberParseMatcher& matcher) {
118
0
    if (fMatchersLen >= fMatchers.getCapacity()) {
119
0
        fMatchers.resize(fMatchersLen * 2, fMatchersLen);
120
0
    }
121
0
    fMatchers[fMatchersLen++] = &matcher;
122
0
}
123
124
0
AffixPatternMatcher AffixPatternMatcherBuilder::build() {
125
0
    return AffixPatternMatcher(fMatchers, fMatchersLen, fPattern);
126
0
}
127
128
129
CodePointMatcherWarehouse::CodePointMatcherWarehouse()
130
0
        : codePointCount(0), codePointNumBatches(0) {}
131
132
0
CodePointMatcherWarehouse::~CodePointMatcherWarehouse() {
133
0
    // Delete the variable number of batches of code point matchers
134
0
    for (int32_t i = 0; i < codePointNumBatches; i++) {
135
0
        delete[] codePointsOverflow[i];
136
0
    }
137
0
}
138
139
CodePointMatcherWarehouse::CodePointMatcherWarehouse(CodePointMatcherWarehouse&& src) U_NOEXCEPT
140
        : codePoints(std::move(src.codePoints)),
141
          codePointsOverflow(std::move(src.codePointsOverflow)),
142
          codePointCount(src.codePointCount),
143
0
          codePointNumBatches(src.codePointNumBatches) {}
144
145
CodePointMatcherWarehouse&
146
0
CodePointMatcherWarehouse::operator=(CodePointMatcherWarehouse&& src) U_NOEXCEPT {
147
0
    codePoints = std::move(src.codePoints);
148
0
    codePointsOverflow = std::move(src.codePointsOverflow);
149
0
    codePointCount = src.codePointCount;
150
0
    codePointNumBatches = src.codePointNumBatches;
151
0
    return *this;
152
0
}
153
154
0
NumberParseMatcher& CodePointMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
155
0
    if (codePointCount < CODE_POINT_STACK_CAPACITY) {
156
0
        return codePoints[codePointCount++] = {cp};
157
0
    }
158
0
    int32_t totalCapacity = CODE_POINT_STACK_CAPACITY + codePointNumBatches * CODE_POINT_BATCH_SIZE;
159
0
    if (codePointCount >= totalCapacity) {
160
0
        // Need a new batch
161
0
        auto* nextBatch = new CodePointMatcher[CODE_POINT_BATCH_SIZE];
162
0
        if (codePointNumBatches >= codePointsOverflow.getCapacity()) {
163
0
            // Need more room for storing pointers to batches
164
0
            codePointsOverflow.resize(codePointNumBatches * 2, codePointNumBatches);
165
0
        }
166
0
        codePointsOverflow[codePointNumBatches++] = nextBatch;
167
0
    }
168
0
    return codePointsOverflow[codePointNumBatches - 1][(codePointCount++ - CODE_POINT_STACK_CAPACITY) %
169
0
                                                       CODE_POINT_BATCH_SIZE] = {cp};
170
0
}
171
172
173
AffixTokenMatcherWarehouse::AffixTokenMatcherWarehouse(const AffixTokenMatcherSetupData* setupData)
174
0
        : fSetupData(setupData) {}
175
176
0
NumberParseMatcher& AffixTokenMatcherWarehouse::minusSign() {
177
0
    return fMinusSign = {fSetupData->dfs, true};
178
0
}
179
180
0
NumberParseMatcher& AffixTokenMatcherWarehouse::plusSign() {
181
0
    return fPlusSign = {fSetupData->dfs, true};
182
0
}
183
184
0
NumberParseMatcher& AffixTokenMatcherWarehouse::percent() {
185
0
    return fPercent = {fSetupData->dfs};
186
0
}
187
188
0
NumberParseMatcher& AffixTokenMatcherWarehouse::permille() {
189
0
    return fPermille = {fSetupData->dfs};
190
0
}
191
192
0
NumberParseMatcher& AffixTokenMatcherWarehouse::currency(UErrorCode& status) {
193
0
    return fCurrency = {fSetupData->currencySymbols, fSetupData->dfs, fSetupData->parseFlags, status};
194
0
}
195
196
0
IgnorablesMatcher& AffixTokenMatcherWarehouse::ignorables() {
197
0
    return fSetupData->ignorables;
198
0
}
199
200
0
NumberParseMatcher& AffixTokenMatcherWarehouse::nextCodePointMatcher(UChar32 cp) {
201
0
    return fCodePoints.nextCodePointMatcher(cp);
202
0
}
203
204
205
CodePointMatcher::CodePointMatcher(UChar32 cp)
206
0
        : fCp(cp) {}
207
208
0
bool CodePointMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode&) const {
209
0
    if (segment.startsWith(fCp)) {
210
0
        segment.adjustOffsetByCodePoint();
211
0
        result.setCharsConsumed(segment);
212
0
    }
213
0
    return false;
214
0
}
215
216
0
bool CodePointMatcher::smokeTest(const StringSegment& segment) const {
217
0
    return segment.startsWith(fCp);
218
0
}
219
220
0
UnicodeString CodePointMatcher::toString() const {
221
0
    return u"<CodePoint>";
222
0
}
223
224
225
AffixPatternMatcher AffixPatternMatcher::fromAffixPattern(const UnicodeString& affixPattern,
226
                                                          AffixTokenMatcherWarehouse& tokenWarehouse,
227
                                                          parse_flags_t parseFlags, bool* success,
228
0
                                                          UErrorCode& status) {
229
0
    if (affixPattern.isEmpty()) {
230
0
        *success = false;
231
0
        return {};
232
0
    }
233
0
    *success = true;
234
0
235
0
    IgnorablesMatcher* ignorables;
236
0
    if (0 != (parseFlags & PARSE_FLAG_EXACT_AFFIX)) {
237
0
        ignorables = nullptr;
238
0
    } else {
239
0
        ignorables = &tokenWarehouse.ignorables();
240
0
    }
241
0
242
0
    AffixPatternMatcherBuilder builder(affixPattern, tokenWarehouse, ignorables);
243
0
    AffixUtils::iterateWithConsumer(affixPattern, builder, status);
244
0
    return builder.build();
245
0
}
246
247
AffixPatternMatcher::AffixPatternMatcher(MatcherArray& matchers, int32_t matchersLen,
248
                                         const UnicodeString& pattern)
249
0
        : ArraySeriesMatcher(matchers, matchersLen), fPattern(pattern) {}
250
251
0
UnicodeString AffixPatternMatcher::getPattern() const {
252
0
    return fPattern.toAliasedUnicodeString();
253
0
}
254
255
0
bool AffixPatternMatcher::operator==(const AffixPatternMatcher& other) const {
256
0
    return fPattern == other.fPattern;
257
0
}
258
259
260
AffixMatcherWarehouse::AffixMatcherWarehouse(AffixTokenMatcherWarehouse* tokenWarehouse)
261
0
        : fTokenWarehouse(tokenWarehouse) {
262
0
}
263
264
bool AffixMatcherWarehouse::isInteresting(const AffixPatternProvider& patternInfo,
265
                                          const IgnorablesMatcher& ignorables, parse_flags_t parseFlags,
266
0
                                          UErrorCode& status) {
267
0
    UnicodeString posPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_PREFIX);
268
0
    UnicodeString posSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_POS_SUFFIX);
269
0
    UnicodeString negPrefixString;
270
0
    UnicodeString negSuffixString;
271
0
    if (patternInfo.hasNegativeSubpattern()) {
272
0
        negPrefixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_PREFIX);
273
0
        negSuffixString = patternInfo.getString(AffixPatternProvider::AFFIX_NEG_SUFFIX);
274
0
    }
275
0
276
0
    if (0 == (parseFlags & PARSE_FLAG_USE_FULL_AFFIXES) &&
277
0
        AffixUtils::containsOnlySymbolsAndIgnorables(posPrefixString, *ignorables.getSet(), status) &&
278
0
        AffixUtils::containsOnlySymbolsAndIgnorables(posSuffixString, *ignorables.getSet(), status) &&
279
0
        AffixUtils::containsOnlySymbolsAndIgnorables(negPrefixString, *ignorables.getSet(), status) &&
280
0
        AffixUtils::containsOnlySymbolsAndIgnorables(negSuffixString, *ignorables.getSet(), status)
281
0
        // HACK: Plus and minus sign are a special case: we accept them trailing only if they are
282
0
        // trailing in the pattern string.
283
0
        && !AffixUtils::containsType(posSuffixString, TYPE_PLUS_SIGN, status) &&
284
0
        !AffixUtils::containsType(posSuffixString, TYPE_MINUS_SIGN, status) &&
285
0
        !AffixUtils::containsType(negSuffixString, TYPE_PLUS_SIGN, status) &&
286
0
        !AffixUtils::containsType(negSuffixString, TYPE_MINUS_SIGN, status)) {
287
0
        // The affixes contain only symbols and ignorables.
288
0
        // No need to generate affix matchers.
289
0
        return false;
290
0
    }
291
0
    return true;
292
0
}
293
294
void AffixMatcherWarehouse::createAffixMatchers(const AffixPatternProvider& patternInfo,
295
                                                MutableMatcherCollection& output,
296
                                                const IgnorablesMatcher& ignorables,
297
0
                                                parse_flags_t parseFlags, UErrorCode& status) {
298
0
    if (!isInteresting(patternInfo, ignorables, parseFlags, status)) {
299
0
        return;
300
0
    }
301
0
302
0
    // The affixes have interesting characters, or we are in strict mode.
303
0
    // Use initial capacity of 6, the highest possible number of AffixMatchers.
304
0
    UnicodeString sb;
305
0
    bool includeUnpaired = 0 != (parseFlags & PARSE_FLAG_INCLUDE_UNPAIRED_AFFIXES);
306
0
    UNumberSignDisplay signDisplay = (0 != (parseFlags & PARSE_FLAG_PLUS_SIGN_ALLOWED)) ? UNUM_SIGN_ALWAYS
307
0
                                                                                        : UNUM_SIGN_AUTO;
308
0
309
0
    int32_t numAffixMatchers = 0;
310
0
    int32_t numAffixPatternMatchers = 0;
311
0
312
0
    AffixPatternMatcher* posPrefix = nullptr;
313
0
    AffixPatternMatcher* posSuffix = nullptr;
314
0
315
0
    // Pre-process the affix strings to resolve LDML rules like sign display.
316
0
    for (int8_t signum = 1; signum >= -1; signum--) {
317
0
        // Generate Prefix
318
0
        bool hasPrefix = false;
319
0
        PatternStringUtils::patternInfoToStringBuilder(
320
0
                patternInfo, true, signum, signDisplay, StandardPlural::OTHER, false, sb);
321
0
        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
322
0
                sb, *fTokenWarehouse, parseFlags, &hasPrefix, status);
323
0
        AffixPatternMatcher* prefix = hasPrefix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
324
0
                                                : nullptr;
325
0
326
0
        // Generate Suffix
327
0
        bool hasSuffix = false;
328
0
        PatternStringUtils::patternInfoToStringBuilder(
329
0
                patternInfo, false, signum, signDisplay, StandardPlural::OTHER, false, sb);
330
0
        fAffixPatternMatchers[numAffixPatternMatchers] = AffixPatternMatcher::fromAffixPattern(
331
0
                sb, *fTokenWarehouse, parseFlags, &hasSuffix, status);
332
0
        AffixPatternMatcher* suffix = hasSuffix ? &fAffixPatternMatchers[numAffixPatternMatchers++]
333
0
                                                : nullptr;
334
0
335
0
        if (signum == 1) {
336
0
            posPrefix = prefix;
337
0
            posSuffix = suffix;
338
0
        } else if (equals(prefix, posPrefix) && equals(suffix, posSuffix)) {
339
0
            // Skip adding these matchers (we already have equivalents)
340
0
            continue;
341
0
        }
342
0
343
0
        // Flags for setting in the ParsedNumber; the token matchers may add more.
344
0
        int flags = (signum == -1) ? FLAG_NEGATIVE : 0;
345
0
346
0
        // Note: it is indeed possible for posPrefix and posSuffix to both be null.
347
0
        // We still need to add that matcher for strict mode to work.
348
0
        fAffixMatchers[numAffixMatchers++] = {prefix, suffix, flags};
349
0
        if (includeUnpaired && prefix != nullptr && suffix != nullptr) {
350
0
            // The following if statements are designed to prevent adding two identical matchers.
351
0
            if (signum == 1 || !equals(prefix, posPrefix)) {
352
0
                fAffixMatchers[numAffixMatchers++] = {prefix, nullptr, flags};
353
0
            }
354
0
            if (signum == 1 || !equals(suffix, posSuffix)) {
355
0
                fAffixMatchers[numAffixMatchers++] = {nullptr, suffix, flags};
356
0
            }
357
0
        }
358
0
    }
359
0
360
0
    // Put the AffixMatchers in order, and then add them to the output.
361
0
    // Since there are at most 9 elements, do a simple-to-implement bubble sort.
362
0
    bool madeChanges;
363
0
    do {
364
0
        madeChanges = false;
365
0
        for (int32_t i = 1; i < numAffixMatchers; i++) {
366
0
            if (fAffixMatchers[i - 1].compareTo(fAffixMatchers[i]) > 0) {
367
0
                madeChanges = true;
368
0
                AffixMatcher temp = std::move(fAffixMatchers[i - 1]);
369
0
                fAffixMatchers[i - 1] = std::move(fAffixMatchers[i]);
370
0
                fAffixMatchers[i] = std::move(temp);
371
0
            }
372
0
        }
373
0
    } while (madeChanges);
374
0
375
0
    for (int32_t i = 0; i < numAffixMatchers; i++) {
376
0
        // Enable the following line to debug affixes
377
0
        //std::cout << "Adding affix matcher: " << CStr(fAffixMatchers[i].toString())() << std::endl;
378
0
        output.addMatcher(fAffixMatchers[i]);
379
0
    }
380
0
}
381
382
383
AffixMatcher::AffixMatcher(AffixPatternMatcher* prefix, AffixPatternMatcher* suffix, result_flags_t flags)
384
0
        : fPrefix(prefix), fSuffix(suffix), fFlags(flags) {}
385
386
0
bool AffixMatcher::match(StringSegment& segment, ParsedNumber& result, UErrorCode& status) const {
387
0
    if (!result.seenNumber()) {
388
0
        // Prefix
389
0
        // Do not match if:
390
0
        // 1. We have already seen a prefix (result.prefix != null)
391
0
        // 2. The prefix in this AffixMatcher is empty (prefix == null)
392
0
        if (!result.prefix.isBogus() || fPrefix == nullptr) {
393
0
            return false;
394
0
        }
395
0
396
0
        // Attempt to match the prefix.
397
0
        int initialOffset = segment.getOffset();
398
0
        bool maybeMore = fPrefix->match(segment, result, status);
399
0
        if (initialOffset != segment.getOffset()) {
400
0
            result.prefix = fPrefix->getPattern();
401
0
        }
402
0
        return maybeMore;
403
0
404
0
    } else {
405
0
        // Suffix
406
0
        // Do not match if:
407
0
        // 1. We have already seen a suffix (result.suffix != null)
408
0
        // 2. The suffix in this AffixMatcher is empty (suffix == null)
409
0
        // 3. The matched prefix does not equal this AffixMatcher's prefix
410
0
        if (!result.suffix.isBogus() || fSuffix == nullptr || !matched(fPrefix, result.prefix)) {
411
0
            return false;
412
0
        }
413
0
414
0
        // Attempt to match the suffix.
415
0
        int initialOffset = segment.getOffset();
416
0
        bool maybeMore = fSuffix->match(segment, result, status);
417
0
        if (initialOffset != segment.getOffset()) {
418
0
            result.suffix = fSuffix->getPattern();
419
0
        }
420
0
        return maybeMore;
421
0
    }
422
0
}
423
424
0
bool AffixMatcher::smokeTest(const StringSegment& segment) const {
425
0
    return (fPrefix != nullptr && fPrefix->smokeTest(segment)) ||
426
0
           (fSuffix != nullptr && fSuffix->smokeTest(segment));
427
0
}
428
429
0
void AffixMatcher::postProcess(ParsedNumber& result) const {
430
0
    // Check to see if our affix is the one that was matched. If so, set the flags in the result.
431
0
    if (matched(fPrefix, result.prefix) && matched(fSuffix, result.suffix)) {
432
0
        // Fill in the result prefix and suffix with non-null values (empty string).
433
0
        // Used by strict mode to determine whether an entire affix pair was matched.
434
0
        if (result.prefix.isBogus()) {
435
0
            result.prefix = UnicodeString();
436
0
        }
437
0
        if (result.suffix.isBogus()) {
438
0
            result.suffix = UnicodeString();
439
0
        }
440
0
        result.flags |= fFlags;
441
0
        if (fPrefix != nullptr) {
442
0
            fPrefix->postProcess(result);
443
0
        }
444
0
        if (fSuffix != nullptr) {
445
0
            fSuffix->postProcess(result);
446
0
        }
447
0
    }
448
0
}
449
450
0
int8_t AffixMatcher::compareTo(const AffixMatcher& rhs) const {
451
0
    const AffixMatcher& lhs = *this;
452
0
    if (length(lhs.fPrefix) != length(rhs.fPrefix)) {
453
0
        return length(lhs.fPrefix) > length(rhs.fPrefix) ? -1 : 1;
454
0
    } else if (length(lhs.fSuffix) != length(rhs.fSuffix)) {
455
0
        return length(lhs.fSuffix) > length(rhs.fSuffix) ? -1 : 1;
456
0
    } else {
457
0
        return 0;
458
0
    }
459
0
}
460
461
0
UnicodeString AffixMatcher::toString() const {
462
0
    bool isNegative = 0 != (fFlags & FLAG_NEGATIVE);
463
0
    return UnicodeString(u"<Affix") + (isNegative ? u":negative " : u" ") +
464
0
           (fPrefix ? fPrefix->getPattern() : u"null") + u"#" +
465
0
           (fSuffix ? fSuffix->getPattern() : u"null") + u">";
466
0
467
0
}
468
469
470
#endif /* #if !UCONFIG_NO_FORMATTING */
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495