Coverage Report

Created: 2022-11-20 06:14

/src/icu/icu4c/source/common/dictbe.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
 *******************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 *******************************************************************************
8
 */
9
10
#include <utility>
11
12
#include "unicode/utypes.h"
13
14
#if !UCONFIG_NO_BREAK_ITERATION
15
16
#include "brkeng.h"
17
#include "dictbe.h"
18
#include "unicode/uniset.h"
19
#include "unicode/chariter.h"
20
#include "unicode/resbund.h"
21
#include "unicode/ubrk.h"
22
#include "unicode/usetiter.h"
23
#include "ubrkimpl.h"
24
#include "utracimp.h"
25
#include "uvectr32.h"
26
#include "uvector.h"
27
#include "uassert.h"
28
#include "unicode/normlzr.h"
29
#include "cmemory.h"
30
#include "dictionarydata.h"
31
32
U_NAMESPACE_BEGIN
33
34
/*
35
 ******************************************************************
36
 */
37
38
5
DictionaryBreakEngine::DictionaryBreakEngine() {
39
5
}
40
41
0
DictionaryBreakEngine::~DictionaryBreakEngine() {
42
0
}
43
44
UBool
45
2.09M
DictionaryBreakEngine::handles(UChar32 c) const {
46
2.09M
    return fSet.contains(c);
47
2.09M
}
48
49
int32_t
50
DictionaryBreakEngine::findBreaks( UText *text,
51
                                 int32_t startPos,
52
                                 int32_t endPos,
53
                                 UVector32 &foundBreaks,
54
                                 UBool isPhraseBreaking,
55
526k
                                 UErrorCode& status) const {
56
526k
    if (U_FAILURE(status)) return 0;
57
526k
    (void)startPos;            // TODO: remove this param?
58
526k
    int32_t result = 0;
59
60
    // Find the span of characters included in the set.
61
    //   The span to break begins at the current position in the text, and
62
    //   extends towards the start or end of the text, depending on 'reverse'.
63
64
526k
    int32_t start = (int32_t)utext_getNativeIndex(text);
65
526k
    int32_t current;
66
526k
    int32_t rangeStart;
67
526k
    int32_t rangeEnd;
68
526k
    UChar32 c = utext_current32(text);
69
23.5M
    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
70
23.0M
        utext_next32(text);         // TODO:  recast loop for postincrement
71
23.0M
        c = utext_current32(text);
72
23.0M
    }
73
526k
    rangeStart = start;
74
526k
    rangeEnd = current;
75
526k
    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, isPhraseBreaking, status);
76
526k
    utext_setNativeIndex(text, current);
77
    
78
526k
    return result;
79
526k
}
80
81
void
82
5
DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
83
5
    fSet = set;
84
    // Compact for caching
85
5
    fSet.compact();
86
5
}
87
88
/*
89
 ******************************************************************
90
 * PossibleWord
91
 */
92
93
// Helper class for improving readability of the Thai/Lao/Khmer word break
94
// algorithm. The implementation is completely inline.
95
96
// List size, limited by the maximum number of words in the dictionary
97
// that form a nested sequence.
98
static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
99
100
class PossibleWord {
101
private:
102
    // list of word candidate lengths, in increasing length order
103
    // TODO: bytes would be sufficient for word lengths.
104
    int32_t   count;      // Count of candidates
105
    int32_t   prefix;     // The longest match with a dictionary word
106
    int32_t   offset;     // Offset in the text of these candidates
107
    int32_t   mark;       // The preferred candidate's offset
108
    int32_t   current;    // The candidate we're currently looking at
109
    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
110
    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
111
112
public:
113
422k
    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
114
422k
    ~PossibleWord() {}
115
  
116
    // Fill the list of candidates if needed, select the longest, and return the number found
117
    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
118
  
119
    // Select the currently marked candidate, point after it in the text, and invalidate self
120
    int32_t   acceptMarked( UText *text );
121
  
122
    // Back up from the current candidate to the next shorter one; return true if that exists
123
    // and point the text after it
124
    UBool     backUp( UText *text );
125
  
126
    // Return the longest prefix this candidate location shares with a dictionary word
127
    // Return value is in code points.
128
1.84M
    int32_t   longestPrefix() { return prefix; }
129
  
130
    // Mark the current candidate as the one we like
131
317k
    void      markCurrent() { mark = current; }
132
    
133
    // Get length in code points of the marked word.
134
2.30M
    int32_t   markedCPLength() { return cpLengths[mark]; }
135
};
136
137
138
11.9M
int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
139
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
140
11.9M
    int32_t start = (int32_t)utext_getNativeIndex(text);
141
11.9M
    if (start != offset) {
142
5.19M
        offset = start;
143
5.19M
        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, nullptr, &prefix);
144
        // Dictionary leaves text after longest prefix, not longest word. Back up.
145
5.19M
        if (count <= 0) {
146
2.70M
            utext_setNativeIndex(text, start);
147
2.70M
        }
148
5.19M
    }
149
11.9M
    if (count > 0) {
150
5.26M
        utext_setNativeIndex(text, start+cuLengths[count-1]);
151
5.26M
    }
152
11.9M
    current = count-1;
153
11.9M
    mark = current;
154
11.9M
    return count;
155
11.9M
}
156
157
int32_t
158
2.30M
PossibleWord::acceptMarked( UText *text ) {
159
2.30M
    utext_setNativeIndex(text, offset + cuLengths[mark]);
160
2.30M
    return cuLengths[mark];
161
2.30M
}
162
163
164
UBool
165
750k
PossibleWord::backUp( UText *text ) {
166
750k
    if (current > 0) {
167
506k
        utext_setNativeIndex(text, offset + cuLengths[--current]);
168
506k
        return true;
169
506k
    }
170
243k
    return false;
171
750k
}
172
173
/*
174
 ******************************************************************
175
 * ThaiBreakEngine
176
 */
177
178
// How many words in a row are "good enough"?
179
static const int32_t THAI_LOOKAHEAD = 3;
180
181
// Will not combine a non-word with a preceding dictionary word longer than this
182
static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
183
184
// Will not combine a non-word that shares at least this much prefix with a
185
// dictionary word, with a preceding word
186
static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
187
188
// Elision character
189
static const int32_t THAI_PAIYANNOI = 0x0E2F;
190
191
// Repeat character
192
static const int32_t THAI_MAIYAMOK = 0x0E46;
193
194
// Minimum word size
195
static const int32_t THAI_MIN_WORD = 2;
196
197
// Minimum number of characters for two words
198
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
199
200
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
201
    : DictionaryBreakEngine(),
202
      fDictionary(adoptDictionary)
203
1
{
204
1
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
205
1
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
206
1
    UnicodeSet thaiWordSet(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]]"), status);
207
1
    if (U_SUCCESS(status)) {
208
1
        setCharacters(thaiWordSet);
209
1
    }
210
1
    fMarkSet.applyPattern(UnicodeString(u"[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
211
1
    fMarkSet.add(0x0020);
212
1
    fEndWordSet = thaiWordSet;
213
1
    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
214
1
    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
215
1
    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
216
1
    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
217
1
    fSuffixSet.add(THAI_PAIYANNOI);
218
1
    fSuffixSet.add(THAI_MAIYAMOK);
219
220
    // Compact for caching.
221
1
    fMarkSet.compact();
222
1
    fEndWordSet.compact();
223
1
    fBeginWordSet.compact();
224
1
    fSuffixSet.compact();
225
1
    UTRACE_EXIT_STATUS(status);
226
1
}
227
228
0
ThaiBreakEngine::~ThaiBreakEngine() {
229
0
    delete fDictionary;
230
0
}
231
232
int32_t
233
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
234
                                                int32_t rangeStart,
235
                                                int32_t rangeEnd,
236
                                                UVector32 &foundBreaks,
237
                                                UBool /* isPhraseBreaking */,
238
99.9k
                                                UErrorCode& status) const {
239
99.9k
    if (U_FAILURE(status)) return 0;
240
99.9k
    utext_setNativeIndex(text, rangeStart);
241
99.9k
    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
242
99.9k
    if (utext_getNativeIndex(text) >= rangeEnd) {
243
50.4k
        return 0;       // Not enough characters for two words
244
50.4k
    }
245
49.4k
    utext_setNativeIndex(text, rangeStart);
246
247
248
49.4k
    uint32_t wordsFound = 0;
249
49.4k
    int32_t cpWordLength = 0;    // Word Length in Code Points.
250
49.4k
    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
251
49.4k
    int32_t current;
252
49.4k
    PossibleWord words[THAI_LOOKAHEAD];
253
    
254
49.4k
    utext_setNativeIndex(text, rangeStart);
255
    
256
968k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
257
918k
        cpWordLength = 0;
258
918k
        cuWordLength = 0;
259
260
        // Look for candidate words at the current position
261
918k
        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
262
        
263
        // If we found exactly one, use that
264
918k
        if (candidates == 1) {
265
394k
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
266
394k
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
267
394k
            wordsFound += 1;
268
394k
        }
269
        // If there was more than one, see which one can take us forward the most words
270
523k
        else if (candidates > 1) {
271
            // If we're already at the end of the range, we're done
272
89.7k
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
273
8.27k
                goto foundBest;
274
8.27k
            }
275
172k
            do {
276
172k
                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
277
                    // Followed by another dictionary word; mark first word as a good candidate
278
19.0k
                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
279
                    
280
                    // If we're already at the end of the range, we're done
281
19.0k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
282
3.33k
                        goto foundBest;
283
3.33k
                    }
284
                    
285
                    // See if any of the possible second words is followed by a third word
286
24.3k
                    do {
287
                        // If we find a third word, stop right away
288
24.3k
                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
289
10.5k
                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
290
10.5k
                            goto foundBest;
291
10.5k
                        }
292
24.3k
                    }
293
15.7k
                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
294
15.7k
                }
295
172k
            }
296
158k
            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
297
89.7k
foundBest:
298
            // Set UText position to after the accepted word.
299
89.7k
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
300
89.7k
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
301
89.7k
            wordsFound += 1;
302
89.7k
        }
303
        
304
        // We come here after having either found a word or not. We look ahead to the
305
        // next word. If it's not a dictionary word, we will combine it with the word we
306
        // just found (if there is one), but only if the preceding word does not exceed
307
        // the threshold.
308
        // The text iterator should now be positioned at the end of the word we found.
309
        
310
918k
        UChar32 uc = 0;
311
918k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
312
            // if it is a dictionary word, do nothing. If it isn't, then if there is
313
            // no preceding word, or the non-word shares less than the minimum threshold
314
            // of characters with a dictionary word, then scan to resynchronize
315
839k
            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
316
839k
                  && (cuWordLength == 0
317
816k
                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
318
                // Look for a plausible word boundary
319
439k
                int32_t remaining = rangeEnd - (current+cuWordLength);
320
439k
                UChar32 pc;
321
439k
                int32_t chars = 0;
322
490k
                for (;;) {
323
490k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
324
490k
                    pc = utext_next32(text);
325
490k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
326
490k
                    chars += pcSize;
327
490k
                    remaining -= pcSize;
328
490k
                    if (remaining <= 0) {
329
18.7k
                        break;
330
18.7k
                    }
331
472k
                    uc = utext_current32(text);
332
472k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
333
                        // Maybe. See if it's in the dictionary.
334
                        // NOTE: In the original Apple code, checked that the next
335
                        // two characters after uc were not 0x0E4C THANTHAKHAT before
336
                        // checking the dictionary. That is just a performance filter,
337
                        // but it's not clear it's faster than checking the trie.
338
429k
                        int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
339
429k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
340
429k
                        if (num_candidates > 0) {
341
420k
                            break;
342
420k
                        }
343
429k
                    }
344
472k
                }
345
                
346
                // Bump the word count if there wasn't already one
347
439k
                if (cuWordLength <= 0) {
348
434k
                    wordsFound += 1;
349
434k
                }
350
                
351
                // Update the length with the passed-over characters
352
439k
                cuWordLength += chars;
353
439k
            }
354
399k
            else {
355
                // Back up to where we were for next iteration
356
399k
                utext_setNativeIndex(text, current+cuWordLength);
357
399k
            }
358
839k
        }
359
        
360
        // Never stop before a combining mark.
361
918k
        int32_t currPos;
362
923k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
363
5.07k
            utext_next32(text);
364
5.07k
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
365
5.07k
        }
366
        
367
        // Look ahead for possible suffixes if a dictionary word does not follow.
368
        // We do this in code rather than using a rule so that the heuristic
369
        // resynch continues to function. For example, one of the suffix characters
370
        // could be a typo in the middle of a word.
371
918k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
372
887k
            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
373
887k
                && fSuffixSet.contains(uc = utext_current32(text))) {
374
51.0k
                if (uc == THAI_PAIYANNOI) {
375
34.0k
                    if (!fSuffixSet.contains(utext_previous32(text))) {
376
                        // Skip over previous end and PAIYANNOI
377
29.8k
                        utext_next32(text);
378
29.8k
                        int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
379
29.8k
                        utext_next32(text);
380
29.8k
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
381
29.8k
                        uc = utext_current32(text);     // Fetch next character
382
29.8k
                    }
383
4.17k
                    else {
384
                        // Restore prior position
385
4.17k
                        utext_next32(text);
386
4.17k
                    }
387
34.0k
                }
388
51.0k
                if (uc == THAI_MAIYAMOK) {
389
35.1k
                    if (utext_previous32(text) != THAI_MAIYAMOK) {
390
                        // Skip over previous end and MAIYAMOK
391
34.5k
                        utext_next32(text);
392
34.5k
                        int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
393
34.5k
                        utext_next32(text);
394
34.5k
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
395
34.5k
                    }
396
578
                    else {
397
                        // Restore prior position
398
578
                        utext_next32(text);
399
578
                    }
400
35.1k
                }
401
51.0k
            }
402
836k
            else {
403
836k
                utext_setNativeIndex(text, current+cuWordLength);
404
836k
            }
405
887k
        }
406
407
        // Did we find a word on this iteration? If so, push it on the break stack
408
918k
        if (cuWordLength > 0) {
409
918k
            foundBreaks.push((current+cuWordLength), status);
410
918k
        }
411
918k
    }
412
413
    // Don't return a break for the end of the dictionary range if there is one there.
414
49.4k
    if (foundBreaks.peeki() >= rangeEnd) {
415
49.4k
        (void) foundBreaks.popi();
416
49.4k
        wordsFound -= 1;
417
49.4k
    }
418
419
49.4k
    return wordsFound;
420
49.4k
}
421
422
/*
423
 ******************************************************************
424
 * LaoBreakEngine
425
 */
426
427
// How many words in a row are "good enough"?
428
static const int32_t LAO_LOOKAHEAD = 3;
429
430
// Will not combine a non-word with a preceding dictionary word longer than this
431
static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
432
433
// Will not combine a non-word that shares at least this much prefix with a
434
// dictionary word, with a preceding word
435
static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
436
437
// Minimum word size
438
static const int32_t LAO_MIN_WORD = 2;
439
440
// Minimum number of characters for two words
441
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
442
443
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
444
    : DictionaryBreakEngine(),
445
      fDictionary(adoptDictionary)
446
1
{
447
1
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
448
1
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
449
1
    UnicodeSet laoWordSet(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]]"), status);
450
1
    if (U_SUCCESS(status)) {
451
1
        setCharacters(laoWordSet);
452
1
    }
453
1
    fMarkSet.applyPattern(UnicodeString(u"[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
454
1
    fMarkSet.add(0x0020);
455
1
    fEndWordSet = laoWordSet;
456
1
    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
457
1
    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
458
1
    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
459
1
    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
460
461
    // Compact for caching.
462
1
    fMarkSet.compact();
463
1
    fEndWordSet.compact();
464
1
    fBeginWordSet.compact();
465
1
    UTRACE_EXIT_STATUS(status);
466
1
}
467
468
0
LaoBreakEngine::~LaoBreakEngine() {
469
0
    delete fDictionary;
470
0
}
471
472
int32_t
473
LaoBreakEngine::divideUpDictionaryRange( UText *text,
474
                                                int32_t rangeStart,
475
                                                int32_t rangeEnd,
476
                                                UVector32 &foundBreaks,
477
                                                UBool /* isPhraseBreaking */,
478
33.2k
                                                UErrorCode& status) const {
479
33.2k
    if (U_FAILURE(status)) return 0;
480
33.2k
    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
481
7.73k
        return 0;       // Not enough characters for two words
482
7.73k
    }
483
484
25.4k
    uint32_t wordsFound = 0;
485
25.4k
    int32_t cpWordLength = 0;
486
25.4k
    int32_t cuWordLength = 0;
487
25.4k
    int32_t current;
488
25.4k
    PossibleWord words[LAO_LOOKAHEAD];
489
490
25.4k
    utext_setNativeIndex(text, rangeStart);
491
492
396k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
493
370k
        cuWordLength = 0;
494
370k
        cpWordLength = 0;
495
496
        // Look for candidate words at the current position
497
370k
        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
498
        
499
        // If we found exactly one, use that
500
370k
        if (candidates == 1) {
501
176k
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
502
176k
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
503
176k
            wordsFound += 1;
504
176k
        }
505
        // If there was more than one, see which one can take us forward the most words
506
194k
        else if (candidates > 1) {
507
            // If we're already at the end of the range, we're done
508
76.6k
            if (utext_getNativeIndex(text) >= rangeEnd) {
509
5.17k
                goto foundBest;
510
5.17k
            }
511
145k
            do {
512
145k
                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
513
                    // Followed by another dictionary word; mark first word as a good candidate
514
63.2k
                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
515
                    
516
                    // If we're already at the end of the range, we're done
517
63.2k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
518
6.36k
                        goto foundBest;
519
6.36k
                    }
520
                    
521
                    // See if any of the possible second words is followed by a third word
522
110k
                    do {
523
                        // If we find a third word, stop right away
524
110k
                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
525
50.5k
                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
526
50.5k
                            goto foundBest;
527
50.5k
                        }
528
110k
                    }
529
60.0k
                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
530
56.8k
                }
531
145k
            }
532
88.5k
            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
533
76.6k
foundBest:
534
76.6k
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
535
76.6k
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
536
76.6k
            wordsFound += 1;
537
76.6k
        }
538
        
539
        // We come here after having either found a word or not. We look ahead to the
540
        // next word. If it's not a dictionary word, we will combine it with the word we
541
        // just found (if there is one), but only if the preceding word does not exceed
542
        // the threshold.
543
        // The text iterator should now be positioned at the end of the word we found.
544
370k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
545
            // if it is a dictionary word, do nothing. If it isn't, then if there is
546
            // no preceding word, or the non-word shares less than the minimum threshold
547
            // of characters with a dictionary word, then scan to resynchronize
548
346k
            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
549
346k
                  && (cuWordLength == 0
550
228k
                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
551
                // Look for a plausible word boundary
552
134k
                int32_t remaining = rangeEnd - (current + cuWordLength);
553
134k
                UChar32 pc;
554
134k
                UChar32 uc;
555
134k
                int32_t chars = 0;
556
165k
                for (;;) {
557
165k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
558
165k
                    pc = utext_next32(text);
559
165k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
560
165k
                    chars += pcSize;
561
165k
                    remaining -= pcSize;
562
165k
                    if (remaining <= 0) {
563
10.6k
                        break;
564
10.6k
                    }
565
155k
                    uc = utext_current32(text);
566
155k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
567
                        // Maybe. See if it's in the dictionary.
568
                        // TODO: this looks iffy; compare with old code.
569
142k
                        int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
570
142k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
571
142k
                        if (num_candidates > 0) {
572
123k
                            break;
573
123k
                        }
574
142k
                    }
575
155k
                }
576
                
577
                // Bump the word count if there wasn't already one
578
134k
                if (cuWordLength <= 0) {
579
118k
                    wordsFound += 1;
580
118k
                }
581
                
582
                // Update the length with the passed-over characters
583
134k
                cuWordLength += chars;
584
134k
            }
585
212k
            else {
586
                // Back up to where we were for next iteration
587
212k
                utext_setNativeIndex(text, current + cuWordLength);
588
212k
            }
589
346k
        }
590
        
591
        // Never stop before a combining mark.
592
370k
        int32_t currPos;
593
371k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
594
252
            utext_next32(text);
595
252
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
596
252
        }
597
        
598
        // Look ahead for possible suffixes if a dictionary word does not follow.
599
        // We do this in code rather than using a rule so that the heuristic
600
        // resynch continues to function. For example, one of the suffix characters
601
        // could be a typo in the middle of a word.
602
        // NOT CURRENTLY APPLICABLE TO LAO
603
604
        // Did we find a word on this iteration? If so, push it on the break stack
605
370k
        if (cuWordLength > 0) {
606
370k
            foundBreaks.push((current+cuWordLength), status);
607
370k
        }
608
370k
    }
609
610
    // Don't return a break for the end of the dictionary range if there is one there.
611
25.4k
    if (foundBreaks.peeki() >= rangeEnd) {
612
25.4k
        (void) foundBreaks.popi();
613
25.4k
        wordsFound -= 1;
614
25.4k
    }
615
616
25.4k
    return wordsFound;
617
25.4k
}
618
619
/*
620
 ******************************************************************
621
 * BurmeseBreakEngine
622
 */
623
624
// How many words in a row are "good enough"?
625
static const int32_t BURMESE_LOOKAHEAD = 3;
626
627
// Will not combine a non-word with a preceding dictionary word longer than this
628
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
629
630
// Will not combine a non-word that shares at least this much prefix with a
631
// dictionary word, with a preceding word
632
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
633
634
// Minimum word size
635
static const int32_t BURMESE_MIN_WORD = 2;
636
637
// Minimum number of characters for two words
638
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
639
640
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
641
    : DictionaryBreakEngine(),
642
      fDictionary(adoptDictionary)
643
1
{
644
1
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
645
1
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
646
1
    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
647
1
    fEndWordSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]]"), status);
648
1
    fMarkSet.applyPattern(UnicodeString(u"[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
649
1
    fMarkSet.add(0x0020);
650
1
    if (U_SUCCESS(status)) {
651
1
        setCharacters(fEndWordSet);
652
1
    }
653
654
    // Compact for caching.
655
1
    fMarkSet.compact();
656
1
    fEndWordSet.compact();
657
1
    fBeginWordSet.compact();
658
1
    UTRACE_EXIT_STATUS(status);
659
1
}
660
661
0
BurmeseBreakEngine::~BurmeseBreakEngine() {
662
0
    delete fDictionary;
663
0
}
664
665
int32_t
666
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
667
                                                int32_t rangeStart,
668
                                                int32_t rangeEnd,
669
                                                UVector32 &foundBreaks,
670
                                                UBool /* isPhraseBreaking */,
671
47.9k
                                                UErrorCode& status ) const {
672
47.9k
    if (U_FAILURE(status)) return 0;
673
47.9k
    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
674
15.9k
        return 0;       // Not enough characters for two words
675
15.9k
    }
676
677
32.0k
    uint32_t wordsFound = 0;
678
32.0k
    int32_t cpWordLength = 0;
679
32.0k
    int32_t cuWordLength = 0;
680
32.0k
    int32_t current;
681
32.0k
    PossibleWord words[BURMESE_LOOKAHEAD];
682
683
32.0k
    utext_setNativeIndex(text, rangeStart);
684
685
266k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
686
234k
        cuWordLength = 0;
687
234k
        cpWordLength = 0;
688
689
        // Look for candidate words at the current position
690
234k
        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
691
        
692
        // If we found exactly one, use that
693
234k
        if (candidates == 1) {
694
98.7k
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
695
98.7k
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
696
98.7k
            wordsFound += 1;
697
98.7k
        }
698
        // If there was more than one, see which one can take us forward the most words
699
135k
        else if (candidates > 1) {
700
            // If we're already at the end of the range, we're done
701
128k
            if (utext_getNativeIndex(text) >= rangeEnd) {
702
9.47k
                goto foundBest;
703
9.47k
            }
704
260k
            do {
705
260k
                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
706
                    // Followed by another dictionary word; mark first word as a good candidate
707
111k
                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
708
                    
709
                    // If we're already at the end of the range, we're done
710
111k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
711
6.38k
                        goto foundBest;
712
6.38k
                    }
713
                    
714
                    // See if any of the possible second words is followed by a third word
715
223k
                    do {
716
                        // If we find a third word, stop right away
717
223k
                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
718
40.6k
                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
719
40.6k
                            goto foundBest;
720
40.6k
                        }
721
223k
                    }
722
183k
                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
723
105k
                }
724
260k
            }
725
213k
            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
726
128k
foundBest:
727
128k
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
728
128k
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
729
128k
            wordsFound += 1;
730
128k
        }
731
        
732
        // We come here after having either found a word or not. We look ahead to the
733
        // next word. If it's not a dictionary word, we will combine it with the word we
734
        // just found (if there is one), but only if the preceding word does not exceed
735
        // the threshold.
736
        // The text iterator should now be positioned at the end of the word we found.
737
234k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
738
            // if it is a dictionary word, do nothing. If it isn't, then if there is
739
            // no preceding word, or the non-word shares less than the minimum threshold
740
            // of characters with a dictionary word, then scan to resynchronize
741
212k
            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
742
212k
                  && (cuWordLength == 0
743
54.1k
                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
744
                // Look for a plausible word boundary
745
47.8k
                int32_t remaining = rangeEnd - (current + cuWordLength);
746
47.8k
                UChar32 pc;
747
47.8k
                UChar32 uc;
748
47.8k
                int32_t chars = 0;
749
57.7k
                for (;;) {
750
57.7k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
751
57.7k
                    pc = utext_next32(text);
752
57.7k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
753
57.7k
                    chars += pcSize;
754
57.7k
                    remaining -= pcSize;
755
57.7k
                    if (remaining <= 0) {
756
12.2k
                        break;
757
12.2k
                    }
758
45.4k
                    uc = utext_current32(text);
759
45.4k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
760
                        // Maybe. See if it's in the dictionary.
761
                        // TODO: this looks iffy; compare with old code.
762
35.8k
                        int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
763
35.8k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
764
35.8k
                        if (num_candidates > 0) {
765
35.5k
                            break;
766
35.5k
                        }
767
35.8k
                    }
768
45.4k
                }
769
                
770
                // Bump the word count if there wasn't already one
771
47.8k
                if (cuWordLength <= 0) {
772
6.69k
                    wordsFound += 1;
773
6.69k
                }
774
                
775
                // Update the length with the passed-over characters
776
47.8k
                cuWordLength += chars;
777
47.8k
            }
778
164k
            else {
779
                // Back up to where we were for next iteration
780
164k
                utext_setNativeIndex(text, current + cuWordLength);
781
164k
            }
782
212k
        }
783
        
784
        // Never stop before a combining mark.
785
234k
        int32_t currPos;
786
242k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
787
8.63k
            utext_next32(text);
788
8.63k
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
789
8.63k
        }
790
        
791
        // Look ahead for possible suffixes if a dictionary word does not follow.
792
        // We do this in code rather than using a rule so that the heuristic
793
        // resynch continues to function. For example, one of the suffix characters
794
        // could be a typo in the middle of a word.
795
        // NOT CURRENTLY APPLICABLE TO BURMESE
796
797
        // Did we find a word on this iteration? If so, push it on the break stack
798
234k
        if (cuWordLength > 0) {
799
234k
            foundBreaks.push((current+cuWordLength), status);
800
234k
        }
801
234k
    }
802
803
    // Don't return a break for the end of the dictionary range if there is one there.
804
32.0k
    if (foundBreaks.peeki() >= rangeEnd) {
805
32.0k
        (void) foundBreaks.popi();
806
32.0k
        wordsFound -= 1;
807
32.0k
    }
808
809
32.0k
    return wordsFound;
810
32.0k
}
811
812
/*
813
 ******************************************************************
814
 * KhmerBreakEngine
815
 */
816
817
// How many words in a row are "good enough"?
818
static const int32_t KHMER_LOOKAHEAD = 3;
819
820
// Will not combine a non-word with a preceding dictionary word longer than this
821
static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
822
823
// Will not combine a non-word that shares at least this much prefix with a
824
// dictionary word, with a preceding word
825
static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
826
827
// Minimum word size
828
static const int32_t KHMER_MIN_WORD = 2;
829
830
// Minimum number of characters for two words
831
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
832
833
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
834
    : DictionaryBreakEngine(),
835
      fDictionary(adoptDictionary)
836
1
{
837
1
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
838
1
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
839
1
    UnicodeSet khmerWordSet(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]]"), status);
840
1
    if (U_SUCCESS(status)) {
841
1
        setCharacters(khmerWordSet);
842
1
    }
843
1
    fMarkSet.applyPattern(UnicodeString(u"[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
844
1
    fMarkSet.add(0x0020);
845
1
    fEndWordSet = khmerWordSet;
846
1
    fBeginWordSet.add(0x1780, 0x17B3);
847
    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
848
    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
849
    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
850
1
    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
851
    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
852
//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
853
//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
854
//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
855
//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
856
//    fSuffixSet.add(THAI_PAIYANNOI);
857
//    fSuffixSet.add(THAI_MAIYAMOK);
858
859
    // Compact for caching.
860
1
    fMarkSet.compact();
861
1
    fEndWordSet.compact();
862
1
    fBeginWordSet.compact();
863
//    fSuffixSet.compact();
864
1
    UTRACE_EXIT_STATUS(status);
865
1
}
866
867
0
KhmerBreakEngine::~KhmerBreakEngine() {
868
0
    delete fDictionary;
869
0
}
870
871
int32_t
872
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
873
                                                int32_t rangeStart,
874
                                                int32_t rangeEnd,
875
                                                UVector32 &foundBreaks,
876
                                                UBool /* isPhraseBreaking */,
877
54.9k
                                                UErrorCode& status ) const {
878
54.9k
    if (U_FAILURE(status)) return 0;
879
54.9k
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
880
21.1k
        return 0;       // Not enough characters for two words
881
21.1k
    }
882
883
33.8k
    uint32_t wordsFound = 0;
884
33.8k
    int32_t cpWordLength = 0;
885
33.8k
    int32_t cuWordLength = 0;
886
33.8k
    int32_t current;
887
33.8k
    PossibleWord words[KHMER_LOOKAHEAD];
888
889
33.8k
    utext_setNativeIndex(text, rangeStart);
890
891
2.67M
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
892
2.64M
        cuWordLength = 0;
893
2.64M
        cpWordLength = 0;
894
895
        // Look for candidate words at the current position
896
2.64M
        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
897
898
        // If we found exactly one, use that
899
2.64M
        if (candidates == 1) {
900
1.31M
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
901
1.31M
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
902
1.31M
            wordsFound += 1;
903
1.31M
        }
904
905
        // If there was more than one, see which one can take us forward the most words
906
1.32M
        else if (candidates > 1) {
907
            // If we're already at the end of the range, we're done
908
22.5k
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
909
4.97k
                goto foundBest;
910
4.97k
            }
911
29.4k
            do {
912
29.4k
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
913
                    // Followed by another dictionary word; mark first word as a good candidate
914
15.7k
                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
915
916
                    // If we're already at the end of the range, we're done
917
15.7k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
918
4.11k
                        goto foundBest;
919
4.11k
                    }
920
921
                    // See if any of the possible second words is followed by a third word
922
19.5k
                    do {
923
                        // If we find a third word, stop right away
924
19.5k
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
925
6.21k
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
926
6.21k
                            goto foundBest;
927
6.21k
                        }
928
19.5k
                    }
929
13.2k
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
930
11.5k
                }
931
29.4k
            }
932
19.1k
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
933
22.5k
foundBest:
934
22.5k
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
935
22.5k
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
936
22.5k
            wordsFound += 1;
937
22.5k
        }
938
939
        // We come here after having either found a word or not. We look ahead to the
940
        // next word. If it's not a dictionary word, we will combine it with the word we
941
        // just found (if there is one), but only if the preceding word does not exceed
942
        // the threshold.
943
        // The text iterator should now be positioned at the end of the word we found.
944
2.64M
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
945
            // if it is a dictionary word, do nothing. If it isn't, then if there is
946
            // no preceding word, or the non-word shares less than the minimum threshold
947
            // of characters with a dictionary word, then scan to resynchronize
948
2.61M
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
949
2.61M
                  && (cuWordLength == 0
950
2.60M
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
951
                // Look for a plausible word boundary
952
1.31M
                int32_t remaining = rangeEnd - (current+cuWordLength);
953
1.31M
                UChar32 pc;
954
1.31M
                UChar32 uc;
955
1.31M
                int32_t chars = 0;
956
1.40M
                for (;;) {
957
1.40M
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
958
1.40M
                    pc = utext_next32(text);
959
1.40M
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
960
1.40M
                    chars += pcSize;
961
1.40M
                    remaining -= pcSize;
962
1.40M
                    if (remaining <= 0) {
963
16.1k
                        break;
964
16.1k
                    }
965
1.39M
                    uc = utext_current32(text);
966
1.39M
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
967
                        // Maybe. See if it's in the dictionary.
968
1.33M
                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
969
1.33M
                        utext_setNativeIndex(text, current+cuWordLength+chars);
970
1.33M
                        if (num_candidates > 0) {
971
1.30M
                            break;
972
1.30M
                        }
973
1.33M
                    }
974
1.39M
                }
975
976
                // Bump the word count if there wasn't already one
977
1.31M
                if (cuWordLength <= 0) {
978
1.29M
                    wordsFound += 1;
979
1.29M
                }
980
981
                // Update the length with the passed-over characters
982
1.31M
                cuWordLength += chars;
983
1.31M
            }
984
1.30M
            else {
985
                // Back up to where we were for next iteration
986
1.30M
                utext_setNativeIndex(text, current+cuWordLength);
987
1.30M
            }
988
2.61M
        }
989
990
        // Never stop before a combining mark.
991
2.64M
        int32_t currPos;
992
2.64M
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
993
1.25k
            utext_next32(text);
994
1.25k
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
995
1.25k
        }
996
997
        // Look ahead for possible suffixes if a dictionary word does not follow.
998
        // We do this in code rather than using a rule so that the heuristic
999
        // resynch continues to function. For example, one of the suffix characters
1000
        // could be a typo in the middle of a word.
1001
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
1002
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
1003
//                && fSuffixSet.contains(uc = utext_current32(text))) {
1004
//                if (uc == KHMER_PAIYANNOI) {
1005
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
1006
//                        // Skip over previous end and PAIYANNOI
1007
//                        utext_next32(text);
1008
//                        utext_next32(text);
1009
//                        wordLength += 1;            // Add PAIYANNOI to word
1010
//                        uc = utext_current32(text);     // Fetch next character
1011
//                    }
1012
//                    else {
1013
//                        // Restore prior position
1014
//                        utext_next32(text);
1015
//                    }
1016
//                }
1017
//                if (uc == KHMER_MAIYAMOK) {
1018
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1019
//                        // Skip over previous end and MAIYAMOK
1020
//                        utext_next32(text);
1021
//                        utext_next32(text);
1022
//                        wordLength += 1;            // Add MAIYAMOK to word
1023
//                    }
1024
//                    else {
1025
//                        // Restore prior position
1026
//                        utext_next32(text);
1027
//                    }
1028
//                }
1029
//            }
1030
//            else {
1031
//                utext_setNativeIndex(text, current+wordLength);
1032
//            }
1033
//        }
1034
1035
        // Did we find a word on this iteration? If so, push it on the break stack
1036
2.64M
        if (cuWordLength > 0) {
1037
2.64M
            foundBreaks.push((current+cuWordLength), status);
1038
2.64M
        }
1039
2.64M
    }
1040
    
1041
    // Don't return a break for the end of the dictionary range if there is one there.
1042
33.8k
    if (foundBreaks.peeki() >= rangeEnd) {
1043
33.8k
        (void) foundBreaks.popi();
1044
33.8k
        wordsFound -= 1;
1045
33.8k
    }
1046
1047
33.8k
    return wordsFound;
1048
33.8k
}
1049
1050
#if !UCONFIG_NO_NORMALIZATION
1051
/*
1052
 ******************************************************************
1053
 * CjkBreakEngine
1054
 */
1055
static const uint32_t kuint32max = 0xFFFFFFFF;
1056
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1057
1
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1058
1
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1059
1
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1060
1
    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1061
    // Korean dictionary only includes Hangul syllables
1062
1
    fHangulWordSet.applyPattern(UnicodeString(u"[\\uac00-\\ud7a3]"), status);
1063
1
    fHangulWordSet.compact();
1064
    // Digits, open puncutation and Alphabetic characters.
1065
1
    fDigitOrOpenPunctuationOrAlphabetSet.applyPattern(
1066
1
        UnicodeString(u"[[:Nd:][:Pi:][:Ps:][:Alphabetic:]]"), status);
1067
1
    fDigitOrOpenPunctuationOrAlphabetSet.compact();
1068
1
    fClosePunctuationSet.applyPattern(UnicodeString(u"[[:Pc:][:Pd:][:Pe:][:Pf:][:Po:]]"), status);
1069
1
    fClosePunctuationSet.compact();
1070
1071
    // handle Korean and Japanese/Chinese using different dictionaries
1072
1
    if (type == kKorean) {
1073
0
        if (U_SUCCESS(status)) {
1074
0
            setCharacters(fHangulWordSet);
1075
0
        }
1076
1
    } else { //Chinese and Japanese
1077
1
        UnicodeSet cjSet(UnicodeString(u"[[:Han:][:Hiragana:][:Katakana:]\\u30fc\\uff70\\uff9e\\uff9f]"), status);
1078
1
        if (U_SUCCESS(status)) {
1079
1
            setCharacters(cjSet);
1080
1
            initJapanesePhraseParameter(status);
1081
1
        }
1082
1
    }
1083
1
    UTRACE_EXIT_STATUS(status);
1084
1
}
1085
1086
0
CjkBreakEngine::~CjkBreakEngine(){
1087
0
    delete fDictionary;
1088
0
}
1089
1090
// The katakanaCost values below are based on the length frequencies of all
1091
// katakana phrases in the dictionary
1092
static const int32_t kMaxKatakanaLength = 8;
1093
static const int32_t kMaxKatakanaGroupLength = 20;
1094
static const uint32_t maxSnlp = 255;
1095
1096
209k
static inline uint32_t getKatakanaCost(int32_t wordLength){
1097
    //TODO: fill array with actual values from dictionary!
1098
209k
    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1099
209k
                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1100
209k
    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1101
209k
}
1102
1103
75.0M
static inline bool isKatakana(UChar32 value) {
1104
75.0M
    return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1105
75.0M
            (value >= 0xFF66 && value <= 0xFF9f);
1106
75.0M
}
1107
1108
// Function for accessing internal utext flags.
1109
//   Replicates an internal UText function.
1110
1111
290k
static inline int32_t utext_i32_flag(int32_t bitIndex) {
1112
290k
    return (int32_t)1 << bitIndex;
1113
290k
}
1114
       
1115
/*
1116
 * @param text A UText representing the text
1117
 * @param rangeStart The start of the range of dictionary characters
1118
 * @param rangeEnd The end of the range of dictionary characters
1119
 * @param foundBreaks vector<int32> to receive the break positions
1120
 * @return The number of breaks found
1121
 */
1122
int32_t 
1123
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1124
        int32_t rangeStart,
1125
        int32_t rangeEnd,
1126
        UVector32 &foundBreaks,
1127
        UBool isPhraseBreaking,
1128
290k
        UErrorCode& status) const {
1129
290k
    if (U_FAILURE(status)) return 0;
1130
290k
    if (rangeStart >= rangeEnd) {
1131
0
        return 0;
1132
0
    }
1133
1134
    // UnicodeString version of input UText, NFKC normalized if necessary.
1135
290k
    UnicodeString inString;
1136
1137
    // inputMap[inStringIndex] = corresponding native index from UText inText.
1138
    // If nullptr then mapping is 1:1
1139
290k
    LocalPointer<UVector32>     inputMap;
1140
1141
    // if UText has the input string as one contiguous UTF-16 chunk
1142
290k
    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1143
290k
         inText->chunkNativeStart <= rangeStart &&
1144
290k
         inText->chunkNativeLimit >= rangeEnd   &&
1145
290k
         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1146
1147
        // Input UText is in one contiguous UTF-16 chunk.
1148
        // Use Read-only aliasing UnicodeString.
1149
0
        inString.setTo(false,
1150
0
                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
1151
0
                       rangeEnd - rangeStart);
1152
290k
    } else {
1153
        // Copy the text from the original inText (UText) to inString (UnicodeString).
1154
        // Create a map from UnicodeString indices -> UText offsets.
1155
290k
        utext_setNativeIndex(inText, rangeStart);
1156
290k
        int32_t limit = rangeEnd;
1157
290k
        U_ASSERT(limit <= utext_nativeLength(inText));
1158
290k
        if (limit > utext_nativeLength(inText)) {
1159
0
            limit = (int32_t)utext_nativeLength(inText);
1160
0
        }
1161
290k
        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1162
290k
        if (U_FAILURE(status)) {
1163
0
            return 0;
1164
0
        }
1165
17.8M
        while (utext_getNativeIndex(inText) < limit) {
1166
17.5M
            int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1167
17.5M
            UChar32 c = utext_next32(inText);
1168
17.5M
            U_ASSERT(c != U_SENTINEL);
1169
17.5M
            inString.append(c);
1170
39.7M
            while (inputMap->size() < inString.length()) {
1171
22.1M
                inputMap->addElement(nativePosition, status);
1172
22.1M
            }
1173
17.5M
        }
1174
290k
        inputMap->addElement(limit, status);
1175
290k
    }
1176
1177
1178
290k
    if (!nfkcNorm2->isNormalized(inString, status)) {
1179
230k
        UnicodeString normalizedInput;
1180
        //  normalizedMap[normalizedInput position] ==  original UText position.
1181
230k
        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1182
230k
        if (U_FAILURE(status)) {
1183
0
            return 0;
1184
0
        }
1185
        
1186
230k
        UnicodeString fragment;
1187
230k
        UnicodeString normalizedFragment;
1188
12.6M
        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1189
12.3M
            fragment.remove();
1190
12.3M
            int32_t fragmentStartI = srcI;
1191
12.3M
            UChar32 c = inString.char32At(srcI);
1192
16.1M
            for (;;) {
1193
16.1M
                fragment.append(c);
1194
16.1M
                srcI = inString.moveIndex32(srcI, 1);
1195
16.1M
                if (srcI == inString.length()) {
1196
230k
                    break;
1197
230k
                }
1198
15.8M
                c = inString.char32At(srcI);
1199
15.8M
                if (nfkcNorm2->hasBoundaryBefore(c)) {
1200
12.1M
                    break;
1201
12.1M
                }
1202
15.8M
            }
1203
12.3M
            nfkcNorm2->normalize(fragment, normalizedFragment, status);
1204
12.3M
            normalizedInput.append(normalizedFragment);
1205
1206
            // Map every position in the normalized chunk to the start of the chunk
1207
            //   in the original input.
1208
12.3M
            int32_t fragmentOriginalStart = inputMap.isValid() ?
1209
12.3M
                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1210
87.0M
            while (normalizedMap->size() < normalizedInput.length()) {
1211
74.6M
                normalizedMap->addElement(fragmentOriginalStart, status);
1212
74.6M
                if (U_FAILURE(status)) {
1213
0
                    break;
1214
0
                }
1215
74.6M
            }
1216
12.3M
        }
1217
230k
        U_ASSERT(normalizedMap->size() == normalizedInput.length());
1218
230k
        int32_t nativeEnd = inputMap.isValid() ?
1219
230k
                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1220
230k
        normalizedMap->addElement(nativeEnd, status);
1221
1222
230k
        inputMap = std::move(normalizedMap);
1223
230k
        inString = std::move(normalizedInput);
1224
230k
    }
1225
1226
290k
    int32_t numCodePts = inString.countChar32();
1227
290k
    if (numCodePts != inString.length()) {
1228
        // There are supplementary characters in the input.
1229
        // The dictionary will produce boundary positions in terms of code point indexes,
1230
        //   not in terms of code unit string indexes.
1231
        // Use the inputMap mechanism to take care of this in addition to indexing differences
1232
        //    from normalization and/or UTF-8 input.
1233
57.0k
        UBool hadExistingMap = inputMap.isValid();
1234
57.0k
        if (!hadExistingMap) {
1235
0
            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1236
0
            if (U_FAILURE(status)) {
1237
0
                return 0;
1238
0
            }
1239
0
        }
1240
57.0k
        int32_t cpIdx = 0;
1241
18.2M
        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1242
18.2M
            U_ASSERT(cuIdx >= cpIdx);
1243
18.2M
            if (hadExistingMap) {
1244
18.2M
                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1245
18.2M
            } else {
1246
0
                inputMap->addElement(cuIdx+rangeStart, status);
1247
0
            }
1248
18.2M
            cpIdx++;
1249
18.2M
            if (cuIdx == inString.length()) {
1250
57.0k
               break;
1251
57.0k
            }
1252
18.2M
        }
1253
57.0k
    }
1254
                
1255
    // bestSnlp[i] is the snlp of the best segmentation of the first i
1256
    // code points in the range to be matched.
1257
290k
    UVector32 bestSnlp(numCodePts + 1, status);
1258
290k
    bestSnlp.addElement(0, status);
1259
73.2M
    for(int32_t i = 1; i <= numCodePts; i++) {
1260
72.9M
        bestSnlp.addElement(kuint32max, status);
1261
72.9M
    }
1262
1263
1264
    // prev[i] is the index of the last CJK code point in the previous word in 
1265
    // the best segmentation of the first i characters.
1266
290k
    UVector32 prev(numCodePts + 1, status);
1267
73.5M
    for(int32_t i = 0; i <= numCodePts; i++){
1268
73.2M
        prev.addElement(-1, status);
1269
73.2M
    }
1270
1271
290k
    const int32_t maxWordSize = 20;
1272
290k
    UVector32 values(numCodePts, status);
1273
290k
    values.setSize(numCodePts);
1274
290k
    UVector32 lengths(numCodePts, status);
1275
290k
    lengths.setSize(numCodePts);
1276
1277
290k
    UText fu = UTEXT_INITIALIZER;
1278
290k
    utext_openUnicodeString(&fu, &inString, &status);
1279
1280
    // Dynamic programming to find the best segmentation.
1281
1282
    // In outer loop, i  is the code point index,
1283
    //                ix is the corresponding string (code unit) index.
1284
    //    They differ when the string contains supplementary characters.
1285
290k
    int32_t ix = 0;
1286
290k
    bool is_prev_katakana = false;
1287
73.2M
    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1288
72.9M
        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1289
0
            continue;
1290
0
        }
1291
1292
72.9M
        int32_t count;
1293
72.9M
        utext_setNativeIndex(&fu, ix);
1294
72.9M
        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1295
72.9M
                             nullptr, lengths.getBuffer(), values.getBuffer(), nullptr);
1296
                             // Note: lengths is filled with code point lengths
1297
                             //       The nullptr parameter is the ignored code unit lengths.
1298
1299
        // if there are no single character matches found in the dictionary 
1300
        // starting with this character, treat character as a 1-character word 
1301
        // with the highest value possible, i.e. the least likely to occur.
1302
        // Exclude Korean characters from this treatment, as they should be left
1303
        // together by default.
1304
72.9M
        if ((count == 0 || lengths.elementAti(0) != 1) &&
1305
72.9M
                !fHangulWordSet.contains(inString.char32At(ix))) {
1306
54.8M
            values.setElementAt(maxSnlp, count);   // 255
1307
54.8M
            lengths.setElementAt(1, count++);
1308
54.8M
        }
1309
1310
190M
        for (int32_t j = 0; j < count; j++) {
1311
117M
            uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1312
117M
            int32_t ln_j_i = lengths.elementAti(j) + i;
1313
117M
            if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1314
79.3M
                bestSnlp.setElementAt(newSnlp, ln_j_i);
1315
79.3M
                prev.setElementAt(i, ln_j_i);
1316
79.3M
            }
1317
117M
        }
1318
1319
        // In Japanese,
1320
        // Katakana word in single character is pretty rare. So we apply
1321
        // the following heuristic to Katakana: any continuous run of Katakana
1322
        // characters is considered a candidate word with a default cost
1323
        // specified in the katakanaCost table according to its length.
1324
1325
72.9M
        bool is_katakana = isKatakana(inString.char32At(ix));
1326
72.9M
        int32_t katakanaRunLength = 1;
1327
72.9M
        if (!is_prev_katakana && is_katakana) {
1328
282k
            int32_t j = inString.moveIndex32(ix, 1);
1329
            // Find the end of the continuous run of Katakana characters
1330
2.39M
            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1331
2.39M
                    isKatakana(inString.char32At(j))) {
1332
2.10M
                j = inString.moveIndex32(j, 1);
1333
2.10M
                katakanaRunLength++;
1334
2.10M
            }
1335
282k
            if (katakanaRunLength < kMaxKatakanaGroupLength) {
1336
209k
                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1337
209k
                if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1338
63.7k
                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1339
63.7k
                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1340
63.7k
                }
1341
209k
            }
1342
282k
        }
1343
72.9M
        is_prev_katakana = is_katakana;
1344
72.9M
    }
1345
290k
    utext_close(&fu);
1346
1347
    // Start pushing the optimal offset index into t_boundary (t for tentative).
1348
    // prev[numCodePts] is guaranteed to be meaningful.
1349
    // We'll first push in the reverse order, i.e.,
1350
    // t_boundary[0] = numCodePts, and afterwards do a swap.
1351
290k
    UVector32 t_boundary(numCodePts+1, status);
1352
1353
290k
    int32_t numBreaks = 0;
1354
    // No segmentation found, set boundary to end of range
1355
290k
    if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1356
0
        t_boundary.addElement(numCodePts, status);
1357
0
        numBreaks++;
1358
290k
    } else if (isPhraseBreaking) {
1359
0
        t_boundary.addElement(numCodePts, status);
1360
0
        if(U_SUCCESS(status)) {
1361
0
            numBreaks++;
1362
0
            int32_t prevIdx = numCodePts;
1363
1364
0
            int32_t codeUnitIdx = -1;
1365
0
            int32_t prevCodeUnitIdx = -1;
1366
0
            int32_t length = -1;
1367
0
            for (int32_t i = prev.elementAti(numCodePts); i > 0; i = prev.elementAti(i)) {
1368
0
                codeUnitIdx = inString.moveIndex32(0, i);
1369
0
                prevCodeUnitIdx = inString.moveIndex32(0, prevIdx);
1370
                // Calculate the length by using the code unit.
1371
0
                length = prevCodeUnitIdx - codeUnitIdx;
1372
0
                prevIdx = i;
1373
                // Keep the breakpoint if the pattern is not in the fSkipSet and continuous Katakana
1374
                // characters don't occur.
1375
0
                if (!fSkipSet.containsKey(inString.tempSubString(codeUnitIdx, length))
1376
0
                    && (!isKatakana(inString.char32At(inString.moveIndex32(codeUnitIdx, -1)))
1377
0
                           || !isKatakana(inString.char32At(codeUnitIdx)))) {
1378
0
                    t_boundary.addElement(i, status);
1379
0
                    numBreaks++;
1380
0
                }
1381
0
            }
1382
0
        }
1383
290k
    } else {
1384
34.9M
        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1385
34.6M
            t_boundary.addElement(i, status);
1386
34.6M
            numBreaks++;
1387
34.6M
        }
1388
290k
        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1389
290k
    }
1390
1391
    // Add a break for the start of the dictionary range if there is not one
1392
    // there already.
1393
290k
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1394
290k
        t_boundary.addElement(0, status);
1395
290k
        numBreaks++;
1396
290k
    }
1397
1398
    // Now that we're done, convert positions in t_boundary[] (indices in 
1399
    // the normalized input string) back to indices in the original input UText
1400
    // while reversing t_boundary and pushing values to foundBreaks.
1401
290k
    int32_t prevCPPos = -1;
1402
290k
    int32_t prevUTextPos = -1;
1403
290k
    int32_t correctedNumBreaks = 0;
1404
35.1M
    for (int32_t i = numBreaks - 1; i >= 0; i--) {
1405
34.9M
        int32_t cpPos = t_boundary.elementAti(i);
1406
34.9M
        U_ASSERT(cpPos > prevCPPos);
1407
34.9M
        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1408
34.9M
        U_ASSERT(utextPos >= prevUTextPos);
1409
34.9M
        if (utextPos > prevUTextPos) {
1410
            // Boundaries are added to foundBreaks output in ascending order.
1411
14.1M
            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1412
            // In phrase breaking, there has to be a breakpoint between Cj character and close
1413
            // punctuation.
1414
            // E.g.[携帯電話]正しい選択 -> [携帯▁電話]▁正しい▁選択 -> breakpoint between ] and 正
1415
14.1M
            if (utextPos != rangeStart
1416
14.1M
                || (isPhraseBreaking && utextPos > 0
1417
13.8M
                       && fClosePunctuationSet.contains(utext_char32At(inText, utextPos - 1)))) {
1418
13.8M
                foundBreaks.push(utextPos, status);
1419
13.8M
                correctedNumBreaks++;
1420
13.8M
            }
1421
20.7M
        } else {
1422
            // Normalization expanded the input text, the dictionary found a boundary
1423
            // within the expansion, giving two boundaries with the same index in the
1424
            // original text. Ignore the second. See ticket #12918.
1425
20.7M
            --numBreaks;
1426
20.7M
        }
1427
34.9M
        prevCPPos = cpPos;
1428
34.9M
        prevUTextPos = utextPos;
1429
34.9M
    }
1430
290k
    (void)prevCPPos; // suppress compiler warnings about unused variable
1431
1432
290k
    UChar32 nextChar = utext_char32At(inText, rangeEnd);
1433
290k
    if (!foundBreaks.isEmpty() && foundBreaks.peeki() == rangeEnd) {
1434
        // In phrase breaking, there has to be a breakpoint between Cj character and
1435
        // the number/open punctuation.
1436
        // E.g. る文字「そうだ、京都」->る▁文字▁「そうだ、▁京都」-> breakpoint between 字 and「
1437
        // E.g. 乗車率90%程度だろうか -> 乗車▁率▁90%▁程度だろうか -> breakpoint between 率 and 9
1438
        // E.g. しかもロゴがUnicode! -> しかも▁ロゴが▁Unicode!-> breakpoint between が and U
1439
290k
        if (isPhraseBreaking) {
1440
0
            if (!fDigitOrOpenPunctuationOrAlphabetSet.contains(nextChar)) {
1441
0
                foundBreaks.popi();
1442
0
                correctedNumBreaks--;
1443
0
            }
1444
290k
        } else {
1445
290k
            foundBreaks.popi();
1446
290k
            correctedNumBreaks--;
1447
290k
        }
1448
290k
    }
1449
1450
    // inString goes out of scope
1451
    // inputMap goes out of scope
1452
290k
    return correctedNumBreaks;
1453
290k
}
1454
1455
1
void CjkBreakEngine::initJapanesePhraseParameter(UErrorCode& error) {
1456
1
    loadJapaneseExtensions(error);
1457
1
    loadHiragana(error);
1458
1
}
1459
1460
1
void CjkBreakEngine::loadJapaneseExtensions(UErrorCode& error) {
1461
1
    const char* tag = "extensions";
1462
1
    ResourceBundle ja(U_ICUDATA_BRKITR, "ja", error);
1463
1
    if (U_SUCCESS(error)) {
1464
1
        ResourceBundle bundle = ja.get(tag, error);
1465
225
        while (U_SUCCESS(error) && bundle.hasNext()) {
1466
224
            fSkipSet.puti(bundle.getNextString(error), 1, error);
1467
224
        }
1468
1
    }
1469
1
}
1470
1471
1
void CjkBreakEngine::loadHiragana(UErrorCode& error) {
1472
1
    UnicodeSet hiraganaWordSet(UnicodeString(u"[:Hiragana:]"), error);
1473
1
    hiraganaWordSet.compact();
1474
1
    UnicodeSetIterator iterator(hiraganaWordSet);
1475
382
    while (iterator.next()) {
1476
381
        fSkipSet.puti(UnicodeString(iterator.getCodepoint()), 1, error);
1477
381
    }
1478
1
}
1479
#endif
1480
1481
U_NAMESPACE_END
1482
1483
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1484