Coverage Report

Created: 2021-08-22 09:07

/src/skia/third_party/externals/icu/source/common/dictbe.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
 *******************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 *******************************************************************************
8
 */
9
10
#include <utility>
11
12
#include "unicode/utypes.h"
13
14
#if !UCONFIG_NO_BREAK_ITERATION
15
16
#include "brkeng.h"
17
#include "dictbe.h"
18
#include "unicode/uniset.h"
19
#include "unicode/chariter.h"
20
#include "unicode/ubrk.h"
21
#include "utracimp.h"
22
#include "uvectr32.h"
23
#include "uvector.h"
24
#include "uassert.h"
25
#include "unicode/normlzr.h"
26
#include "cmemory.h"
27
#include "dictionarydata.h"
28
29
U_NAMESPACE_BEGIN
30
31
/*
32
 ******************************************************************
33
 */
34
35
8
DictionaryBreakEngine::DictionaryBreakEngine() {
36
8
}
37
38
0
DictionaryBreakEngine::~DictionaryBreakEngine() {
39
0
}
40
41
UBool
42
101k
DictionaryBreakEngine::handles(UChar32 c) const {
43
101k
    return fSet.contains(c);
44
101k
}
45
46
int32_t
47
DictionaryBreakEngine::findBreaks( UText *text,
48
                                 int32_t startPos,
49
                                 int32_t endPos,
50
49.0k
                                 UVector32 &foundBreaks ) const {
51
49.0k
    (void)startPos;            // TODO: remove this param?
52
49.0k
    int32_t result = 0;
53
54
    // Find the span of characters included in the set.
55
    //   The span to break begins at the current position in the text, and
56
    //   extends towards the start or end of the text, depending on 'reverse'.
57
58
49.0k
    int32_t start = (int32_t)utext_getNativeIndex(text);
59
49.0k
    int32_t current;
60
49.0k
    int32_t rangeStart;
61
49.0k
    int32_t rangeEnd;
62
49.0k
    UChar32 c = utext_current32(text);
63
166k
    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
64
117k
        utext_next32(text);         // TODO:  recast loop for postincrement
65
117k
        c = utext_current32(text);
66
117k
    }
67
49.0k
    rangeStart = start;
68
49.0k
    rangeEnd = current;
69
49.0k
    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
70
49.0k
    utext_setNativeIndex(text, current);
71
    
72
49.0k
    return result;
73
49.0k
}
74
75
void
76
8
DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
77
8
    fSet = set;
78
    // Compact for caching
79
8
    fSet.compact();
80
8
}
81
82
/*
83
 ******************************************************************
84
 * PossibleWord
85
 */
86
87
// Helper class for improving readability of the Thai/Lao/Khmer word break
88
// algorithm. The implementation is completely inline.
89
90
// List size, limited by the maximum number of words in the dictionary
91
// that form a nested sequence.
92
static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
93
94
class PossibleWord {
95
private:
96
    // list of word candidate lengths, in increasing length order
97
    // TODO: bytes would be sufficient for word lengths.
98
    int32_t   count;      // Count of candidates
99
    int32_t   prefix;     // The longest match with a dictionary word
100
    int32_t   offset;     // Offset in the text of these candidates
101
    int32_t   mark;       // The preferred candidate's offset
102
    int32_t   current;    // The candidate we're currently looking at
103
    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
104
    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
105
106
public:
107
35.3k
    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
108
35.3k
    ~PossibleWord() {}
109
  
110
    // Fill the list of candidates if needed, select the longest, and return the number found
111
    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
112
  
113
    // Select the currently marked candidate, point after it in the text, and invalidate self
114
    int32_t   acceptMarked( UText *text );
115
  
116
    // Back up from the current candidate to the next shorter one; return TRUE if that exists
117
    // and point the text after it
118
    UBool     backUp( UText *text );
119
  
120
    // Return the longest prefix this candidate location shares with a dictionary word
121
    // Return value is in code points.
122
6.51k
    int32_t   longestPrefix() { return prefix; }
123
  
124
    // Mark the current candidate as the one we like
125
9.11k
    void      markCurrent() { mark = current; }
126
    
127
    // Get length in code points of the marked word.
128
32.7k
    int32_t   markedCPLength() { return cpLengths[mark]; }
129
};
130
131
132
108k
int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
133
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
134
108k
    int32_t start = (int32_t)utext_getNativeIndex(text);
135
108k
    if (start != offset) {
136
71.0k
        offset = start;
137
71.0k
        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
138
        // Dictionary leaves text after longest prefix, not longest word. Back up.
139
71.0k
        if (count <= 0) {
140
32.9k
            utext_setNativeIndex(text, start);
141
32.9k
        }
142
71.0k
    }
143
108k
    if (count > 0) {
144
68.2k
        utext_setNativeIndex(text, start+cuLengths[count-1]);
145
68.2k
    }
146
108k
    current = count-1;
147
108k
    mark = current;
148
108k
    return count;
149
108k
}
150
151
int32_t
152
32.7k
PossibleWord::acceptMarked( UText *text ) {
153
32.7k
    utext_setNativeIndex(text, offset + cuLengths[mark]);
154
32.7k
    return cuLengths[mark];
155
32.7k
}
156
157
158
UBool
159
18.0k
PossibleWord::backUp( UText *text ) {
160
18.0k
    if (current > 0) {
161
11.2k
        utext_setNativeIndex(text, offset + cuLengths[--current]);
162
11.2k
        return TRUE;
163
11.2k
    }
164
6.79k
    return FALSE;
165
6.79k
}
166
167
/*
168
 ******************************************************************
169
 * ThaiBreakEngine
170
 */
171
172
// How many words in a row are "good enough"?
173
static const int32_t THAI_LOOKAHEAD = 3;
174
175
// Will not combine a non-word with a preceding dictionary word longer than this
176
static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
177
178
// Will not combine a non-word that shares at least this much prefix with a
179
// dictionary word, with a preceding word
180
static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
181
182
// Ellision character
183
static const int32_t THAI_PAIYANNOI = 0x0E2F;
184
185
// Repeat character
186
static const int32_t THAI_MAIYAMOK = 0x0E46;
187
188
// Minimum word size
189
static const int32_t THAI_MIN_WORD = 2;
190
191
// Minimum number of characters for two words
192
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
193
194
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
195
    : DictionaryBreakEngine(),
196
      fDictionary(adoptDictionary)
197
2
{
198
2
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
199
2
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
200
2
    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
201
2
    if (U_SUCCESS(status)) {
202
2
        setCharacters(fThaiWordSet);
203
2
    }
204
2
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
205
2
    fMarkSet.add(0x0020);
206
2
    fEndWordSet = fThaiWordSet;
207
2
    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
208
2
    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
209
2
    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
210
2
    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
211
2
    fSuffixSet.add(THAI_PAIYANNOI);
212
2
    fSuffixSet.add(THAI_MAIYAMOK);
213
214
    // Compact for caching.
215
2
    fMarkSet.compact();
216
2
    fEndWordSet.compact();
217
2
    fBeginWordSet.compact();
218
2
    fSuffixSet.compact();
219
2
    UTRACE_EXIT_STATUS(status);
220
2
}
221
222
0
ThaiBreakEngine::~ThaiBreakEngine() {
223
0
    delete fDictionary;
224
0
}
225
226
int32_t
227
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
228
                                                int32_t rangeStart,
229
                                                int32_t rangeEnd,
230
20.7k
                                                UVector32 &foundBreaks ) const {
231
20.7k
    utext_setNativeIndex(text, rangeStart);
232
20.7k
    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
233
20.7k
    if (utext_getNativeIndex(text) >= rangeEnd) {
234
19.2k
        return 0;       // Not enough characters for two words
235
19.2k
    }
236
1.54k
    utext_setNativeIndex(text, rangeStart);
237
238
239
1.54k
    uint32_t wordsFound = 0;
240
1.54k
    int32_t cpWordLength = 0;    // Word Length in Code Points.
241
1.54k
    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
242
1.54k
    int32_t current;
243
1.54k
    UErrorCode status = U_ZERO_ERROR;
244
1.54k
    PossibleWord words[THAI_LOOKAHEAD];
245
    
246
1.54k
    utext_setNativeIndex(text, rangeStart);
247
    
248
7.91k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
249
6.37k
        cpWordLength = 0;
250
6.37k
        cuWordLength = 0;
251
252
        // Look for candidate words at the current position
253
6.37k
        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
254
        
255
        // If we found exactly one, use that
256
6.37k
        if (candidates == 1) {
257
1.13k
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
258
1.13k
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
259
1.13k
            wordsFound += 1;
260
1.13k
        }
261
        // If there was more than one, see which one can take us forward the most words
262
5.23k
        else if (candidates > 1) {
263
            // If we're already at the end of the range, we're done
264
3.44k
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
265
327
                goto foundBest;
266
327
            }
267
6.13k
            do {
268
6.13k
                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
269
                    // Followed by another dictionary word; mark first word as a good candidate
270
1.95k
                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
271
                    
272
                    // If we're already at the end of the range, we're done
273
1.95k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
274
433
                        goto foundBest;
275
433
                    }
276
                    
277
                    // See if any of the possible second words is followed by a third word
278
2.81k
                    do {
279
                        // If we find a third word, stop right away
280
2.81k
                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
281
1.08k
                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
282
1.08k
                            goto foundBest;
283
1.08k
                        }
284
1.72k
                    }
285
1.72k
                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
286
1.51k
                }
287
6.13k
            }
288
4.62k
            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
289
3.44k
foundBest:
290
            // Set UText position to after the accepted word.
291
3.44k
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
292
3.44k
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
293
3.44k
            wordsFound += 1;
294
3.44k
        }
295
        
296
        // We come here after having either found a word or not. We look ahead to the
297
        // next word. If it's not a dictionary word, we will combine it with the word we
298
        // just found (if there is one), but only if the preceding word does not exceed
299
        // the threshold.
300
        // The text iterator should now be positioned at the end of the word we found.
301
        
302
6.37k
        UChar32 uc = 0;
303
6.37k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
304
            // if it is a dictionary word, do nothing. If it isn't, then if there is
305
            // no preceding word, or the non-word shares less than the minimum threshold
306
            // of characters with a dictionary word, then scan to resynchronize
307
3.76k
            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
308
2.12k
                  && (cuWordLength == 0
309
2.03k
                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
310
                // Look for a plausible word boundary
311
2.03k
                int32_t remaining = rangeEnd - (current+cuWordLength);
312
2.03k
                UChar32 pc;
313
2.03k
                int32_t chars = 0;
314
4.87k
                for (;;) {
315
4.87k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
316
4.87k
                    pc = utext_next32(text);
317
4.87k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
318
4.87k
                    chars += pcSize;
319
4.87k
                    remaining -= pcSize;
320
4.87k
                    if (remaining <= 0) {
321
714
                        break;
322
714
                    }
323
4.15k
                    uc = utext_current32(text);
324
4.15k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
325
                        // Maybe. See if it's in the dictionary.
326
                        // NOTE: In the original Apple code, checked that the next
327
                        // two characters after uc were not 0x0E4C THANTHAKHAT before
328
                        // checking the dictionary. That is just a performance filter,
329
                        // but it's not clear it's faster than checking the trie.
330
3.35k
                        int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
331
3.35k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
332
3.35k
                        if (num_candidates > 0) {
333
1.32k
                            break;
334
1.32k
                        }
335
3.35k
                    }
336
4.15k
                }
337
                
338
                // Bump the word count if there wasn't already one
339
2.03k
                if (cuWordLength <= 0) {
340
1.78k
                    wordsFound += 1;
341
1.78k
                }
342
                
343
                // Update the length with the passed-over characters
344
2.03k
                cuWordLength += chars;
345
2.03k
            }
346
1.73k
            else {
347
                // Back up to where we were for next iteration
348
1.73k
                utext_setNativeIndex(text, current+cuWordLength);
349
1.73k
            }
350
3.76k
        }
351
        
352
        // Never stop before a combining mark.
353
6.37k
        int32_t currPos;
354
6.51k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
355
143
            utext_next32(text);
356
143
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
357
143
        }
358
        
359
        // Look ahead for possible suffixes if a dictionary word does not follow.
360
        // We do this in code rather than using a rule so that the heuristic
361
        // resynch continues to function. For example, one of the suffix characters
362
        // could be a typo in the middle of a word.
363
6.37k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
364
4.98k
            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
365
1.50k
                && fSuffixSet.contains(uc = utext_current32(text))) {
366
686
                if (uc == THAI_PAIYANNOI) {
367
238
                    if (!fSuffixSet.contains(utext_previous32(text))) {
368
                        // Skip over previous end and PAIYANNOI
369
171
                        utext_next32(text);
370
171
                        int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
371
171
                        utext_next32(text);
372
171
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
373
171
                        uc = utext_current32(text);     // Fetch next character
374
171
                    }
375
67
                    else {
376
                        // Restore prior position
377
67
                        utext_next32(text);
378
67
                    }
379
238
                }
380
686
                if (uc == THAI_MAIYAMOK) {
381
452
                    if (utext_previous32(text) != THAI_MAIYAMOK) {
382
                        // Skip over previous end and MAIYAMOK
383
356
                        utext_next32(text);
384
356
                        int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
385
356
                        utext_next32(text);
386
356
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
387
356
                    }
388
96
                    else {
389
                        // Restore prior position
390
96
                        utext_next32(text);
391
96
                    }
392
452
                }
393
686
            }
394
4.29k
            else {
395
4.29k
                utext_setNativeIndex(text, current+cuWordLength);
396
4.29k
            }
397
4.98k
        }
398
399
        // Did we find a word on this iteration? If so, push it on the break stack
400
6.37k
        if (cuWordLength > 0) {
401
6.37k
            foundBreaks.push((current+cuWordLength), status);
402
6.37k
        }
403
6.37k
    }
404
405
    // Don't return a break for the end of the dictionary range if there is one there.
406
1.54k
    if (foundBreaks.peeki() >= rangeEnd) {
407
1.54k
        (void) foundBreaks.popi();
408
1.54k
        wordsFound -= 1;
409
1.54k
    }
410
411
1.54k
    return wordsFound;
412
1.54k
}
413
414
/*
415
 ******************************************************************
416
 * LaoBreakEngine
417
 */
418
419
// How many words in a row are "good enough"?
420
static const int32_t LAO_LOOKAHEAD = 3;
421
422
// Will not combine a non-word with a preceding dictionary word longer than this
423
static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
424
425
// Will not combine a non-word that shares at least this much prefix with a
426
// dictionary word, with a preceding word
427
static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
428
429
// Minimum word size
430
static const int32_t LAO_MIN_WORD = 2;
431
432
// Minimum number of characters for two words
433
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
434
435
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
436
    : DictionaryBreakEngine(),
437
      fDictionary(adoptDictionary)
438
2
{
439
2
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
440
2
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
441
2
    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
442
2
    if (U_SUCCESS(status)) {
443
2
        setCharacters(fLaoWordSet);
444
2
    }
445
2
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
446
2
    fMarkSet.add(0x0020);
447
2
    fEndWordSet = fLaoWordSet;
448
2
    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
449
2
    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
450
2
    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
451
2
    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
452
453
    // Compact for caching.
454
2
    fMarkSet.compact();
455
2
    fEndWordSet.compact();
456
2
    fBeginWordSet.compact();
457
2
    UTRACE_EXIT_STATUS(status);
458
2
}
459
460
0
LaoBreakEngine::~LaoBreakEngine() {
461
0
    delete fDictionary;
462
0
}
463
464
int32_t
465
LaoBreakEngine::divideUpDictionaryRange( UText *text,
466
                                                int32_t rangeStart,
467
                                                int32_t rangeEnd,
468
5.37k
                                                UVector32 &foundBreaks ) const {
469
5.37k
    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
470
3.92k
        return 0;       // Not enough characters for two words
471
3.92k
    }
472
473
1.45k
    uint32_t wordsFound = 0;
474
1.45k
    int32_t cpWordLength = 0;
475
1.45k
    int32_t cuWordLength = 0;
476
1.45k
    int32_t current;
477
1.45k
    UErrorCode status = U_ZERO_ERROR;
478
1.45k
    PossibleWord words[LAO_LOOKAHEAD];
479
    
480
1.45k
    utext_setNativeIndex(text, rangeStart);
481
    
482
4.69k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
483
3.23k
        cuWordLength = 0;
484
3.23k
        cpWordLength = 0;
485
486
        // Look for candidate words at the current position
487
3.23k
        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
488
        
489
        // If we found exactly one, use that
490
3.23k
        if (candidates == 1) {
491
1.73k
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
492
1.73k
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
493
1.73k
            wordsFound += 1;
494
1.73k
        }
495
        // If there was more than one, see which one can take us forward the most words
496
1.50k
        else if (candidates > 1) {
497
            // If we're already at the end of the range, we're done
498
907
            if (utext_getNativeIndex(text) >= rangeEnd) {
499
127
                goto foundBest;
500
127
            }
501
1.31k
            do {
502
1.31k
                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
503
                    // Followed by another dictionary word; mark first word as a good candidate
504
703
                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
505
                    
506
                    // If we're already at the end of the range, we're done
507
703
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
508
72
                        goto foundBest;
509
72
                    }
510
                    
511
                    // See if any of the possible second words is followed by a third word
512
769
                    do {
513
                        // If we find a third word, stop right away
514
769
                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
515
271
                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
516
271
                            goto foundBest;
517
271
                        }
518
498
                    }
519
498
                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
520
631
                }
521
1.31k
            }
522
971
            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
523
907
foundBest:
524
907
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
525
907
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
526
907
            wordsFound += 1;
527
907
        }
528
        
529
        // We come here after having either found a word or not. We look ahead to the
530
        // next word. If it's not a dictionary word, we will combine it withe the word we
531
        // just found (if there is one), but only if the preceding word does not exceed
532
        // the threshold.
533
        // The text iterator should now be positioned at the end of the word we found.
534
3.23k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
535
            // if it is a dictionary word, do nothing. If it isn't, then if there is
536
            // no preceding word, or the non-word shares less than the minimum threshold
537
            // of characters with a dictionary word, then scan to resynchronize
538
1.99k
            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
539
1.37k
                  && (cuWordLength == 0
540
1.29k
                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
541
                // Look for a plausible word boundary
542
1.29k
                int32_t remaining = rangeEnd - (current + cuWordLength);
543
1.29k
                UChar32 pc;
544
1.29k
                UChar32 uc;
545
1.29k
                int32_t chars = 0;
546
2.92k
                for (;;) {
547
2.92k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
548
2.92k
                    pc = utext_next32(text);
549
2.92k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
550
2.92k
                    chars += pcSize;
551
2.92k
                    remaining -= pcSize;
552
2.92k
                    if (remaining <= 0) {
553
615
                        break;
554
615
                    }
555
2.30k
                    uc = utext_current32(text);
556
2.30k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
557
                        // Maybe. See if it's in the dictionary.
558
                        // TODO: this looks iffy; compare with old code.
559
1.21k
                        int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
560
1.21k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
561
1.21k
                        if (num_candidates > 0) {
562
676
                            break;
563
676
                        }
564
1.21k
                    }
565
2.30k
                }
566
                
567
                // Bump the word count if there wasn't already one
568
1.29k
                if (cuWordLength <= 0) {
569
598
                    wordsFound += 1;
570
598
                }
571
                
572
                // Update the length with the passed-over characters
573
1.29k
                cuWordLength += chars;
574
1.29k
            }
575
702
            else {
576
                // Back up to where we were for next iteration
577
702
                utext_setNativeIndex(text, current + cuWordLength);
578
702
            }
579
1.99k
        }
580
        
581
        // Never stop before a combining mark.
582
3.23k
        int32_t currPos;
583
3.95k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
584
715
            utext_next32(text);
585
715
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
586
715
        }
587
        
588
        // Look ahead for possible suffixes if a dictionary word does not follow.
589
        // We do this in code rather than using a rule so that the heuristic
590
        // resynch continues to function. For example, one of the suffix characters
591
        // could be a typo in the middle of a word.
592
        // NOT CURRENTLY APPLICABLE TO LAO
593
594
        // Did we find a word on this iteration? If so, push it on the break stack
595
3.23k
        if (cuWordLength > 0) {
596
3.23k
            foundBreaks.push((current+cuWordLength), status);
597
3.23k
        }
598
3.23k
    }
599
600
    // Don't return a break for the end of the dictionary range if there is one there.
601
1.45k
    if (foundBreaks.peeki() >= rangeEnd) {
602
1.45k
        (void) foundBreaks.popi();
603
1.45k
        wordsFound -= 1;
604
1.45k
    }
605
606
1.45k
    return wordsFound;
607
1.45k
}
608
609
/*
610
 ******************************************************************
611
 * BurmeseBreakEngine
612
 */
613
614
// How many words in a row are "good enough"?
615
static const int32_t BURMESE_LOOKAHEAD = 3;
616
617
// Will not combine a non-word with a preceding dictionary word longer than this
618
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
619
620
// Will not combine a non-word that shares at least this much prefix with a
621
// dictionary word, with a preceding word
622
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
623
624
// Minimum word size
625
static const int32_t BURMESE_MIN_WORD = 2;
626
627
// Minimum number of characters for two words
628
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
629
630
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
631
    : DictionaryBreakEngine(),
632
      fDictionary(adoptDictionary)
633
2
{
634
2
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
635
2
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
636
2
    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
637
2
    if (U_SUCCESS(status)) {
638
2
        setCharacters(fBurmeseWordSet);
639
2
    }
640
2
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
641
2
    fMarkSet.add(0x0020);
642
2
    fEndWordSet = fBurmeseWordSet;
643
2
    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
644
645
    // Compact for caching.
646
2
    fMarkSet.compact();
647
2
    fEndWordSet.compact();
648
2
    fBeginWordSet.compact();
649
2
    UTRACE_EXIT_STATUS(status);
650
2
}
651
652
0
BurmeseBreakEngine::~BurmeseBreakEngine() {
653
0
    delete fDictionary;
654
0
}
655
656
int32_t
657
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
658
                                                int32_t rangeStart,
659
                                                int32_t rangeEnd,
660
14.2k
                                                UVector32 &foundBreaks ) const {
661
14.2k
    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
662
7.74k
        return 0;       // Not enough characters for two words
663
7.74k
    }
664
665
6.45k
    uint32_t wordsFound = 0;
666
6.45k
    int32_t cpWordLength = 0;
667
6.45k
    int32_t cuWordLength = 0;
668
6.45k
    int32_t current;
669
6.45k
    UErrorCode status = U_ZERO_ERROR;
670
6.45k
    PossibleWord words[BURMESE_LOOKAHEAD];
671
    
672
6.45k
    utext_setNativeIndex(text, rangeStart);
673
    
674
30.6k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
675
24.2k
        cuWordLength = 0;
676
24.2k
        cpWordLength = 0;
677
678
        // Look for candidate words at the current position
679
24.2k
        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
680
        
681
        // If we found exactly one, use that
682
24.2k
        if (candidates == 1) {
683
18.6k
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
684
18.6k
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
685
18.6k
            wordsFound += 1;
686
18.6k
        }
687
        // If there was more than one, see which one can take us forward the most words
688
5.52k
        else if (candidates > 1) {
689
            // If we're already at the end of the range, we're done
690
4.42k
            if (utext_getNativeIndex(text) >= rangeEnd) {
691
972
                goto foundBest;
692
972
            }
693
7.04k
            do {
694
7.04k
                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
695
                    // Followed by another dictionary word; mark first word as a good candidate
696
2.62k
                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
697
                    
698
                    // If we're already at the end of the range, we're done
699
2.62k
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
700
204
                        goto foundBest;
701
204
                    }
702
                    
703
                    // See if any of the possible second words is followed by a third word
704
4.43k
                    do {
705
                        // If we find a third word, stop right away
706
4.43k
                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
707
1.22k
                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
708
1.22k
                            goto foundBest;
709
1.22k
                        }
710
3.21k
                    }
711
3.21k
                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
712
2.41k
                }
713
7.04k
            }
714
5.61k
            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
715
4.42k
foundBest:
716
4.42k
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
717
4.42k
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
718
4.42k
            wordsFound += 1;
719
4.42k
        }
720
        
721
        // We come here after having either found a word or not. We look ahead to the
722
        // next word. If it's not a dictionary word, we will combine it withe the word we
723
        // just found (if there is one), but only if the preceding word does not exceed
724
        // the threshold.
725
        // The text iterator should now be positioned at the end of the word we found.
726
24.2k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
727
            // if it is a dictionary word, do nothing. If it isn't, then if there is
728
            // no preceding word, or the non-word shares less than the minimum threshold
729
            // of characters with a dictionary word, then scan to resynchronize
730
20.5k
            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
731
5.82k
                  && (cuWordLength == 0
732
5.33k
                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
733
                // Look for a plausible word boundary
734
5.33k
                int32_t remaining = rangeEnd - (current + cuWordLength);
735
5.33k
                UChar32 pc;
736
5.33k
                UChar32 uc;
737
5.33k
                int32_t chars = 0;
738
7.69k
                for (;;) {
739
7.69k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
740
7.69k
                    pc = utext_next32(text);
741
7.69k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
742
7.69k
                    chars += pcSize;
743
7.69k
                    remaining -= pcSize;
744
7.69k
                    if (remaining <= 0) {
745
3.38k
                        break;
746
3.38k
                    }
747
4.31k
                    uc = utext_current32(text);
748
4.31k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
749
                        // Maybe. See if it's in the dictionary.
750
                        // TODO: this looks iffy; compare with old code.
751
2.25k
                        int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
752
2.25k
                        utext_setNativeIndex(text, current + cuWordLength + chars);
753
2.25k
                        if (num_candidates > 0) {
754
1.94k
                            break;
755
1.94k
                        }
756
2.25k
                    }
757
4.31k
                }
758
                
759
                // Bump the word count if there wasn't already one
760
5.33k
                if (cuWordLength <= 0) {
761
1.10k
                    wordsFound += 1;
762
1.10k
                }
763
                
764
                // Update the length with the passed-over characters
765
5.33k
                cuWordLength += chars;
766
5.33k
            }
767
15.2k
            else {
768
                // Back up to where we were for next iteration
769
15.2k
                utext_setNativeIndex(text, current + cuWordLength);
770
15.2k
            }
771
20.5k
        }
772
        
773
        // Never stop before a combining mark.
774
24.2k
        int32_t currPos;
775
25.3k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
776
1.11k
            utext_next32(text);
777
1.11k
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
778
1.11k
        }
779
        
780
        // Look ahead for possible suffixes if a dictionary word does not follow.
781
        // We do this in code rather than using a rule so that the heuristic
782
        // resynch continues to function. For example, one of the suffix characters
783
        // could be a typo in the middle of a word.
784
        // NOT CURRENTLY APPLICABLE TO BURMESE
785
786
        // Did we find a word on this iteration? If so, push it on the break stack
787
24.2k
        if (cuWordLength > 0) {
788
24.2k
            foundBreaks.push((current+cuWordLength), status);
789
24.2k
        }
790
24.2k
    }
791
792
    // Don't return a break for the end of the dictionary range if there is one there.
793
6.45k
    if (foundBreaks.peeki() >= rangeEnd) {
794
6.45k
        (void) foundBreaks.popi();
795
6.45k
        wordsFound -= 1;
796
6.45k
    }
797
798
6.45k
    return wordsFound;
799
6.45k
}
800
801
/*
802
 ******************************************************************
803
 * KhmerBreakEngine
804
 */
805
806
// How many words in a row are "good enough"?
807
static const int32_t KHMER_LOOKAHEAD = 3;
808
809
// Will not combine a non-word with a preceding dictionary word longer than this
810
static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 10;
811
812
// Will not combine a non-word that shares at least this much prefix with a
813
// dictionary word, with a preceding word
814
static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 5;
815
816
// Minimum word size
817
static const int32_t KHMER_MIN_WORD = 2;
818
819
// Minimum number of characters for two words
820
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
821
822
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
823
    : DictionaryBreakEngine(),
824
      fDictionary(adoptDictionary)
825
2
{
826
2
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
827
2
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
828
2
    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
829
2
    if (U_SUCCESS(status)) {
830
2
        setCharacters(fKhmerWordSet);
831
2
    }
832
2
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
833
2
    fMarkSet.add(0x0020);
834
2
    fEndWordSet = fKhmerWordSet;
835
2
    fBeginWordSet.add(0x1780, 0x17B3);
836
    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
837
    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
838
    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
839
2
    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
840
    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
841
//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
842
//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
843
//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
844
//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
845
//    fSuffixSet.add(THAI_PAIYANNOI);
846
//    fSuffixSet.add(THAI_MAIYAMOK);
847
848
    // Compact for caching.
849
2
    fMarkSet.compact();
850
2
    fEndWordSet.compact();
851
2
    fBeginWordSet.compact();
852
//    fSuffixSet.compact();
853
2
    UTRACE_EXIT_STATUS(status);
854
2
}
855
856
0
KhmerBreakEngine::~KhmerBreakEngine() {
857
0
    delete fDictionary;
858
0
}
859
860
int32_t
861
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
862
                                                int32_t rangeStart,
863
                                                int32_t rangeEnd,
864
8.66k
                                                UVector32 &foundBreaks ) const {
865
8.66k
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
866
6.34k
        return 0;       // Not enough characters for two words
867
6.34k
    }
868
869
2.32k
    uint32_t wordsFound = 0;
870
2.32k
    int32_t cpWordLength = 0;
871
2.32k
    int32_t cuWordLength = 0;
872
2.32k
    int32_t current;
873
2.32k
    UErrorCode status = U_ZERO_ERROR;
874
2.32k
    PossibleWord words[KHMER_LOOKAHEAD];
875
876
2.32k
    utext_setNativeIndex(text, rangeStart);
877
878
6.70k
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
879
4.38k
        cuWordLength = 0;
880
4.38k
        cpWordLength = 0;
881
882
        // Look for candidate words at the current position
883
4.38k
        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
884
885
        // If we found exactly one, use that
886
4.38k
        if (candidates == 1) {
887
1.31k
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
888
1.31k
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
889
1.31k
            wordsFound += 1;
890
1.31k
        }
891
892
        // If there was more than one, see which one can take us forward the most words
893
3.06k
        else if (candidates > 1) {
894
            // If we're already at the end of the range, we're done
895
1.10k
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
896
80
                goto foundBest;
897
80
            }
898
1.49k
            do {
899
1.49k
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
900
                    // Followed by another dictionary word; mark first word as a good candidate
901
822
                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
902
903
                    // If we're already at the end of the range, we're done
904
822
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
905
104
                        goto foundBest;
906
104
                    }
907
908
                    // See if any of the possible second words is followed by a third word
909
866
                    do {
910
                        // If we find a third word, stop right away
911
866
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
912
444
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
913
444
                            goto foundBest;
914
444
                        }
915
422
                    }
916
422
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
917
718
                }
918
1.49k
            }
919
947
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
920
1.10k
foundBest:
921
1.10k
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
922
1.10k
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
923
1.10k
            wordsFound += 1;
924
1.10k
        }
925
926
        // We come here after having either found a word or not. We look ahead to the
927
        // next word. If it's not a dictionary word, we will combine it with the word we
928
        // just found (if there is one), but only if the preceding word does not exceed
929
        // the threshold.
930
        // The text iterator should now be positioned at the end of the word we found.
931
4.38k
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
932
            // if it is a dictionary word, do nothing. If it isn't, then if there is
933
            // no preceding word, or the non-word shares less than the minimum threshold
934
            // of characters with a dictionary word, then scan to resynchronize
935
3.81k
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
936
2.65k
                  && (cuWordLength == 0
937
2.52k
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
938
                // Look for a plausible word boundary
939
2.52k
                int32_t remaining = rangeEnd - (current+cuWordLength);
940
2.52k
                UChar32 pc;
941
2.52k
                UChar32 uc;
942
2.52k
                int32_t chars = 0;
943
6.88k
                for (;;) {
944
6.88k
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
945
6.88k
                    pc = utext_next32(text);
946
6.88k
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
947
6.88k
                    chars += pcSize;
948
6.88k
                    remaining -= pcSize;
949
6.88k
                    if (remaining <= 0) {
950
1.75k
                        break;
951
1.75k
                    }
952
5.12k
                    uc = utext_current32(text);
953
5.12k
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
954
                        // Maybe. See if it's in the dictionary.
955
3.48k
                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
956
3.48k
                        utext_setNativeIndex(text, current+cuWordLength+chars);
957
3.48k
                        if (num_candidates > 0) {
958
773
                            break;
959
773
                        }
960
3.48k
                    }
961
5.12k
                }
962
963
                // Bump the word count if there wasn't already one
964
2.52k
                if (cuWordLength <= 0) {
965
1.96k
                    wordsFound += 1;
966
1.96k
                }
967
968
                // Update the length with the passed-over characters
969
2.52k
                cuWordLength += chars;
970
2.52k
            }
971
1.28k
            else {
972
                // Back up to where we were for next iteration
973
1.28k
                utext_setNativeIndex(text, current+cuWordLength);
974
1.28k
            }
975
3.81k
        }
976
977
        // Never stop before a combining mark.
978
4.38k
        int32_t currPos;
979
4.38k
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
980
0
            utext_next32(text);
981
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
982
0
        }
983
984
        // Look ahead for possible suffixes if a dictionary word does not follow.
985
        // We do this in code rather than using a rule so that the heuristic
986
        // resynch continues to function. For example, one of the suffix characters
987
        // could be a typo in the middle of a word.
988
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
989
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
990
//                && fSuffixSet.contains(uc = utext_current32(text))) {
991
//                if (uc == KHMER_PAIYANNOI) {
992
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
993
//                        // Skip over previous end and PAIYANNOI
994
//                        utext_next32(text);
995
//                        utext_next32(text);
996
//                        wordLength += 1;            // Add PAIYANNOI to word
997
//                        uc = utext_current32(text);     // Fetch next character
998
//                    }
999
//                    else {
1000
//                        // Restore prior position
1001
//                        utext_next32(text);
1002
//                    }
1003
//                }
1004
//                if (uc == KHMER_MAIYAMOK) {
1005
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1006
//                        // Skip over previous end and MAIYAMOK
1007
//                        utext_next32(text);
1008
//                        utext_next32(text);
1009
//                        wordLength += 1;            // Add MAIYAMOK to word
1010
//                    }
1011
//                    else {
1012
//                        // Restore prior position
1013
//                        utext_next32(text);
1014
//                    }
1015
//                }
1016
//            }
1017
//            else {
1018
//                utext_setNativeIndex(text, current+wordLength);
1019
//            }
1020
//        }
1021
1022
        // Did we find a word on this iteration? If so, push it on the break stack
1023
4.38k
        if (cuWordLength > 0) {
1024
4.38k
            foundBreaks.push((current+cuWordLength), status);
1025
4.38k
        }
1026
4.38k
    }
1027
    
1028
    // Don't return a break for the end of the dictionary range if there is one there.
1029
2.32k
    if (foundBreaks.peeki() >= rangeEnd) {
1030
2.32k
        (void) foundBreaks.popi();
1031
2.32k
        wordsFound -= 1;
1032
2.32k
    }
1033
1034
2.32k
    return wordsFound;
1035
2.32k
}
1036
1037
#if !UCONFIG_NO_NORMALIZATION
1038
/*
1039
 ******************************************************************
1040
 * CjkBreakEngine
1041
 */
1042
static const uint32_t kuint32max = 0xFFFFFFFF;
1043
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1044
0
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1045
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1046
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1047
    // Korean dictionary only includes Hangul syllables
1048
0
    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1049
0
    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1050
0
    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1051
0
    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1052
0
    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1053
1054
0
    if (U_SUCCESS(status)) {
1055
        // handle Korean and Japanese/Chinese using different dictionaries
1056
0
        if (type == kKorean) {
1057
0
            setCharacters(fHangulWordSet);
1058
0
        } else { //Chinese and Japanese
1059
0
            UnicodeSet cjSet;
1060
0
            cjSet.addAll(fHanWordSet);
1061
0
            cjSet.addAll(fKatakanaWordSet);
1062
0
            cjSet.addAll(fHiraganaWordSet);
1063
0
            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1064
0
            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1065
0
            setCharacters(cjSet);
1066
0
        }
1067
0
    }
1068
0
    UTRACE_EXIT_STATUS(status);
1069
0
}
1070
1071
0
CjkBreakEngine::~CjkBreakEngine(){
1072
0
    delete fDictionary;
1073
0
}
1074
1075
// The katakanaCost values below are based on the length frequencies of all
1076
// katakana phrases in the dictionary
1077
static const int32_t kMaxKatakanaLength = 8;
1078
static const int32_t kMaxKatakanaGroupLength = 20;
1079
static const uint32_t maxSnlp = 255;
1080
1081
0
static inline uint32_t getKatakanaCost(int32_t wordLength){
1082
    //TODO: fill array with actual values from dictionary!
1083
0
    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1084
0
                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1085
0
    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1086
0
}
1087
1088
0
static inline bool isKatakana(UChar32 value) {
1089
0
    return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1090
0
            (value >= 0xFF66 && value <= 0xFF9f);
1091
0
}
1092
1093
1094
// Function for accessing internal utext flags.
1095
//   Replicates an internal UText function.
1096
1097
0
static inline int32_t utext_i32_flag(int32_t bitIndex) {
1098
0
    return (int32_t)1 << bitIndex;
1099
0
}
1100
1101
       
1102
/*
1103
 * @param text A UText representing the text
1104
 * @param rangeStart The start of the range of dictionary characters
1105
 * @param rangeEnd The end of the range of dictionary characters
1106
 * @param foundBreaks vector<int32> to receive the break positions
1107
 * @return The number of breaks found
1108
 */
1109
int32_t 
1110
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1111
        int32_t rangeStart,
1112
        int32_t rangeEnd,
1113
0
        UVector32 &foundBreaks ) const {
1114
0
    if (rangeStart >= rangeEnd) {
1115
0
        return 0;
1116
0
    }
1117
1118
    // UnicodeString version of input UText, NFKC normalized if necessary.
1119
0
    UnicodeString inString;
1120
1121
    // inputMap[inStringIndex] = corresponding native index from UText inText.
1122
    // If NULL then mapping is 1:1
1123
0
    LocalPointer<UVector32>     inputMap;
1124
1125
0
    UErrorCode     status      = U_ZERO_ERROR;
1126
1127
1128
    // if UText has the input string as one contiguous UTF-16 chunk
1129
0
    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1130
0
         inText->chunkNativeStart <= rangeStart &&
1131
0
         inText->chunkNativeLimit >= rangeEnd   &&
1132
0
         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1133
1134
        // Input UText is in one contiguous UTF-16 chunk.
1135
        // Use Read-only aliasing UnicodeString.
1136
0
        inString.setTo(FALSE,
1137
0
                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
1138
0
                       rangeEnd - rangeStart);
1139
0
    } else {
1140
        // Copy the text from the original inText (UText) to inString (UnicodeString).
1141
        // Create a map from UnicodeString indices -> UText offsets.
1142
0
        utext_setNativeIndex(inText, rangeStart);
1143
0
        int32_t limit = rangeEnd;
1144
0
        U_ASSERT(limit <= utext_nativeLength(inText));
1145
0
        if (limit > utext_nativeLength(inText)) {
1146
0
            limit = (int32_t)utext_nativeLength(inText);
1147
0
        }
1148
0
        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1149
0
        if (U_FAILURE(status)) {
1150
0
            return 0;
1151
0
        }
1152
0
        while (utext_getNativeIndex(inText) < limit) {
1153
0
            int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1154
0
            UChar32 c = utext_next32(inText);
1155
0
            U_ASSERT(c != U_SENTINEL);
1156
0
            inString.append(c);
1157
0
            while (inputMap->size() < inString.length()) {
1158
0
                inputMap->addElement(nativePosition, status);
1159
0
            }
1160
0
        }
1161
0
        inputMap->addElement(limit, status);
1162
0
    }
1163
1164
1165
0
    if (!nfkcNorm2->isNormalized(inString, status)) {
1166
0
        UnicodeString normalizedInput;
1167
        //  normalizedMap[normalizedInput position] ==  original UText position.
1168
0
        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1169
0
        if (U_FAILURE(status)) {
1170
0
            return 0;
1171
0
        }
1172
        
1173
0
        UnicodeString fragment;
1174
0
        UnicodeString normalizedFragment;
1175
0
        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1176
0
            fragment.remove();
1177
0
            int32_t fragmentStartI = srcI;
1178
0
            UChar32 c = inString.char32At(srcI);
1179
0
            for (;;) {
1180
0
                fragment.append(c);
1181
0
                srcI = inString.moveIndex32(srcI, 1);
1182
0
                if (srcI == inString.length()) {
1183
0
                    break;
1184
0
                }
1185
0
                c = inString.char32At(srcI);
1186
0
                if (nfkcNorm2->hasBoundaryBefore(c)) {
1187
0
                    break;
1188
0
                }
1189
0
            }
1190
0
            nfkcNorm2->normalize(fragment, normalizedFragment, status);
1191
0
            normalizedInput.append(normalizedFragment);
1192
1193
            // Map every position in the normalized chunk to the start of the chunk
1194
            //   in the original input.
1195
0
            int32_t fragmentOriginalStart = inputMap.isValid() ?
1196
0
                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1197
0
            while (normalizedMap->size() < normalizedInput.length()) {
1198
0
                normalizedMap->addElement(fragmentOriginalStart, status);
1199
0
                if (U_FAILURE(status)) {
1200
0
                    break;
1201
0
                }
1202
0
            }
1203
0
        }
1204
0
        U_ASSERT(normalizedMap->size() == normalizedInput.length());
1205
0
        int32_t nativeEnd = inputMap.isValid() ?
1206
0
                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1207
0
        normalizedMap->addElement(nativeEnd, status);
1208
1209
0
        inputMap = std::move(normalizedMap);
1210
0
        inString = std::move(normalizedInput);
1211
0
    }
1212
1213
0
    int32_t numCodePts = inString.countChar32();
1214
0
    if (numCodePts != inString.length()) {
1215
        // There are supplementary characters in the input.
1216
        // The dictionary will produce boundary positions in terms of code point indexes,
1217
        //   not in terms of code unit string indexes.
1218
        // Use the inputMap mechanism to take care of this in addition to indexing differences
1219
        //    from normalization and/or UTF-8 input.
1220
0
        UBool hadExistingMap = inputMap.isValid();
1221
0
        if (!hadExistingMap) {
1222
0
            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1223
0
            if (U_FAILURE(status)) {
1224
0
                return 0;
1225
0
            }
1226
0
        }
1227
0
        int32_t cpIdx = 0;
1228
0
        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1229
0
            U_ASSERT(cuIdx >= cpIdx);
1230
0
            if (hadExistingMap) {
1231
0
                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1232
0
            } else {
1233
0
                inputMap->addElement(cuIdx+rangeStart, status);
1234
0
            }
1235
0
            cpIdx++;
1236
0
            if (cuIdx == inString.length()) {
1237
0
               break;
1238
0
            }
1239
0
        }
1240
0
    }
1241
                
1242
    // bestSnlp[i] is the snlp of the best segmentation of the first i
1243
    // code points in the range to be matched.
1244
0
    UVector32 bestSnlp(numCodePts + 1, status);
1245
0
    bestSnlp.addElement(0, status);
1246
0
    for(int32_t i = 1; i <= numCodePts; i++) {
1247
0
        bestSnlp.addElement(kuint32max, status);
1248
0
    }
1249
1250
1251
    // prev[i] is the index of the last CJK code point in the previous word in 
1252
    // the best segmentation of the first i characters.
1253
0
    UVector32 prev(numCodePts + 1, status);
1254
0
    for(int32_t i = 0; i <= numCodePts; i++){
1255
0
        prev.addElement(-1, status);
1256
0
    }
1257
1258
0
    const int32_t maxWordSize = 20;
1259
0
    UVector32 values(numCodePts, status);
1260
0
    values.setSize(numCodePts);
1261
0
    UVector32 lengths(numCodePts, status);
1262
0
    lengths.setSize(numCodePts);
1263
1264
0
    UText fu = UTEXT_INITIALIZER;
1265
0
    utext_openUnicodeString(&fu, &inString, &status);
1266
1267
    // Dynamic programming to find the best segmentation.
1268
1269
    // In outer loop, i  is the code point index,
1270
    //                ix is the corresponding string (code unit) index.
1271
    //    They differ when the string contains supplementary characters.
1272
0
    int32_t ix = 0;
1273
0
    bool is_prev_katakana = false;
1274
0
    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1275
0
        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1276
0
            continue;
1277
0
        }
1278
1279
0
        int32_t count;
1280
0
        utext_setNativeIndex(&fu, ix);
1281
0
        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1282
0
                             NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1283
                             // Note: lengths is filled with code point lengths
1284
                             //       The NULL parameter is the ignored code unit lengths.
1285
1286
        // if there are no single character matches found in the dictionary 
1287
        // starting with this character, treat character as a 1-character word 
1288
        // with the highest value possible, i.e. the least likely to occur.
1289
        // Exclude Korean characters from this treatment, as they should be left
1290
        // together by default.
1291
0
        if ((count == 0 || lengths.elementAti(0) != 1) &&
1292
0
                !fHangulWordSet.contains(inString.char32At(ix))) {
1293
0
            values.setElementAt(maxSnlp, count);   // 255
1294
0
            lengths.setElementAt(1, count++);
1295
0
        }
1296
1297
0
        for (int32_t j = 0; j < count; j++) {
1298
0
            uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1299
0
            int32_t ln_j_i = lengths.elementAti(j) + i;
1300
0
            if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1301
0
                bestSnlp.setElementAt(newSnlp, ln_j_i);
1302
0
                prev.setElementAt(i, ln_j_i);
1303
0
            }
1304
0
        }
1305
1306
        // In Japanese,
1307
        // Katakana word in single character is pretty rare. So we apply
1308
        // the following heuristic to Katakana: any continuous run of Katakana
1309
        // characters is considered a candidate word with a default cost
1310
        // specified in the katakanaCost table according to its length.
1311
1312
0
        bool is_katakana = isKatakana(inString.char32At(ix));
1313
0
        int32_t katakanaRunLength = 1;
1314
0
        if (!is_prev_katakana && is_katakana) {
1315
0
            int32_t j = inString.moveIndex32(ix, 1);
1316
            // Find the end of the continuous run of Katakana characters
1317
0
            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1318
0
                    isKatakana(inString.char32At(j))) {
1319
0
                j = inString.moveIndex32(j, 1);
1320
0
                katakanaRunLength++;
1321
0
            }
1322
0
            if (katakanaRunLength < kMaxKatakanaGroupLength) {
1323
0
                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1324
0
                if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1325
0
                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1326
0
                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1327
0
                }
1328
0
            }
1329
0
        }
1330
0
        is_prev_katakana = is_katakana;
1331
0
    }
1332
0
    utext_close(&fu);
1333
1334
    // Start pushing the optimal offset index into t_boundary (t for tentative).
1335
    // prev[numCodePts] is guaranteed to be meaningful.
1336
    // We'll first push in the reverse order, i.e.,
1337
    // t_boundary[0] = numCodePts, and afterwards do a swap.
1338
0
    UVector32 t_boundary(numCodePts+1, status);
1339
1340
0
    int32_t numBreaks = 0;
1341
    // No segmentation found, set boundary to end of range
1342
0
    if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1343
0
        t_boundary.addElement(numCodePts, status);
1344
0
        numBreaks++;
1345
0
    } else {
1346
0
        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1347
0
            t_boundary.addElement(i, status);
1348
0
            numBreaks++;
1349
0
        }
1350
0
        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1351
0
    }
1352
1353
    // Add a break for the start of the dictionary range if there is not one
1354
    // there already.
1355
0
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1356
0
        t_boundary.addElement(0, status);
1357
0
        numBreaks++;
1358
0
    }
1359
1360
    // Now that we're done, convert positions in t_boundary[] (indices in 
1361
    // the normalized input string) back to indices in the original input UText
1362
    // while reversing t_boundary and pushing values to foundBreaks.
1363
0
    int32_t prevCPPos = -1;
1364
0
    int32_t prevUTextPos = -1;
1365
0
    for (int32_t i = numBreaks-1; i >= 0; i--) {
1366
0
        int32_t cpPos = t_boundary.elementAti(i);
1367
0
        U_ASSERT(cpPos > prevCPPos);
1368
0
        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1369
0
        U_ASSERT(utextPos >= prevUTextPos);
1370
0
        if (utextPos > prevUTextPos) {
1371
            // Boundaries are added to foundBreaks output in ascending order.
1372
0
            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1373
0
            foundBreaks.push(utextPos, status);
1374
0
        } else {
1375
            // Normalization expanded the input text, the dictionary found a boundary
1376
            // within the expansion, giving two boundaries with the same index in the
1377
            // original text. Ignore the second. See ticket #12918.
1378
0
            --numBreaks;
1379
0
        }
1380
0
        prevCPPos = cpPos;
1381
0
        prevUTextPos = utextPos;
1382
0
    }
1383
0
    (void)prevCPPos; // suppress compiler warnings about unused variable
1384
1385
    // inString goes out of scope
1386
    // inputMap goes out of scope
1387
0
    return numBreaks;
1388
0
}
1389
#endif
1390
1391
U_NAMESPACE_END
1392
1393
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1394