Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/dictbe.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
 *******************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 *******************************************************************************
8
 */
9
10
#include <utility>
11
12
#include "unicode/utypes.h"
13
14
#if !UCONFIG_NO_BREAK_ITERATION
15
16
#include "brkeng.h"
17
#include "dictbe.h"
18
#include "unicode/uniset.h"
19
#include "unicode/chariter.h"
20
#include "unicode/ubrk.h"
21
#include "utracimp.h"
22
#include "uvectr32.h"
23
#include "uvector.h"
24
#include "uassert.h"
25
#include "unicode/normlzr.h"
26
#include "cmemory.h"
27
#include "dictionarydata.h"
28
29
U_NAMESPACE_BEGIN
30
31
/*
32
 ******************************************************************
33
 */
34
35
0
DictionaryBreakEngine::DictionaryBreakEngine() {
36
0
}
37
38
0
DictionaryBreakEngine::~DictionaryBreakEngine() {
39
0
}
40
41
UBool
42
0
DictionaryBreakEngine::handles(UChar32 c) const {
43
0
    return fSet.contains(c);
44
0
}
45
46
int32_t
47
DictionaryBreakEngine::findBreaks( UText *text,
48
                                 int32_t startPos,
49
                                 int32_t endPos,
50
                                 UVector32 &foundBreaks,
51
0
                                 UErrorCode& status) const {
52
0
    if (U_FAILURE(status)) return 0;
53
0
    (void)startPos;            // TODO: remove this param?
54
0
    int32_t result = 0;
55
56
    // Find the span of characters included in the set.
57
    //   The span to break begins at the current position in the text, and
58
    //   extends towards the start or end of the text, depending on 'reverse'.
59
60
0
    int32_t start = (int32_t)utext_getNativeIndex(text);
61
0
    int32_t current;
62
0
    int32_t rangeStart;
63
0
    int32_t rangeEnd;
64
0
    UChar32 c = utext_current32(text);
65
0
    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
66
0
        utext_next32(text);         // TODO:  recast loop for postincrement
67
0
        c = utext_current32(text);
68
0
    }
69
0
    rangeStart = start;
70
0
    rangeEnd = current;
71
0
    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);
72
0
    utext_setNativeIndex(text, current);
73
    
74
0
    return result;
75
0
}
76
77
void
78
0
DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
79
0
    fSet = set;
80
    // Compact for caching
81
0
    fSet.compact();
82
0
}
83
84
/*
85
 ******************************************************************
86
 * PossibleWord
87
 */
88
89
// Helper class for improving readability of the Thai/Lao/Khmer word break
90
// algorithm. The implementation is completely inline.
91
92
// List size, limited by the maximum number of words in the dictionary
93
// that form a nested sequence.
94
static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
95
96
class PossibleWord {
97
private:
98
    // list of word candidate lengths, in increasing length order
99
    // TODO: bytes would be sufficient for word lengths.
100
    int32_t   count;      // Count of candidates
101
    int32_t   prefix;     // The longest match with a dictionary word
102
    int32_t   offset;     // Offset in the text of these candidates
103
    int32_t   mark;       // The preferred candidate's offset
104
    int32_t   current;    // The candidate we're currently looking at
105
    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
106
    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
107
108
public:
109
0
    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {}
110
0
    ~PossibleWord() {}
111
  
112
    // Fill the list of candidates if needed, select the longest, and return the number found
113
    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
114
  
115
    // Select the currently marked candidate, point after it in the text, and invalidate self
116
    int32_t   acceptMarked( UText *text );
117
  
118
    // Back up from the current candidate to the next shorter one; return TRUE if that exists
119
    // and point the text after it
120
    UBool     backUp( UText *text );
121
  
122
    // Return the longest prefix this candidate location shares with a dictionary word
123
    // Return value is in code points.
124
0
    int32_t   longestPrefix() { return prefix; }
125
  
126
    // Mark the current candidate as the one we like
127
0
    void      markCurrent() { mark = current; }
128
    
129
    // Get length in code points of the marked word.
130
0
    int32_t   markedCPLength() { return cpLengths[mark]; }
131
};
132
133
134
0
int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
135
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
136
0
    int32_t start = (int32_t)utext_getNativeIndex(text);
137
0
    if (start != offset) {
138
0
        offset = start;
139
0
        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
140
        // Dictionary leaves text after longest prefix, not longest word. Back up.
141
0
        if (count <= 0) {
142
0
            utext_setNativeIndex(text, start);
143
0
        }
144
0
    }
145
0
    if (count > 0) {
146
0
        utext_setNativeIndex(text, start+cuLengths[count-1]);
147
0
    }
148
0
    current = count-1;
149
0
    mark = current;
150
0
    return count;
151
0
}
152
153
int32_t
154
0
PossibleWord::acceptMarked( UText *text ) {
155
0
    utext_setNativeIndex(text, offset + cuLengths[mark]);
156
0
    return cuLengths[mark];
157
0
}
158
159
160
UBool
161
0
PossibleWord::backUp( UText *text ) {
162
0
    if (current > 0) {
163
0
        utext_setNativeIndex(text, offset + cuLengths[--current]);
164
0
        return TRUE;
165
0
    }
166
0
    return FALSE;
167
0
}
168
169
/*
170
 ******************************************************************
171
 * ThaiBreakEngine
172
 */
173
174
// How many words in a row are "good enough"?
175
static const int32_t THAI_LOOKAHEAD = 3;
176
177
// Will not combine a non-word with a preceding dictionary word longer than this
178
static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
179
180
// Will not combine a non-word that shares at least this much prefix with a
181
// dictionary word, with a preceding word
182
static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
183
184
// Elision character
185
static const int32_t THAI_PAIYANNOI = 0x0E2F;
186
187
// Repeat character
188
static const int32_t THAI_MAIYAMOK = 0x0E46;
189
190
// Minimum word size
191
static const int32_t THAI_MIN_WORD = 2;
192
193
// Minimum number of characters for two words
194
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
195
196
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
197
0
    : DictionaryBreakEngine(),
198
0
      fDictionary(adoptDictionary)
199
0
{
200
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
201
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");
202
0
    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
203
0
    if (U_SUCCESS(status)) {
204
0
        setCharacters(fThaiWordSet);
205
0
    }
206
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
207
0
    fMarkSet.add(0x0020);
208
0
    fEndWordSet = fThaiWordSet;
209
0
    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
210
0
    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
211
0
    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
212
0
    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
213
0
    fSuffixSet.add(THAI_PAIYANNOI);
214
0
    fSuffixSet.add(THAI_MAIYAMOK);
215
216
    // Compact for caching.
217
0
    fMarkSet.compact();
218
0
    fEndWordSet.compact();
219
0
    fBeginWordSet.compact();
220
0
    fSuffixSet.compact();
221
0
    UTRACE_EXIT_STATUS(status);
222
0
}
223
224
0
ThaiBreakEngine::~ThaiBreakEngine() {
225
0
    delete fDictionary;
226
0
}
227
228
int32_t
229
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
230
                                                int32_t rangeStart,
231
                                                int32_t rangeEnd,
232
                                                UVector32 &foundBreaks,
233
0
                                                UErrorCode& status) const {
234
0
    if (U_FAILURE(status)) return 0;
235
0
    utext_setNativeIndex(text, rangeStart);
236
0
    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
237
0
    if (utext_getNativeIndex(text) >= rangeEnd) {
238
0
        return 0;       // Not enough characters for two words
239
0
    }
240
0
    utext_setNativeIndex(text, rangeStart);
241
242
243
0
    uint32_t wordsFound = 0;
244
0
    int32_t cpWordLength = 0;    // Word Length in Code Points.
245
0
    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
246
0
    int32_t current;
247
0
    PossibleWord words[THAI_LOOKAHEAD];
248
    
249
0
    utext_setNativeIndex(text, rangeStart);
250
    
251
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
252
0
        cpWordLength = 0;
253
0
        cuWordLength = 0;
254
255
        // Look for candidate words at the current position
256
0
        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
257
        
258
        // If we found exactly one, use that
259
0
        if (candidates == 1) {
260
0
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
261
0
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
262
0
            wordsFound += 1;
263
0
        }
264
        // If there was more than one, see which one can take us forward the most words
265
0
        else if (candidates > 1) {
266
            // If we're already at the end of the range, we're done
267
0
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
268
0
                goto foundBest;
269
0
            }
270
0
            do {
271
0
                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
272
                    // Followed by another dictionary word; mark first word as a good candidate
273
0
                    words[wordsFound%THAI_LOOKAHEAD].markCurrent();
274
                    
275
                    // If we're already at the end of the range, we're done
276
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
277
0
                        goto foundBest;
278
0
                    }
279
                    
280
                    // See if any of the possible second words is followed by a third word
281
0
                    do {
282
                        // If we find a third word, stop right away
283
0
                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
284
0
                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
285
0
                            goto foundBest;
286
0
                        }
287
0
                    }
288
0
                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
289
0
                }
290
0
            }
291
0
            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
292
0
foundBest:
293
            // Set UText position to after the accepted word.
294
0
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
295
0
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
296
0
            wordsFound += 1;
297
0
        }
298
        
299
        // We come here after having either found a word or not. We look ahead to the
300
        // next word. If it's not a dictionary word, we will combine it with the word we
301
        // just found (if there is one), but only if the preceding word does not exceed
302
        // the threshold.
303
        // The text iterator should now be positioned at the end of the word we found.
304
        
305
0
        UChar32 uc = 0;
306
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
307
            // if it is a dictionary word, do nothing. If it isn't, then if there is
308
            // no preceding word, or the non-word shares less than the minimum threshold
309
            // of characters with a dictionary word, then scan to resynchronize
310
0
            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
311
0
                  && (cuWordLength == 0
312
0
                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
313
                // Look for a plausible word boundary
314
0
                int32_t remaining = rangeEnd - (current+cuWordLength);
315
0
                UChar32 pc;
316
0
                int32_t chars = 0;
317
0
                for (;;) {
318
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
319
0
                    pc = utext_next32(text);
320
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
321
0
                    chars += pcSize;
322
0
                    remaining -= pcSize;
323
0
                    if (remaining <= 0) {
324
0
                        break;
325
0
                    }
326
0
                    uc = utext_current32(text);
327
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
328
                        // Maybe. See if it's in the dictionary.
329
                        // NOTE: In the original Apple code, checked that the next
330
                        // two characters after uc were not 0x0E4C THANTHAKHAT before
331
                        // checking the dictionary. That is just a performance filter,
332
                        // but it's not clear it's faster than checking the trie.
333
0
                        int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
334
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
335
0
                        if (num_candidates > 0) {
336
0
                            break;
337
0
                        }
338
0
                    }
339
0
                }
340
                
341
                // Bump the word count if there wasn't already one
342
0
                if (cuWordLength <= 0) {
343
0
                    wordsFound += 1;
344
0
                }
345
                
346
                // Update the length with the passed-over characters
347
0
                cuWordLength += chars;
348
0
            }
349
0
            else {
350
                // Back up to where we were for next iteration
351
0
                utext_setNativeIndex(text, current+cuWordLength);
352
0
            }
353
0
        }
354
        
355
        // Never stop before a combining mark.
356
0
        int32_t currPos;
357
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
358
0
            utext_next32(text);
359
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
360
0
        }
361
        
362
        // Look ahead for possible suffixes if a dictionary word does not follow.
363
        // We do this in code rather than using a rule so that the heuristic
364
        // resynch continues to function. For example, one of the suffix characters
365
        // could be a typo in the middle of a word.
366
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
367
0
            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
368
0
                && fSuffixSet.contains(uc = utext_current32(text))) {
369
0
                if (uc == THAI_PAIYANNOI) {
370
0
                    if (!fSuffixSet.contains(utext_previous32(text))) {
371
                        // Skip over previous end and PAIYANNOI
372
0
                        utext_next32(text);
373
0
                        int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
374
0
                        utext_next32(text);
375
0
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
376
0
                        uc = utext_current32(text);     // Fetch next character
377
0
                    }
378
0
                    else {
379
                        // Restore prior position
380
0
                        utext_next32(text);
381
0
                    }
382
0
                }
383
0
                if (uc == THAI_MAIYAMOK) {
384
0
                    if (utext_previous32(text) != THAI_MAIYAMOK) {
385
                        // Skip over previous end and MAIYAMOK
386
0
                        utext_next32(text);
387
0
                        int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
388
0
                        utext_next32(text);
389
0
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
390
0
                    }
391
0
                    else {
392
                        // Restore prior position
393
0
                        utext_next32(text);
394
0
                    }
395
0
                }
396
0
            }
397
0
            else {
398
0
                utext_setNativeIndex(text, current+cuWordLength);
399
0
            }
400
0
        }
401
402
        // Did we find a word on this iteration? If so, push it on the break stack
403
0
        if (cuWordLength > 0) {
404
0
            foundBreaks.push((current+cuWordLength), status);
405
0
        }
406
0
    }
407
408
    // Don't return a break for the end of the dictionary range if there is one there.
409
0
    if (foundBreaks.peeki() >= rangeEnd) {
410
0
        (void) foundBreaks.popi();
411
0
        wordsFound -= 1;
412
0
    }
413
414
0
    return wordsFound;
415
0
}
416
417
/*
418
 ******************************************************************
419
 * LaoBreakEngine
420
 */
421
422
// How many words in a row are "good enough"?
423
static const int32_t LAO_LOOKAHEAD = 3;
424
425
// Will not combine a non-word with a preceding dictionary word longer than this
426
static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
427
428
// Will not combine a non-word that shares at least this much prefix with a
429
// dictionary word, with a preceding word
430
static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
431
432
// Minimum word size
433
static const int32_t LAO_MIN_WORD = 2;
434
435
// Minimum number of characters for two words
436
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
437
438
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
439
0
    : DictionaryBreakEngine(),
440
0
      fDictionary(adoptDictionary)
441
0
{
442
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
443
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");
444
0
    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
445
0
    if (U_SUCCESS(status)) {
446
0
        setCharacters(fLaoWordSet);
447
0
    }
448
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
449
0
    fMarkSet.add(0x0020);
450
0
    fEndWordSet = fLaoWordSet;
451
0
    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
452
0
    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
453
0
    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
454
0
    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
455
456
    // Compact for caching.
457
0
    fMarkSet.compact();
458
0
    fEndWordSet.compact();
459
0
    fBeginWordSet.compact();
460
0
    UTRACE_EXIT_STATUS(status);
461
0
}
462
463
0
LaoBreakEngine::~LaoBreakEngine() {
464
0
    delete fDictionary;
465
0
}
466
467
int32_t
468
LaoBreakEngine::divideUpDictionaryRange( UText *text,
469
                                                int32_t rangeStart,
470
                                                int32_t rangeEnd,
471
                                                UVector32 &foundBreaks,
472
0
                                                UErrorCode& status) const {
473
0
    if (U_FAILURE(status)) return 0;
474
0
    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
475
0
        return 0;       // Not enough characters for two words
476
0
    }
477
478
0
    uint32_t wordsFound = 0;
479
0
    int32_t cpWordLength = 0;
480
0
    int32_t cuWordLength = 0;
481
0
    int32_t current;
482
0
    PossibleWord words[LAO_LOOKAHEAD];
483
484
0
    utext_setNativeIndex(text, rangeStart);
485
486
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
487
0
        cuWordLength = 0;
488
0
        cpWordLength = 0;
489
490
        // Look for candidate words at the current position
491
0
        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
492
        
493
        // If we found exactly one, use that
494
0
        if (candidates == 1) {
495
0
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
496
0
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
497
0
            wordsFound += 1;
498
0
        }
499
        // If there was more than one, see which one can take us forward the most words
500
0
        else if (candidates > 1) {
501
            // If we're already at the end of the range, we're done
502
0
            if (utext_getNativeIndex(text) >= rangeEnd) {
503
0
                goto foundBest;
504
0
            }
505
0
            do {
506
0
                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
507
                    // Followed by another dictionary word; mark first word as a good candidate
508
0
                    words[wordsFound%LAO_LOOKAHEAD].markCurrent();
509
                    
510
                    // If we're already at the end of the range, we're done
511
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
512
0
                        goto foundBest;
513
0
                    }
514
                    
515
                    // See if any of the possible second words is followed by a third word
516
0
                    do {
517
                        // If we find a third word, stop right away
518
0
                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
519
0
                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
520
0
                            goto foundBest;
521
0
                        }
522
0
                    }
523
0
                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
524
0
                }
525
0
            }
526
0
            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
527
0
foundBest:
528
0
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
529
0
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
530
0
            wordsFound += 1;
531
0
        }
532
        
533
        // We come here after having either found a word or not. We look ahead to the
534
        // next word. If it's not a dictionary word, we will combine it with the word we
535
        // just found (if there is one), but only if the preceding word does not exceed
536
        // the threshold.
537
        // The text iterator should now be positioned at the end of the word we found.
538
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
539
            // if it is a dictionary word, do nothing. If it isn't, then if there is
540
            // no preceding word, or the non-word shares less than the minimum threshold
541
            // of characters with a dictionary word, then scan to resynchronize
542
0
            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
543
0
                  && (cuWordLength == 0
544
0
                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
545
                // Look for a plausible word boundary
546
0
                int32_t remaining = rangeEnd - (current + cuWordLength);
547
0
                UChar32 pc;
548
0
                UChar32 uc;
549
0
                int32_t chars = 0;
550
0
                for (;;) {
551
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
552
0
                    pc = utext_next32(text);
553
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
554
0
                    chars += pcSize;
555
0
                    remaining -= pcSize;
556
0
                    if (remaining <= 0) {
557
0
                        break;
558
0
                    }
559
0
                    uc = utext_current32(text);
560
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
561
                        // Maybe. See if it's in the dictionary.
562
                        // TODO: this looks iffy; compare with old code.
563
0
                        int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
564
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
565
0
                        if (num_candidates > 0) {
566
0
                            break;
567
0
                        }
568
0
                    }
569
0
                }
570
                
571
                // Bump the word count if there wasn't already one
572
0
                if (cuWordLength <= 0) {
573
0
                    wordsFound += 1;
574
0
                }
575
                
576
                // Update the length with the passed-over characters
577
0
                cuWordLength += chars;
578
0
            }
579
0
            else {
580
                // Back up to where we were for next iteration
581
0
                utext_setNativeIndex(text, current + cuWordLength);
582
0
            }
583
0
        }
584
        
585
        // Never stop before a combining mark.
586
0
        int32_t currPos;
587
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
588
0
            utext_next32(text);
589
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
590
0
        }
591
        
592
        // Look ahead for possible suffixes if a dictionary word does not follow.
593
        // We do this in code rather than using a rule so that the heuristic
594
        // resynch continues to function. For example, one of the suffix characters
595
        // could be a typo in the middle of a word.
596
        // NOT CURRENTLY APPLICABLE TO LAO
597
598
        // Did we find a word on this iteration? If so, push it on the break stack
599
0
        if (cuWordLength > 0) {
600
0
            foundBreaks.push((current+cuWordLength), status);
601
0
        }
602
0
    }
603
604
    // Don't return a break for the end of the dictionary range if there is one there.
605
0
    if (foundBreaks.peeki() >= rangeEnd) {
606
0
        (void) foundBreaks.popi();
607
0
        wordsFound -= 1;
608
0
    }
609
610
0
    return wordsFound;
611
0
}
612
613
/*
614
 ******************************************************************
615
 * BurmeseBreakEngine
616
 */
617
618
// How many words in a row are "good enough"?
619
static const int32_t BURMESE_LOOKAHEAD = 3;
620
621
// Will not combine a non-word with a preceding dictionary word longer than this
622
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
623
624
// Will not combine a non-word that shares at least this much prefix with a
625
// dictionary word, with a preceding word
626
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
627
628
// Minimum word size
629
static const int32_t BURMESE_MIN_WORD = 2;
630
631
// Minimum number of characters for two words
632
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
633
634
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
635
0
    : DictionaryBreakEngine(),
636
0
      fDictionary(adoptDictionary)
637
0
{
638
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
639
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");
640
0
    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
641
0
    if (U_SUCCESS(status)) {
642
0
        setCharacters(fBurmeseWordSet);
643
0
    }
644
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
645
0
    fMarkSet.add(0x0020);
646
0
    fEndWordSet = fBurmeseWordSet;
647
0
    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
648
649
    // Compact for caching.
650
0
    fMarkSet.compact();
651
0
    fEndWordSet.compact();
652
0
    fBeginWordSet.compact();
653
0
    UTRACE_EXIT_STATUS(status);
654
0
}
655
656
0
BurmeseBreakEngine::~BurmeseBreakEngine() {
657
0
    delete fDictionary;
658
0
}
659
660
int32_t
661
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
662
                                                int32_t rangeStart,
663
                                                int32_t rangeEnd,
664
                                                UVector32 &foundBreaks,
665
0
                                                UErrorCode& status ) const {
666
0
    if (U_FAILURE(status)) return 0;
667
0
    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
668
0
        return 0;       // Not enough characters for two words
669
0
    }
670
671
0
    uint32_t wordsFound = 0;
672
0
    int32_t cpWordLength = 0;
673
0
    int32_t cuWordLength = 0;
674
0
    int32_t current;
675
0
    PossibleWord words[BURMESE_LOOKAHEAD];
676
677
0
    utext_setNativeIndex(text, rangeStart);
678
679
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
680
0
        cuWordLength = 0;
681
0
        cpWordLength = 0;
682
683
        // Look for candidate words at the current position
684
0
        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
685
        
686
        // If we found exactly one, use that
687
0
        if (candidates == 1) {
688
0
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
689
0
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
690
0
            wordsFound += 1;
691
0
        }
692
        // If there was more than one, see which one can take us forward the most words
693
0
        else if (candidates > 1) {
694
            // If we're already at the end of the range, we're done
695
0
            if (utext_getNativeIndex(text) >= rangeEnd) {
696
0
                goto foundBest;
697
0
            }
698
0
            do {
699
0
                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
700
                    // Followed by another dictionary word; mark first word as a good candidate
701
0
                    words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
702
                    
703
                    // If we're already at the end of the range, we're done
704
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
705
0
                        goto foundBest;
706
0
                    }
707
                    
708
                    // See if any of the possible second words is followed by a third word
709
0
                    do {
710
                        // If we find a third word, stop right away
711
0
                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
712
0
                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
713
0
                            goto foundBest;
714
0
                        }
715
0
                    }
716
0
                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
717
0
                }
718
0
            }
719
0
            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
720
0
foundBest:
721
0
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
722
0
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
723
0
            wordsFound += 1;
724
0
        }
725
        
726
        // We come here after having either found a word or not. We look ahead to the
727
        // next word. If it's not a dictionary word, we will combine it with the word we
728
        // just found (if there is one), but only if the preceding word does not exceed
729
        // the threshold.
730
        // The text iterator should now be positioned at the end of the word we found.
731
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
732
            // if it is a dictionary word, do nothing. If it isn't, then if there is
733
            // no preceding word, or the non-word shares less than the minimum threshold
734
            // of characters with a dictionary word, then scan to resynchronize
735
0
            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
736
0
                  && (cuWordLength == 0
737
0
                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
738
                // Look for a plausible word boundary
739
0
                int32_t remaining = rangeEnd - (current + cuWordLength);
740
0
                UChar32 pc;
741
0
                UChar32 uc;
742
0
                int32_t chars = 0;
743
0
                for (;;) {
744
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
745
0
                    pc = utext_next32(text);
746
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
747
0
                    chars += pcSize;
748
0
                    remaining -= pcSize;
749
0
                    if (remaining <= 0) {
750
0
                        break;
751
0
                    }
752
0
                    uc = utext_current32(text);
753
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
754
                        // Maybe. See if it's in the dictionary.
755
                        // TODO: this looks iffy; compare with old code.
756
0
                        int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
757
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
758
0
                        if (num_candidates > 0) {
759
0
                            break;
760
0
                        }
761
0
                    }
762
0
                }
763
                
764
                // Bump the word count if there wasn't already one
765
0
                if (cuWordLength <= 0) {
766
0
                    wordsFound += 1;
767
0
                }
768
                
769
                // Update the length with the passed-over characters
770
0
                cuWordLength += chars;
771
0
            }
772
0
            else {
773
                // Back up to where we were for next iteration
774
0
                utext_setNativeIndex(text, current + cuWordLength);
775
0
            }
776
0
        }
777
        
778
        // Never stop before a combining mark.
779
0
        int32_t currPos;
780
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
781
0
            utext_next32(text);
782
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
783
0
        }
784
        
785
        // Look ahead for possible suffixes if a dictionary word does not follow.
786
        // We do this in code rather than using a rule so that the heuristic
787
        // resynch continues to function. For example, one of the suffix characters
788
        // could be a typo in the middle of a word.
789
        // NOT CURRENTLY APPLICABLE TO BURMESE
790
791
        // Did we find a word on this iteration? If so, push it on the break stack
792
0
        if (cuWordLength > 0) {
793
0
            foundBreaks.push((current+cuWordLength), status);
794
0
        }
795
0
    }
796
797
    // Don't return a break for the end of the dictionary range if there is one there.
798
0
    if (foundBreaks.peeki() >= rangeEnd) {
799
0
        (void) foundBreaks.popi();
800
0
        wordsFound -= 1;
801
0
    }
802
803
0
    return wordsFound;
804
0
}
805
806
/*
807
 ******************************************************************
808
 * KhmerBreakEngine
809
 */
810
811
// How many words in a row are "good enough"?
812
static const int32_t KHMER_LOOKAHEAD = 3;
813
814
// Will not combine a non-word with a preceding dictionary word longer than this
815
static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
816
817
// Will not combine a non-word that shares at least this much prefix with a
818
// dictionary word, with a preceding word
819
static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
820
821
// Minimum word size
822
static const int32_t KHMER_MIN_WORD = 2;
823
824
// Minimum number of characters for two words
825
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
826
827
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
828
0
    : DictionaryBreakEngine(),
829
0
      fDictionary(adoptDictionary)
830
0
{
831
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
832
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");
833
0
    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
834
0
    if (U_SUCCESS(status)) {
835
0
        setCharacters(fKhmerWordSet);
836
0
    }
837
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
838
0
    fMarkSet.add(0x0020);
839
0
    fEndWordSet = fKhmerWordSet;
840
0
    fBeginWordSet.add(0x1780, 0x17B3);
841
    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
842
    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
843
    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
844
0
    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
845
    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
846
//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
847
//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
848
//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
849
//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
850
//    fSuffixSet.add(THAI_PAIYANNOI);
851
//    fSuffixSet.add(THAI_MAIYAMOK);
852
853
    // Compact for caching.
854
0
    fMarkSet.compact();
855
0
    fEndWordSet.compact();
856
0
    fBeginWordSet.compact();
857
//    fSuffixSet.compact();
858
0
    UTRACE_EXIT_STATUS(status);
859
0
}
860
861
0
KhmerBreakEngine::~KhmerBreakEngine() {
862
0
    delete fDictionary;
863
0
}
864
865
int32_t
866
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
867
                                                int32_t rangeStart,
868
                                                int32_t rangeEnd,
869
                                                UVector32 &foundBreaks,
870
0
                                                UErrorCode& status ) const {
871
0
    if (U_FAILURE(status)) return 0;
872
0
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
873
0
        return 0;       // Not enough characters for two words
874
0
    }
875
876
0
    uint32_t wordsFound = 0;
877
0
    int32_t cpWordLength = 0;
878
0
    int32_t cuWordLength = 0;
879
0
    int32_t current;
880
0
    PossibleWord words[KHMER_LOOKAHEAD];
881
882
0
    utext_setNativeIndex(text, rangeStart);
883
884
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
885
0
        cuWordLength = 0;
886
0
        cpWordLength = 0;
887
888
        // Look for candidate words at the current position
889
0
        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
890
891
        // If we found exactly one, use that
892
0
        if (candidates == 1) {
893
0
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
894
0
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
895
0
            wordsFound += 1;
896
0
        }
897
898
        // If there was more than one, see which one can take us forward the most words
899
0
        else if (candidates > 1) {
900
            // If we're already at the end of the range, we're done
901
0
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
902
0
                goto foundBest;
903
0
            }
904
0
            do {
905
0
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
906
                    // Followed by another dictionary word; mark first word as a good candidate
907
0
                    words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
908
909
                    // If we're already at the end of the range, we're done
910
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
911
0
                        goto foundBest;
912
0
                    }
913
914
                    // See if any of the possible second words is followed by a third word
915
0
                    do {
916
                        // If we find a third word, stop right away
917
0
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
918
0
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
919
0
                            goto foundBest;
920
0
                        }
921
0
                    }
922
0
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
923
0
                }
924
0
            }
925
0
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
926
0
foundBest:
927
0
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
928
0
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
929
0
            wordsFound += 1;
930
0
        }
931
932
        // We come here after having either found a word or not. We look ahead to the
933
        // next word. If it's not a dictionary word, we will combine it with the word we
934
        // just found (if there is one), but only if the preceding word does not exceed
935
        // the threshold.
936
        // The text iterator should now be positioned at the end of the word we found.
937
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
938
            // if it is a dictionary word, do nothing. If it isn't, then if there is
939
            // no preceding word, or the non-word shares less than the minimum threshold
940
            // of characters with a dictionary word, then scan to resynchronize
941
0
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
942
0
                  && (cuWordLength == 0
943
0
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
944
                // Look for a plausible word boundary
945
0
                int32_t remaining = rangeEnd - (current+cuWordLength);
946
0
                UChar32 pc;
947
0
                UChar32 uc;
948
0
                int32_t chars = 0;
949
0
                for (;;) {
950
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
951
0
                    pc = utext_next32(text);
952
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
953
0
                    chars += pcSize;
954
0
                    remaining -= pcSize;
955
0
                    if (remaining <= 0) {
956
0
                        break;
957
0
                    }
958
0
                    uc = utext_current32(text);
959
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
960
                        // Maybe. See if it's in the dictionary.
961
0
                        int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
962
0
                        utext_setNativeIndex(text, current+cuWordLength+chars);
963
0
                        if (num_candidates > 0) {
964
0
                            break;
965
0
                        }
966
0
                    }
967
0
                }
968
969
                // Bump the word count if there wasn't already one
970
0
                if (cuWordLength <= 0) {
971
0
                    wordsFound += 1;
972
0
                }
973
974
                // Update the length with the passed-over characters
975
0
                cuWordLength += chars;
976
0
            }
977
0
            else {
978
                // Back up to where we were for next iteration
979
0
                utext_setNativeIndex(text, current+cuWordLength);
980
0
            }
981
0
        }
982
983
        // Never stop before a combining mark.
984
0
        int32_t currPos;
985
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
986
0
            utext_next32(text);
987
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
988
0
        }
989
990
        // Look ahead for possible suffixes if a dictionary word does not follow.
991
        // We do this in code rather than using a rule so that the heuristic
992
        // resynch continues to function. For example, one of the suffix characters
993
        // could be a typo in the middle of a word.
994
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
995
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
996
//                && fSuffixSet.contains(uc = utext_current32(text))) {
997
//                if (uc == KHMER_PAIYANNOI) {
998
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
999
//                        // Skip over previous end and PAIYANNOI
1000
//                        utext_next32(text);
1001
//                        utext_next32(text);
1002
//                        wordLength += 1;            // Add PAIYANNOI to word
1003
//                        uc = utext_current32(text);     // Fetch next character
1004
//                    }
1005
//                    else {
1006
//                        // Restore prior position
1007
//                        utext_next32(text);
1008
//                    }
1009
//                }
1010
//                if (uc == KHMER_MAIYAMOK) {
1011
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1012
//                        // Skip over previous end and MAIYAMOK
1013
//                        utext_next32(text);
1014
//                        utext_next32(text);
1015
//                        wordLength += 1;            // Add MAIYAMOK to word
1016
//                    }
1017
//                    else {
1018
//                        // Restore prior position
1019
//                        utext_next32(text);
1020
//                    }
1021
//                }
1022
//            }
1023
//            else {
1024
//                utext_setNativeIndex(text, current+wordLength);
1025
//            }
1026
//        }
1027
1028
        // Did we find a word on this iteration? If so, push it on the break stack
1029
0
        if (cuWordLength > 0) {
1030
0
            foundBreaks.push((current+cuWordLength), status);
1031
0
        }
1032
0
    }
1033
    
1034
    // Don't return a break for the end of the dictionary range if there is one there.
1035
0
    if (foundBreaks.peeki() >= rangeEnd) {
1036
0
        (void) foundBreaks.popi();
1037
0
        wordsFound -= 1;
1038
0
    }
1039
1040
0
    return wordsFound;
1041
0
}
1042
1043
#if !UCONFIG_NO_NORMALIZATION
1044
/*
1045
 ******************************************************************
1046
 * CjkBreakEngine
1047
 */
1048
static const uint32_t kuint32max = 0xFFFFFFFF;
1049
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1050
0
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1051
0
    UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);
1052
0
    UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");
1053
    // Korean dictionary only includes Hangul syllables
1054
0
    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1055
0
    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1056
0
    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1057
0
    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1058
0
    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1059
1060
0
    if (U_SUCCESS(status)) {
1061
        // handle Korean and Japanese/Chinese using different dictionaries
1062
0
        if (type == kKorean) {
1063
0
            setCharacters(fHangulWordSet);
1064
0
        } else { //Chinese and Japanese
1065
0
            UnicodeSet cjSet;
1066
0
            cjSet.addAll(fHanWordSet);
1067
0
            cjSet.addAll(fKatakanaWordSet);
1068
0
            cjSet.addAll(fHiraganaWordSet);
1069
0
            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1070
0
            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1071
0
            setCharacters(cjSet);
1072
0
        }
1073
0
    }
1074
0
    UTRACE_EXIT_STATUS(status);
1075
0
}
1076
1077
0
CjkBreakEngine::~CjkBreakEngine(){
1078
0
    delete fDictionary;
1079
0
}
1080
1081
// The katakanaCost values below are based on the length frequencies of all
1082
// katakana phrases in the dictionary
1083
static const int32_t kMaxKatakanaLength = 8;
1084
static const int32_t kMaxKatakanaGroupLength = 20;
1085
static const uint32_t maxSnlp = 255;
1086
1087
0
static inline uint32_t getKatakanaCost(int32_t wordLength){
1088
    //TODO: fill array with actual values from dictionary!
1089
0
    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1090
0
                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1091
0
    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1092
0
}
1093
1094
0
static inline bool isKatakana(UChar32 value) {
1095
0
    return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1096
0
            (value >= 0xFF66 && value <= 0xFF9f);
1097
0
}
1098
1099
1100
// Function for accessing internal utext flags.
1101
//   Replicates an internal UText function.
1102
1103
0
static inline int32_t utext_i32_flag(int32_t bitIndex) {
1104
0
    return (int32_t)1 << bitIndex;
1105
0
}
1106
1107
       
1108
/*
1109
 * @param text A UText representing the text
1110
 * @param rangeStart The start of the range of dictionary characters
1111
 * @param rangeEnd The end of the range of dictionary characters
1112
 * @param foundBreaks vector<int32> to receive the break positions
1113
 * @return The number of breaks found
1114
 */
1115
int32_t 
1116
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1117
        int32_t rangeStart,
1118
        int32_t rangeEnd,
1119
        UVector32 &foundBreaks,
1120
0
        UErrorCode& status) const {
1121
0
    if (U_FAILURE(status)) return 0;
1122
0
    if (rangeStart >= rangeEnd) {
1123
0
        return 0;
1124
0
    }
1125
1126
    // UnicodeString version of input UText, NFKC normalized if necessary.
1127
0
    UnicodeString inString;
1128
1129
    // inputMap[inStringIndex] = corresponding native index from UText inText.
1130
    // If NULL then mapping is 1:1
1131
0
    LocalPointer<UVector32>     inputMap;
1132
1133
    // if UText has the input string as one contiguous UTF-16 chunk
1134
0
    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1135
0
         inText->chunkNativeStart <= rangeStart &&
1136
0
         inText->chunkNativeLimit >= rangeEnd   &&
1137
0
         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1138
1139
        // Input UText is in one contiguous UTF-16 chunk.
1140
        // Use Read-only aliasing UnicodeString.
1141
0
        inString.setTo(FALSE,
1142
0
                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
1143
0
                       rangeEnd - rangeStart);
1144
0
    } else {
1145
        // Copy the text from the original inText (UText) to inString (UnicodeString).
1146
        // Create a map from UnicodeString indices -> UText offsets.
1147
0
        utext_setNativeIndex(inText, rangeStart);
1148
0
        int32_t limit = rangeEnd;
1149
0
        U_ASSERT(limit <= utext_nativeLength(inText));
1150
0
        if (limit > utext_nativeLength(inText)) {
1151
0
            limit = (int32_t)utext_nativeLength(inText);
1152
0
        }
1153
0
        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1154
0
        if (U_FAILURE(status)) {
1155
0
            return 0;
1156
0
        }
1157
0
        while (utext_getNativeIndex(inText) < limit) {
1158
0
            int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1159
0
            UChar32 c = utext_next32(inText);
1160
0
            U_ASSERT(c != U_SENTINEL);
1161
0
            inString.append(c);
1162
0
            while (inputMap->size() < inString.length()) {
1163
0
                inputMap->addElement(nativePosition, status);
1164
0
            }
1165
0
        }
1166
0
        inputMap->addElement(limit, status);
1167
0
    }
1168
1169
1170
0
    if (!nfkcNorm2->isNormalized(inString, status)) {
1171
0
        UnicodeString normalizedInput;
1172
        //  normalizedMap[normalizedInput position] ==  original UText position.
1173
0
        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1174
0
        if (U_FAILURE(status)) {
1175
0
            return 0;
1176
0
        }
1177
        
1178
0
        UnicodeString fragment;
1179
0
        UnicodeString normalizedFragment;
1180
0
        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1181
0
            fragment.remove();
1182
0
            int32_t fragmentStartI = srcI;
1183
0
            UChar32 c = inString.char32At(srcI);
1184
0
            for (;;) {
1185
0
                fragment.append(c);
1186
0
                srcI = inString.moveIndex32(srcI, 1);
1187
0
                if (srcI == inString.length()) {
1188
0
                    break;
1189
0
                }
1190
0
                c = inString.char32At(srcI);
1191
0
                if (nfkcNorm2->hasBoundaryBefore(c)) {
1192
0
                    break;
1193
0
                }
1194
0
            }
1195
0
            nfkcNorm2->normalize(fragment, normalizedFragment, status);
1196
0
            normalizedInput.append(normalizedFragment);
1197
1198
            // Map every position in the normalized chunk to the start of the chunk
1199
            //   in the original input.
1200
0
            int32_t fragmentOriginalStart = inputMap.isValid() ?
1201
0
                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1202
0
            while (normalizedMap->size() < normalizedInput.length()) {
1203
0
                normalizedMap->addElement(fragmentOriginalStart, status);
1204
0
                if (U_FAILURE(status)) {
1205
0
                    break;
1206
0
                }
1207
0
            }
1208
0
        }
1209
0
        U_ASSERT(normalizedMap->size() == normalizedInput.length());
1210
0
        int32_t nativeEnd = inputMap.isValid() ?
1211
0
                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1212
0
        normalizedMap->addElement(nativeEnd, status);
1213
1214
0
        inputMap = std::move(normalizedMap);
1215
0
        inString = std::move(normalizedInput);
1216
0
    }
1217
1218
0
    int32_t numCodePts = inString.countChar32();
1219
0
    if (numCodePts != inString.length()) {
1220
        // There are supplementary characters in the input.
1221
        // The dictionary will produce boundary positions in terms of code point indexes,
1222
        //   not in terms of code unit string indexes.
1223
        // Use the inputMap mechanism to take care of this in addition to indexing differences
1224
        //    from normalization and/or UTF-8 input.
1225
0
        UBool hadExistingMap = inputMap.isValid();
1226
0
        if (!hadExistingMap) {
1227
0
            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1228
0
            if (U_FAILURE(status)) {
1229
0
                return 0;
1230
0
            }
1231
0
        }
1232
0
        int32_t cpIdx = 0;
1233
0
        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1234
0
            U_ASSERT(cuIdx >= cpIdx);
1235
0
            if (hadExistingMap) {
1236
0
                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1237
0
            } else {
1238
0
                inputMap->addElement(cuIdx+rangeStart, status);
1239
0
            }
1240
0
            cpIdx++;
1241
0
            if (cuIdx == inString.length()) {
1242
0
               break;
1243
0
            }
1244
0
        }
1245
0
    }
1246
                
1247
    // bestSnlp[i] is the snlp of the best segmentation of the first i
1248
    // code points in the range to be matched.
1249
0
    UVector32 bestSnlp(numCodePts + 1, status);
1250
0
    bestSnlp.addElement(0, status);
1251
0
    for(int32_t i = 1; i <= numCodePts; i++) {
1252
0
        bestSnlp.addElement(kuint32max, status);
1253
0
    }
1254
1255
1256
    // prev[i] is the index of the last CJK code point in the previous word in 
1257
    // the best segmentation of the first i characters.
1258
0
    UVector32 prev(numCodePts + 1, status);
1259
0
    for(int32_t i = 0; i <= numCodePts; i++){
1260
0
        prev.addElement(-1, status);
1261
0
    }
1262
1263
0
    const int32_t maxWordSize = 20;
1264
0
    UVector32 values(numCodePts, status);
1265
0
    values.setSize(numCodePts);
1266
0
    UVector32 lengths(numCodePts, status);
1267
0
    lengths.setSize(numCodePts);
1268
1269
0
    UText fu = UTEXT_INITIALIZER;
1270
0
    utext_openUnicodeString(&fu, &inString, &status);
1271
1272
    // Dynamic programming to find the best segmentation.
1273
1274
    // In outer loop, i  is the code point index,
1275
    //                ix is the corresponding string (code unit) index.
1276
    //    They differ when the string contains supplementary characters.
1277
0
    int32_t ix = 0;
1278
0
    bool is_prev_katakana = false;
1279
0
    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1280
0
        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1281
0
            continue;
1282
0
        }
1283
1284
0
        int32_t count;
1285
0
        utext_setNativeIndex(&fu, ix);
1286
0
        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1287
0
                             NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1288
                             // Note: lengths is filled with code point lengths
1289
                             //       The NULL parameter is the ignored code unit lengths.
1290
1291
        // if there are no single character matches found in the dictionary 
1292
        // starting with this character, treat character as a 1-character word 
1293
        // with the highest value possible, i.e. the least likely to occur.
1294
        // Exclude Korean characters from this treatment, as they should be left
1295
        // together by default.
1296
0
        if ((count == 0 || lengths.elementAti(0) != 1) &&
1297
0
                !fHangulWordSet.contains(inString.char32At(ix))) {
1298
0
            values.setElementAt(maxSnlp, count);   // 255
1299
0
            lengths.setElementAt(1, count++);
1300
0
        }
1301
1302
0
        for (int32_t j = 0; j < count; j++) {
1303
0
            uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1304
0
            int32_t ln_j_i = lengths.elementAti(j) + i;
1305
0
            if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1306
0
                bestSnlp.setElementAt(newSnlp, ln_j_i);
1307
0
                prev.setElementAt(i, ln_j_i);
1308
0
            }
1309
0
        }
1310
1311
        // In Japanese,
1312
        // Katakana word in single character is pretty rare. So we apply
1313
        // the following heuristic to Katakana: any continuous run of Katakana
1314
        // characters is considered a candidate word with a default cost
1315
        // specified in the katakanaCost table according to its length.
1316
1317
0
        bool is_katakana = isKatakana(inString.char32At(ix));
1318
0
        int32_t katakanaRunLength = 1;
1319
0
        if (!is_prev_katakana && is_katakana) {
1320
0
            int32_t j = inString.moveIndex32(ix, 1);
1321
            // Find the end of the continuous run of Katakana characters
1322
0
            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1323
0
                    isKatakana(inString.char32At(j))) {
1324
0
                j = inString.moveIndex32(j, 1);
1325
0
                katakanaRunLength++;
1326
0
            }
1327
0
            if (katakanaRunLength < kMaxKatakanaGroupLength) {
1328
0
                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1329
0
                if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1330
0
                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1331
0
                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1332
0
                }
1333
0
            }
1334
0
        }
1335
0
        is_prev_katakana = is_katakana;
1336
0
    }
1337
0
    utext_close(&fu);
1338
1339
    // Start pushing the optimal offset index into t_boundary (t for tentative).
1340
    // prev[numCodePts] is guaranteed to be meaningful.
1341
    // We'll first push in the reverse order, i.e.,
1342
    // t_boundary[0] = numCodePts, and afterwards do a swap.
1343
0
    UVector32 t_boundary(numCodePts+1, status);
1344
1345
0
    int32_t numBreaks = 0;
1346
    // No segmentation found, set boundary to end of range
1347
0
    if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1348
0
        t_boundary.addElement(numCodePts, status);
1349
0
        numBreaks++;
1350
0
    } else {
1351
0
        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1352
0
            t_boundary.addElement(i, status);
1353
0
            numBreaks++;
1354
0
        }
1355
0
        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1356
0
    }
1357
1358
    // Add a break for the start of the dictionary range if there is not one
1359
    // there already.
1360
0
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1361
0
        t_boundary.addElement(0, status);
1362
0
        numBreaks++;
1363
0
    }
1364
1365
    // Now that we're done, convert positions in t_boundary[] (indices in 
1366
    // the normalized input string) back to indices in the original input UText
1367
    // while reversing t_boundary and pushing values to foundBreaks.
1368
0
    int32_t prevCPPos = -1;
1369
0
    int32_t prevUTextPos = -1;
1370
0
    for (int32_t i = numBreaks-1; i >= 0; i--) {
1371
0
        int32_t cpPos = t_boundary.elementAti(i);
1372
0
        U_ASSERT(cpPos > prevCPPos);
1373
0
        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1374
0
        U_ASSERT(utextPos >= prevUTextPos);
1375
0
        if (utextPos > prevUTextPos) {
1376
            // Boundaries are added to foundBreaks output in ascending order.
1377
0
            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1378
0
            foundBreaks.push(utextPos, status);
1379
0
        } else {
1380
            // Normalization expanded the input text, the dictionary found a boundary
1381
            // within the expansion, giving two boundaries with the same index in the
1382
            // original text. Ignore the second. See ticket #12918.
1383
0
            --numBreaks;
1384
0
        }
1385
0
        prevCPPos = cpPos;
1386
0
        prevUTextPos = utextPos;
1387
0
    }
1388
0
    (void)prevCPPos; // suppress compiler warnings about unused variable
1389
1390
    // inString goes out of scope
1391
    // inputMap goes out of scope
1392
0
    return numBreaks;
1393
0
}
1394
#endif
1395
1396
U_NAMESPACE_END
1397
1398
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1399