Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/dictbe.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/**
4
 *******************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 *******************************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_BREAK_ITERATION
13
14
#include "brkeng.h"
15
#include "dictbe.h"
16
#include "unicode/uniset.h"
17
#include "unicode/chariter.h"
18
#include "unicode/ubrk.h"
19
#include "uvectr32.h"
20
#include "uvector.h"
21
#include "uassert.h"
22
#include "unicode/normlzr.h"
23
#include "cmemory.h"
24
#include "dictionarydata.h"
25
26
U_NAMESPACE_BEGIN
27
28
/*
29
 ******************************************************************
30
 */
31
32
0
DictionaryBreakEngine::DictionaryBreakEngine() {
33
0
}
34
35
0
DictionaryBreakEngine::~DictionaryBreakEngine() {
36
0
}
37
38
UBool
39
0
DictionaryBreakEngine::handles(UChar32 c) const {
40
0
    return fSet.contains(c);
41
0
}
42
43
int32_t
44
DictionaryBreakEngine::findBreaks( UText *text,
45
                                 int32_t startPos,
46
                                 int32_t endPos,
47
0
                                 UVector32 &foundBreaks ) const {
48
0
    (void)startPos;            // TODO: remove this param?
49
0
    int32_t result = 0;
50
0
51
0
    // Find the span of characters included in the set.
52
0
    //   The span to break begins at the current position in the text, and
53
0
    //   extends towards the start or end of the text, depending on 'reverse'.
54
0
55
0
    int32_t start = (int32_t)utext_getNativeIndex(text);
56
0
    int32_t current;
57
0
    int32_t rangeStart;
58
0
    int32_t rangeEnd;
59
0
    UChar32 c = utext_current32(text);
60
0
    while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) {
61
0
        utext_next32(text);         // TODO:  recast loop for postincrement
62
0
        c = utext_current32(text);
63
0
    }
64
0
    rangeStart = start;
65
0
    rangeEnd = current;
66
0
    result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks);
67
0
    utext_setNativeIndex(text, current);
68
0
    
69
0
    return result;
70
0
}
71
72
void
73
0
DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) {
74
0
    fSet = set;
75
0
    // Compact for caching
76
0
    fSet.compact();
77
0
}
78
79
/*
80
 ******************************************************************
81
 * PossibleWord
82
 */
83
84
// Helper class for improving readability of the Thai/Lao/Khmer word break
85
// algorithm. The implementation is completely inline.
86
87
// List size, limited by the maximum number of words in the dictionary
88
// that form a nested sequence.
89
static const int32_t POSSIBLE_WORD_LIST_MAX = 20;
90
91
class PossibleWord {
92
private:
93
    // list of word candidate lengths, in increasing length order
94
    // TODO: bytes would be sufficient for word lengths.
95
    int32_t   count;      // Count of candidates
96
    int32_t   prefix;     // The longest match with a dictionary word
97
    int32_t   offset;     // Offset in the text of these candidates
98
    int32_t   mark;       // The preferred candidate's offset
99
    int32_t   current;    // The candidate we're currently looking at
100
    int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.
101
    int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.
102
103
public:
104
0
    PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {};
105
0
    ~PossibleWord() {};
106
  
107
    // Fill the list of candidates if needed, select the longest, and return the number found
108
    int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );
109
  
110
    // Select the currently marked candidate, point after it in the text, and invalidate self
111
    int32_t   acceptMarked( UText *text );
112
  
113
    // Back up from the current candidate to the next shorter one; return TRUE if that exists
114
    // and point the text after it
115
    UBool     backUp( UText *text );
116
  
117
    // Return the longest prefix this candidate location shares with a dictionary word
118
    // Return value is in code points.
119
0
    int32_t   longestPrefix() { return prefix; };
120
  
121
    // Mark the current candidate as the one we like
122
0
    void      markCurrent() { mark = current; };
123
    
124
    // Get length in code points of the marked word.
125
0
    int32_t   markedCPLength() { return cpLengths[mark]; };
126
};
127
128
129
0
int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) {
130
0
    // TODO: If getIndex is too slow, use offset < 0 and add discardAll()
131
0
    int32_t start = (int32_t)utext_getNativeIndex(text);
132
0
    if (start != offset) {
133
0
        offset = start;
134
0
        count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);
135
0
        // Dictionary leaves text after longest prefix, not longest word. Back up.
136
0
        if (count <= 0) {
137
0
            utext_setNativeIndex(text, start);
138
0
        }
139
0
    }
140
0
    if (count > 0) {
141
0
        utext_setNativeIndex(text, start+cuLengths[count-1]);
142
0
    }
143
0
    current = count-1;
144
0
    mark = current;
145
0
    return count;
146
0
}
147
148
int32_t
149
0
PossibleWord::acceptMarked( UText *text ) {
150
0
    utext_setNativeIndex(text, offset + cuLengths[mark]);
151
0
    return cuLengths[mark];
152
0
}
153
154
155
UBool
156
0
PossibleWord::backUp( UText *text ) {
157
0
    if (current > 0) {
158
0
        utext_setNativeIndex(text, offset + cuLengths[--current]);
159
0
        return TRUE;
160
0
    }
161
0
    return FALSE;
162
0
}
163
164
/*
165
 ******************************************************************
166
 * ThaiBreakEngine
167
 */
168
169
// How many words in a row are "good enough"?
170
static const int32_t THAI_LOOKAHEAD = 3;
171
172
// Will not combine a non-word with a preceding dictionary word longer than this
173
static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;
174
175
// Will not combine a non-word that shares at least this much prefix with a
176
// dictionary word, with a preceding word
177
static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;
178
179
// Ellision character
180
static const int32_t THAI_PAIYANNOI = 0x0E2F;
181
182
// Repeat character
183
static const int32_t THAI_MAIYAMOK = 0x0E46;
184
185
// Minimum word size
186
static const int32_t THAI_MIN_WORD = 2;
187
188
// Minimum number of characters for two words
189
static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;
190
191
ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
192
    : DictionaryBreakEngine(),
193
      fDictionary(adoptDictionary)
194
0
{
195
0
    fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status);
196
0
    if (U_SUCCESS(status)) {
197
0
        setCharacters(fThaiWordSet);
198
0
    }
199
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status);
200
0
    fMarkSet.add(0x0020);
201
0
    fEndWordSet = fThaiWordSet;
202
0
    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
203
0
    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
204
0
    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
205
0
    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
206
0
    fSuffixSet.add(THAI_PAIYANNOI);
207
0
    fSuffixSet.add(THAI_MAIYAMOK);
208
0
209
0
    // Compact for caching.
210
0
    fMarkSet.compact();
211
0
    fEndWordSet.compact();
212
0
    fBeginWordSet.compact();
213
0
    fSuffixSet.compact();
214
0
}
215
216
0
ThaiBreakEngine::~ThaiBreakEngine() {
217
0
    delete fDictionary;
218
0
}
219
220
int32_t
221
ThaiBreakEngine::divideUpDictionaryRange( UText *text,
222
                                                int32_t rangeStart,
223
                                                int32_t rangeEnd,
224
0
                                                UVector32 &foundBreaks ) const {
225
0
    utext_setNativeIndex(text, rangeStart);
226
0
    utext_moveIndex32(text, THAI_MIN_WORD_SPAN);
227
0
    if (utext_getNativeIndex(text) >= rangeEnd) {
228
0
        return 0;       // Not enough characters for two words
229
0
    }
230
0
    utext_setNativeIndex(text, rangeStart);
231
0
232
0
233
0
    uint32_t wordsFound = 0;
234
0
    int32_t cpWordLength = 0;    // Word Length in Code Points.
235
0
    int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)
236
0
    int32_t current;
237
0
    UErrorCode status = U_ZERO_ERROR;
238
0
    PossibleWord words[THAI_LOOKAHEAD];
239
0
    
240
0
    utext_setNativeIndex(text, rangeStart);
241
0
    
242
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
243
0
        cpWordLength = 0;
244
0
        cuWordLength = 0;
245
0
246
0
        // Look for candidate words at the current position
247
0
        int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
248
0
        
249
0
        // If we found exactly one, use that
250
0
        if (candidates == 1) {
251
0
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
252
0
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
253
0
            wordsFound += 1;
254
0
        }
255
0
        // If there was more than one, see which one can take us forward the most words
256
0
        else if (candidates > 1) {
257
0
            // If we're already at the end of the range, we're done
258
0
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
259
0
                goto foundBest;
260
0
            }
261
0
            do {
262
0
                int32_t wordsMatched = 1;
263
0
                if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
264
0
                    if (wordsMatched < 2) {
265
0
                        // Followed by another dictionary word; mark first word as a good candidate
266
0
                        words[wordsFound%THAI_LOOKAHEAD].markCurrent();
267
0
                        wordsMatched = 2;
268
0
                    }
269
0
                    
270
0
                    // If we're already at the end of the range, we're done
271
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
272
0
                        goto foundBest;
273
0
                    }
274
0
                    
275
0
                    // See if any of the possible second words is followed by a third word
276
0
                    do {
277
0
                        // If we find a third word, stop right away
278
0
                        if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
279
0
                            words[wordsFound % THAI_LOOKAHEAD].markCurrent();
280
0
                            goto foundBest;
281
0
                        }
282
0
                    }
283
0
                    while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));
284
0
                }
285
0
            }
286
0
            while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));
287
0
foundBest:
288
0
            // Set UText position to after the accepted word.
289
0
            cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);
290
0
            cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();
291
0
            wordsFound += 1;
292
0
        }
293
0
        
294
0
        // We come here after having either found a word or not. We look ahead to the
295
0
        // next word. If it's not a dictionary word, we will combine it with the word we
296
0
        // just found (if there is one), but only if the preceding word does not exceed
297
0
        // the threshold.
298
0
        // The text iterator should now be positioned at the end of the word we found.
299
0
        
300
0
        UChar32 uc = 0;
301
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) {
302
0
            // if it is a dictionary word, do nothing. If it isn't, then if there is
303
0
            // no preceding word, or the non-word shares less than the minimum threshold
304
0
            // of characters with a dictionary word, then scan to resynchronize
305
0
            if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
306
0
                  && (cuWordLength == 0
307
0
                      || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) {
308
0
                // Look for a plausible word boundary
309
0
                int32_t remaining = rangeEnd - (current+cuWordLength);
310
0
                UChar32 pc;
311
0
                int32_t chars = 0;
312
0
                for (;;) {
313
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
314
0
                    pc = utext_next32(text);
315
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
316
0
                    chars += pcSize;
317
0
                    remaining -= pcSize;
318
0
                    if (remaining <= 0) {
319
0
                        break;
320
0
                    }
321
0
                    uc = utext_current32(text);
322
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
323
0
                        // Maybe. See if it's in the dictionary.
324
0
                        // NOTE: In the original Apple code, checked that the next
325
0
                        // two characters after uc were not 0x0E4C THANTHAKHAT before
326
0
                        // checking the dictionary. That is just a performance filter,
327
0
                        // but it's not clear it's faster than checking the trie.
328
0
                        int32_t candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
329
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
330
0
                        if (candidates > 0) {
331
0
                            break;
332
0
                        }
333
0
                    }
334
0
                }
335
0
                
336
0
                // Bump the word count if there wasn't already one
337
0
                if (cuWordLength <= 0) {
338
0
                    wordsFound += 1;
339
0
                }
340
0
                
341
0
                // Update the length with the passed-over characters
342
0
                cuWordLength += chars;
343
0
            }
344
0
            else {
345
0
                // Back up to where we were for next iteration
346
0
                utext_setNativeIndex(text, current+cuWordLength);
347
0
            }
348
0
        }
349
0
        
350
0
        // Never stop before a combining mark.
351
0
        int32_t currPos;
352
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
353
0
            utext_next32(text);
354
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
355
0
        }
356
0
        
357
0
        // Look ahead for possible suffixes if a dictionary word does not follow.
358
0
        // We do this in code rather than using a rule so that the heuristic
359
0
        // resynch continues to function. For example, one of the suffix characters
360
0
        // could be a typo in the middle of a word.
361
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) {
362
0
            if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
363
0
                && fSuffixSet.contains(uc = utext_current32(text))) {
364
0
                if (uc == THAI_PAIYANNOI) {
365
0
                    if (!fSuffixSet.contains(utext_previous32(text))) {
366
0
                        // Skip over previous end and PAIYANNOI
367
0
                        utext_next32(text);
368
0
                        int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);
369
0
                        utext_next32(text);
370
0
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word
371
0
                        uc = utext_current32(text);     // Fetch next character
372
0
                    }
373
0
                    else {
374
0
                        // Restore prior position
375
0
                        utext_next32(text);
376
0
                    }
377
0
                }
378
0
                if (uc == THAI_MAIYAMOK) {
379
0
                    if (utext_previous32(text) != THAI_MAIYAMOK) {
380
0
                        // Skip over previous end and MAIYAMOK
381
0
                        utext_next32(text);
382
0
                        int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);
383
0
                        utext_next32(text);
384
0
                        cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word
385
0
                    }
386
0
                    else {
387
0
                        // Restore prior position
388
0
                        utext_next32(text);
389
0
                    }
390
0
                }
391
0
            }
392
0
            else {
393
0
                utext_setNativeIndex(text, current+cuWordLength);
394
0
            }
395
0
        }
396
0
397
0
        // Did we find a word on this iteration? If so, push it on the break stack
398
0
        if (cuWordLength > 0) {
399
0
            foundBreaks.push((current+cuWordLength), status);
400
0
        }
401
0
    }
402
0
403
0
    // Don't return a break for the end of the dictionary range if there is one there.
404
0
    if (foundBreaks.peeki() >= rangeEnd) {
405
0
        (void) foundBreaks.popi();
406
0
        wordsFound -= 1;
407
0
    }
408
0
409
0
    return wordsFound;
410
0
}
411
412
/*
413
 ******************************************************************
414
 * LaoBreakEngine
415
 */
416
417
// How many words in a row are "good enough"?
418
static const int32_t LAO_LOOKAHEAD = 3;
419
420
// Will not combine a non-word with a preceding dictionary word longer than this
421
static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;
422
423
// Will not combine a non-word that shares at least this much prefix with a
424
// dictionary word, with a preceding word
425
static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;
426
427
// Minimum word size
428
static const int32_t LAO_MIN_WORD = 2;
429
430
// Minimum number of characters for two words
431
static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;
432
433
LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
434
    : DictionaryBreakEngine(),
435
      fDictionary(adoptDictionary)
436
0
{
437
0
    fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status);
438
0
    if (U_SUCCESS(status)) {
439
0
        setCharacters(fLaoWordSet);
440
0
    }
441
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status);
442
0
    fMarkSet.add(0x0020);
443
0
    fEndWordSet = fLaoWordSet;
444
0
    fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels
445
0
    fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)
446
0
    fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)
447
0
    fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels
448
0
449
0
    // Compact for caching.
450
0
    fMarkSet.compact();
451
0
    fEndWordSet.compact();
452
0
    fBeginWordSet.compact();
453
0
}
454
455
0
LaoBreakEngine::~LaoBreakEngine() {
456
0
    delete fDictionary;
457
0
}
458
459
int32_t
460
LaoBreakEngine::divideUpDictionaryRange( UText *text,
461
                                                int32_t rangeStart,
462
                                                int32_t rangeEnd,
463
0
                                                UVector32 &foundBreaks ) const {
464
0
    if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) {
465
0
        return 0;       // Not enough characters for two words
466
0
    }
467
0
468
0
    uint32_t wordsFound = 0;
469
0
    int32_t cpWordLength = 0;
470
0
    int32_t cuWordLength = 0;
471
0
    int32_t current;
472
0
    UErrorCode status = U_ZERO_ERROR;
473
0
    PossibleWord words[LAO_LOOKAHEAD];
474
0
    
475
0
    utext_setNativeIndex(text, rangeStart);
476
0
    
477
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
478
0
        cuWordLength = 0;
479
0
        cpWordLength = 0;
480
0
481
0
        // Look for candidate words at the current position
482
0
        int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
483
0
        
484
0
        // If we found exactly one, use that
485
0
        if (candidates == 1) {
486
0
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
487
0
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
488
0
            wordsFound += 1;
489
0
        }
490
0
        // If there was more than one, see which one can take us forward the most words
491
0
        else if (candidates > 1) {
492
0
            // If we're already at the end of the range, we're done
493
0
            if (utext_getNativeIndex(text) >= rangeEnd) {
494
0
                goto foundBest;
495
0
            }
496
0
            do {
497
0
                int32_t wordsMatched = 1;
498
0
                if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
499
0
                    if (wordsMatched < 2) {
500
0
                        // Followed by another dictionary word; mark first word as a good candidate
501
0
                        words[wordsFound%LAO_LOOKAHEAD].markCurrent();
502
0
                        wordsMatched = 2;
503
0
                    }
504
0
                    
505
0
                    // If we're already at the end of the range, we're done
506
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
507
0
                        goto foundBest;
508
0
                    }
509
0
                    
510
0
                    // See if any of the possible second words is followed by a third word
511
0
                    do {
512
0
                        // If we find a third word, stop right away
513
0
                        if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
514
0
                            words[wordsFound % LAO_LOOKAHEAD].markCurrent();
515
0
                            goto foundBest;
516
0
                        }
517
0
                    }
518
0
                    while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));
519
0
                }
520
0
            }
521
0
            while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));
522
0
foundBest:
523
0
            cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);
524
0
            cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();
525
0
            wordsFound += 1;
526
0
        }
527
0
        
528
0
        // We come here after having either found a word or not. We look ahead to the
529
0
        // next word. If it's not a dictionary word, we will combine it withe the word we
530
0
        // just found (if there is one), but only if the preceding word does not exceed
531
0
        // the threshold.
532
0
        // The text iterator should now be positioned at the end of the word we found.
533
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) {
534
0
            // if it is a dictionary word, do nothing. If it isn't, then if there is
535
0
            // no preceding word, or the non-word shares less than the minimum threshold
536
0
            // of characters with a dictionary word, then scan to resynchronize
537
0
            if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
538
0
                  && (cuWordLength == 0
539
0
                      || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) {
540
0
                // Look for a plausible word boundary
541
0
                int32_t remaining = rangeEnd - (current + cuWordLength);
542
0
                UChar32 pc;
543
0
                UChar32 uc;
544
0
                int32_t chars = 0;
545
0
                for (;;) {
546
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
547
0
                    pc = utext_next32(text);
548
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
549
0
                    chars += pcSize;
550
0
                    remaining -= pcSize;
551
0
                    if (remaining <= 0) {
552
0
                        break;
553
0
                    }
554
0
                    uc = utext_current32(text);
555
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
556
0
                        // Maybe. See if it's in the dictionary.
557
0
                        // TODO: this looks iffy; compare with old code.
558
0
                        int32_t candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
559
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
560
0
                        if (candidates > 0) {
561
0
                            break;
562
0
                        }
563
0
                    }
564
0
                }
565
0
                
566
0
                // Bump the word count if there wasn't already one
567
0
                if (cuWordLength <= 0) {
568
0
                    wordsFound += 1;
569
0
                }
570
0
                
571
0
                // Update the length with the passed-over characters
572
0
                cuWordLength += chars;
573
0
            }
574
0
            else {
575
0
                // Back up to where we were for next iteration
576
0
                utext_setNativeIndex(text, current + cuWordLength);
577
0
            }
578
0
        }
579
0
        
580
0
        // Never stop before a combining mark.
581
0
        int32_t currPos;
582
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
583
0
            utext_next32(text);
584
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
585
0
        }
586
0
        
587
0
        // Look ahead for possible suffixes if a dictionary word does not follow.
588
0
        // We do this in code rather than using a rule so that the heuristic
589
0
        // resynch continues to function. For example, one of the suffix characters
590
0
        // could be a typo in the middle of a word.
591
0
        // NOT CURRENTLY APPLICABLE TO LAO
592
0
593
0
        // Did we find a word on this iteration? If so, push it on the break stack
594
0
        if (cuWordLength > 0) {
595
0
            foundBreaks.push((current+cuWordLength), status);
596
0
        }
597
0
    }
598
0
599
0
    // Don't return a break for the end of the dictionary range if there is one there.
600
0
    if (foundBreaks.peeki() >= rangeEnd) {
601
0
        (void) foundBreaks.popi();
602
0
        wordsFound -= 1;
603
0
    }
604
0
605
0
    return wordsFound;
606
0
}
607
608
/*
609
 ******************************************************************
610
 * BurmeseBreakEngine
611
 */
612
613
// How many words in a row are "good enough"?
614
static const int32_t BURMESE_LOOKAHEAD = 3;
615
616
// Will not combine a non-word with a preceding dictionary word longer than this
617
static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;
618
619
// Will not combine a non-word that shares at least this much prefix with a
620
// dictionary word, with a preceding word
621
static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;
622
623
// Minimum word size
624
static const int32_t BURMESE_MIN_WORD = 2;
625
626
// Minimum number of characters for two words
627
static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;
628
629
BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
630
    : DictionaryBreakEngine(),
631
      fDictionary(adoptDictionary)
632
0
{
633
0
    fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status);
634
0
    if (U_SUCCESS(status)) {
635
0
        setCharacters(fBurmeseWordSet);
636
0
    }
637
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status);
638
0
    fMarkSet.add(0x0020);
639
0
    fEndWordSet = fBurmeseWordSet;
640
0
    fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels
641
0
642
0
    // Compact for caching.
643
0
    fMarkSet.compact();
644
0
    fEndWordSet.compact();
645
0
    fBeginWordSet.compact();
646
0
}
647
648
0
BurmeseBreakEngine::~BurmeseBreakEngine() {
649
0
    delete fDictionary;
650
0
}
651
652
int32_t
653
BurmeseBreakEngine::divideUpDictionaryRange( UText *text,
654
                                                int32_t rangeStart,
655
                                                int32_t rangeEnd,
656
0
                                                UVector32 &foundBreaks ) const {
657
0
    if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) {
658
0
        return 0;       // Not enough characters for two words
659
0
    }
660
0
661
0
    uint32_t wordsFound = 0;
662
0
    int32_t cpWordLength = 0;
663
0
    int32_t cuWordLength = 0;
664
0
    int32_t current;
665
0
    UErrorCode status = U_ZERO_ERROR;
666
0
    PossibleWord words[BURMESE_LOOKAHEAD];
667
0
    
668
0
    utext_setNativeIndex(text, rangeStart);
669
0
    
670
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
671
0
        cuWordLength = 0;
672
0
        cpWordLength = 0;
673
0
674
0
        // Look for candidate words at the current position
675
0
        int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
676
0
        
677
0
        // If we found exactly one, use that
678
0
        if (candidates == 1) {
679
0
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
680
0
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
681
0
            wordsFound += 1;
682
0
        }
683
0
        // If there was more than one, see which one can take us forward the most words
684
0
        else if (candidates > 1) {
685
0
            // If we're already at the end of the range, we're done
686
0
            if (utext_getNativeIndex(text) >= rangeEnd) {
687
0
                goto foundBest;
688
0
            }
689
0
            do {
690
0
                int32_t wordsMatched = 1;
691
0
                if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
692
0
                    if (wordsMatched < 2) {
693
0
                        // Followed by another dictionary word; mark first word as a good candidate
694
0
                        words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();
695
0
                        wordsMatched = 2;
696
0
                    }
697
0
                    
698
0
                    // If we're already at the end of the range, we're done
699
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
700
0
                        goto foundBest;
701
0
                    }
702
0
                    
703
0
                    // See if any of the possible second words is followed by a third word
704
0
                    do {
705
0
                        // If we find a third word, stop right away
706
0
                        if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
707
0
                            words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();
708
0
                            goto foundBest;
709
0
                        }
710
0
                    }
711
0
                    while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));
712
0
                }
713
0
            }
714
0
            while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));
715
0
foundBest:
716
0
            cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);
717
0
            cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();
718
0
            wordsFound += 1;
719
0
        }
720
0
        
721
0
        // We come here after having either found a word or not. We look ahead to the
722
0
        // next word. If it's not a dictionary word, we will combine it withe the word we
723
0
        // just found (if there is one), but only if the preceding word does not exceed
724
0
        // the threshold.
725
0
        // The text iterator should now be positioned at the end of the word we found.
726
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) {
727
0
            // if it is a dictionary word, do nothing. If it isn't, then if there is
728
0
            // no preceding word, or the non-word shares less than the minimum threshold
729
0
            // of characters with a dictionary word, then scan to resynchronize
730
0
            if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
731
0
                  && (cuWordLength == 0
732
0
                      || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) {
733
0
                // Look for a plausible word boundary
734
0
                int32_t remaining = rangeEnd - (current + cuWordLength);
735
0
                UChar32 pc;
736
0
                UChar32 uc;
737
0
                int32_t chars = 0;
738
0
                for (;;) {
739
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
740
0
                    pc = utext_next32(text);
741
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
742
0
                    chars += pcSize;
743
0
                    remaining -= pcSize;
744
0
                    if (remaining <= 0) {
745
0
                        break;
746
0
                    }
747
0
                    uc = utext_current32(text);
748
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
749
0
                        // Maybe. See if it's in the dictionary.
750
0
                        // TODO: this looks iffy; compare with old code.
751
0
                        int32_t candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
752
0
                        utext_setNativeIndex(text, current + cuWordLength + chars);
753
0
                        if (candidates > 0) {
754
0
                            break;
755
0
                        }
756
0
                    }
757
0
                }
758
0
                
759
0
                // Bump the word count if there wasn't already one
760
0
                if (cuWordLength <= 0) {
761
0
                    wordsFound += 1;
762
0
                }
763
0
                
764
0
                // Update the length with the passed-over characters
765
0
                cuWordLength += chars;
766
0
            }
767
0
            else {
768
0
                // Back up to where we were for next iteration
769
0
                utext_setNativeIndex(text, current + cuWordLength);
770
0
            }
771
0
        }
772
0
        
773
0
        // Never stop before a combining mark.
774
0
        int32_t currPos;
775
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
776
0
            utext_next32(text);
777
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
778
0
        }
779
0
        
780
0
        // Look ahead for possible suffixes if a dictionary word does not follow.
781
0
        // We do this in code rather than using a rule so that the heuristic
782
0
        // resynch continues to function. For example, one of the suffix characters
783
0
        // could be a typo in the middle of a word.
784
0
        // NOT CURRENTLY APPLICABLE TO BURMESE
785
0
786
0
        // Did we find a word on this iteration? If so, push it on the break stack
787
0
        if (cuWordLength > 0) {
788
0
            foundBreaks.push((current+cuWordLength), status);
789
0
        }
790
0
    }
791
0
792
0
    // Don't return a break for the end of the dictionary range if there is one there.
793
0
    if (foundBreaks.peeki() >= rangeEnd) {
794
0
        (void) foundBreaks.popi();
795
0
        wordsFound -= 1;
796
0
    }
797
0
798
0
    return wordsFound;
799
0
}
800
801
/*
802
 ******************************************************************
803
 * KhmerBreakEngine
804
 */
805
806
// How many words in a row are "good enough"?
807
static const int32_t KHMER_LOOKAHEAD = 3;
808
809
// Will not combine a non-word with a preceding dictionary word longer than this
810
static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;
811
812
// Will not combine a non-word that shares at least this much prefix with a
813
// dictionary word, with a preceding word
814
static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;
815
816
// Minimum word size
817
static const int32_t KHMER_MIN_WORD = 2;
818
819
// Minimum number of characters for two words
820
static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;
821
822
KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)
823
    : DictionaryBreakEngine(),
824
      fDictionary(adoptDictionary)
825
0
{
826
0
    fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status);
827
0
    if (U_SUCCESS(status)) {
828
0
        setCharacters(fKhmerWordSet);
829
0
    }
830
0
    fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status);
831
0
    fMarkSet.add(0x0020);
832
0
    fEndWordSet = fKhmerWordSet;
833
0
    fBeginWordSet.add(0x1780, 0x17B3);
834
0
    //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels
835
0
    //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word
836
0
    //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word
837
0
    fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters
838
0
    //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels
839
0
//    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT
840
0
//    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI
841
0
//    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK
842
0
//    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI
843
0
//    fSuffixSet.add(THAI_PAIYANNOI);
844
0
//    fSuffixSet.add(THAI_MAIYAMOK);
845
0
846
0
    // Compact for caching.
847
0
    fMarkSet.compact();
848
0
    fEndWordSet.compact();
849
0
    fBeginWordSet.compact();
850
0
//    fSuffixSet.compact();
851
0
}
852
853
0
KhmerBreakEngine::~KhmerBreakEngine() {
854
0
    delete fDictionary;
855
0
}
856
857
int32_t
858
KhmerBreakEngine::divideUpDictionaryRange( UText *text,
859
                                                int32_t rangeStart,
860
                                                int32_t rangeEnd,
861
0
                                                UVector32 &foundBreaks ) const {
862
0
    if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) {
863
0
        return 0;       // Not enough characters for two words
864
0
    }
865
0
866
0
    uint32_t wordsFound = 0;
867
0
    int32_t cpWordLength = 0;
868
0
    int32_t cuWordLength = 0;
869
0
    int32_t current;
870
0
    UErrorCode status = U_ZERO_ERROR;
871
0
    PossibleWord words[KHMER_LOOKAHEAD];
872
0
873
0
    utext_setNativeIndex(text, rangeStart);
874
0
875
0
    while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) {
876
0
        cuWordLength = 0;
877
0
        cpWordLength = 0;
878
0
879
0
        // Look for candidate words at the current position
880
0
        int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
881
0
882
0
        // If we found exactly one, use that
883
0
        if (candidates == 1) {
884
0
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
885
0
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
886
0
            wordsFound += 1;
887
0
        }
888
0
889
0
        // If there was more than one, see which one can take us forward the most words
890
0
        else if (candidates > 1) {
891
0
            // If we're already at the end of the range, we're done
892
0
            if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
893
0
                goto foundBest;
894
0
            }
895
0
            do {
896
0
                int32_t wordsMatched = 1;
897
0
                if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) {
898
0
                    if (wordsMatched < 2) {
899
0
                        // Followed by another dictionary word; mark first word as a good candidate
900
0
                        words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
901
0
                        wordsMatched = 2;
902
0
                    }
903
0
904
0
                    // If we're already at the end of the range, we're done
905
0
                    if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) {
906
0
                        goto foundBest;
907
0
                    }
908
0
909
0
                    // See if any of the possible second words is followed by a third word
910
0
                    do {
911
0
                        // If we find a third word, stop right away
912
0
                        if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) {
913
0
                            words[wordsFound % KHMER_LOOKAHEAD].markCurrent();
914
0
                            goto foundBest;
915
0
                        }
916
0
                    }
917
0
                    while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));
918
0
                }
919
0
            }
920
0
            while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));
921
0
foundBest:
922
0
            cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);
923
0
            cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();
924
0
            wordsFound += 1;
925
0
        }
926
0
927
0
        // We come here after having either found a word or not. We look ahead to the
928
0
        // next word. If it's not a dictionary word, we will combine it with the word we
929
0
        // just found (if there is one), but only if the preceding word does not exceed
930
0
        // the threshold.
931
0
        // The text iterator should now be positioned at the end of the word we found.
932
0
        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) {
933
0
            // if it is a dictionary word, do nothing. If it isn't, then if there is
934
0
            // no preceding word, or the non-word shares less than the minimum threshold
935
0
            // of characters with a dictionary word, then scan to resynchronize
936
0
            if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
937
0
                  && (cuWordLength == 0
938
0
                      || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) {
939
0
                // Look for a plausible word boundary
940
0
                int32_t remaining = rangeEnd - (current+cuWordLength);
941
0
                UChar32 pc;
942
0
                UChar32 uc;
943
0
                int32_t chars = 0;
944
0
                for (;;) {
945
0
                    int32_t pcIndex = (int32_t)utext_getNativeIndex(text);
946
0
                    pc = utext_next32(text);
947
0
                    int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;
948
0
                    chars += pcSize;
949
0
                    remaining -= pcSize;
950
0
                    if (remaining <= 0) {
951
0
                        break;
952
0
                    }
953
0
                    uc = utext_current32(text);
954
0
                    if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) {
955
0
                        // Maybe. See if it's in the dictionary.
956
0
                        int32_t candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);
957
0
                        utext_setNativeIndex(text, current+cuWordLength+chars);
958
0
                        if (candidates > 0) {
959
0
                            break;
960
0
                        }
961
0
                    }
962
0
                }
963
0
964
0
                // Bump the word count if there wasn't already one
965
0
                if (cuWordLength <= 0) {
966
0
                    wordsFound += 1;
967
0
                }
968
0
969
0
                // Update the length with the passed-over characters
970
0
                cuWordLength += chars;
971
0
            }
972
0
            else {
973
0
                // Back up to where we were for next iteration
974
0
                utext_setNativeIndex(text, current+cuWordLength);
975
0
            }
976
0
        }
977
0
978
0
        // Never stop before a combining mark.
979
0
        int32_t currPos;
980
0
        while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) {
981
0
            utext_next32(text);
982
0
            cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;
983
0
        }
984
0
985
0
        // Look ahead for possible suffixes if a dictionary word does not follow.
986
0
        // We do this in code rather than using a rule so that the heuristic
987
0
        // resynch continues to function. For example, one of the suffix characters
988
0
        // could be a typo in the middle of a word.
989
0
//        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) {
990
0
//            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0
991
0
//                && fSuffixSet.contains(uc = utext_current32(text))) {
992
0
//                if (uc == KHMER_PAIYANNOI) {
993
0
//                    if (!fSuffixSet.contains(utext_previous32(text))) {
994
0
//                        // Skip over previous end and PAIYANNOI
995
0
//                        utext_next32(text);
996
0
//                        utext_next32(text);
997
0
//                        wordLength += 1;            // Add PAIYANNOI to word
998
0
//                        uc = utext_current32(text);     // Fetch next character
999
0
//                    }
1000
0
//                    else {
1001
0
//                        // Restore prior position
1002
0
//                        utext_next32(text);
1003
0
//                    }
1004
0
//                }
1005
0
//                if (uc == KHMER_MAIYAMOK) {
1006
0
//                    if (utext_previous32(text) != KHMER_MAIYAMOK) {
1007
0
//                        // Skip over previous end and MAIYAMOK
1008
0
//                        utext_next32(text);
1009
0
//                        utext_next32(text);
1010
0
//                        wordLength += 1;            // Add MAIYAMOK to word
1011
0
//                    }
1012
0
//                    else {
1013
0
//                        // Restore prior position
1014
0
//                        utext_next32(text);
1015
0
//                    }
1016
0
//                }
1017
0
//            }
1018
0
//            else {
1019
0
//                utext_setNativeIndex(text, current+wordLength);
1020
0
//            }
1021
0
//        }
1022
0
1023
0
        // Did we find a word on this iteration? If so, push it on the break stack
1024
0
        if (cuWordLength > 0) {
1025
0
            foundBreaks.push((current+cuWordLength), status);
1026
0
        }
1027
0
    }
1028
0
    
1029
0
    // Don't return a break for the end of the dictionary range if there is one there.
1030
0
    if (foundBreaks.peeki() >= rangeEnd) {
1031
0
        (void) foundBreaks.popi();
1032
0
        wordsFound -= 1;
1033
0
    }
1034
0
1035
0
    return wordsFound;
1036
0
}
1037
1038
#if !UCONFIG_NO_NORMALIZATION
1039
/*
1040
 ******************************************************************
1041
 * CjkBreakEngine
1042
 */
1043
static const uint32_t kuint32max = 0xFFFFFFFF;
1044
CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)
1045
0
: DictionaryBreakEngine(), fDictionary(adoptDictionary) {
1046
0
    // Korean dictionary only includes Hangul syllables
1047
0
    fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status);
1048
0
    fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status);
1049
0
    fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status);
1050
0
    fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status);
1051
0
    nfkcNorm2 = Normalizer2::getNFKCInstance(status);
1052
0
1053
0
    if (U_SUCCESS(status)) {
1054
0
        // handle Korean and Japanese/Chinese using different dictionaries
1055
0
        if (type == kKorean) {
1056
0
            setCharacters(fHangulWordSet);
1057
0
        } else { //Chinese and Japanese
1058
0
            UnicodeSet cjSet;
1059
0
            cjSet.addAll(fHanWordSet);
1060
0
            cjSet.addAll(fKatakanaWordSet);
1061
0
            cjSet.addAll(fHiraganaWordSet);
1062
0
            cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK
1063
0
            cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK
1064
0
            setCharacters(cjSet);
1065
0
        }
1066
0
    }
1067
0
}
1068
1069
0
CjkBreakEngine::~CjkBreakEngine(){
1070
0
    delete fDictionary;
1071
0
}
1072
1073
// The katakanaCost values below are based on the length frequencies of all
1074
// katakana phrases in the dictionary
1075
static const int32_t kMaxKatakanaLength = 8;
1076
static const int32_t kMaxKatakanaGroupLength = 20;
1077
static const uint32_t maxSnlp = 255;
1078
1079
0
static inline uint32_t getKatakanaCost(int32_t wordLength){
1080
0
    //TODO: fill array with actual values from dictionary!
1081
0
    static const uint32_t katakanaCost[kMaxKatakanaLength + 1]
1082
0
                                       = {8192, 984, 408, 240, 204, 252, 300, 372, 480};
1083
0
    return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];
1084
0
}
1085
1086
0
static inline bool isKatakana(UChar32 value) {
1087
0
    return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||
1088
0
            (value >= 0xFF66 && value <= 0xFF9f);
1089
0
}
1090
1091
1092
// Function for accessing internal utext flags.
1093
//   Replicates an internal UText function.
1094
1095
0
static inline int32_t utext_i32_flag(int32_t bitIndex) {
1096
0
    return (int32_t)1 << bitIndex;
1097
0
}
1098
1099
       
1100
/*
1101
 * @param text A UText representing the text
1102
 * @param rangeStart The start of the range of dictionary characters
1103
 * @param rangeEnd The end of the range of dictionary characters
1104
 * @param foundBreaks vector<int32> to receive the break positions
1105
 * @return The number of breaks found
1106
 */
1107
int32_t 
1108
CjkBreakEngine::divideUpDictionaryRange( UText *inText,
1109
        int32_t rangeStart,
1110
        int32_t rangeEnd,
1111
0
        UVector32 &foundBreaks ) const {
1112
0
    if (rangeStart >= rangeEnd) {
1113
0
        return 0;
1114
0
    }
1115
0
1116
0
    // UnicodeString version of input UText, NFKC normalized if necessary.
1117
0
    UnicodeString inString;
1118
0
1119
0
    // inputMap[inStringIndex] = corresponding native index from UText inText.
1120
0
    // If NULL then mapping is 1:1
1121
0
    LocalPointer<UVector32>     inputMap;
1122
0
1123
0
    UErrorCode     status      = U_ZERO_ERROR;
1124
0
1125
0
1126
0
    // if UText has the input string as one contiguous UTF-16 chunk
1127
0
    if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&
1128
0
         inText->chunkNativeStart <= rangeStart &&
1129
0
         inText->chunkNativeLimit >= rangeEnd   &&
1130
0
         inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) {
1131
0
1132
0
        // Input UText is in one contiguous UTF-16 chunk.
1133
0
        // Use Read-only aliasing UnicodeString.
1134
0
        inString.setTo(FALSE,
1135
0
                       inText->chunkContents + rangeStart - inText->chunkNativeStart,
1136
0
                       rangeEnd - rangeStart);
1137
0
    } else {
1138
0
        // Copy the text from the original inText (UText) to inString (UnicodeString).
1139
0
        // Create a map from UnicodeString indices -> UText offsets.
1140
0
        utext_setNativeIndex(inText, rangeStart);
1141
0
        int32_t limit = rangeEnd;
1142
0
        U_ASSERT(limit <= utext_nativeLength(inText));
1143
0
        if (limit > utext_nativeLength(inText)) {
1144
0
            limit = (int32_t)utext_nativeLength(inText);
1145
0
        }
1146
0
        inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1147
0
        if (U_FAILURE(status)) {
1148
0
            return 0;
1149
0
        }
1150
0
        while (utext_getNativeIndex(inText) < limit) {
1151
0
            int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);
1152
0
            UChar32 c = utext_next32(inText);
1153
0
            U_ASSERT(c != U_SENTINEL);
1154
0
            inString.append(c);
1155
0
            while (inputMap->size() < inString.length()) {
1156
0
                inputMap->addElement(nativePosition, status);
1157
0
            }
1158
0
        }
1159
0
        inputMap->addElement(limit, status);
1160
0
    }
1161
0
1162
0
1163
0
    if (!nfkcNorm2->isNormalized(inString, status)) {
1164
0
        UnicodeString normalizedInput;
1165
0
        //  normalizedMap[normalizedInput position] ==  original UText position.
1166
0
        LocalPointer<UVector32> normalizedMap(new UVector32(status), status);
1167
0
        if (U_FAILURE(status)) {
1168
0
            return 0;
1169
0
        }
1170
0
        
1171
0
        UnicodeString fragment;
1172
0
        UnicodeString normalizedFragment;
1173
0
        for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk
1174
0
            fragment.remove();
1175
0
            int32_t fragmentStartI = srcI;
1176
0
            UChar32 c = inString.char32At(srcI);
1177
0
            for (;;) {
1178
0
                fragment.append(c);
1179
0
                srcI = inString.moveIndex32(srcI, 1);
1180
0
                if (srcI == inString.length()) {
1181
0
                    break;
1182
0
                }
1183
0
                c = inString.char32At(srcI);
1184
0
                if (nfkcNorm2->hasBoundaryBefore(c)) {
1185
0
                    break;
1186
0
                }
1187
0
            }
1188
0
            nfkcNorm2->normalize(fragment, normalizedFragment, status);
1189
0
            normalizedInput.append(normalizedFragment);
1190
0
1191
0
            // Map every position in the normalized chunk to the start of the chunk
1192
0
            //   in the original input.
1193
0
            int32_t fragmentOriginalStart = inputMap.isValid() ?
1194
0
                    inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;
1195
0
            while (normalizedMap->size() < normalizedInput.length()) {
1196
0
                normalizedMap->addElement(fragmentOriginalStart, status);
1197
0
                if (U_FAILURE(status)) {
1198
0
                    break;
1199
0
                }
1200
0
            }
1201
0
        }
1202
0
        U_ASSERT(normalizedMap->size() == normalizedInput.length());
1203
0
        int32_t nativeEnd = inputMap.isValid() ?
1204
0
                inputMap->elementAti(inString.length()) : inString.length()+rangeStart;
1205
0
        normalizedMap->addElement(nativeEnd, status);
1206
0
1207
0
        inputMap.moveFrom(normalizedMap);
1208
0
        inString.moveFrom(normalizedInput);
1209
0
    }
1210
0
1211
0
    int32_t numCodePts = inString.countChar32();
1212
0
    if (numCodePts != inString.length()) {
1213
0
        // There are supplementary characters in the input.
1214
0
        // The dictionary will produce boundary positions in terms of code point indexes,
1215
0
        //   not in terms of code unit string indexes.
1216
0
        // Use the inputMap mechanism to take care of this in addition to indexing differences
1217
0
        //    from normalization and/or UTF-8 input.
1218
0
        UBool hadExistingMap = inputMap.isValid();
1219
0
        if (!hadExistingMap) {
1220
0
            inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);
1221
0
            if (U_FAILURE(status)) {
1222
0
                return 0;
1223
0
            }
1224
0
        }
1225
0
        int32_t cpIdx = 0;
1226
0
        for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) {
1227
0
            U_ASSERT(cuIdx >= cpIdx);
1228
0
            if (hadExistingMap) {
1229
0
                inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);
1230
0
            } else {
1231
0
                inputMap->addElement(cuIdx+rangeStart, status);
1232
0
            }
1233
0
            cpIdx++;
1234
0
            if (cuIdx == inString.length()) {
1235
0
               break;
1236
0
            }
1237
0
        }
1238
0
    }
1239
0
                
1240
0
    // bestSnlp[i] is the snlp of the best segmentation of the first i
1241
0
    // code points in the range to be matched.
1242
0
    UVector32 bestSnlp(numCodePts + 1, status);
1243
0
    bestSnlp.addElement(0, status);
1244
0
    for(int32_t i = 1; i <= numCodePts; i++) {
1245
0
        bestSnlp.addElement(kuint32max, status);
1246
0
    }
1247
0
1248
0
1249
0
    // prev[i] is the index of the last CJK code point in the previous word in 
1250
0
    // the best segmentation of the first i characters.
1251
0
    UVector32 prev(numCodePts + 1, status);
1252
0
    for(int32_t i = 0; i <= numCodePts; i++){
1253
0
        prev.addElement(-1, status);
1254
0
    }
1255
0
1256
0
    const int32_t maxWordSize = 20;
1257
0
    UVector32 values(numCodePts, status);
1258
0
    values.setSize(numCodePts);
1259
0
    UVector32 lengths(numCodePts, status);
1260
0
    lengths.setSize(numCodePts);
1261
0
1262
0
    UText fu = UTEXT_INITIALIZER;
1263
0
    utext_openUnicodeString(&fu, &inString, &status);
1264
0
1265
0
    // Dynamic programming to find the best segmentation.
1266
0
1267
0
    // In outer loop, i  is the code point index,
1268
0
    //                ix is the corresponding string (code unit) index.
1269
0
    //    They differ when the string contains supplementary characters.
1270
0
    int32_t ix = 0;
1271
0
    bool is_prev_katakana = false;
1272
0
    for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) {
1273
0
        if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) {
1274
0
            continue;
1275
0
        }
1276
0
1277
0
        int32_t count;
1278
0
        utext_setNativeIndex(&fu, ix);
1279
0
        count = fDictionary->matches(&fu, maxWordSize, numCodePts,
1280
0
                             NULL, lengths.getBuffer(), values.getBuffer(), NULL);
1281
0
                             // Note: lengths is filled with code point lengths
1282
0
                             //       The NULL parameter is the ignored code unit lengths.
1283
0
1284
0
        // if there are no single character matches found in the dictionary 
1285
0
        // starting with this character, treat character as a 1-character word 
1286
0
        // with the highest value possible, i.e. the least likely to occur.
1287
0
        // Exclude Korean characters from this treatment, as they should be left
1288
0
        // together by default.
1289
0
        if ((count == 0 || lengths.elementAti(0) != 1) &&
1290
0
                !fHangulWordSet.contains(inString.char32At(ix))) {
1291
0
            values.setElementAt(maxSnlp, count);   // 255
1292
0
            lengths.setElementAt(1, count++);
1293
0
        }
1294
0
1295
0
        for (int32_t j = 0; j < count; j++) {
1296
0
            uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);
1297
0
            int32_t ln_j_i = lengths.elementAti(j) + i;
1298
0
            if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) {
1299
0
                bestSnlp.setElementAt(newSnlp, ln_j_i);
1300
0
                prev.setElementAt(i, ln_j_i);
1301
0
            }
1302
0
        }
1303
0
1304
0
        // In Japanese,
1305
0
        // Katakana word in single character is pretty rare. So we apply
1306
0
        // the following heuristic to Katakana: any continuous run of Katakana
1307
0
        // characters is considered a candidate word with a default cost
1308
0
        // specified in the katakanaCost table according to its length.
1309
0
1310
0
        bool is_katakana = isKatakana(inString.char32At(ix));
1311
0
        int32_t katakanaRunLength = 1;
1312
0
        if (!is_prev_katakana && is_katakana) {
1313
0
            int32_t j = inString.moveIndex32(ix, 1);
1314
0
            // Find the end of the continuous run of Katakana characters
1315
0
            while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&
1316
0
                    isKatakana(inString.char32At(j))) {
1317
0
                j = inString.moveIndex32(j, 1);
1318
0
                katakanaRunLength++;
1319
0
            }
1320
0
            if (katakanaRunLength < kMaxKatakanaGroupLength) {
1321
0
                uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);
1322
0
                if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) {
1323
0
                    bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);
1324
0
                    prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;
1325
0
                }
1326
0
            }
1327
0
        }
1328
0
        is_prev_katakana = is_katakana;
1329
0
    }
1330
0
    utext_close(&fu);
1331
0
1332
0
    // Start pushing the optimal offset index into t_boundary (t for tentative).
1333
0
    // prev[numCodePts] is guaranteed to be meaningful.
1334
0
    // We'll first push in the reverse order, i.e.,
1335
0
    // t_boundary[0] = numCodePts, and afterwards do a swap.
1336
0
    UVector32 t_boundary(numCodePts+1, status);
1337
0
1338
0
    int32_t numBreaks = 0;
1339
0
    // No segmentation found, set boundary to end of range
1340
0
    if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) {
1341
0
        t_boundary.addElement(numCodePts, status);
1342
0
        numBreaks++;
1343
0
    } else {
1344
0
        for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) {
1345
0
            t_boundary.addElement(i, status);
1346
0
            numBreaks++;
1347
0
        }
1348
0
        U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);
1349
0
    }
1350
0
1351
0
    // Add a break for the start of the dictionary range if there is not one
1352
0
    // there already.
1353
0
    if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) {
1354
0
        t_boundary.addElement(0, status);
1355
0
        numBreaks++;
1356
0
    }
1357
0
1358
0
    // Now that we're done, convert positions in t_boundary[] (indices in 
1359
0
    // the normalized input string) back to indices in the original input UText
1360
0
    // while reversing t_boundary and pushing values to foundBreaks.
1361
0
    int32_t prevCPPos = -1;
1362
0
    int32_t prevUTextPos = -1;
1363
0
    for (int32_t i = numBreaks-1; i >= 0; i--) {
1364
0
        int32_t cpPos = t_boundary.elementAti(i);
1365
0
        U_ASSERT(cpPos > prevCPPos);
1366
0
        int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;
1367
0
        U_ASSERT(utextPos >= prevUTextPos);
1368
0
        if (utextPos > prevUTextPos) {
1369
0
            // Boundaries are added to foundBreaks output in ascending order.
1370
0
            U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);
1371
0
            foundBreaks.push(utextPos, status);
1372
0
        } else {
1373
0
            // Normalization expanded the input text, the dictionary found a boundary
1374
0
            // within the expansion, giving two boundaries with the same index in the
1375
0
            // original text. Ignore the second. See ticket #12918.
1376
0
            --numBreaks;
1377
0
        }
1378
0
        prevCPPos = cpPos;
1379
0
        prevUTextPos = utextPos;
1380
0
    }
1381
0
    (void)prevCPPos; // suppress compiler warnings about unused variable
1382
0
1383
0
    // inString goes out of scope
1384
0
    // inputMap goes out of scope
1385
0
    return numBreaks;
1386
0
}
1387
#endif
1388
1389
U_NAMESPACE_END
1390
1391
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */
1392