/src/icu/source/common/dictbe.cpp
Line  | Count  | Source (jump to first uncovered line)  | 
1  |  | // © 2016 and later: Unicode, Inc. and others.  | 
2  |  | // License & terms of use: http://www.unicode.org/copyright.html  | 
3  |  | /**  | 
4  |  |  *******************************************************************************  | 
5  |  |  * Copyright (C) 2006-2016, International Business Machines Corporation  | 
6  |  |  * and others. All Rights Reserved.  | 
7  |  |  *******************************************************************************  | 
8  |  |  */  | 
9  |  |  | 
10  |  | #include <utility>  | 
11  |  |  | 
12  |  | #include "unicode/utypes.h"  | 
13  |  |  | 
14  |  | #if !UCONFIG_NO_BREAK_ITERATION  | 
15  |  |  | 
16  |  | #include "brkeng.h"  | 
17  |  | #include "dictbe.h"  | 
18  |  | #include "unicode/uniset.h"  | 
19  |  | #include "unicode/chariter.h"  | 
20  |  | #include "unicode/ubrk.h"  | 
21  |  | #include "utracimp.h"  | 
22  |  | #include "uvectr32.h"  | 
23  |  | #include "uvector.h"  | 
24  |  | #include "uassert.h"  | 
25  |  | #include "unicode/normlzr.h"  | 
26  |  | #include "cmemory.h"  | 
27  |  | #include "dictionarydata.h"  | 
28  |  |  | 
29  |  | U_NAMESPACE_BEGIN  | 
30  |  |  | 
31  |  | /*  | 
32  |  |  ******************************************************************  | 
33  |  |  */  | 
34  |  |  | 
35  | 0  | DictionaryBreakEngine::DictionaryBreakEngine() { | 
36  | 0  | }  | 
37  |  |  | 
38  | 0  | DictionaryBreakEngine::~DictionaryBreakEngine() { | 
39  | 0  | }  | 
40  |  |  | 
41  |  | UBool  | 
42  | 0  | DictionaryBreakEngine::handles(UChar32 c) const { | 
43  | 0  |     return fSet.contains(c);  | 
44  | 0  | }  | 
45  |  |  | 
46  |  | int32_t  | 
47  |  | DictionaryBreakEngine::findBreaks( UText *text,  | 
48  |  |                                  int32_t startPos,  | 
49  |  |                                  int32_t endPos,  | 
50  |  |                                  UVector32 &foundBreaks,  | 
51  | 0  |                                  UErrorCode& status) const { | 
52  | 0  |     if (U_FAILURE(status)) return 0;  | 
53  | 0  |     (void)startPos;            // TODO: remove this param?  | 
54  | 0  |     int32_t result = 0;  | 
55  |  |  | 
56  |  |     // Find the span of characters included in the set.  | 
57  |  |     //   The span to break begins at the current position in the text, and  | 
58  |  |     //   extends towards the start or end of the text, depending on 'reverse'.  | 
59  |  | 
  | 
60  | 0  |     int32_t start = (int32_t)utext_getNativeIndex(text);  | 
61  | 0  |     int32_t current;  | 
62  | 0  |     int32_t rangeStart;  | 
63  | 0  |     int32_t rangeEnd;  | 
64  | 0  |     UChar32 c = utext_current32(text);  | 
65  | 0  |     while((current = (int32_t)utext_getNativeIndex(text)) < endPos && fSet.contains(c)) { | 
66  | 0  |         utext_next32(text);         // TODO:  recast loop for postincrement  | 
67  | 0  |         c = utext_current32(text);  | 
68  | 0  |     }  | 
69  | 0  |     rangeStart = start;  | 
70  | 0  |     rangeEnd = current;  | 
71  | 0  |     result = divideUpDictionaryRange(text, rangeStart, rangeEnd, foundBreaks, status);  | 
72  | 0  |     utext_setNativeIndex(text, current);  | 
73  |  |       | 
74  | 0  |     return result;  | 
75  | 0  | }  | 
76  |  |  | 
77  |  | void  | 
78  | 0  | DictionaryBreakEngine::setCharacters( const UnicodeSet &set ) { | 
79  | 0  |     fSet = set;  | 
80  |  |     // Compact for caching  | 
81  | 0  |     fSet.compact();  | 
82  | 0  | }  | 
83  |  |  | 
84  |  | /*  | 
85  |  |  ******************************************************************  | 
86  |  |  * PossibleWord  | 
87  |  |  */  | 
88  |  |  | 
89  |  | // Helper class for improving readability of the Thai/Lao/Khmer word break  | 
90  |  | // algorithm. The implementation is completely inline.  | 
91  |  |  | 
92  |  | // List size, limited by the maximum number of words in the dictionary  | 
93  |  | // that form a nested sequence.  | 
94  |  | static const int32_t POSSIBLE_WORD_LIST_MAX = 20;  | 
95  |  |  | 
96  |  | class PossibleWord { | 
97  |  | private:  | 
98  |  |     // list of word candidate lengths, in increasing length order  | 
99  |  |     // TODO: bytes would be sufficient for word lengths.  | 
100  |  |     int32_t   count;      // Count of candidates  | 
101  |  |     int32_t   prefix;     // The longest match with a dictionary word  | 
102  |  |     int32_t   offset;     // Offset in the text of these candidates  | 
103  |  |     int32_t   mark;       // The preferred candidate's offset  | 
104  |  |     int32_t   current;    // The candidate we're currently looking at  | 
105  |  |     int32_t   cuLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code units.  | 
106  |  |     int32_t   cpLengths[POSSIBLE_WORD_LIST_MAX];   // Word Lengths, in code points.  | 
107  |  |  | 
108  |  | public:  | 
109  | 0  |     PossibleWord() : count(0), prefix(0), offset(-1), mark(0), current(0) {} | 
110  | 0  |     ~PossibleWord() {} | 
111  |  |     | 
112  |  |     // Fill the list of candidates if needed, select the longest, and return the number found  | 
113  |  |     int32_t   candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd );  | 
114  |  |     | 
115  |  |     // Select the currently marked candidate, point after it in the text, and invalidate self  | 
116  |  |     int32_t   acceptMarked( UText *text );  | 
117  |  |     | 
118  |  |     // Back up from the current candidate to the next shorter one; return TRUE if that exists  | 
119  |  |     // and point the text after it  | 
120  |  |     UBool     backUp( UText *text );  | 
121  |  |     | 
122  |  |     // Return the longest prefix this candidate location shares with a dictionary word  | 
123  |  |     // Return value is in code points.  | 
124  | 0  |     int32_t   longestPrefix() { return prefix; } | 
125  |  |     | 
126  |  |     // Mark the current candidate as the one we like  | 
127  | 0  |     void      markCurrent() { mark = current; } | 
128  |  |       | 
129  |  |     // Get length in code points of the marked word.  | 
130  | 0  |     int32_t   markedCPLength() { return cpLengths[mark]; } | 
131  |  | };  | 
132  |  |  | 
133  |  |  | 
134  | 0  | int32_t PossibleWord::candidates( UText *text, DictionaryMatcher *dict, int32_t rangeEnd ) { | 
135  |  |     // TODO: If getIndex is too slow, use offset < 0 and add discardAll()  | 
136  | 0  |     int32_t start = (int32_t)utext_getNativeIndex(text);  | 
137  | 0  |     if (start != offset) { | 
138  | 0  |         offset = start;  | 
139  | 0  |         count = dict->matches(text, rangeEnd-start, UPRV_LENGTHOF(cuLengths), cuLengths, cpLengths, NULL, &prefix);  | 
140  |  |         // Dictionary leaves text after longest prefix, not longest word. Back up.  | 
141  | 0  |         if (count <= 0) { | 
142  | 0  |             utext_setNativeIndex(text, start);  | 
143  | 0  |         }  | 
144  | 0  |     }  | 
145  | 0  |     if (count > 0) { | 
146  | 0  |         utext_setNativeIndex(text, start+cuLengths[count-1]);  | 
147  | 0  |     }  | 
148  | 0  |     current = count-1;  | 
149  | 0  |     mark = current;  | 
150  | 0  |     return count;  | 
151  | 0  | }  | 
152  |  |  | 
153  |  | int32_t  | 
154  | 0  | PossibleWord::acceptMarked( UText *text ) { | 
155  | 0  |     utext_setNativeIndex(text, offset + cuLengths[mark]);  | 
156  | 0  |     return cuLengths[mark];  | 
157  | 0  | }  | 
158  |  |  | 
159  |  |  | 
160  |  | UBool  | 
161  | 0  | PossibleWord::backUp( UText *text ) { | 
162  | 0  |     if (current > 0) { | 
163  | 0  |         utext_setNativeIndex(text, offset + cuLengths[--current]);  | 
164  | 0  |         return TRUE;  | 
165  | 0  |     }  | 
166  | 0  |     return FALSE;  | 
167  | 0  | }  | 
168  |  |  | 
169  |  | /*  | 
170  |  |  ******************************************************************  | 
171  |  |  * ThaiBreakEngine  | 
172  |  |  */  | 
173  |  |  | 
174  |  | // How many words in a row are "good enough"?  | 
175  |  | static const int32_t THAI_LOOKAHEAD = 3;  | 
176  |  |  | 
177  |  | // Will not combine a non-word with a preceding dictionary word longer than this  | 
178  |  | static const int32_t THAI_ROOT_COMBINE_THRESHOLD = 3;  | 
179  |  |  | 
180  |  | // Will not combine a non-word that shares at least this much prefix with a  | 
181  |  | // dictionary word, with a preceding word  | 
182  |  | static const int32_t THAI_PREFIX_COMBINE_THRESHOLD = 3;  | 
183  |  |  | 
184  |  | // Elision character  | 
185  |  | static const int32_t THAI_PAIYANNOI = 0x0E2F;  | 
186  |  |  | 
187  |  | // Repeat character  | 
188  |  | static const int32_t THAI_MAIYAMOK = 0x0E46;  | 
189  |  |  | 
190  |  | // Minimum word size  | 
191  |  | static const int32_t THAI_MIN_WORD = 2;  | 
192  |  |  | 
193  |  | // Minimum number of characters for two words  | 
194  |  | static const int32_t THAI_MIN_WORD_SPAN = THAI_MIN_WORD * 2;  | 
195  |  |  | 
196  |  | ThaiBreakEngine::ThaiBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)  | 
197  | 0  |     : DictionaryBreakEngine(),  | 
198  | 0  |       fDictionary(adoptDictionary)  | 
199  | 0  | { | 
200  | 0  |     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);  | 
201  | 0  |     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Thai");  | 
202  | 0  |     fThaiWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]]"), status); | 
203  | 0  |     if (U_SUCCESS(status)) { | 
204  | 0  |         setCharacters(fThaiWordSet);  | 
205  | 0  |     }  | 
206  | 0  |     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Thai:]&[:LineBreak=SA:]&[:M:]]"), status); | 
207  | 0  |     fMarkSet.add(0x0020);  | 
208  | 0  |     fEndWordSet = fThaiWordSet;  | 
209  | 0  |     fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT  | 
210  | 0  |     fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI  | 
211  | 0  |     fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK  | 
212  | 0  |     fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI  | 
213  | 0  |     fSuffixSet.add(THAI_PAIYANNOI);  | 
214  | 0  |     fSuffixSet.add(THAI_MAIYAMOK);  | 
215  |  |  | 
216  |  |     // Compact for caching.  | 
217  | 0  |     fMarkSet.compact();  | 
218  | 0  |     fEndWordSet.compact();  | 
219  | 0  |     fBeginWordSet.compact();  | 
220  | 0  |     fSuffixSet.compact();  | 
221  | 0  |     UTRACE_EXIT_STATUS(status);  | 
222  | 0  | }  | 
223  |  |  | 
224  | 0  | ThaiBreakEngine::~ThaiBreakEngine() { | 
225  | 0  |     delete fDictionary;  | 
226  | 0  | }  | 
227  |  |  | 
228  |  | int32_t  | 
229  |  | ThaiBreakEngine::divideUpDictionaryRange( UText *text,  | 
230  |  |                                                 int32_t rangeStart,  | 
231  |  |                                                 int32_t rangeEnd,  | 
232  |  |                                                 UVector32 &foundBreaks,  | 
233  | 0  |                                                 UErrorCode& status) const { | 
234  | 0  |     if (U_FAILURE(status)) return 0;  | 
235  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
236  | 0  |     utext_moveIndex32(text, THAI_MIN_WORD_SPAN);  | 
237  | 0  |     if (utext_getNativeIndex(text) >= rangeEnd) { | 
238  | 0  |         return 0;       // Not enough characters for two words  | 
239  | 0  |     }  | 
240  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
241  |  |  | 
242  |  | 
  | 
243  | 0  |     uint32_t wordsFound = 0;  | 
244  | 0  |     int32_t cpWordLength = 0;    // Word Length in Code Points.  | 
245  | 0  |     int32_t cuWordLength = 0;    // Word length in code units (UText native indexing)  | 
246  | 0  |     int32_t current;  | 
247  | 0  |     PossibleWord words[THAI_LOOKAHEAD];  | 
248  |  |       | 
249  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
250  |  |       | 
251  | 0  |     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | 
252  | 0  |         cpWordLength = 0;  | 
253  | 0  |         cuWordLength = 0;  | 
254  |  |  | 
255  |  |         // Look for candidate words at the current position  | 
256  | 0  |         int32_t candidates = words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
257  |  |           | 
258  |  |         // If we found exactly one, use that  | 
259  | 0  |         if (candidates == 1) { | 
260  | 0  |             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);  | 
261  | 0  |             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();  | 
262  | 0  |             wordsFound += 1;  | 
263  | 0  |         }  | 
264  |  |         // If there was more than one, see which one can take us forward the most words  | 
265  | 0  |         else if (candidates > 1) { | 
266  |  |             // If we're already at the end of the range, we're done  | 
267  | 0  |             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
268  | 0  |                 goto foundBest;  | 
269  | 0  |             }  | 
270  | 0  |             do { | 
271  | 0  |                 if (words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | 
272  |  |                     // Followed by another dictionary word; mark first word as a good candidate  | 
273  | 0  |                     words[wordsFound%THAI_LOOKAHEAD].markCurrent();  | 
274  |  |                       | 
275  |  |                     // If we're already at the end of the range, we're done  | 
276  | 0  |                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
277  | 0  |                         goto foundBest;  | 
278  | 0  |                     }  | 
279  |  |                       | 
280  |  |                     // See if any of the possible second words is followed by a third word  | 
281  | 0  |                     do { | 
282  |  |                         // If we find a third word, stop right away  | 
283  | 0  |                         if (words[(wordsFound + 2) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | 
284  | 0  |                             words[wordsFound % THAI_LOOKAHEAD].markCurrent();  | 
285  | 0  |                             goto foundBest;  | 
286  | 0  |                         }  | 
287  | 0  |                     }  | 
288  | 0  |                     while (words[(wordsFound + 1) % THAI_LOOKAHEAD].backUp(text));  | 
289  | 0  |                 }  | 
290  | 0  |             }  | 
291  | 0  |             while (words[wordsFound % THAI_LOOKAHEAD].backUp(text));  | 
292  | 0  | foundBest:  | 
293  |  |             // Set UText position to after the accepted word.  | 
294  | 0  |             cuWordLength = words[wordsFound % THAI_LOOKAHEAD].acceptMarked(text);  | 
295  | 0  |             cpWordLength = words[wordsFound % THAI_LOOKAHEAD].markedCPLength();  | 
296  | 0  |             wordsFound += 1;  | 
297  | 0  |         }  | 
298  |  |           | 
299  |  |         // We come here after having either found a word or not. We look ahead to the  | 
300  |  |         // next word. If it's not a dictionary word, we will combine it with the word we  | 
301  |  |         // just found (if there is one), but only if the preceding word does not exceed  | 
302  |  |         // the threshold.  | 
303  |  |         // The text iterator should now be positioned at the end of the word we found.  | 
304  |  |           | 
305  | 0  |         UChar32 uc = 0;  | 
306  | 0  |         if ((int32_t)utext_getNativeIndex(text) < rangeEnd &&  cpWordLength < THAI_ROOT_COMBINE_THRESHOLD) { | 
307  |  |             // if it is a dictionary word, do nothing. If it isn't, then if there is  | 
308  |  |             // no preceding word, or the non-word shares less than the minimum threshold  | 
309  |  |             // of characters with a dictionary word, then scan to resynchronize  | 
310  | 0  |             if (words[wordsFound % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
311  | 0  |                   && (cuWordLength == 0  | 
312  | 0  |                       || words[wordsFound%THAI_LOOKAHEAD].longestPrefix() < THAI_PREFIX_COMBINE_THRESHOLD)) { | 
313  |  |                 // Look for a plausible word boundary  | 
314  | 0  |                 int32_t remaining = rangeEnd - (current+cuWordLength);  | 
315  | 0  |                 UChar32 pc;  | 
316  | 0  |                 int32_t chars = 0;  | 
317  | 0  |                 for (;;) { | 
318  | 0  |                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);  | 
319  | 0  |                     pc = utext_next32(text);  | 
320  | 0  |                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;  | 
321  | 0  |                     chars += pcSize;  | 
322  | 0  |                     remaining -= pcSize;  | 
323  | 0  |                     if (remaining <= 0) { | 
324  | 0  |                         break;  | 
325  | 0  |                     }  | 
326  | 0  |                     uc = utext_current32(text);  | 
327  | 0  |                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | 
328  |  |                         // Maybe. See if it's in the dictionary.  | 
329  |  |                         // NOTE: In the original Apple code, checked that the next  | 
330  |  |                         // two characters after uc were not 0x0E4C THANTHAKHAT before  | 
331  |  |                         // checking the dictionary. That is just a performance filter,  | 
332  |  |                         // but it's not clear it's faster than checking the trie.  | 
333  | 0  |                         int32_t num_candidates = words[(wordsFound + 1) % THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
334  | 0  |                         utext_setNativeIndex(text, current + cuWordLength + chars);  | 
335  | 0  |                         if (num_candidates > 0) { | 
336  | 0  |                             break;  | 
337  | 0  |                         }  | 
338  | 0  |                     }  | 
339  | 0  |                 }  | 
340  |  |                   | 
341  |  |                 // Bump the word count if there wasn't already one  | 
342  | 0  |                 if (cuWordLength <= 0) { | 
343  | 0  |                     wordsFound += 1;  | 
344  | 0  |                 }  | 
345  |  |                   | 
346  |  |                 // Update the length with the passed-over characters  | 
347  | 0  |                 cuWordLength += chars;  | 
348  | 0  |             }  | 
349  | 0  |             else { | 
350  |  |                 // Back up to where we were for next iteration  | 
351  | 0  |                 utext_setNativeIndex(text, current+cuWordLength);  | 
352  | 0  |             }  | 
353  | 0  |         }  | 
354  |  |           | 
355  |  |         // Never stop before a combining mark.  | 
356  | 0  |         int32_t currPos;  | 
357  | 0  |         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | 
358  | 0  |             utext_next32(text);  | 
359  | 0  |             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;  | 
360  | 0  |         }  | 
361  |  |           | 
362  |  |         // Look ahead for possible suffixes if a dictionary word does not follow.  | 
363  |  |         // We do this in code rather than using a rule so that the heuristic  | 
364  |  |         // resynch continues to function. For example, one of the suffix characters  | 
365  |  |         // could be a typo in the middle of a word.  | 
366  | 0  |         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cuWordLength > 0) { | 
367  | 0  |             if (words[wordsFound%THAI_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
368  | 0  |                 && fSuffixSet.contains(uc = utext_current32(text))) { | 
369  | 0  |                 if (uc == THAI_PAIYANNOI) { | 
370  | 0  |                     if (!fSuffixSet.contains(utext_previous32(text))) { | 
371  |  |                         // Skip over previous end and PAIYANNOI  | 
372  | 0  |                         utext_next32(text);  | 
373  | 0  |                         int32_t paiyannoiIndex = (int32_t)utext_getNativeIndex(text);  | 
374  | 0  |                         utext_next32(text);  | 
375  | 0  |                         cuWordLength += (int32_t)utext_getNativeIndex(text) - paiyannoiIndex;    // Add PAIYANNOI to word  | 
376  | 0  |                         uc = utext_current32(text);     // Fetch next character  | 
377  | 0  |                     }  | 
378  | 0  |                     else { | 
379  |  |                         // Restore prior position  | 
380  | 0  |                         utext_next32(text);  | 
381  | 0  |                     }  | 
382  | 0  |                 }  | 
383  | 0  |                 if (uc == THAI_MAIYAMOK) { | 
384  | 0  |                     if (utext_previous32(text) != THAI_MAIYAMOK) { | 
385  |  |                         // Skip over previous end and MAIYAMOK  | 
386  | 0  |                         utext_next32(text);  | 
387  | 0  |                         int32_t maiyamokIndex = (int32_t)utext_getNativeIndex(text);  | 
388  | 0  |                         utext_next32(text);  | 
389  | 0  |                         cuWordLength += (int32_t)utext_getNativeIndex(text) - maiyamokIndex;    // Add MAIYAMOK to word  | 
390  | 0  |                     }  | 
391  | 0  |                     else { | 
392  |  |                         // Restore prior position  | 
393  | 0  |                         utext_next32(text);  | 
394  | 0  |                     }  | 
395  | 0  |                 }  | 
396  | 0  |             }  | 
397  | 0  |             else { | 
398  | 0  |                 utext_setNativeIndex(text, current+cuWordLength);  | 
399  | 0  |             }  | 
400  | 0  |         }  | 
401  |  |  | 
402  |  |         // Did we find a word on this iteration? If so, push it on the break stack  | 
403  | 0  |         if (cuWordLength > 0) { | 
404  | 0  |             foundBreaks.push((current+cuWordLength), status);  | 
405  | 0  |         }  | 
406  | 0  |     }  | 
407  |  |  | 
408  |  |     // Don't return a break for the end of the dictionary range if there is one there.  | 
409  | 0  |     if (foundBreaks.peeki() >= rangeEnd) { | 
410  | 0  |         (void) foundBreaks.popi();  | 
411  | 0  |         wordsFound -= 1;  | 
412  | 0  |     }  | 
413  |  | 
  | 
414  | 0  |     return wordsFound;  | 
415  | 0  | }  | 
416  |  |  | 
417  |  | /*  | 
418  |  |  ******************************************************************  | 
419  |  |  * LaoBreakEngine  | 
420  |  |  */  | 
421  |  |  | 
422  |  | // How many words in a row are "good enough"?  | 
423  |  | static const int32_t LAO_LOOKAHEAD = 3;  | 
424  |  |  | 
425  |  | // Will not combine a non-word with a preceding dictionary word longer than this  | 
426  |  | static const int32_t LAO_ROOT_COMBINE_THRESHOLD = 3;  | 
427  |  |  | 
428  |  | // Will not combine a non-word that shares at least this much prefix with a  | 
429  |  | // dictionary word, with a preceding word  | 
430  |  | static const int32_t LAO_PREFIX_COMBINE_THRESHOLD = 3;  | 
431  |  |  | 
432  |  | // Minimum word size  | 
433  |  | static const int32_t LAO_MIN_WORD = 2;  | 
434  |  |  | 
435  |  | // Minimum number of characters for two words  | 
436  |  | static const int32_t LAO_MIN_WORD_SPAN = LAO_MIN_WORD * 2;  | 
437  |  |  | 
438  |  | LaoBreakEngine::LaoBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)  | 
439  | 0  |     : DictionaryBreakEngine(),  | 
440  | 0  |       fDictionary(adoptDictionary)  | 
441  | 0  | { | 
442  | 0  |     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);  | 
443  | 0  |     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Laoo");  | 
444  | 0  |     fLaoWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]]"), status); | 
445  | 0  |     if (U_SUCCESS(status)) { | 
446  | 0  |         setCharacters(fLaoWordSet);  | 
447  | 0  |     }  | 
448  | 0  |     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Laoo:]&[:LineBreak=SA:]&[:M:]]"), status); | 
449  | 0  |     fMarkSet.add(0x0020);  | 
450  | 0  |     fEndWordSet = fLaoWordSet;  | 
451  | 0  |     fEndWordSet.remove(0x0EC0, 0x0EC4);     // prefix vowels  | 
452  | 0  |     fBeginWordSet.add(0x0E81, 0x0EAE);      // basic consonants (including holes for corresponding Thai characters)  | 
453  | 0  |     fBeginWordSet.add(0x0EDC, 0x0EDD);      // digraph consonants (no Thai equivalent)  | 
454  | 0  |     fBeginWordSet.add(0x0EC0, 0x0EC4);      // prefix vowels  | 
455  |  |  | 
456  |  |     // Compact for caching.  | 
457  | 0  |     fMarkSet.compact();  | 
458  | 0  |     fEndWordSet.compact();  | 
459  | 0  |     fBeginWordSet.compact();  | 
460  | 0  |     UTRACE_EXIT_STATUS(status);  | 
461  | 0  | }  | 
462  |  |  | 
463  | 0  | LaoBreakEngine::~LaoBreakEngine() { | 
464  | 0  |     delete fDictionary;  | 
465  | 0  | }  | 
466  |  |  | 
467  |  | int32_t  | 
468  |  | LaoBreakEngine::divideUpDictionaryRange( UText *text,  | 
469  |  |                                                 int32_t rangeStart,  | 
470  |  |                                                 int32_t rangeEnd,  | 
471  |  |                                                 UVector32 &foundBreaks,  | 
472  | 0  |                                                 UErrorCode& status) const { | 
473  | 0  |     if (U_FAILURE(status)) return 0;  | 
474  | 0  |     if ((rangeEnd - rangeStart) < LAO_MIN_WORD_SPAN) { | 
475  | 0  |         return 0;       // Not enough characters for two words  | 
476  | 0  |     }  | 
477  |  |  | 
478  | 0  |     uint32_t wordsFound = 0;  | 
479  | 0  |     int32_t cpWordLength = 0;  | 
480  | 0  |     int32_t cuWordLength = 0;  | 
481  | 0  |     int32_t current;  | 
482  | 0  |     PossibleWord words[LAO_LOOKAHEAD];  | 
483  |  | 
  | 
484  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
485  |  | 
  | 
486  | 0  |     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | 
487  | 0  |         cuWordLength = 0;  | 
488  | 0  |         cpWordLength = 0;  | 
489  |  |  | 
490  |  |         // Look for candidate words at the current position  | 
491  | 0  |         int32_t candidates = words[wordsFound%LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
492  |  |           | 
493  |  |         // If we found exactly one, use that  | 
494  | 0  |         if (candidates == 1) { | 
495  | 0  |             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);  | 
496  | 0  |             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();  | 
497  | 0  |             wordsFound += 1;  | 
498  | 0  |         }  | 
499  |  |         // If there was more than one, see which one can take us forward the most words  | 
500  | 0  |         else if (candidates > 1) { | 
501  |  |             // If we're already at the end of the range, we're done  | 
502  | 0  |             if (utext_getNativeIndex(text) >= rangeEnd) { | 
503  | 0  |                 goto foundBest;  | 
504  | 0  |             }  | 
505  | 0  |             do { | 
506  | 0  |                 if (words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | 
507  |  |                     // Followed by another dictionary word; mark first word as a good candidate  | 
508  | 0  |                     words[wordsFound%LAO_LOOKAHEAD].markCurrent();  | 
509  |  |                       | 
510  |  |                     // If we're already at the end of the range, we're done  | 
511  | 0  |                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
512  | 0  |                         goto foundBest;  | 
513  | 0  |                     }  | 
514  |  |                       | 
515  |  |                     // See if any of the possible second words is followed by a third word  | 
516  | 0  |                     do { | 
517  |  |                         // If we find a third word, stop right away  | 
518  | 0  |                         if (words[(wordsFound + 2) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | 
519  | 0  |                             words[wordsFound % LAO_LOOKAHEAD].markCurrent();  | 
520  | 0  |                             goto foundBest;  | 
521  | 0  |                         }  | 
522  | 0  |                     }  | 
523  | 0  |                     while (words[(wordsFound + 1) % LAO_LOOKAHEAD].backUp(text));  | 
524  | 0  |                 }  | 
525  | 0  |             }  | 
526  | 0  |             while (words[wordsFound % LAO_LOOKAHEAD].backUp(text));  | 
527  | 0  | foundBest:  | 
528  | 0  |             cuWordLength = words[wordsFound % LAO_LOOKAHEAD].acceptMarked(text);  | 
529  | 0  |             cpWordLength = words[wordsFound % LAO_LOOKAHEAD].markedCPLength();  | 
530  | 0  |             wordsFound += 1;  | 
531  | 0  |         }  | 
532  |  |           | 
533  |  |         // We come here after having either found a word or not. We look ahead to the  | 
534  |  |         // next word. If it's not a dictionary word, we will combine it with the word we  | 
535  |  |         // just found (if there is one), but only if the preceding word does not exceed  | 
536  |  |         // the threshold.  | 
537  |  |         // The text iterator should now be positioned at the end of the word we found.  | 
538  | 0  |         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < LAO_ROOT_COMBINE_THRESHOLD) { | 
539  |  |             // if it is a dictionary word, do nothing. If it isn't, then if there is  | 
540  |  |             // no preceding word, or the non-word shares less than the minimum threshold  | 
541  |  |             // of characters with a dictionary word, then scan to resynchronize  | 
542  | 0  |             if (words[wordsFound % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
543  | 0  |                   && (cuWordLength == 0  | 
544  | 0  |                       || words[wordsFound%LAO_LOOKAHEAD].longestPrefix() < LAO_PREFIX_COMBINE_THRESHOLD)) { | 
545  |  |                 // Look for a plausible word boundary  | 
546  | 0  |                 int32_t remaining = rangeEnd - (current + cuWordLength);  | 
547  | 0  |                 UChar32 pc;  | 
548  | 0  |                 UChar32 uc;  | 
549  | 0  |                 int32_t chars = 0;  | 
550  | 0  |                 for (;;) { | 
551  | 0  |                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);  | 
552  | 0  |                     pc = utext_next32(text);  | 
553  | 0  |                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;  | 
554  | 0  |                     chars += pcSize;  | 
555  | 0  |                     remaining -= pcSize;  | 
556  | 0  |                     if (remaining <= 0) { | 
557  | 0  |                         break;  | 
558  | 0  |                     }  | 
559  | 0  |                     uc = utext_current32(text);  | 
560  | 0  |                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | 
561  |  |                         // Maybe. See if it's in the dictionary.  | 
562  |  |                         // TODO: this looks iffy; compare with old code.  | 
563  | 0  |                         int32_t num_candidates = words[(wordsFound + 1) % LAO_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
564  | 0  |                         utext_setNativeIndex(text, current + cuWordLength + chars);  | 
565  | 0  |                         if (num_candidates > 0) { | 
566  | 0  |                             break;  | 
567  | 0  |                         }  | 
568  | 0  |                     }  | 
569  | 0  |                 }  | 
570  |  |                   | 
571  |  |                 // Bump the word count if there wasn't already one  | 
572  | 0  |                 if (cuWordLength <= 0) { | 
573  | 0  |                     wordsFound += 1;  | 
574  | 0  |                 }  | 
575  |  |                   | 
576  |  |                 // Update the length with the passed-over characters  | 
577  | 0  |                 cuWordLength += chars;  | 
578  | 0  |             }  | 
579  | 0  |             else { | 
580  |  |                 // Back up to where we were for next iteration  | 
581  | 0  |                 utext_setNativeIndex(text, current + cuWordLength);  | 
582  | 0  |             }  | 
583  | 0  |         }  | 
584  |  |           | 
585  |  |         // Never stop before a combining mark.  | 
586  | 0  |         int32_t currPos;  | 
587  | 0  |         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | 
588  | 0  |             utext_next32(text);  | 
589  | 0  |             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;  | 
590  | 0  |         }  | 
591  |  |           | 
592  |  |         // Look ahead for possible suffixes if a dictionary word does not follow.  | 
593  |  |         // We do this in code rather than using a rule so that the heuristic  | 
594  |  |         // resynch continues to function. For example, one of the suffix characters  | 
595  |  |         // could be a typo in the middle of a word.  | 
596  |  |         // NOT CURRENTLY APPLICABLE TO LAO  | 
597  |  |  | 
598  |  |         // Did we find a word on this iteration? If so, push it on the break stack  | 
599  | 0  |         if (cuWordLength > 0) { | 
600  | 0  |             foundBreaks.push((current+cuWordLength), status);  | 
601  | 0  |         }  | 
602  | 0  |     }  | 
603  |  |  | 
604  |  |     // Don't return a break for the end of the dictionary range if there is one there.  | 
605  | 0  |     if (foundBreaks.peeki() >= rangeEnd) { | 
606  | 0  |         (void) foundBreaks.popi();  | 
607  | 0  |         wordsFound -= 1;  | 
608  | 0  |     }  | 
609  |  | 
  | 
610  | 0  |     return wordsFound;  | 
611  | 0  | }  | 
612  |  |  | 
613  |  | /*  | 
614  |  |  ******************************************************************  | 
615  |  |  * BurmeseBreakEngine  | 
616  |  |  */  | 
617  |  |  | 
618  |  | // How many words in a row are "good enough"?  | 
619  |  | static const int32_t BURMESE_LOOKAHEAD = 3;  | 
620  |  |  | 
621  |  | // Will not combine a non-word with a preceding dictionary word longer than this  | 
622  |  | static const int32_t BURMESE_ROOT_COMBINE_THRESHOLD = 3;  | 
623  |  |  | 
624  |  | // Will not combine a non-word that shares at least this much prefix with a  | 
625  |  | // dictionary word, with a preceding word  | 
626  |  | static const int32_t BURMESE_PREFIX_COMBINE_THRESHOLD = 3;  | 
627  |  |  | 
628  |  | // Minimum word size  | 
629  |  | static const int32_t BURMESE_MIN_WORD = 2;  | 
630  |  |  | 
631  |  | // Minimum number of characters for two words  | 
632  |  | static const int32_t BURMESE_MIN_WORD_SPAN = BURMESE_MIN_WORD * 2;  | 
633  |  |  | 
634  |  | BurmeseBreakEngine::BurmeseBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)  | 
635  | 0  |     : DictionaryBreakEngine(),  | 
636  | 0  |       fDictionary(adoptDictionary)  | 
637  | 0  | { | 
638  | 0  |     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);  | 
639  | 0  |     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Mymr");  | 
640  | 0  |     fBurmeseWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]]"), status); | 
641  | 0  |     if (U_SUCCESS(status)) { | 
642  | 0  |         setCharacters(fBurmeseWordSet);  | 
643  | 0  |     }  | 
644  | 0  |     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Mymr:]&[:LineBreak=SA:]&[:M:]]"), status); | 
645  | 0  |     fMarkSet.add(0x0020);  | 
646  | 0  |     fEndWordSet = fBurmeseWordSet;  | 
647  | 0  |     fBeginWordSet.add(0x1000, 0x102A);      // basic consonants and independent vowels  | 
648  |  |  | 
649  |  |     // Compact for caching.  | 
650  | 0  |     fMarkSet.compact();  | 
651  | 0  |     fEndWordSet.compact();  | 
652  | 0  |     fBeginWordSet.compact();  | 
653  | 0  |     UTRACE_EXIT_STATUS(status);  | 
654  | 0  | }  | 
655  |  |  | 
656  | 0  | BurmeseBreakEngine::~BurmeseBreakEngine() { | 
657  | 0  |     delete fDictionary;  | 
658  | 0  | }  | 
659  |  |  | 
660  |  | int32_t  | 
661  |  | BurmeseBreakEngine::divideUpDictionaryRange( UText *text,  | 
662  |  |                                                 int32_t rangeStart,  | 
663  |  |                                                 int32_t rangeEnd,  | 
664  |  |                                                 UVector32 &foundBreaks,  | 
665  | 0  |                                                 UErrorCode& status ) const { | 
666  | 0  |     if (U_FAILURE(status)) return 0;  | 
667  | 0  |     if ((rangeEnd - rangeStart) < BURMESE_MIN_WORD_SPAN) { | 
668  | 0  |         return 0;       // Not enough characters for two words  | 
669  | 0  |     }  | 
670  |  |  | 
671  | 0  |     uint32_t wordsFound = 0;  | 
672  | 0  |     int32_t cpWordLength = 0;  | 
673  | 0  |     int32_t cuWordLength = 0;  | 
674  | 0  |     int32_t current;  | 
675  | 0  |     PossibleWord words[BURMESE_LOOKAHEAD];  | 
676  |  | 
  | 
677  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
678  |  | 
  | 
679  | 0  |     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | 
680  | 0  |         cuWordLength = 0;  | 
681  | 0  |         cpWordLength = 0;  | 
682  |  |  | 
683  |  |         // Look for candidate words at the current position  | 
684  | 0  |         int32_t candidates = words[wordsFound%BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
685  |  |           | 
686  |  |         // If we found exactly one, use that  | 
687  | 0  |         if (candidates == 1) { | 
688  | 0  |             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);  | 
689  | 0  |             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();  | 
690  | 0  |             wordsFound += 1;  | 
691  | 0  |         }  | 
692  |  |         // If there was more than one, see which one can take us forward the most words  | 
693  | 0  |         else if (candidates > 1) { | 
694  |  |             // If we're already at the end of the range, we're done  | 
695  | 0  |             if (utext_getNativeIndex(text) >= rangeEnd) { | 
696  | 0  |                 goto foundBest;  | 
697  | 0  |             }  | 
698  | 0  |             do { | 
699  | 0  |                 if (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | 
700  |  |                     // Followed by another dictionary word; mark first word as a good candidate  | 
701  | 0  |                     words[wordsFound%BURMESE_LOOKAHEAD].markCurrent();  | 
702  |  |                       | 
703  |  |                     // If we're already at the end of the range, we're done  | 
704  | 0  |                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
705  | 0  |                         goto foundBest;  | 
706  | 0  |                     }  | 
707  |  |                       | 
708  |  |                     // See if any of the possible second words is followed by a third word  | 
709  | 0  |                     do { | 
710  |  |                         // If we find a third word, stop right away  | 
711  | 0  |                         if (words[(wordsFound + 2) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | 
712  | 0  |                             words[wordsFound % BURMESE_LOOKAHEAD].markCurrent();  | 
713  | 0  |                             goto foundBest;  | 
714  | 0  |                         }  | 
715  | 0  |                     }  | 
716  | 0  |                     while (words[(wordsFound + 1) % BURMESE_LOOKAHEAD].backUp(text));  | 
717  | 0  |                 }  | 
718  | 0  |             }  | 
719  | 0  |             while (words[wordsFound % BURMESE_LOOKAHEAD].backUp(text));  | 
720  | 0  | foundBest:  | 
721  | 0  |             cuWordLength = words[wordsFound % BURMESE_LOOKAHEAD].acceptMarked(text);  | 
722  | 0  |             cpWordLength = words[wordsFound % BURMESE_LOOKAHEAD].markedCPLength();  | 
723  | 0  |             wordsFound += 1;  | 
724  | 0  |         }  | 
725  |  |           | 
726  |  |         // We come here after having either found a word or not. We look ahead to the  | 
727  |  |         // next word. If it's not a dictionary word, we will combine it with the word we  | 
728  |  |         // just found (if there is one), but only if the preceding word does not exceed  | 
729  |  |         // the threshold.  | 
730  |  |         // The text iterator should now be positioned at the end of the word we found.  | 
731  | 0  |         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < BURMESE_ROOT_COMBINE_THRESHOLD) { | 
732  |  |             // if it is a dictionary word, do nothing. If it isn't, then if there is  | 
733  |  |             // no preceding word, or the non-word shares less than the minimum threshold  | 
734  |  |             // of characters with a dictionary word, then scan to resynchronize  | 
735  | 0  |             if (words[wordsFound % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
736  | 0  |                   && (cuWordLength == 0  | 
737  | 0  |                       || words[wordsFound%BURMESE_LOOKAHEAD].longestPrefix() < BURMESE_PREFIX_COMBINE_THRESHOLD)) { | 
738  |  |                 // Look for a plausible word boundary  | 
739  | 0  |                 int32_t remaining = rangeEnd - (current + cuWordLength);  | 
740  | 0  |                 UChar32 pc;  | 
741  | 0  |                 UChar32 uc;  | 
742  | 0  |                 int32_t chars = 0;  | 
743  | 0  |                 for (;;) { | 
744  | 0  |                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);  | 
745  | 0  |                     pc = utext_next32(text);  | 
746  | 0  |                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;  | 
747  | 0  |                     chars += pcSize;  | 
748  | 0  |                     remaining -= pcSize;  | 
749  | 0  |                     if (remaining <= 0) { | 
750  | 0  |                         break;  | 
751  | 0  |                     }  | 
752  | 0  |                     uc = utext_current32(text);  | 
753  | 0  |                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | 
754  |  |                         // Maybe. See if it's in the dictionary.  | 
755  |  |                         // TODO: this looks iffy; compare with old code.  | 
756  | 0  |                         int32_t num_candidates = words[(wordsFound + 1) % BURMESE_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
757  | 0  |                         utext_setNativeIndex(text, current + cuWordLength + chars);  | 
758  | 0  |                         if (num_candidates > 0) { | 
759  | 0  |                             break;  | 
760  | 0  |                         }  | 
761  | 0  |                     }  | 
762  | 0  |                 }  | 
763  |  |                   | 
764  |  |                 // Bump the word count if there wasn't already one  | 
765  | 0  |                 if (cuWordLength <= 0) { | 
766  | 0  |                     wordsFound += 1;  | 
767  | 0  |                 }  | 
768  |  |                   | 
769  |  |                 // Update the length with the passed-over characters  | 
770  | 0  |                 cuWordLength += chars;  | 
771  | 0  |             }  | 
772  | 0  |             else { | 
773  |  |                 // Back up to where we were for next iteration  | 
774  | 0  |                 utext_setNativeIndex(text, current + cuWordLength);  | 
775  | 0  |             }  | 
776  | 0  |         }  | 
777  |  |           | 
778  |  |         // Never stop before a combining mark.  | 
779  | 0  |         int32_t currPos;  | 
780  | 0  |         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | 
781  | 0  |             utext_next32(text);  | 
782  | 0  |             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;  | 
783  | 0  |         }  | 
784  |  |           | 
785  |  |         // Look ahead for possible suffixes if a dictionary word does not follow.  | 
786  |  |         // We do this in code rather than using a rule so that the heuristic  | 
787  |  |         // resynch continues to function. For example, one of the suffix characters  | 
788  |  |         // could be a typo in the middle of a word.  | 
789  |  |         // NOT CURRENTLY APPLICABLE TO BURMESE  | 
790  |  |  | 
791  |  |         // Did we find a word on this iteration? If so, push it on the break stack  | 
792  | 0  |         if (cuWordLength > 0) { | 
793  | 0  |             foundBreaks.push((current+cuWordLength), status);  | 
794  | 0  |         }  | 
795  | 0  |     }  | 
796  |  |  | 
797  |  |     // Don't return a break for the end of the dictionary range if there is one there.  | 
798  | 0  |     if (foundBreaks.peeki() >= rangeEnd) { | 
799  | 0  |         (void) foundBreaks.popi();  | 
800  | 0  |         wordsFound -= 1;  | 
801  | 0  |     }  | 
802  |  | 
  | 
803  | 0  |     return wordsFound;  | 
804  | 0  | }  | 
805  |  |  | 
806  |  | /*  | 
807  |  |  ******************************************************************  | 
808  |  |  * KhmerBreakEngine  | 
809  |  |  */  | 
810  |  |  | 
811  |  | // How many words in a row are "good enough"?  | 
812  |  | static const int32_t KHMER_LOOKAHEAD = 3;  | 
813  |  |  | 
814  |  | // Will not combine a non-word with a preceding dictionary word longer than this  | 
815  |  | static const int32_t KHMER_ROOT_COMBINE_THRESHOLD = 3;  | 
816  |  |  | 
817  |  | // Will not combine a non-word that shares at least this much prefix with a  | 
818  |  | // dictionary word, with a preceding word  | 
819  |  | static const int32_t KHMER_PREFIX_COMBINE_THRESHOLD = 3;  | 
820  |  |  | 
821  |  | // Minimum word size  | 
822  |  | static const int32_t KHMER_MIN_WORD = 2;  | 
823  |  |  | 
824  |  | // Minimum number of characters for two words  | 
825  |  | static const int32_t KHMER_MIN_WORD_SPAN = KHMER_MIN_WORD * 2;  | 
826  |  |  | 
827  |  | KhmerBreakEngine::KhmerBreakEngine(DictionaryMatcher *adoptDictionary, UErrorCode &status)  | 
828  | 0  |     : DictionaryBreakEngine(),  | 
829  | 0  |       fDictionary(adoptDictionary)  | 
830  | 0  | { | 
831  | 0  |     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);  | 
832  | 0  |     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Khmr");  | 
833  | 0  |     fKhmerWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]]"), status); | 
834  | 0  |     if (U_SUCCESS(status)) { | 
835  | 0  |         setCharacters(fKhmerWordSet);  | 
836  | 0  |     }  | 
837  | 0  |     fMarkSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Khmr:]&[:LineBreak=SA:]&[:M:]]"), status); | 
838  | 0  |     fMarkSet.add(0x0020);  | 
839  | 0  |     fEndWordSet = fKhmerWordSet;  | 
840  | 0  |     fBeginWordSet.add(0x1780, 0x17B3);  | 
841  |  |     //fBeginWordSet.add(0x17A3, 0x17A4);      // deprecated vowels  | 
842  |  |     //fEndWordSet.remove(0x17A5, 0x17A9);     // Khmer independent vowels that can't end a word  | 
843  |  |     //fEndWordSet.remove(0x17B2);             // Khmer independent vowel that can't end a word  | 
844  | 0  |     fEndWordSet.remove(0x17D2);             // KHMER SIGN COENG that combines some following characters  | 
845  |  |     //fEndWordSet.remove(0x17B6, 0x17C5);     // Remove dependent vowels  | 
846  |  | //    fEndWordSet.remove(0x0E31);             // MAI HAN-AKAT  | 
847  |  | //    fEndWordSet.remove(0x0E40, 0x0E44);     // SARA E through SARA AI MAIMALAI  | 
848  |  | //    fBeginWordSet.add(0x0E01, 0x0E2E);      // KO KAI through HO NOKHUK  | 
849  |  | //    fBeginWordSet.add(0x0E40, 0x0E44);      // SARA E through SARA AI MAIMALAI  | 
850  |  | //    fSuffixSet.add(THAI_PAIYANNOI);  | 
851  |  | //    fSuffixSet.add(THAI_MAIYAMOK);  | 
852  |  |  | 
853  |  |     // Compact for caching.  | 
854  | 0  |     fMarkSet.compact();  | 
855  | 0  |     fEndWordSet.compact();  | 
856  | 0  |     fBeginWordSet.compact();  | 
857  |  | //    fSuffixSet.compact();  | 
858  | 0  |     UTRACE_EXIT_STATUS(status);  | 
859  | 0  | }  | 
860  |  |  | 
861  | 0  | KhmerBreakEngine::~KhmerBreakEngine() { | 
862  | 0  |     delete fDictionary;  | 
863  | 0  | }  | 
864  |  |  | 
865  |  | int32_t  | 
866  |  | KhmerBreakEngine::divideUpDictionaryRange( UText *text,  | 
867  |  |                                                 int32_t rangeStart,  | 
868  |  |                                                 int32_t rangeEnd,  | 
869  |  |                                                 UVector32 &foundBreaks,  | 
870  | 0  |                                                 UErrorCode& status ) const { | 
871  | 0  |     if (U_FAILURE(status)) return 0;  | 
872  | 0  |     if ((rangeEnd - rangeStart) < KHMER_MIN_WORD_SPAN) { | 
873  | 0  |         return 0;       // Not enough characters for two words  | 
874  | 0  |     }  | 
875  |  |  | 
876  | 0  |     uint32_t wordsFound = 0;  | 
877  | 0  |     int32_t cpWordLength = 0;  | 
878  | 0  |     int32_t cuWordLength = 0;  | 
879  | 0  |     int32_t current;  | 
880  | 0  |     PossibleWord words[KHMER_LOOKAHEAD];  | 
881  |  | 
  | 
882  | 0  |     utext_setNativeIndex(text, rangeStart);  | 
883  |  | 
  | 
884  | 0  |     while (U_SUCCESS(status) && (current = (int32_t)utext_getNativeIndex(text)) < rangeEnd) { | 
885  | 0  |         cuWordLength = 0;  | 
886  | 0  |         cpWordLength = 0;  | 
887  |  |  | 
888  |  |         // Look for candidate words at the current position  | 
889  | 0  |         int32_t candidates = words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
890  |  |  | 
891  |  |         // If we found exactly one, use that  | 
892  | 0  |         if (candidates == 1) { | 
893  | 0  |             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);  | 
894  | 0  |             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();  | 
895  | 0  |             wordsFound += 1;  | 
896  | 0  |         }  | 
897  |  |  | 
898  |  |         // If there was more than one, see which one can take us forward the most words  | 
899  | 0  |         else if (candidates > 1) { | 
900  |  |             // If we're already at the end of the range, we're done  | 
901  | 0  |             if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
902  | 0  |                 goto foundBest;  | 
903  | 0  |             }  | 
904  | 0  |             do { | 
905  | 0  |                 if (words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) > 0) { | 
906  |  |                     // Followed by another dictionary word; mark first word as a good candidate  | 
907  | 0  |                     words[wordsFound % KHMER_LOOKAHEAD].markCurrent();  | 
908  |  |  | 
909  |  |                     // If we're already at the end of the range, we're done  | 
910  | 0  |                     if ((int32_t)utext_getNativeIndex(text) >= rangeEnd) { | 
911  | 0  |                         goto foundBest;  | 
912  | 0  |                     }  | 
913  |  |  | 
914  |  |                     // See if any of the possible second words is followed by a third word  | 
915  | 0  |                     do { | 
916  |  |                         // If we find a third word, stop right away  | 
917  | 0  |                         if (words[(wordsFound + 2) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd)) { | 
918  | 0  |                             words[wordsFound % KHMER_LOOKAHEAD].markCurrent();  | 
919  | 0  |                             goto foundBest;  | 
920  | 0  |                         }  | 
921  | 0  |                     }  | 
922  | 0  |                     while (words[(wordsFound + 1) % KHMER_LOOKAHEAD].backUp(text));  | 
923  | 0  |                 }  | 
924  | 0  |             }  | 
925  | 0  |             while (words[wordsFound % KHMER_LOOKAHEAD].backUp(text));  | 
926  | 0  | foundBest:  | 
927  | 0  |             cuWordLength = words[wordsFound % KHMER_LOOKAHEAD].acceptMarked(text);  | 
928  | 0  |             cpWordLength = words[wordsFound % KHMER_LOOKAHEAD].markedCPLength();  | 
929  | 0  |             wordsFound += 1;  | 
930  | 0  |         }  | 
931  |  |  | 
932  |  |         // We come here after having either found a word or not. We look ahead to the  | 
933  |  |         // next word. If it's not a dictionary word, we will combine it with the word we  | 
934  |  |         // just found (if there is one), but only if the preceding word does not exceed  | 
935  |  |         // the threshold.  | 
936  |  |         // The text iterator should now be positioned at the end of the word we found.  | 
937  | 0  |         if ((int32_t)utext_getNativeIndex(text) < rangeEnd && cpWordLength < KHMER_ROOT_COMBINE_THRESHOLD) { | 
938  |  |             // if it is a dictionary word, do nothing. If it isn't, then if there is  | 
939  |  |             // no preceding word, or the non-word shares less than the minimum threshold  | 
940  |  |             // of characters with a dictionary word, then scan to resynchronize  | 
941  | 0  |             if (words[wordsFound % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
942  | 0  |                   && (cuWordLength == 0  | 
943  | 0  |                       || words[wordsFound % KHMER_LOOKAHEAD].longestPrefix() < KHMER_PREFIX_COMBINE_THRESHOLD)) { | 
944  |  |                 // Look for a plausible word boundary  | 
945  | 0  |                 int32_t remaining = rangeEnd - (current+cuWordLength);  | 
946  | 0  |                 UChar32 pc;  | 
947  | 0  |                 UChar32 uc;  | 
948  | 0  |                 int32_t chars = 0;  | 
949  | 0  |                 for (;;) { | 
950  | 0  |                     int32_t pcIndex = (int32_t)utext_getNativeIndex(text);  | 
951  | 0  |                     pc = utext_next32(text);  | 
952  | 0  |                     int32_t pcSize = (int32_t)utext_getNativeIndex(text) - pcIndex;  | 
953  | 0  |                     chars += pcSize;  | 
954  | 0  |                     remaining -= pcSize;  | 
955  | 0  |                     if (remaining <= 0) { | 
956  | 0  |                         break;  | 
957  | 0  |                     }  | 
958  | 0  |                     uc = utext_current32(text);  | 
959  | 0  |                     if (fEndWordSet.contains(pc) && fBeginWordSet.contains(uc)) { | 
960  |  |                         // Maybe. See if it's in the dictionary.  | 
961  | 0  |                         int32_t num_candidates = words[(wordsFound + 1) % KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd);  | 
962  | 0  |                         utext_setNativeIndex(text, current+cuWordLength+chars);  | 
963  | 0  |                         if (num_candidates > 0) { | 
964  | 0  |                             break;  | 
965  | 0  |                         }  | 
966  | 0  |                     }  | 
967  | 0  |                 }  | 
968  |  |  | 
969  |  |                 // Bump the word count if there wasn't already one  | 
970  | 0  |                 if (cuWordLength <= 0) { | 
971  | 0  |                     wordsFound += 1;  | 
972  | 0  |                 }  | 
973  |  |  | 
974  |  |                 // Update the length with the passed-over characters  | 
975  | 0  |                 cuWordLength += chars;  | 
976  | 0  |             }  | 
977  | 0  |             else { | 
978  |  |                 // Back up to where we were for next iteration  | 
979  | 0  |                 utext_setNativeIndex(text, current+cuWordLength);  | 
980  | 0  |             }  | 
981  | 0  |         }  | 
982  |  |  | 
983  |  |         // Never stop before a combining mark.  | 
984  | 0  |         int32_t currPos;  | 
985  | 0  |         while ((currPos = (int32_t)utext_getNativeIndex(text)) < rangeEnd && fMarkSet.contains(utext_current32(text))) { | 
986  | 0  |             utext_next32(text);  | 
987  | 0  |             cuWordLength += (int32_t)utext_getNativeIndex(text) - currPos;  | 
988  | 0  |         }  | 
989  |  |  | 
990  |  |         // Look ahead for possible suffixes if a dictionary word does not follow.  | 
991  |  |         // We do this in code rather than using a rule so that the heuristic  | 
992  |  |         // resynch continues to function. For example, one of the suffix characters  | 
993  |  |         // could be a typo in the middle of a word.  | 
994  |  | //        if ((int32_t)utext_getNativeIndex(text) < rangeEnd && wordLength > 0) { | 
995  |  | //            if (words[wordsFound%KHMER_LOOKAHEAD].candidates(text, fDictionary, rangeEnd) <= 0  | 
996  |  | //                && fSuffixSet.contains(uc = utext_current32(text))) { | 
997  |  | //                if (uc == KHMER_PAIYANNOI) { | 
998  |  | //                    if (!fSuffixSet.contains(utext_previous32(text))) { | 
999  |  | //                        // Skip over previous end and PAIYANNOI  | 
1000  |  | //                        utext_next32(text);  | 
1001  |  | //                        utext_next32(text);  | 
1002  |  | //                        wordLength += 1;            // Add PAIYANNOI to word  | 
1003  |  | //                        uc = utext_current32(text);     // Fetch next character  | 
1004  |  | //                    }  | 
1005  |  | //                    else { | 
1006  |  | //                        // Restore prior position  | 
1007  |  | //                        utext_next32(text);  | 
1008  |  | //                    }  | 
1009  |  | //                }  | 
1010  |  | //                if (uc == KHMER_MAIYAMOK) { | 
1011  |  | //                    if (utext_previous32(text) != KHMER_MAIYAMOK) { | 
1012  |  | //                        // Skip over previous end and MAIYAMOK  | 
1013  |  | //                        utext_next32(text);  | 
1014  |  | //                        utext_next32(text);  | 
1015  |  | //                        wordLength += 1;            // Add MAIYAMOK to word  | 
1016  |  | //                    }  | 
1017  |  | //                    else { | 
1018  |  | //                        // Restore prior position  | 
1019  |  | //                        utext_next32(text);  | 
1020  |  | //                    }  | 
1021  |  | //                }  | 
1022  |  | //            }  | 
1023  |  | //            else { | 
1024  |  | //                utext_setNativeIndex(text, current+wordLength);  | 
1025  |  | //            }  | 
1026  |  | //        }  | 
1027  |  |  | 
1028  |  |         // Did we find a word on this iteration? If so, push it on the break stack  | 
1029  | 0  |         if (cuWordLength > 0) { | 
1030  | 0  |             foundBreaks.push((current+cuWordLength), status);  | 
1031  | 0  |         }  | 
1032  | 0  |     }  | 
1033  |  |       | 
1034  |  |     // Don't return a break for the end of the dictionary range if there is one there.  | 
1035  | 0  |     if (foundBreaks.peeki() >= rangeEnd) { | 
1036  | 0  |         (void) foundBreaks.popi();  | 
1037  | 0  |         wordsFound -= 1;  | 
1038  | 0  |     }  | 
1039  |  | 
  | 
1040  | 0  |     return wordsFound;  | 
1041  | 0  | }  | 
1042  |  |  | 
1043  |  | #if !UCONFIG_NO_NORMALIZATION  | 
1044  |  | /*  | 
1045  |  |  ******************************************************************  | 
1046  |  |  * CjkBreakEngine  | 
1047  |  |  */  | 
1048  |  | static const uint32_t kuint32max = 0xFFFFFFFF;  | 
1049  |  | CjkBreakEngine::CjkBreakEngine(DictionaryMatcher *adoptDictionary, LanguageType type, UErrorCode &status)  | 
1050  | 0  | : DictionaryBreakEngine(), fDictionary(adoptDictionary) { | 
1051  | 0  |     UTRACE_ENTRY(UTRACE_UBRK_CREATE_BREAK_ENGINE);  | 
1052  | 0  |     UTRACE_DATA1(UTRACE_INFO, "dictbe=%s", "Hani");  | 
1053  |  |     // Korean dictionary only includes Hangul syllables  | 
1054  | 0  |     fHangulWordSet.applyPattern(UNICODE_STRING_SIMPLE("[\\uac00-\\ud7a3]"), status); | 
1055  | 0  |     fHanWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Han:]"), status); | 
1056  | 0  |     fKatakanaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[[:Katakana:]\\uff9e\\uff9f]"), status); | 
1057  | 0  |     fHiraganaWordSet.applyPattern(UNICODE_STRING_SIMPLE("[:Hiragana:]"), status); | 
1058  | 0  |     nfkcNorm2 = Normalizer2::getNFKCInstance(status);  | 
1059  |  | 
  | 
1060  | 0  |     if (U_SUCCESS(status)) { | 
1061  |  |         // handle Korean and Japanese/Chinese using different dictionaries  | 
1062  | 0  |         if (type == kKorean) { | 
1063  | 0  |             setCharacters(fHangulWordSet);  | 
1064  | 0  |         } else { //Chinese and Japanese | 
1065  | 0  |             UnicodeSet cjSet;  | 
1066  | 0  |             cjSet.addAll(fHanWordSet);  | 
1067  | 0  |             cjSet.addAll(fKatakanaWordSet);  | 
1068  | 0  |             cjSet.addAll(fHiraganaWordSet);  | 
1069  | 0  |             cjSet.add(0xFF70); // HALFWIDTH KATAKANA-HIRAGANA PROLONGED SOUND MARK  | 
1070  | 0  |             cjSet.add(0x30FC); // KATAKANA-HIRAGANA PROLONGED SOUND MARK  | 
1071  | 0  |             setCharacters(cjSet);  | 
1072  | 0  |         }  | 
1073  | 0  |     }  | 
1074  | 0  |     UTRACE_EXIT_STATUS(status);  | 
1075  | 0  | }  | 
1076  |  |  | 
1077  | 0  | CjkBreakEngine::~CjkBreakEngine(){ | 
1078  | 0  |     delete fDictionary;  | 
1079  | 0  | }  | 
1080  |  |  | 
1081  |  | // The katakanaCost values below are based on the length frequencies of all  | 
1082  |  | // katakana phrases in the dictionary  | 
1083  |  | static const int32_t kMaxKatakanaLength = 8;  | 
1084  |  | static const int32_t kMaxKatakanaGroupLength = 20;  | 
1085  |  | static const uint32_t maxSnlp = 255;  | 
1086  |  |  | 
1087  | 0  | static inline uint32_t getKatakanaCost(int32_t wordLength){ | 
1088  |  |     //TODO: fill array with actual values from dictionary!  | 
1089  | 0  |     static const uint32_t katakanaCost[kMaxKatakanaLength + 1]  | 
1090  | 0  |                                        = {8192, 984, 408, 240, 204, 252, 300, 372, 480}; | 
1091  | 0  |     return (wordLength > kMaxKatakanaLength) ? 8192 : katakanaCost[wordLength];  | 
1092  | 0  | }  | 
1093  |  |  | 
1094  | 0  | static inline bool isKatakana(UChar32 value) { | 
1095  | 0  |     return (value >= 0x30A1 && value <= 0x30FE && value != 0x30FB) ||  | 
1096  | 0  |             (value >= 0xFF66 && value <= 0xFF9f);  | 
1097  | 0  | }  | 
1098  |  |  | 
1099  |  |  | 
1100  |  | // Function for accessing internal utext flags.  | 
1101  |  | //   Replicates an internal UText function.  | 
1102  |  |  | 
1103  | 0  | static inline int32_t utext_i32_flag(int32_t bitIndex) { | 
1104  | 0  |     return (int32_t)1 << bitIndex;  | 
1105  | 0  | }  | 
1106  |  |  | 
1107  |  |          | 
1108  |  | /*  | 
1109  |  |  * @param text A UText representing the text  | 
1110  |  |  * @param rangeStart The start of the range of dictionary characters  | 
1111  |  |  * @param rangeEnd The end of the range of dictionary characters  | 
1112  |  |  * @param foundBreaks vector<int32> to receive the break positions  | 
1113  |  |  * @return The number of breaks found  | 
1114  |  |  */  | 
1115  |  | int32_t   | 
1116  |  | CjkBreakEngine::divideUpDictionaryRange( UText *inText,  | 
1117  |  |         int32_t rangeStart,  | 
1118  |  |         int32_t rangeEnd,  | 
1119  |  |         UVector32 &foundBreaks,  | 
1120  | 0  |         UErrorCode& status) const { | 
1121  | 0  |     if (U_FAILURE(status)) return 0;  | 
1122  | 0  |     if (rangeStart >= rangeEnd) { | 
1123  | 0  |         return 0;  | 
1124  | 0  |     }  | 
1125  |  |  | 
1126  |  |     // UnicodeString version of input UText, NFKC normalized if necessary.  | 
1127  | 0  |     UnicodeString inString;  | 
1128  |  |  | 
1129  |  |     // inputMap[inStringIndex] = corresponding native index from UText inText.  | 
1130  |  |     // If NULL then mapping is 1:1  | 
1131  | 0  |     LocalPointer<UVector32>     inputMap;  | 
1132  |  |  | 
1133  |  |     // if UText has the input string as one contiguous UTF-16 chunk  | 
1134  | 0  |     if ((inText->providerProperties & utext_i32_flag(UTEXT_PROVIDER_STABLE_CHUNKS)) &&  | 
1135  | 0  |          inText->chunkNativeStart <= rangeStart &&  | 
1136  | 0  |          inText->chunkNativeLimit >= rangeEnd   &&  | 
1137  | 0  |          inText->nativeIndexingLimit >= rangeEnd - inText->chunkNativeStart) { | 
1138  |  |  | 
1139  |  |         // Input UText is in one contiguous UTF-16 chunk.  | 
1140  |  |         // Use Read-only aliasing UnicodeString.  | 
1141  | 0  |         inString.setTo(FALSE,  | 
1142  | 0  |                        inText->chunkContents + rangeStart - inText->chunkNativeStart,  | 
1143  | 0  |                        rangeEnd - rangeStart);  | 
1144  | 0  |     } else { | 
1145  |  |         // Copy the text from the original inText (UText) to inString (UnicodeString).  | 
1146  |  |         // Create a map from UnicodeString indices -> UText offsets.  | 
1147  | 0  |         utext_setNativeIndex(inText, rangeStart);  | 
1148  | 0  |         int32_t limit = rangeEnd;  | 
1149  | 0  |         U_ASSERT(limit <= utext_nativeLength(inText));  | 
1150  | 0  |         if (limit > utext_nativeLength(inText)) { | 
1151  | 0  |             limit = (int32_t)utext_nativeLength(inText);  | 
1152  | 0  |         }  | 
1153  | 0  |         inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);  | 
1154  | 0  |         if (U_FAILURE(status)) { | 
1155  | 0  |             return 0;  | 
1156  | 0  |         }  | 
1157  | 0  |         while (utext_getNativeIndex(inText) < limit) { | 
1158  | 0  |             int32_t nativePosition = (int32_t)utext_getNativeIndex(inText);  | 
1159  | 0  |             UChar32 c = utext_next32(inText);  | 
1160  | 0  |             U_ASSERT(c != U_SENTINEL);  | 
1161  | 0  |             inString.append(c);  | 
1162  | 0  |             while (inputMap->size() < inString.length()) { | 
1163  | 0  |                 inputMap->addElement(nativePosition, status);  | 
1164  | 0  |             }  | 
1165  | 0  |         }  | 
1166  | 0  |         inputMap->addElement(limit, status);  | 
1167  | 0  |     }  | 
1168  |  |  | 
1169  |  |  | 
1170  | 0  |     if (!nfkcNorm2->isNormalized(inString, status)) { | 
1171  | 0  |         UnicodeString normalizedInput;  | 
1172  |  |         //  normalizedMap[normalizedInput position] ==  original UText position.  | 
1173  | 0  |         LocalPointer<UVector32> normalizedMap(new UVector32(status), status);  | 
1174  | 0  |         if (U_FAILURE(status)) { | 
1175  | 0  |             return 0;  | 
1176  | 0  |         }  | 
1177  |  |           | 
1178  | 0  |         UnicodeString fragment;  | 
1179  | 0  |         UnicodeString normalizedFragment;  | 
1180  | 0  |         for (int32_t srcI = 0; srcI < inString.length();) {  // Once per normalization chunk | 
1181  | 0  |             fragment.remove();  | 
1182  | 0  |             int32_t fragmentStartI = srcI;  | 
1183  | 0  |             UChar32 c = inString.char32At(srcI);  | 
1184  | 0  |             for (;;) { | 
1185  | 0  |                 fragment.append(c);  | 
1186  | 0  |                 srcI = inString.moveIndex32(srcI, 1);  | 
1187  | 0  |                 if (srcI == inString.length()) { | 
1188  | 0  |                     break;  | 
1189  | 0  |                 }  | 
1190  | 0  |                 c = inString.char32At(srcI);  | 
1191  | 0  |                 if (nfkcNorm2->hasBoundaryBefore(c)) { | 
1192  | 0  |                     break;  | 
1193  | 0  |                 }  | 
1194  | 0  |             }  | 
1195  | 0  |             nfkcNorm2->normalize(fragment, normalizedFragment, status);  | 
1196  | 0  |             normalizedInput.append(normalizedFragment);  | 
1197  |  |  | 
1198  |  |             // Map every position in the normalized chunk to the start of the chunk  | 
1199  |  |             //   in the original input.  | 
1200  | 0  |             int32_t fragmentOriginalStart = inputMap.isValid() ?  | 
1201  | 0  |                     inputMap->elementAti(fragmentStartI) : fragmentStartI+rangeStart;  | 
1202  | 0  |             while (normalizedMap->size() < normalizedInput.length()) { | 
1203  | 0  |                 normalizedMap->addElement(fragmentOriginalStart, status);  | 
1204  | 0  |                 if (U_FAILURE(status)) { | 
1205  | 0  |                     break;  | 
1206  | 0  |                 }  | 
1207  | 0  |             }  | 
1208  | 0  |         }  | 
1209  | 0  |         U_ASSERT(normalizedMap->size() == normalizedInput.length());  | 
1210  | 0  |         int32_t nativeEnd = inputMap.isValid() ?  | 
1211  | 0  |                 inputMap->elementAti(inString.length()) : inString.length()+rangeStart;  | 
1212  | 0  |         normalizedMap->addElement(nativeEnd, status);  | 
1213  |  | 
  | 
1214  | 0  |         inputMap = std::move(normalizedMap);  | 
1215  | 0  |         inString = std::move(normalizedInput);  | 
1216  | 0  |     }  | 
1217  |  |  | 
1218  | 0  |     int32_t numCodePts = inString.countChar32();  | 
1219  | 0  |     if (numCodePts != inString.length()) { | 
1220  |  |         // There are supplementary characters in the input.  | 
1221  |  |         // The dictionary will produce boundary positions in terms of code point indexes,  | 
1222  |  |         //   not in terms of code unit string indexes.  | 
1223  |  |         // Use the inputMap mechanism to take care of this in addition to indexing differences  | 
1224  |  |         //    from normalization and/or UTF-8 input.  | 
1225  | 0  |         UBool hadExistingMap = inputMap.isValid();  | 
1226  | 0  |         if (!hadExistingMap) { | 
1227  | 0  |             inputMap.adoptInsteadAndCheckErrorCode(new UVector32(status), status);  | 
1228  | 0  |             if (U_FAILURE(status)) { | 
1229  | 0  |                 return 0;  | 
1230  | 0  |             }  | 
1231  | 0  |         }  | 
1232  | 0  |         int32_t cpIdx = 0;  | 
1233  | 0  |         for (int32_t cuIdx = 0; ; cuIdx = inString.moveIndex32(cuIdx, 1)) { | 
1234  | 0  |             U_ASSERT(cuIdx >= cpIdx);  | 
1235  | 0  |             if (hadExistingMap) { | 
1236  | 0  |                 inputMap->setElementAt(inputMap->elementAti(cuIdx), cpIdx);  | 
1237  | 0  |             } else { | 
1238  | 0  |                 inputMap->addElement(cuIdx+rangeStart, status);  | 
1239  | 0  |             }  | 
1240  | 0  |             cpIdx++;  | 
1241  | 0  |             if (cuIdx == inString.length()) { | 
1242  | 0  |                break;  | 
1243  | 0  |             }  | 
1244  | 0  |         }  | 
1245  | 0  |     }  | 
1246  |  |                   | 
1247  |  |     // bestSnlp[i] is the snlp of the best segmentation of the first i  | 
1248  |  |     // code points in the range to be matched.  | 
1249  | 0  |     UVector32 bestSnlp(numCodePts + 1, status);  | 
1250  | 0  |     bestSnlp.addElement(0, status);  | 
1251  | 0  |     for(int32_t i = 1; i <= numCodePts; i++) { | 
1252  | 0  |         bestSnlp.addElement(kuint32max, status);  | 
1253  | 0  |     }  | 
1254  |  |  | 
1255  |  |  | 
1256  |  |     // prev[i] is the index of the last CJK code point in the previous word in   | 
1257  |  |     // the best segmentation of the first i characters.  | 
1258  | 0  |     UVector32 prev(numCodePts + 1, status);  | 
1259  | 0  |     for(int32_t i = 0; i <= numCodePts; i++){ | 
1260  | 0  |         prev.addElement(-1, status);  | 
1261  | 0  |     }  | 
1262  |  | 
  | 
1263  | 0  |     const int32_t maxWordSize = 20;  | 
1264  | 0  |     UVector32 values(numCodePts, status);  | 
1265  | 0  |     values.setSize(numCodePts);  | 
1266  | 0  |     UVector32 lengths(numCodePts, status);  | 
1267  | 0  |     lengths.setSize(numCodePts);  | 
1268  |  | 
  | 
1269  | 0  |     UText fu = UTEXT_INITIALIZER;  | 
1270  | 0  |     utext_openUnicodeString(&fu, &inString, &status);  | 
1271  |  |  | 
1272  |  |     // Dynamic programming to find the best segmentation.  | 
1273  |  |  | 
1274  |  |     // In outer loop, i  is the code point index,  | 
1275  |  |     //                ix is the corresponding string (code unit) index.  | 
1276  |  |     //    They differ when the string contains supplementary characters.  | 
1277  | 0  |     int32_t ix = 0;  | 
1278  | 0  |     bool is_prev_katakana = false;  | 
1279  | 0  |     for (int32_t i = 0;  i < numCodePts;  ++i, ix = inString.moveIndex32(ix, 1)) { | 
1280  | 0  |         if ((uint32_t)bestSnlp.elementAti(i) == kuint32max) { | 
1281  | 0  |             continue;  | 
1282  | 0  |         }  | 
1283  |  |  | 
1284  | 0  |         int32_t count;  | 
1285  | 0  |         utext_setNativeIndex(&fu, ix);  | 
1286  | 0  |         count = fDictionary->matches(&fu, maxWordSize, numCodePts,  | 
1287  | 0  |                              NULL, lengths.getBuffer(), values.getBuffer(), NULL);  | 
1288  |  |                              // Note: lengths is filled with code point lengths  | 
1289  |  |                              //       The NULL parameter is the ignored code unit lengths.  | 
1290  |  |  | 
1291  |  |         // if there are no single character matches found in the dictionary   | 
1292  |  |         // starting with this character, treat character as a 1-character word   | 
1293  |  |         // with the highest value possible, i.e. the least likely to occur.  | 
1294  |  |         // Exclude Korean characters from this treatment, as they should be left  | 
1295  |  |         // together by default.  | 
1296  | 0  |         if ((count == 0 || lengths.elementAti(0) != 1) &&  | 
1297  | 0  |                 !fHangulWordSet.contains(inString.char32At(ix))) { | 
1298  | 0  |             values.setElementAt(maxSnlp, count);   // 255  | 
1299  | 0  |             lengths.setElementAt(1, count++);  | 
1300  | 0  |         }  | 
1301  |  | 
  | 
1302  | 0  |         for (int32_t j = 0; j < count; j++) { | 
1303  | 0  |             uint32_t newSnlp = (uint32_t)bestSnlp.elementAti(i) + (uint32_t)values.elementAti(j);  | 
1304  | 0  |             int32_t ln_j_i = lengths.elementAti(j) + i;  | 
1305  | 0  |             if (newSnlp < (uint32_t)bestSnlp.elementAti(ln_j_i)) { | 
1306  | 0  |                 bestSnlp.setElementAt(newSnlp, ln_j_i);  | 
1307  | 0  |                 prev.setElementAt(i, ln_j_i);  | 
1308  | 0  |             }  | 
1309  | 0  |         }  | 
1310  |  |  | 
1311  |  |         // In Japanese,  | 
1312  |  |         // Katakana word in single character is pretty rare. So we apply  | 
1313  |  |         // the following heuristic to Katakana: any continuous run of Katakana  | 
1314  |  |         // characters is considered a candidate word with a default cost  | 
1315  |  |         // specified in the katakanaCost table according to its length.  | 
1316  |  | 
  | 
1317  | 0  |         bool is_katakana = isKatakana(inString.char32At(ix));  | 
1318  | 0  |         int32_t katakanaRunLength = 1;  | 
1319  | 0  |         if (!is_prev_katakana && is_katakana) { | 
1320  | 0  |             int32_t j = inString.moveIndex32(ix, 1);  | 
1321  |  |             // Find the end of the continuous run of Katakana characters  | 
1322  | 0  |             while (j < inString.length() && katakanaRunLength < kMaxKatakanaGroupLength &&  | 
1323  | 0  |                     isKatakana(inString.char32At(j))) { | 
1324  | 0  |                 j = inString.moveIndex32(j, 1);  | 
1325  | 0  |                 katakanaRunLength++;  | 
1326  | 0  |             }  | 
1327  | 0  |             if (katakanaRunLength < kMaxKatakanaGroupLength) { | 
1328  | 0  |                 uint32_t newSnlp = bestSnlp.elementAti(i) + getKatakanaCost(katakanaRunLength);  | 
1329  | 0  |                 if (newSnlp < (uint32_t)bestSnlp.elementAti(i+katakanaRunLength)) { | 
1330  | 0  |                     bestSnlp.setElementAt(newSnlp, i+katakanaRunLength);  | 
1331  | 0  |                     prev.setElementAt(i, i+katakanaRunLength);  // prev[j] = i;  | 
1332  | 0  |                 }  | 
1333  | 0  |             }  | 
1334  | 0  |         }  | 
1335  | 0  |         is_prev_katakana = is_katakana;  | 
1336  | 0  |     }  | 
1337  | 0  |     utext_close(&fu);  | 
1338  |  |  | 
1339  |  |     // Start pushing the optimal offset index into t_boundary (t for tentative).  | 
1340  |  |     // prev[numCodePts] is guaranteed to be meaningful.  | 
1341  |  |     // We'll first push in the reverse order, i.e.,  | 
1342  |  |     // t_boundary[0] = numCodePts, and afterwards do a swap.  | 
1343  | 0  |     UVector32 t_boundary(numCodePts+1, status);  | 
1344  |  | 
  | 
1345  | 0  |     int32_t numBreaks = 0;  | 
1346  |  |     // No segmentation found, set boundary to end of range  | 
1347  | 0  |     if ((uint32_t)bestSnlp.elementAti(numCodePts) == kuint32max) { | 
1348  | 0  |         t_boundary.addElement(numCodePts, status);  | 
1349  | 0  |         numBreaks++;  | 
1350  | 0  |     } else { | 
1351  | 0  |         for (int32_t i = numCodePts; i > 0; i = prev.elementAti(i)) { | 
1352  | 0  |             t_boundary.addElement(i, status);  | 
1353  | 0  |             numBreaks++;  | 
1354  | 0  |         }  | 
1355  | 0  |         U_ASSERT(prev.elementAti(t_boundary.elementAti(numBreaks - 1)) == 0);  | 
1356  | 0  |     }  | 
1357  |  |  | 
1358  |  |     // Add a break for the start of the dictionary range if there is not one  | 
1359  |  |     // there already.  | 
1360  | 0  |     if (foundBreaks.size() == 0 || foundBreaks.peeki() < rangeStart) { | 
1361  | 0  |         t_boundary.addElement(0, status);  | 
1362  | 0  |         numBreaks++;  | 
1363  | 0  |     }  | 
1364  |  |  | 
1365  |  |     // Now that we're done, convert positions in t_boundary[] (indices in   | 
1366  |  |     // the normalized input string) back to indices in the original input UText  | 
1367  |  |     // while reversing t_boundary and pushing values to foundBreaks.  | 
1368  | 0  |     int32_t prevCPPos = -1;  | 
1369  | 0  |     int32_t prevUTextPos = -1;  | 
1370  | 0  |     for (int32_t i = numBreaks-1; i >= 0; i--) { | 
1371  | 0  |         int32_t cpPos = t_boundary.elementAti(i);  | 
1372  | 0  |         U_ASSERT(cpPos > prevCPPos);  | 
1373  | 0  |         int32_t utextPos =  inputMap.isValid() ? inputMap->elementAti(cpPos) : cpPos + rangeStart;  | 
1374  | 0  |         U_ASSERT(utextPos >= prevUTextPos);  | 
1375  | 0  |         if (utextPos > prevUTextPos) { | 
1376  |  |             // Boundaries are added to foundBreaks output in ascending order.  | 
1377  | 0  |             U_ASSERT(foundBreaks.size() == 0 || foundBreaks.peeki() < utextPos);  | 
1378  | 0  |             foundBreaks.push(utextPos, status);  | 
1379  | 0  |         } else { | 
1380  |  |             // Normalization expanded the input text, the dictionary found a boundary  | 
1381  |  |             // within the expansion, giving two boundaries with the same index in the  | 
1382  |  |             // original text. Ignore the second. See ticket #12918.  | 
1383  | 0  |             --numBreaks;  | 
1384  | 0  |         }  | 
1385  | 0  |         prevCPPos = cpPos;  | 
1386  | 0  |         prevUTextPos = utextPos;  | 
1387  | 0  |     }  | 
1388  | 0  |     (void)prevCPPos; // suppress compiler warnings about unused variable  | 
1389  |  |  | 
1390  |  |     // inString goes out of scope  | 
1391  |  |     // inputMap goes out of scope  | 
1392  | 0  |     return numBreaks;  | 
1393  | 0  | }  | 
1394  |  | #endif  | 
1395  |  |  | 
1396  |  | U_NAMESPACE_END  | 
1397  |  |  | 
1398  |  | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */  | 
1399  |  |  |