/src/mozilla-central/intl/icu/source/common/brkeng.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | /* |
4 | | ************************************************************************************ |
5 | | * Copyright (C) 2006-2016, International Business Machines Corporation |
6 | | * and others. All Rights Reserved. |
7 | | ************************************************************************************ |
8 | | */ |
9 | | |
10 | | #include "unicode/utypes.h" |
11 | | |
12 | | #if !UCONFIG_NO_BREAK_ITERATION |
13 | | |
14 | | #include "unicode/uchar.h" |
15 | | #include "unicode/uniset.h" |
16 | | #include "unicode/chariter.h" |
17 | | #include "unicode/ures.h" |
18 | | #include "unicode/udata.h" |
19 | | #include "unicode/putil.h" |
20 | | #include "unicode/ustring.h" |
21 | | #include "unicode/uscript.h" |
22 | | #include "unicode/ucharstrie.h" |
23 | | #include "unicode/bytestrie.h" |
24 | | |
25 | | #include "brkeng.h" |
26 | | #include "cmemory.h" |
27 | | #include "dictbe.h" |
28 | | #include "charstr.h" |
29 | | #include "dictionarydata.h" |
30 | | #include "mutex.h" |
31 | | #include "uvector.h" |
32 | | #include "umutex.h" |
33 | | #include "uresimp.h" |
34 | | #include "ubrkimpl.h" |
35 | | |
36 | | U_NAMESPACE_BEGIN |
37 | | |
38 | | /* |
39 | | ****************************************************************** |
40 | | */ |
41 | | |
42 | 0 | LanguageBreakEngine::LanguageBreakEngine() { |
43 | 0 | } |
44 | | |
45 | 0 | LanguageBreakEngine::~LanguageBreakEngine() { |
46 | 0 | } |
47 | | |
48 | | /* |
49 | | ****************************************************************** |
50 | | */ |
51 | | |
52 | 0 | LanguageBreakFactory::LanguageBreakFactory() { |
53 | 0 | } |
54 | | |
55 | 0 | LanguageBreakFactory::~LanguageBreakFactory() { |
56 | 0 | } |
57 | | |
58 | | /* |
59 | | ****************************************************************** |
60 | | */ |
61 | | |
62 | 0 | UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) { |
63 | 0 | (void)status; |
64 | 0 | } |
65 | | |
66 | 0 | UnhandledEngine::~UnhandledEngine() { |
67 | 0 | delete fHandled; |
68 | 0 | fHandled = nullptr; |
69 | 0 | } |
70 | | |
71 | | UBool |
72 | 0 | UnhandledEngine::handles(UChar32 c) const { |
73 | 0 | return fHandled && fHandled->contains(c); |
74 | 0 | } |
75 | | |
76 | | int32_t |
77 | | UnhandledEngine::findBreaks( UText *text, |
78 | | int32_t /* startPos */, |
79 | | int32_t endPos, |
80 | 0 | UVector32 &/*foundBreaks*/ ) const { |
81 | 0 | UChar32 c = utext_current32(text); |
82 | 0 | while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) { |
83 | 0 | utext_next32(text); // TODO: recast loop to work with post-increment operations. |
84 | 0 | c = utext_current32(text); |
85 | 0 | } |
86 | 0 | return 0; |
87 | 0 | } |
88 | | |
89 | | void |
90 | 0 | UnhandledEngine::handleCharacter(UChar32 c) { |
91 | 0 | if (fHandled == nullptr) { |
92 | 0 | fHandled = new UnicodeSet(); |
93 | 0 | if (fHandled == nullptr) { |
94 | 0 | return; |
95 | 0 | } |
96 | 0 | } |
97 | 0 | if (!fHandled->contains(c)) { |
98 | 0 | UErrorCode status = U_ZERO_ERROR; |
99 | 0 | // Apply the entire script of the character. |
100 | 0 | int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT); |
101 | 0 | fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status); |
102 | 0 | } |
103 | 0 | } |
104 | | |
105 | | /* |
106 | | ****************************************************************** |
107 | | */ |
108 | | |
109 | 0 | ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) { |
110 | 0 | fEngines = 0; |
111 | 0 | } |
112 | | |
113 | 0 | ICULanguageBreakFactory::~ICULanguageBreakFactory() { |
114 | 0 | if (fEngines != 0) { |
115 | 0 | delete fEngines; |
116 | 0 | } |
117 | 0 | } |
118 | | |
119 | | U_NAMESPACE_END |
120 | | U_CDECL_BEGIN |
121 | 0 | static void U_CALLCONV _deleteEngine(void *obj) { |
122 | 0 | delete (const icu::LanguageBreakEngine *) obj; |
123 | 0 | } |
124 | | U_CDECL_END |
125 | | U_NAMESPACE_BEGIN |
126 | | |
127 | | static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER; |
128 | | |
129 | | const LanguageBreakEngine * |
130 | 0 | ICULanguageBreakFactory::getEngineFor(UChar32 c) { |
131 | 0 | const LanguageBreakEngine *lbe = NULL; |
132 | 0 | UErrorCode status = U_ZERO_ERROR; |
133 | 0 |
|
134 | 0 | Mutex m(&gBreakEngineMutex); |
135 | 0 |
|
136 | 0 | if (fEngines == NULL) { |
137 | 0 | UStack *engines = new UStack(_deleteEngine, NULL, status); |
138 | 0 | if (U_FAILURE(status) || engines == NULL) { |
139 | 0 | // Note: no way to return error code to caller. |
140 | 0 | delete engines; |
141 | 0 | return NULL; |
142 | 0 | } |
143 | 0 | fEngines = engines; |
144 | 0 | } else { |
145 | 0 | int32_t i = fEngines->size(); |
146 | 0 | while (--i >= 0) { |
147 | 0 | lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i)); |
148 | 0 | if (lbe != NULL && lbe->handles(c)) { |
149 | 0 | return lbe; |
150 | 0 | } |
151 | 0 | } |
152 | 0 | } |
153 | 0 | |
154 | 0 | // We didn't find an engine. Create one. |
155 | 0 | lbe = loadEngineFor(c); |
156 | 0 | if (lbe != NULL) { |
157 | 0 | fEngines->push((void *)lbe, status); |
158 | 0 | } |
159 | 0 | return lbe; |
160 | 0 | } |
161 | | |
162 | | const LanguageBreakEngine * |
163 | 0 | ICULanguageBreakFactory::loadEngineFor(UChar32 c) { |
164 | 0 | UErrorCode status = U_ZERO_ERROR; |
165 | 0 | UScriptCode code = uscript_getScript(c, &status); |
166 | 0 | if (U_SUCCESS(status)) { |
167 | 0 | DictionaryMatcher *m = loadDictionaryMatcherFor(code); |
168 | 0 | if (m != NULL) { |
169 | 0 | const LanguageBreakEngine *engine = NULL; |
170 | 0 | switch(code) { |
171 | 0 | case USCRIPT_THAI: |
172 | 0 | engine = new ThaiBreakEngine(m, status); |
173 | 0 | break; |
174 | 0 | case USCRIPT_LAO: |
175 | 0 | engine = new LaoBreakEngine(m, status); |
176 | 0 | break; |
177 | 0 | case USCRIPT_MYANMAR: |
178 | 0 | engine = new BurmeseBreakEngine(m, status); |
179 | 0 | break; |
180 | 0 | case USCRIPT_KHMER: |
181 | 0 | engine = new KhmerBreakEngine(m, status); |
182 | 0 | break; |
183 | 0 |
|
184 | 0 | #if !UCONFIG_NO_NORMALIZATION |
185 | 0 | // CJK not available w/o normalization |
186 | 0 | case USCRIPT_HANGUL: |
187 | 0 | engine = new CjkBreakEngine(m, kKorean, status); |
188 | 0 | break; |
189 | 0 |
|
190 | 0 | // use same BreakEngine and dictionary for both Chinese and Japanese |
191 | 0 | case USCRIPT_HIRAGANA: |
192 | 0 | case USCRIPT_KATAKANA: |
193 | 0 | case USCRIPT_HAN: |
194 | 0 | engine = new CjkBreakEngine(m, kChineseJapanese, status); |
195 | 0 | break; |
196 | | #if 0 |
197 | | // TODO: Have to get some characters with script=common handled |
198 | | // by CjkBreakEngine (e.g. U+309B). Simply subjecting |
199 | | // them to CjkBreakEngine does not work. The engine has to |
200 | | // special-case them. |
201 | | case USCRIPT_COMMON: |
202 | | { |
203 | | UBlockCode block = ublock_getCode(code); |
204 | | if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA) |
205 | | engine = new CjkBreakEngine(dict, kChineseJapanese, status); |
206 | | break; |
207 | | } |
208 | | #endif |
209 | | #endif |
210 | 0 |
|
211 | 0 | default: |
212 | 0 | break; |
213 | 0 | } |
214 | 0 | if (engine == NULL) { |
215 | 0 | delete m; |
216 | 0 | } |
217 | 0 | else if (U_FAILURE(status)) { |
218 | 0 | delete engine; |
219 | 0 | engine = NULL; |
220 | 0 | } |
221 | 0 | return engine; |
222 | 0 | } |
223 | 0 | } |
224 | 0 | return NULL; |
225 | 0 | } |
226 | | |
227 | | DictionaryMatcher * |
228 | 0 | ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { |
229 | 0 | UErrorCode status = U_ZERO_ERROR; |
230 | 0 | // open root from brkitr tree. |
231 | 0 | UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status); |
232 | 0 | b = ures_getByKeyWithFallback(b, "dictionaries", b, &status); |
233 | 0 | int32_t dictnlength = 0; |
234 | 0 | const UChar *dictfname = |
235 | 0 | ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status); |
236 | 0 | if (U_FAILURE(status)) { |
237 | 0 | ures_close(b); |
238 | 0 | return NULL; |
239 | 0 | } |
240 | 0 | CharString dictnbuf; |
241 | 0 | CharString ext; |
242 | 0 | const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength); // last dot |
243 | 0 | if (extStart != NULL) { |
244 | 0 | int32_t len = (int32_t)(extStart - dictfname); |
245 | 0 | ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status); |
246 | 0 | dictnlength = len; |
247 | 0 | } |
248 | 0 | dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status); |
249 | 0 | ures_close(b); |
250 | 0 |
|
251 | 0 | UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status); |
252 | 0 | if (U_SUCCESS(status)) { |
253 | 0 | // build trie |
254 | 0 | const uint8_t *data = (const uint8_t *)udata_getMemory(file); |
255 | 0 | const int32_t *indexes = (const int32_t *)data; |
256 | 0 | const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET]; |
257 | 0 | const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK; |
258 | 0 | DictionaryMatcher *m = NULL; |
259 | 0 | if (trieType == DictionaryData::TRIE_TYPE_BYTES) { |
260 | 0 | const int32_t transform = indexes[DictionaryData::IX_TRANSFORM]; |
261 | 0 | const char *characters = (const char *)(data + offset); |
262 | 0 | m = new BytesDictionaryMatcher(characters, transform, file); |
263 | 0 | } |
264 | 0 | else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) { |
265 | 0 | const UChar *characters = (const UChar *)(data + offset); |
266 | 0 | m = new UCharsDictionaryMatcher(characters, file); |
267 | 0 | } |
268 | 0 | if (m == NULL) { |
269 | 0 | // no matcher exists to take ownership - either we are an invalid |
270 | 0 | // type or memory allocation failed |
271 | 0 | udata_close(file); |
272 | 0 | } |
273 | 0 | return m; |
274 | 0 | } else if (dictfname != NULL) { |
275 | 0 | // we don't have a dictionary matcher. |
276 | 0 | // returning NULL here will cause us to fail to find a dictionary break engine, as expected |
277 | 0 | status = U_ZERO_ERROR; |
278 | 0 | return NULL; |
279 | 0 | } |
280 | 0 | return NULL; |
281 | 0 | } |
282 | | |
283 | | U_NAMESPACE_END |
284 | | |
285 | | #endif /* #if !UCONFIG_NO_BREAK_ITERATION */ |