Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/brkeng.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 ************************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 ************************************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_BREAK_ITERATION
13
14
#include "unicode/uchar.h"
15
#include "unicode/uniset.h"
16
#include "unicode/chariter.h"
17
#include "unicode/ures.h"
18
#include "unicode/udata.h"
19
#include "unicode/putil.h"
20
#include "unicode/ustring.h"
21
#include "unicode/uscript.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/bytestrie.h"
24
25
#include "brkeng.h"
26
#include "cmemory.h"
27
#include "dictbe.h"
28
#include "charstr.h"
29
#include "dictionarydata.h"
30
#include "mutex.h"
31
#include "uvector.h"
32
#include "umutex.h"
33
#include "uresimp.h"
34
#include "ubrkimpl.h"
35
36
U_NAMESPACE_BEGIN
37
38
/*
39
 ******************************************************************
40
 */
41
42
0
LanguageBreakEngine::LanguageBreakEngine() {
43
0
}
44
45
0
LanguageBreakEngine::~LanguageBreakEngine() {
46
0
}
47
48
/*
49
 ******************************************************************
50
 */
51
52
0
LanguageBreakFactory::LanguageBreakFactory() {
53
0
}
54
55
0
LanguageBreakFactory::~LanguageBreakFactory() {
56
0
}
57
58
/*
59
 ******************************************************************
60
 */
61
62
0
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
63
0
    (void)status;
64
0
}
65
66
0
UnhandledEngine::~UnhandledEngine() {
67
0
    delete fHandled;
68
0
    fHandled = nullptr;
69
0
}
70
71
UBool
72
0
UnhandledEngine::handles(UChar32 c) const {
73
0
    return fHandled && fHandled->contains(c);
74
0
}
75
76
int32_t
77
UnhandledEngine::findBreaks( UText *text,
78
                             int32_t /* startPos */,
79
                             int32_t endPos,
80
0
                             UVector32 &/*foundBreaks*/ ) const {
81
0
    UChar32 c = utext_current32(text); 
82
0
    while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
83
0
        utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
84
0
        c = utext_current32(text);
85
0
    }
86
0
    return 0;
87
0
}
88
89
void
90
0
UnhandledEngine::handleCharacter(UChar32 c) {
91
0
    if (fHandled == nullptr) {
92
0
        fHandled = new UnicodeSet();
93
0
        if (fHandled == nullptr) {
94
0
            return;
95
0
        }
96
0
    }
97
0
    if (!fHandled->contains(c)) {
98
0
        UErrorCode status = U_ZERO_ERROR;
99
0
        // Apply the entire script of the character.
100
0
        int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
101
0
        fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
102
0
    }
103
0
}
104
105
/*
106
 ******************************************************************
107
 */
108
109
0
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
110
0
    fEngines = 0;
111
0
}
112
113
0
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
114
0
    if (fEngines != 0) {
115
0
        delete fEngines;
116
0
    }
117
0
}
118
119
U_NAMESPACE_END
120
U_CDECL_BEGIN
121
0
static void U_CALLCONV _deleteEngine(void *obj) {
122
0
    delete (const icu::LanguageBreakEngine *) obj;
123
0
}
124
U_CDECL_END
125
U_NAMESPACE_BEGIN
126
127
static UMutex gBreakEngineMutex = U_MUTEX_INITIALIZER;
128
129
const LanguageBreakEngine *
130
0
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
131
0
    const LanguageBreakEngine *lbe = NULL;
132
0
    UErrorCode  status = U_ZERO_ERROR;
133
0
134
0
    Mutex m(&gBreakEngineMutex);
135
0
136
0
    if (fEngines == NULL) {
137
0
        UStack  *engines = new UStack(_deleteEngine, NULL, status);
138
0
        if (U_FAILURE(status) || engines == NULL) {
139
0
            // Note: no way to return error code to caller.
140
0
            delete engines;
141
0
            return NULL;
142
0
        }
143
0
        fEngines = engines;
144
0
    } else {
145
0
        int32_t i = fEngines->size();
146
0
        while (--i >= 0) {
147
0
            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
148
0
            if (lbe != NULL && lbe->handles(c)) {
149
0
                return lbe;
150
0
            }
151
0
        }
152
0
    }
153
0
    
154
0
    // We didn't find an engine. Create one.
155
0
    lbe = loadEngineFor(c);
156
0
    if (lbe != NULL) {
157
0
        fEngines->push((void *)lbe, status);
158
0
    }
159
0
    return lbe;
160
0
}
161
162
const LanguageBreakEngine *
163
0
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
164
0
    UErrorCode status = U_ZERO_ERROR;
165
0
    UScriptCode code = uscript_getScript(c, &status);
166
0
    if (U_SUCCESS(status)) {
167
0
        DictionaryMatcher *m = loadDictionaryMatcherFor(code);
168
0
        if (m != NULL) {
169
0
            const LanguageBreakEngine *engine = NULL;
170
0
            switch(code) {
171
0
            case USCRIPT_THAI:
172
0
                engine = new ThaiBreakEngine(m, status);
173
0
                break;
174
0
            case USCRIPT_LAO:
175
0
                engine = new LaoBreakEngine(m, status);
176
0
                break;
177
0
            case USCRIPT_MYANMAR:
178
0
                engine = new BurmeseBreakEngine(m, status);
179
0
                break;
180
0
            case USCRIPT_KHMER:
181
0
                engine = new KhmerBreakEngine(m, status);
182
0
                break;
183
0
184
0
#if !UCONFIG_NO_NORMALIZATION
185
0
                // CJK not available w/o normalization
186
0
            case USCRIPT_HANGUL:
187
0
                engine = new CjkBreakEngine(m, kKorean, status);
188
0
                break;
189
0
190
0
            // use same BreakEngine and dictionary for both Chinese and Japanese
191
0
            case USCRIPT_HIRAGANA:
192
0
            case USCRIPT_KATAKANA:
193
0
            case USCRIPT_HAN:
194
0
                engine = new CjkBreakEngine(m, kChineseJapanese, status);
195
0
                break;
196
#if 0
197
            // TODO: Have to get some characters with script=common handled
198
            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
199
            // them to CjkBreakEngine does not work. The engine has to
200
            // special-case them.
201
            case USCRIPT_COMMON:
202
            {
203
                UBlockCode block = ublock_getCode(code);
204
                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
205
                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
206
                break;
207
            }
208
#endif
209
#endif
210
0
211
0
            default:
212
0
                break;
213
0
            }
214
0
            if (engine == NULL) {
215
0
                delete m;
216
0
            }
217
0
            else if (U_FAILURE(status)) {
218
0
                delete engine;
219
0
                engine = NULL;
220
0
            }
221
0
            return engine;
222
0
        }
223
0
    }
224
0
    return NULL;
225
0
}
226
227
DictionaryMatcher *
228
0
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { 
229
0
    UErrorCode status = U_ZERO_ERROR;
230
0
    // open root from brkitr tree.
231
0
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
232
0
    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
233
0
    int32_t dictnlength = 0;
234
0
    const UChar *dictfname =
235
0
        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
236
0
    if (U_FAILURE(status)) {
237
0
        ures_close(b);
238
0
        return NULL;
239
0
    }
240
0
    CharString dictnbuf;
241
0
    CharString ext;
242
0
    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
243
0
    if (extStart != NULL) {
244
0
        int32_t len = (int32_t)(extStart - dictfname);
245
0
        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
246
0
        dictnlength = len;
247
0
    }
248
0
    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
249
0
    ures_close(b);
250
0
251
0
    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
252
0
    if (U_SUCCESS(status)) {
253
0
        // build trie
254
0
        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
255
0
        const int32_t *indexes = (const int32_t *)data;
256
0
        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
257
0
        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
258
0
        DictionaryMatcher *m = NULL;
259
0
        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
260
0
            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
261
0
            const char *characters = (const char *)(data + offset);
262
0
            m = new BytesDictionaryMatcher(characters, transform, file);
263
0
        }
264
0
        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
265
0
            const UChar *characters = (const UChar *)(data + offset);
266
0
            m = new UCharsDictionaryMatcher(characters, file);
267
0
        }
268
0
        if (m == NULL) {
269
0
            // no matcher exists to take ownership - either we are an invalid 
270
0
            // type or memory allocation failed
271
0
            udata_close(file);
272
0
        }
273
0
        return m;
274
0
    } else if (dictfname != NULL) {
275
0
        // we don't have a dictionary matcher.
276
0
        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
277
0
        status = U_ZERO_ERROR;
278
0
        return NULL;
279
0
    }
280
0
    return NULL;
281
0
}
282
283
U_NAMESPACE_END
284
285
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */