Coverage Report

Created: 2025-06-24 06:43

/src/icu/source/common/brkeng.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
 ************************************************************************************
5
 * Copyright (C) 2006-2016, International Business Machines Corporation
6
 * and others. All Rights Reserved.
7
 ************************************************************************************
8
 */
9
10
#include "unicode/utypes.h"
11
12
#if !UCONFIG_NO_BREAK_ITERATION
13
14
#include "unicode/uchar.h"
15
#include "unicode/uniset.h"
16
#include "unicode/chariter.h"
17
#include "unicode/ures.h"
18
#include "unicode/udata.h"
19
#include "unicode/putil.h"
20
#include "unicode/ustring.h"
21
#include "unicode/uscript.h"
22
#include "unicode/ucharstrie.h"
23
#include "unicode/bytestrie.h"
24
25
#include "brkeng.h"
26
#include "cmemory.h"
27
#include "dictbe.h"
28
#include "lstmbe.h"
29
#include "charstr.h"
30
#include "dictionarydata.h"
31
#include "mutex.h"
32
#include "uvector.h"
33
#include "umutex.h"
34
#include "uresimp.h"
35
#include "ubrkimpl.h"
36
37
U_NAMESPACE_BEGIN
38
39
/*
40
 ******************************************************************
41
 */
42
43
0
LanguageBreakEngine::LanguageBreakEngine() {
44
0
}
45
46
0
LanguageBreakEngine::~LanguageBreakEngine() {
47
0
}
48
49
/*
50
 ******************************************************************
51
 */
52
53
0
LanguageBreakFactory::LanguageBreakFactory() {
54
0
}
55
56
0
LanguageBreakFactory::~LanguageBreakFactory() {
57
0
}
58
59
/*
60
 ******************************************************************
61
 */
62
63
0
UnhandledEngine::UnhandledEngine(UErrorCode &status) : fHandled(nullptr) {
64
0
    (void)status;
65
0
}
66
67
0
UnhandledEngine::~UnhandledEngine() {
68
0
    delete fHandled;
69
0
    fHandled = nullptr;
70
0
}
71
72
UBool
73
0
UnhandledEngine::handles(UChar32 c) const {
74
0
    return fHandled && fHandled->contains(c);
75
0
}
76
77
int32_t
78
UnhandledEngine::findBreaks( UText *text,
79
                             int32_t /* startPos */,
80
                             int32_t endPos,
81
                             UVector32 &/*foundBreaks*/,
82
0
                             UErrorCode &status) const {
83
0
    if (U_FAILURE(status)) return 0;
84
0
    UChar32 c = utext_current32(text); 
85
0
    while((int32_t)utext_getNativeIndex(text) < endPos && fHandled->contains(c)) {
86
0
        utext_next32(text);            // TODO:  recast loop to work with post-increment operations.
87
0
        c = utext_current32(text);
88
0
    }
89
0
    return 0;
90
0
}
91
92
void
93
0
UnhandledEngine::handleCharacter(UChar32 c) {
94
0
    if (fHandled == nullptr) {
95
0
        fHandled = new UnicodeSet();
96
0
        if (fHandled == nullptr) {
97
0
            return;
98
0
        }
99
0
    }
100
0
    if (!fHandled->contains(c)) {
101
0
        UErrorCode status = U_ZERO_ERROR;
102
        // Apply the entire script of the character.
103
0
        int32_t script = u_getIntPropertyValue(c, UCHAR_SCRIPT);
104
0
        fHandled->applyIntPropertyValue(UCHAR_SCRIPT, script, status);
105
0
    }
106
0
}
107
108
/*
109
 ******************************************************************
110
 */
111
112
0
ICULanguageBreakFactory::ICULanguageBreakFactory(UErrorCode &/*status*/) {
113
0
    fEngines = 0;
114
0
}
115
116
0
ICULanguageBreakFactory::~ICULanguageBreakFactory() {
117
0
    if (fEngines != 0) {
118
0
        delete fEngines;
119
0
    }
120
0
}
121
122
U_NAMESPACE_END
123
U_CDECL_BEGIN
124
0
static void U_CALLCONV _deleteEngine(void *obj) {
125
0
    delete (const icu::LanguageBreakEngine *) obj;
126
0
}
127
U_CDECL_END
128
U_NAMESPACE_BEGIN
129
130
const LanguageBreakEngine *
131
0
ICULanguageBreakFactory::getEngineFor(UChar32 c) {
132
0
    const LanguageBreakEngine *lbe = NULL;
133
0
    UErrorCode  status = U_ZERO_ERROR;
134
135
0
    static UMutex gBreakEngineMutex;
136
0
    Mutex m(&gBreakEngineMutex);
137
138
0
    if (fEngines == NULL) {
139
0
        UStack  *engines = new UStack(_deleteEngine, NULL, status);
140
0
        if (U_FAILURE(status) || engines == NULL) {
141
            // Note: no way to return error code to caller.
142
0
            delete engines;
143
0
            return NULL;
144
0
        }
145
0
        fEngines = engines;
146
0
    } else {
147
0
        int32_t i = fEngines->size();
148
0
        while (--i >= 0) {
149
0
            lbe = (const LanguageBreakEngine *)(fEngines->elementAt(i));
150
0
            if (lbe != NULL && lbe->handles(c)) {
151
0
                return lbe;
152
0
            }
153
0
        }
154
0
    }
155
    
156
    // We didn't find an engine. Create one.
157
0
    lbe = loadEngineFor(c);
158
0
    if (lbe != NULL) {
159
0
        fEngines->push((void *)lbe, status);
160
0
    }
161
0
    return lbe;
162
0
}
163
164
const LanguageBreakEngine *
165
0
ICULanguageBreakFactory::loadEngineFor(UChar32 c) {
166
0
    UErrorCode status = U_ZERO_ERROR;
167
0
    UScriptCode code = uscript_getScript(c, &status);
168
0
    if (U_SUCCESS(status)) {
169
0
        const LanguageBreakEngine *engine = nullptr;
170
        // Try to use LSTM first
171
0
        const LSTMData *data = CreateLSTMDataForScript(code, status);
172
0
        if (U_SUCCESS(status)) {
173
0
            if (data != nullptr) {
174
0
                engine = CreateLSTMBreakEngine(code, data, status);
175
0
                if (U_SUCCESS(status) && engine != nullptr) {
176
0
                    return engine;
177
0
                }
178
0
                if (engine != nullptr) {
179
0
                    delete engine;
180
0
                    engine = nullptr;
181
0
                } else {
182
0
                    DeleteLSTMData(data);
183
0
                }
184
0
            }
185
0
        }
186
0
        status = U_ZERO_ERROR;  // fallback to dictionary based
187
0
        DictionaryMatcher *m = loadDictionaryMatcherFor(code);
188
0
        if (m != NULL) {
189
0
            switch(code) {
190
0
            case USCRIPT_THAI:
191
0
                engine = new ThaiBreakEngine(m, status);
192
0
                break;
193
0
            case USCRIPT_LAO:
194
0
                engine = new LaoBreakEngine(m, status);
195
0
                break;
196
0
            case USCRIPT_MYANMAR:
197
0
                engine = new BurmeseBreakEngine(m, status);
198
0
                break;
199
0
            case USCRIPT_KHMER:
200
0
                engine = new KhmerBreakEngine(m, status);
201
0
                break;
202
203
0
#if !UCONFIG_NO_NORMALIZATION
204
                // CJK not available w/o normalization
205
0
            case USCRIPT_HANGUL:
206
0
                engine = new CjkBreakEngine(m, kKorean, status);
207
0
                break;
208
209
            // use same BreakEngine and dictionary for both Chinese and Japanese
210
0
            case USCRIPT_HIRAGANA:
211
0
            case USCRIPT_KATAKANA:
212
0
            case USCRIPT_HAN:
213
0
                engine = new CjkBreakEngine(m, kChineseJapanese, status);
214
0
                break;
215
#if 0
216
            // TODO: Have to get some characters with script=common handled
217
            // by CjkBreakEngine (e.g. U+309B). Simply subjecting
218
            // them to CjkBreakEngine does not work. The engine has to
219
            // special-case them.
220
            case USCRIPT_COMMON:
221
            {
222
                UBlockCode block = ublock_getCode(code);
223
                if (block == UBLOCK_HIRAGANA || block == UBLOCK_KATAKANA)
224
                   engine = new CjkBreakEngine(dict, kChineseJapanese, status);
225
                break;
226
            }
227
#endif
228
0
#endif
229
230
0
            default:
231
0
                break;
232
0
            }
233
0
            if (engine == NULL) {
234
0
                delete m;
235
0
            }
236
0
            else if (U_FAILURE(status)) {
237
0
                delete engine;
238
0
                engine = NULL;
239
0
            }
240
0
            return engine;
241
0
        }
242
0
    }
243
0
    return NULL;
244
0
}
245
246
DictionaryMatcher *
247
0
ICULanguageBreakFactory::loadDictionaryMatcherFor(UScriptCode script) { 
248
0
    UErrorCode status = U_ZERO_ERROR;
249
    // open root from brkitr tree.
250
0
    UResourceBundle *b = ures_open(U_ICUDATA_BRKITR, "", &status);
251
0
    b = ures_getByKeyWithFallback(b, "dictionaries", b, &status);
252
0
    int32_t dictnlength = 0;
253
0
    const UChar *dictfname =
254
0
        ures_getStringByKeyWithFallback(b, uscript_getShortName(script), &dictnlength, &status);
255
0
    if (U_FAILURE(status)) {
256
0
        ures_close(b);
257
0
        return NULL;
258
0
    }
259
0
    CharString dictnbuf;
260
0
    CharString ext;
261
0
    const UChar *extStart = u_memrchr(dictfname, 0x002e, dictnlength);  // last dot
262
0
    if (extStart != NULL) {
263
0
        int32_t len = (int32_t)(extStart - dictfname);
264
0
        ext.appendInvariantChars(UnicodeString(FALSE, extStart + 1, dictnlength - len - 1), status);
265
0
        dictnlength = len;
266
0
    }
267
0
    dictnbuf.appendInvariantChars(UnicodeString(FALSE, dictfname, dictnlength), status);
268
0
    ures_close(b);
269
270
0
    UDataMemory *file = udata_open(U_ICUDATA_BRKITR, ext.data(), dictnbuf.data(), &status);
271
0
    if (U_SUCCESS(status)) {
272
        // build trie
273
0
        const uint8_t *data = (const uint8_t *)udata_getMemory(file);
274
0
        const int32_t *indexes = (const int32_t *)data;
275
0
        const int32_t offset = indexes[DictionaryData::IX_STRING_TRIE_OFFSET];
276
0
        const int32_t trieType = indexes[DictionaryData::IX_TRIE_TYPE] & DictionaryData::TRIE_TYPE_MASK;
277
0
        DictionaryMatcher *m = NULL;
278
0
        if (trieType == DictionaryData::TRIE_TYPE_BYTES) {
279
0
            const int32_t transform = indexes[DictionaryData::IX_TRANSFORM];
280
0
            const char *characters = (const char *)(data + offset);
281
0
            m = new BytesDictionaryMatcher(characters, transform, file);
282
0
        }
283
0
        else if (trieType == DictionaryData::TRIE_TYPE_UCHARS) {
284
0
            const UChar *characters = (const UChar *)(data + offset);
285
0
            m = new UCharsDictionaryMatcher(characters, file);
286
0
        }
287
0
        if (m == NULL) {
288
            // no matcher exists to take ownership - either we are an invalid 
289
            // type or memory allocation failed
290
0
            udata_close(file);
291
0
        }
292
0
        return m;
293
0
    } else if (dictfname != NULL) {
294
        // we don't have a dictionary matcher.
295
        // returning NULL here will cause us to fail to find a dictionary break engine, as expected
296
0
        status = U_ZERO_ERROR;
297
0
        return NULL;
298
0
    }
299
0
    return NULL;
300
0
}
301
302
U_NAMESPACE_END
303
304
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */