Coverage Report

Created: 2025-06-13 06:38

/src/icu/icu4c/source/common/rbbirb.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  rbbirb.cpp
5
//
6
//  Copyright (C) 2002-2011, International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the RBBIRuleBuilder class implementation.  This is the main class for
10
//    building (compiling) break rules into the tables required by the runtime
11
//    RBBI engine.
12
//
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_BREAK_ITERATION
17
18
#include "unicode/brkiter.h"
19
#include "unicode/rbbi.h"
20
#include "unicode/ubrk.h"
21
#include "unicode/unistr.h"
22
#include "unicode/uniset.h"
23
#include "unicode/uchar.h"
24
#include "unicode/uchriter.h"
25
#include "unicode/ustring.h"
26
#include "unicode/parsepos.h"
27
#include "unicode/parseerr.h"
28
29
#include "cmemory.h"
30
#include "cstring.h"
31
#include "rbbirb.h"
32
#include "rbbinode.h"
33
#include "rbbiscan.h"
34
#include "rbbisetb.h"
35
#include "rbbitblb.h"
36
#include "rbbidata.h"
37
#include "uassert.h"
38
39
40
U_NAMESPACE_BEGIN
41
42
43
//----------------------------------------------------------------------------------------
44
//
45
//  Constructor.
46
//
47
//----------------------------------------------------------------------------------------
48
RBBIRuleBuilder::RBBIRuleBuilder(const UnicodeString   &rules,
49
                                       UParseError     *parseErr,
50
                                       UErrorCode      &status)
51
8.41k
 : fRules(rules), fStrippedRules(rules)
52
8.41k
{
53
8.41k
    fStatus = &status; // status is checked below
54
8.41k
    fParseError = parseErr;
55
8.41k
    fDebugEnv   = nullptr;
56
#ifdef RBBI_DEBUG
57
    fDebugEnv   = getenv("U_RBBIDEBUG");
58
#endif
59
60
61
8.41k
    fForwardTree        = nullptr;
62
8.41k
    fReverseTree        = nullptr;
63
8.41k
    fSafeFwdTree        = nullptr;
64
8.41k
    fSafeRevTree        = nullptr;
65
8.41k
    fDefaultTree        = &fForwardTree;
66
8.41k
    fForwardTable       = nullptr;
67
8.41k
    fRuleStatusVals     = nullptr;
68
8.41k
    fChainRules         = false;
69
8.41k
    fLookAheadHardBreak = false;
70
8.41k
    fUSetNodes          = nullptr;
71
8.41k
    fRuleStatusVals     = nullptr;
72
8.41k
    fScanner            = nullptr;
73
8.41k
    fSetBuilder         = nullptr;
74
8.41k
    if (parseErr) {
75
8.41k
        uprv_memset(parseErr, 0, sizeof(UParseError));
76
8.41k
    }
77
78
8.41k
    if (U_FAILURE(status)) {
79
0
        return;
80
0
    }
81
82
8.41k
    fUSetNodes          = new UVector(status); // bcos status gets overwritten here
83
8.41k
    fRuleStatusVals     = new UVector(status);
84
8.41k
    fScanner            = new RBBIRuleScanner(this);
85
8.41k
    fSetBuilder         = new RBBISetBuilder(this);
86
8.41k
    if (U_FAILURE(status)) {
87
0
        return;
88
0
    }
89
8.41k
    if (fSetBuilder == nullptr || fScanner == nullptr ||
90
8.41k
        fUSetNodes == nullptr || fRuleStatusVals == nullptr) {
91
0
        status = U_MEMORY_ALLOCATION_ERROR;
92
0
    }
93
8.41k
}
94
95
96
97
//----------------------------------------------------------------------------------------
98
//
99
//  Destructor
100
//
101
//----------------------------------------------------------------------------------------
102
8.41k
RBBIRuleBuilder::~RBBIRuleBuilder() {
103
104
8.41k
    int        i;
105
182k
    for (i=0; ; i++) {
106
182k
        RBBINode* n = static_cast<RBBINode*>(fUSetNodes->elementAt(i));
107
182k
        if (n==nullptr) {
108
8.41k
            break;
109
8.41k
        }
110
174k
        delete n;
111
174k
    }
112
113
8.41k
    delete fUSetNodes;
114
8.41k
    delete fSetBuilder;
115
8.41k
    delete fForwardTable;
116
8.41k
    delete fForwardTree;
117
8.41k
    delete fReverseTree;
118
8.41k
    delete fSafeFwdTree;
119
8.41k
    delete fSafeRevTree;
120
8.41k
    delete fScanner;
121
8.41k
    delete fRuleStatusVals;
122
8.41k
}
123
124
125
126
127
128
//----------------------------------------------------------------------------------------
129
//
130
//   flattenData() -  Collect up the compiled RBBI rule data and put it into
131
//                    the format for saving in ICU data files,
132
//                    which is also the format needed by the RBBI runtime engine.
133
//
134
//----------------------------------------------------------------------------------------
135
15.8k
static int32_t align8(int32_t i) {return (i+7) & 0xfffffff8;}
136
137
2.67k
RBBIDataHeader *RBBIRuleBuilder::flattenData() {
138
2.67k
    int32_t    i;
139
140
2.67k
    if (U_FAILURE(*fStatus)) {
141
41
        return nullptr;
142
41
    }
143
144
    // Remove whitespace from the rules to make it smaller.
145
    // The rule parser has already removed comments.
146
2.63k
    fStrippedRules = fScanner->stripRules(fStrippedRules);
147
148
    // Calculate the size of each section in the data.
149
    //   Sizes here are padded up to a multiple of 8 for better memory alignment.
150
    //   Sections sizes actually stored in the header are for the actual data
151
    //     without the padding.
152
    //
153
2.63k
    int32_t headerSize        = align8(sizeof(RBBIDataHeader));
154
2.63k
    int32_t forwardTableSize  = align8(fForwardTable->getTableSize());
155
2.63k
    int32_t reverseTableSize  = align8(fForwardTable->getSafeTableSize());
156
2.63k
    int32_t trieSize          = align8(fSetBuilder->getTrieSize());
157
2.63k
    int32_t statusTableSize   = align8(fRuleStatusVals->size() * sizeof(int32_t));
158
159
2.63k
    int32_t rulesLengthInUTF8 = 0;
160
2.63k
    u_strToUTF8WithSub(nullptr, 0, &rulesLengthInUTF8,
161
2.63k
                       fStrippedRules.getBuffer(), fStrippedRules.length(),
162
2.63k
                       0xfffd, nullptr, fStatus);
163
2.63k
    *fStatus = U_ZERO_ERROR;
164
165
2.63k
    int32_t rulesSize         = align8((rulesLengthInUTF8+1));
166
167
2.63k
    int32_t         totalSize = headerSize
168
2.63k
                                + forwardTableSize
169
2.63k
                                + reverseTableSize
170
2.63k
                                + statusTableSize + trieSize + rulesSize;
171
172
#ifdef RBBI_DEBUG
173
    if (fDebugEnv && uprv_strstr(fDebugEnv, "size")) {
174
        RBBIDebugPrintf("Header Size:        %8d\n", headerSize);
175
        RBBIDebugPrintf("Forward Table Size: %8d\n", forwardTableSize);
176
        RBBIDebugPrintf("Reverse Table Size: %8d\n", reverseTableSize);
177
        RBBIDebugPrintf("Trie Size:          %8d\n", trieSize);
178
        RBBIDebugPrintf("Status Table Size:  %8d\n", statusTableSize);
179
        RBBIDebugPrintf("Rules Size:         %8d\n", rulesSize);
180
        RBBIDebugPrintf("-----------------------------\n");
181
        RBBIDebugPrintf("Total Size:         %8d\n", totalSize);
182
    }
183
#endif
184
185
2.63k
    LocalMemory<RBBIDataHeader> data(static_cast<RBBIDataHeader*>(uprv_malloc(totalSize)));
186
2.63k
    if (data.isNull()) {
187
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
188
0
        return nullptr;
189
0
    }
190
2.63k
    uprv_memset(data.getAlias(), 0, totalSize);
191
192
193
2.63k
    data->fMagic            = 0xb1a0;
194
2.63k
    data->fFormatVersion[0] = RBBI_DATA_FORMAT_VERSION[0];
195
2.63k
    data->fFormatVersion[1] = RBBI_DATA_FORMAT_VERSION[1];
196
2.63k
    data->fFormatVersion[2] = RBBI_DATA_FORMAT_VERSION[2];
197
2.63k
    data->fFormatVersion[3] = RBBI_DATA_FORMAT_VERSION[3];
198
2.63k
    data->fLength           = totalSize;
199
2.63k
    data->fCatCount         = fSetBuilder->getNumCharCategories();
200
201
2.63k
    data->fFTable        = headerSize;
202
2.63k
    data->fFTableLen     = forwardTableSize;
203
204
2.63k
    data->fRTable        = data->fFTable  + data->fFTableLen;
205
2.63k
    data->fRTableLen     = reverseTableSize;
206
207
2.63k
    data->fTrie          = data->fRTable + data->fRTableLen;
208
2.63k
    data->fTrieLen       = trieSize;
209
2.63k
    data->fStatusTable   = data->fTrie    + data->fTrieLen;
210
2.63k
    data->fStatusTableLen= statusTableSize;
211
2.63k
    data->fRuleSource    = data->fStatusTable + statusTableSize;
212
2.63k
    data->fRuleSourceLen = rulesLengthInUTF8;
213
214
2.63k
    uprv_memset(data->fReserved, 0, sizeof(data->fReserved));
215
216
2.63k
    fForwardTable->exportTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fFTable);
217
2.63k
    fForwardTable->exportSafeTable(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fRTable);
218
2.63k
    fSetBuilder->serializeTrie(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fTrie);
219
220
2.63k
    int32_t* ruleStatusTable = reinterpret_cast<int32_t*>(reinterpret_cast<uint8_t*>(data.getAlias()) + data->fStatusTable);
221
16.4k
    for (i=0; i<fRuleStatusVals->size(); i++) {
222
13.8k
        ruleStatusTable[i] = fRuleStatusVals->elementAti(i);
223
13.8k
    }
224
225
2.63k
    u_strToUTF8WithSub(reinterpret_cast<char*>(data.getAlias()) + data->fRuleSource, rulesSize, &rulesLengthInUTF8,
226
2.63k
                       fStrippedRules.getBuffer(), fStrippedRules.length(),
227
2.63k
                       0xfffd, nullptr, fStatus);
228
2.63k
    if (U_FAILURE(*fStatus)) {
229
0
        return nullptr;
230
0
    }
231
232
2.63k
    return data.orphan();
233
2.63k
}
234
235
236
//----------------------------------------------------------------------------------------
237
//
238
//  createRuleBasedBreakIterator    construct from source rules that are passed in
239
//                                  in a UnicodeString
240
//
241
//----------------------------------------------------------------------------------------
242
BreakIterator *
243
RBBIRuleBuilder::createRuleBasedBreakIterator( const UnicodeString    &rules,
244
                                    UParseError      *parseError,
245
                                    UErrorCode       &status)
246
8.41k
{
247
    //
248
    // Read the input rules, generate a parse tree, symbol table,
249
    // and list of all Unicode Sets referenced by the rules.
250
    //
251
8.41k
    RBBIRuleBuilder  builder(rules, parseError, status);
252
8.41k
    if (U_FAILURE(status)) { // status checked here bcos build below doesn't
253
0
        return nullptr;
254
0
    }
255
256
8.41k
    RBBIDataHeader *data = builder.build(status);
257
258
8.41k
    if (U_FAILURE(status)) {
259
5.77k
        return nullptr;
260
5.77k
    }
261
262
    //
263
    //  Create a break iterator from the compiled rules.
264
    //     (Identical to creation from stored pre-compiled rules)
265
    //
266
    // status is checked after init in construction.
267
2.63k
    RuleBasedBreakIterator *This = new RuleBasedBreakIterator(data, status);
268
2.63k
    if (U_FAILURE(status)) {
269
0
        delete This;
270
0
        This = nullptr;
271
0
    } 
272
2.63k
    else if(This == nullptr) { // test for nullptr
273
0
        status = U_MEMORY_ALLOCATION_ERROR;
274
0
    }
275
2.63k
    return This;
276
8.41k
}
277
278
8.41k
RBBIDataHeader *RBBIRuleBuilder::build(UErrorCode &status) {
279
8.41k
    if (U_FAILURE(status)) {
280
0
        return nullptr;
281
0
    }
282
283
8.41k
    fScanner->parse();
284
8.41k
    if (U_FAILURE(status)) {
285
5.73k
        return nullptr;
286
5.73k
    }
287
288
    //
289
    // UnicodeSet processing.
290
    //    Munge the Unicode Sets to create an initial set of character categories.
291
    //
292
2.67k
    fSetBuilder->buildRanges();
293
294
    //
295
    //   Generate the DFA state transition table.
296
    //
297
2.67k
    fForwardTable = new RBBITableBuilder(this, &fForwardTree, status);
298
2.67k
    if (fForwardTable == nullptr) {
299
0
        status = U_MEMORY_ALLOCATION_ERROR;
300
0
        return nullptr;
301
0
    }
302
303
2.67k
    fForwardTable->buildForwardTable();
304
305
    // State table and character category optimization.
306
    // Merge equivalent rows and columns.
307
    // Note that this process alters the initial set of character categories,
308
    // causing the representation of UnicodeSets in the parse tree to become invalid.
309
310
2.67k
    optimizeTables();
311
2.67k
    fForwardTable->buildSafeReverseTable(status);
312
313
314
#ifdef RBBI_DEBUG
315
    if (fDebugEnv && uprv_strstr(fDebugEnv, "states")) {
316
        fForwardTable->printStates();
317
        fForwardTable->printRuleStatusTable();
318
        fForwardTable->printReverseTable();
319
    }
320
#endif
321
322
    //    Generate the mapping tables (TRIE) from input code points to
323
    //    the character categories.
324
    //
325
2.67k
    fSetBuilder->buildTrie();
326
327
    //
328
    //   Package up the compiled data into a memory image
329
    //      in the run-time format.
330
    //
331
2.67k
    RBBIDataHeader *data = flattenData(); // returns nullptr if error
332
2.67k
    if (U_FAILURE(status)) {
333
41
        return nullptr;
334
41
    }
335
2.63k
    return data;
336
2.67k
}
337
338
2.67k
void RBBIRuleBuilder::optimizeTables() {
339
2.67k
    bool didSomething;
340
3.26k
    do {
341
3.26k
        didSomething = false;
342
343
        // Begin looking for duplicates with char class 3.
344
        // Classes 0, 1 and 2 are special; they are unused, {bof} and {eof} respectively,
345
        // and should not have other categories merged into them.
346
3.26k
        IntPair duplPair = {3, 0};
347
15.7k
        while (fForwardTable->findDuplCharClassFrom(&duplPair)) {
348
12.4k
            fSetBuilder->mergeCategories(duplPair);
349
12.4k
            fForwardTable->removeColumn(duplPair.second);
350
12.4k
            didSomething = true;
351
12.4k
        }
352
353
4.95k
        while (fForwardTable->removeDuplicateStates() > 0) {
354
1.68k
            didSomething = true;
355
1.68k
        }
356
3.26k
    } while (didSomething);
357
2.67k
}
358
359
U_NAMESPACE_END
360
361
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */