Coverage Report

Created: 2026-01-22 06:31

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/rbbitblb.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (c) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*/
9
//
10
//  rbbitblb.cpp
11
//
12
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_BREAK_ITERATION
17
18
#include "unicode/unistr.h"
19
#include "rbbitblb.h"
20
#include "rbbirb.h"
21
#include "rbbiscan.h"
22
#include "rbbisetb.h"
23
#include "rbbidata.h"
24
#include "cstring.h"
25
#include "uassert.h"
26
#include "uvectr32.h"
27
#include "cmemory.h"
28
29
U_NAMESPACE_BEGIN
30
31
const int32_t kMaxStateFor8BitsTable = 255;
32
33
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
34
2.66k
        fRB(rb),
35
2.66k
        fTree(*rootNode),
36
2.66k
        fStatus(&status),
37
2.66k
        fDStates(nullptr),
38
2.66k
        fSafeTable(nullptr) {
39
2.66k
    if (U_FAILURE(status)) {
40
0
        return;
41
0
    }
42
    // fDStates is UVector<RBBIStateDescriptor *>
43
2.66k
    fDStates = new UVector(status);
44
2.66k
    if (U_SUCCESS(status) && fDStates == nullptr ) {
45
0
        status = U_MEMORY_ALLOCATION_ERROR;
46
0
    }
47
2.66k
}
48
49
50
51
2.66k
RBBITableBuilder::~RBBITableBuilder() {
52
2.66k
    int i;
53
244k
    for (i=0; i<fDStates->size(); i++) {
54
241k
        delete static_cast<RBBIStateDescriptor*>(fDStates->elementAt(i));
55
241k
    }
56
2.66k
    delete fDStates;
57
2.66k
    delete fSafeTable;
58
2.66k
    delete fLookAheadRuleMap;
59
2.66k
}
60
61
62
//-----------------------------------------------------------------------------
63
//
64
//   RBBITableBuilder::buildForwardTable  -  This is the main function for building
65
//                               the DFA state transition table from the RBBI rules parse tree.
66
//
67
//-----------------------------------------------------------------------------
68
2.66k
void  RBBITableBuilder::buildForwardTable() {
69
70
2.66k
    if (U_FAILURE(*fStatus)) {
71
0
        return;
72
0
    }
73
74
    // If there were no rules, just return.  This situation can easily arise
75
    //   for the reverse rules.
76
2.66k
    if (fTree==nullptr) {
77
0
        return;
78
0
    }
79
80
    //
81
    // Walk through the tree, replacing any references to $variables with a copy of the
82
    //   parse tree for the substitution expression.
83
    //
84
2.66k
    fTree = fTree->flattenVariables(*fStatus, 0);
85
2.66k
    if (U_FAILURE(*fStatus)) {
86
43
        return;
87
43
    }
88
#ifdef RBBI_DEBUG
89
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
90
        RBBIDebugPuts("\nParse tree after flattening variable references.");
91
        RBBINode::printTree(fTree, true);
92
    }
93
#endif
94
95
    //
96
    // If the rules contained any references to {bof} 
97
    //   add a {bof} <cat> <former root of tree> to the
98
    //   tree.  Means that all matches must start out with the 
99
    //   {bof} fake character.
100
    // 
101
2.62k
    if (fRB->fSetBuilder->sawBOF()) {
102
62
        RBBINode *bofTop    = new RBBINode(RBBINode::opCat, *fStatus);
103
62
        if (bofTop == nullptr) {
104
0
            *fStatus = U_MEMORY_ALLOCATION_ERROR;
105
0
        }
106
62
        if (U_FAILURE(*fStatus)) {
107
0
            delete bofTop;
108
0
            return;
109
0
        }
110
62
        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar, *fStatus);
111
        // Delete and exit if memory allocation failed.
112
62
        if (bofLeaf == nullptr) {
113
0
            *fStatus = U_MEMORY_ALLOCATION_ERROR;
114
0
        }
115
62
        if (U_FAILURE(*fStatus)) {
116
0
            delete bofLeaf;
117
0
            delete bofTop;
118
0
            return;
119
0
        }
120
62
        bofTop->fLeftChild  = bofLeaf;
121
62
        bofTop->fRightChild = fTree;
122
62
        bofLeaf->fParent    = bofTop;
123
62
        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
124
62
        fTree               = bofTop;
125
62
    }
126
127
    //
128
    // Add a unique right-end marker to the expression.
129
    //   Appears as a cat-node, left child being the original tree,
130
    //   right child being the end marker.
131
    //
132
2.62k
    RBBINode *cn = new RBBINode(RBBINode::opCat, *fStatus);
133
    // Exit if memory allocation failed.
134
2.62k
    if (cn == nullptr) {
135
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
136
0
    }
137
2.62k
    if (U_FAILURE(*fStatus)) {
138
0
        delete cn;
139
0
        return;
140
0
    }
141
2.62k
    cn->fLeftChild = fTree;
142
2.62k
    fTree->fParent = cn;
143
2.62k
    RBBINode *endMarkerNode = cn->fRightChild = new RBBINode(RBBINode::endMark, *fStatus);
144
    // Delete and exit if memory allocation failed.
145
2.62k
    if (cn->fRightChild == nullptr) {
146
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
147
0
    }
148
2.62k
    if (U_FAILURE(*fStatus)) {
149
0
        delete cn;
150
0
        return;
151
0
    }
152
2.62k
    cn->fRightChild->fParent = cn;
153
2.62k
    fTree = cn;
154
155
    //
156
    //  Replace all references to UnicodeSets with the tree for the equivalent
157
    //      expression.
158
    //
159
2.62k
    fTree->flattenSets(*fStatus, 0);
160
#ifdef RBBI_DEBUG
161
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
162
        RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
163
        RBBINode::printTree(fTree, true);
164
    }
165
#endif
166
167
168
    //
169
    // calculate the functions nullable, firstpos, lastpos and followpos on
170
    // nodes in the parse tree.
171
    //    See the algorithm description in Aho.
172
    //    Understanding how this works by looking at the code alone will be
173
    //       nearly impossible.
174
    //
175
2.62k
    calcNullable(fTree);
176
2.62k
    calcFirstPos(fTree);
177
2.62k
    calcLastPos(fTree);
178
2.62k
    calcFollowPos(fTree);
179
2.62k
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
180
0
        RBBIDebugPuts("\n");
181
0
        printPosSets(fTree);
182
0
    }
183
184
    //
185
    //  For "chained" rules, modify the followPos sets
186
    //
187
2.62k
    if (fRB->fChainRules) {
188
139
        calcChainedFollowPos(fTree, endMarkerNode);
189
139
    }
190
191
    //
192
    //  BOF (start of input) test fixup.
193
    //
194
2.62k
    if (fRB->fSetBuilder->sawBOF()) {
195
62
        bofFixup();
196
62
    }
197
198
    //
199
    // Build the DFA state transition tables.
200
    //
201
2.62k
    buildStateTable();
202
2.62k
    mapLookAheadRules();
203
2.62k
    flagAcceptingStates();
204
2.62k
    flagLookAheadStates();
205
2.62k
    flagTaggedStates();
206
207
    //
208
    // Update the global table of rule status {tag} values
209
    // The rule builder has a global vector of status values that are common
210
    //    for all tables.  Merge the ones from this table into the global set.
211
    //
212
2.62k
    mergeRuleStatusVals();
213
2.62k
}
214
215
216
217
//-----------------------------------------------------------------------------
218
//
219
//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
220
//
221
//-----------------------------------------------------------------------------
222
2.59M
void RBBITableBuilder::calcNullable(RBBINode *n) {
223
2.59M
    if (n == nullptr) {
224
1.28M
        return;
225
1.28M
    }
226
1.31M
    if (n->fType == RBBINode::setRef ||
227
1.31M
        n->fType == RBBINode::endMark ) {
228
        // These are non-empty leaf node types.
229
9.67k
        n->fNullable = false;
230
9.67k
        return;
231
9.67k
    }
232
233
1.30M
    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
234
        // Lookahead marker node.  It's a leaf, so no recursion on children.
235
        // It's nullable because it does not match any literal text from the input stream.
236
10.8k
        n->fNullable = true;
237
10.8k
        return;
238
10.8k
    }
239
240
241
    // The node is not a leaf.
242
    //  Calculate nullable on its children.
243
1.29M
    calcNullable(n->fLeftChild);
244
1.29M
    calcNullable(n->fRightChild);
245
246
    // Apply functions from table 3.40 in Aho
247
1.29M
    if (n->fType == RBBINode::opOr) {
248
436k
        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
249
436k
    }
250
861k
    else if (n->fType == RBBINode::opCat) {
251
219k
        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
252
219k
    }
253
641k
    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
254
1.87k
        n->fNullable = true;
255
1.87k
    }
256
639k
    else {
257
639k
        n->fNullable = false;
258
639k
    }
259
1.29M
}
260
261
262
263
264
//-----------------------------------------------------------------------------
265
//
266
//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
267
//
268
//-----------------------------------------------------------------------------
269
1.32M
void RBBITableBuilder::calcFirstPos(RBBINode *n) {
270
1.32M
    if (n == nullptr) {
271
2.40k
        return;
272
2.40k
    }
273
1.31M
    if (n->fType == RBBINode::leafChar  ||
274
679k
        n->fType == RBBINode::endMark   ||
275
669k
        n->fType == RBBINode::lookAhead ||
276
660k
        n->fType == RBBINode::tag) {
277
        // These are non-empty leaf node types.
278
        // Note: In order to maintain the sort invariant on the set,
279
        // this function should only be called on a node whose set is
280
        // empty to start with.
281
659k
        n->fFirstPosSet->addElement(n, *fStatus);
282
659k
        return;
283
659k
    }
284
285
    // The node is not a leaf.
286
    //  Calculate firstPos on its children.
287
659k
    calcFirstPos(n->fLeftChild);
288
659k
    calcFirstPos(n->fRightChild);
289
290
    // Apply functions from table 3.40 in Aho
291
659k
    if (n->fType == RBBINode::opOr) {
292
436k
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
293
436k
        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
294
436k
    }
295
222k
    else if (n->fType == RBBINode::opCat) {
296
219k
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
297
219k
        if (n->fLeftChild->fNullable) {
298
917
            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
299
917
        }
300
219k
    }
301
2.40k
    else if (n->fType == RBBINode::opStar ||
302
765
             n->fType == RBBINode::opQuestion ||
303
2.40k
             n->fType == RBBINode::opPlus) {
304
2.40k
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
305
2.40k
    }
306
659k
}
307
308
309
310
//-----------------------------------------------------------------------------
311
//
312
//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
313
//
314
//-----------------------------------------------------------------------------
315
1.32M
void RBBITableBuilder::calcLastPos(RBBINode *n) {
316
1.32M
    if (n == nullptr) {
317
2.40k
        return;
318
2.40k
    }
319
1.31M
    if (n->fType == RBBINode::leafChar  ||
320
679k
        n->fType == RBBINode::endMark   ||
321
669k
        n->fType == RBBINode::lookAhead ||
322
660k
        n->fType == RBBINode::tag) {
323
        // These are non-empty leaf node types.
324
        // Note: In order to maintain the sort invariant on the set,
325
        // this function should only be called on a node whose set is
326
        // empty to start with.
327
659k
        n->fLastPosSet->addElement(n, *fStatus);
328
659k
        return;
329
659k
    }
330
331
    // The node is not a leaf.
332
    //  Calculate lastPos on its children.
333
659k
    calcLastPos(n->fLeftChild);
334
659k
    calcLastPos(n->fRightChild);
335
336
    // Apply functions from table 3.40 in Aho
337
659k
    if (n->fType == RBBINode::opOr) {
338
436k
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
339
436k
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
340
436k
    }
341
222k
    else if (n->fType == RBBINode::opCat) {
342
219k
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
343
219k
        if (n->fRightChild->fNullable) {
344
12.1k
            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
345
12.1k
        }
346
219k
    }
347
2.40k
    else if (n->fType == RBBINode::opStar     ||
348
765
             n->fType == RBBINode::opQuestion ||
349
2.40k
             n->fType == RBBINode::opPlus) {
350
2.40k
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
351
2.40k
    }
352
659k
}
353
354
355
356
//-----------------------------------------------------------------------------
357
//
358
//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
359
//
360
//-----------------------------------------------------------------------------
361
1.34M
void RBBITableBuilder::calcFollowPos(RBBINode *n) {
362
1.34M
    if (n == nullptr ||
363
1.31M
        n->fType == RBBINode::leafChar ||
364
679k
        n->fType == RBBINode::endMark) {
365
672k
        return;
366
672k
    }
367
368
669k
    calcFollowPos(n->fLeftChild);
369
669k
    calcFollowPos(n->fRightChild);
370
371
    // Aho rule #1
372
669k
    if (n->fType == RBBINode::opCat) {
373
219k
        RBBINode *i;   // is 'i' in Aho's description
374
219k
        uint32_t     ix;
375
376
219k
        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
377
378
916k
        for (ix = 0; ix < static_cast<uint32_t>(LastPosOfLeftChild->size()); ix++) {
379
696k
            i = static_cast<RBBINode*>(LastPosOfLeftChild->elementAt(ix));
380
696k
            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
381
696k
        }
382
219k
    }
383
384
    // Aho rule #2
385
669k
    if (n->fType == RBBINode::opStar ||
386
668k
        n->fType == RBBINode::opPlus) {
387
2.16k
        RBBINode   *i;  // again, n and i are the names from Aho's description.
388
2.16k
        uint32_t    ix;
389
390
14.4k
        for (ix = 0; ix < static_cast<uint32_t>(n->fLastPosSet->size()); ix++) {
391
12.2k
            i = static_cast<RBBINode*>(n->fLastPosSet->elementAt(ix));
392
12.2k
            setAdd(i->fFollowPos, n->fFirstPosSet);
393
12.2k
        }
394
2.16k
    }
395
396
397
398
669k
}
399
400
//-----------------------------------------------------------------------------
401
//
402
//    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
403
//                        as roots of a rule to a destination vector.
404
//
405
//-----------------------------------------------------------------------------
406
35.7k
void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
407
35.7k
    if (node == nullptr || U_FAILURE(*fStatus)) {
408
13.4k
        return;
409
13.4k
    }
410
22.2k
    U_ASSERT(!dest->hasDeleter());
411
22.2k
    if (node->fRuleRoot) {
412
4.45k
        dest->addElement(node, *fStatus);
413
        // Note: rules cannot nest. If we found a rule start node,
414
        //       no child node can also be a start node.
415
4.45k
        return;
416
4.45k
    }
417
17.7k
    addRuleRootNodes(dest, node->fLeftChild);
418
17.7k
    addRuleRootNodes(dest, node->fRightChild);
419
17.7k
}
420
421
//-----------------------------------------------------------------------------
422
//
423
//   calcChainedFollowPos.    Modify the previously calculated followPos sets
424
//                            to implement rule chaining.  NOT described by Aho
425
//
426
//-----------------------------------------------------------------------------
427
139
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree, RBBINode *endMarkNode) {
428
429
139
    UVector         leafNodes(*fStatus);
430
139
    if (U_FAILURE(*fStatus)) {
431
0
        return;
432
0
    }
433
434
    // get a list all leaf nodes
435
139
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
436
139
    if (U_FAILURE(*fStatus)) {
437
0
        return;
438
0
    }
439
440
    // Collect all leaf nodes that can start matches for rules
441
    // with inbound chaining enabled, which is the union of the 
442
    // firstPosition sets from each of the rule root nodes.
443
    
444
139
    UVector ruleRootNodes(*fStatus);
445
139
    addRuleRootNodes(&ruleRootNodes, tree);
446
447
139
    UVector matchStartNodes(*fStatus);
448
4.59k
    for (int j=0; j<ruleRootNodes.size(); ++j) {
449
4.45k
        RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(j));
450
4.45k
        if (node->fChainIn) {
451
3.77k
            setAdd(&matchStartNodes, node->fFirstPosSet);
452
3.77k
        }
453
4.45k
    }
454
139
    if (U_FAILURE(*fStatus)) {
455
0
        return;
456
0
    }
457
458
139
    int32_t  endNodeIx;
459
139
    int32_t  startNodeIx;
460
461
209k
    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
462
209k
        RBBINode* endNode = static_cast<RBBINode*>(leafNodes.elementAt(endNodeIx));
463
464
        // Identify leaf nodes that correspond to overall rule match positions.
465
        // These include the endMarkNode in their followPos sets.
466
        //
467
        // Note: do not consider other end marker nodes, those that are added to
468
        //       look-ahead rules. These can't chain; a match immediately stops
469
        //       further matching. This leaves exactly one end marker node, the one
470
        //       at the end of the complete tree.
471
472
209k
        if (!endNode->fFollowPos->contains(endMarkNode)) {
473
115k
            continue;
474
115k
        }
475
476
        // We've got a node that can end a match.
477
478
        // Now iterate over the nodes that can start a match, looking for ones
479
        //   with the same char class as our ending node.
480
93.6k
        RBBINode *startNode;
481
48.7M
        for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
482
48.6M
            startNode = static_cast<RBBINode*>(matchStartNodes.elementAt(startNodeIx));
483
48.6M
            if (startNode->fType != RBBINode::leafChar) {
484
15.2k
                continue;
485
15.2k
            }
486
487
48.6M
            if (endNode->fVal == startNode->fVal) {
488
                // The end val (character class) of one possible match is the
489
                //   same as the start of another.
490
491
                // Add all nodes from the followPos of the start node to the
492
                //  followPos set of the end node, which will have the effect of
493
                //  letting matches transition from a match state at endNode
494
                //  to the second char of a match starting with startNode.
495
546k
                setAdd(endNode->fFollowPos, startNode->fFollowPos);
496
546k
            }
497
48.6M
        }
498
93.6k
    }
499
139
}
500
501
502
//-----------------------------------------------------------------------------
503
//
504
//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
505
//                Do an swizzle similar to chaining, modifying the followPos set of
506
//                the bofNode to include the followPos nodes from other {bot} nodes
507
//                scattered through the tree.
508
//
509
//                This function has much in common with calcChainedFollowPos().
510
//
511
//-----------------------------------------------------------------------------
512
62
void RBBITableBuilder::bofFixup() {
513
514
62
    if (U_FAILURE(*fStatus)) {
515
0
        return;
516
0
    }
517
518
    //   The parse tree looks like this ...
519
    //         fTree root  --->       <cat>
520
    //                               /     \       .
521
    //                            <cat>   <#end node>
522
    //                           /     \  .
523
    //                     <bofNode>   rest
524
    //                               of tree
525
    //
526
    //    We will be adding things to the followPos set of the <bofNode>
527
    //
528
62
    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
529
62
    U_ASSERT(bofNode->fType == RBBINode::leafChar);
530
62
    U_ASSERT(bofNode->fVal == 2);
531
532
    // Get all nodes that can be the start a match of the user-written rules
533
    //  (excluding the fake bofNode)
534
    //  We want the nodes that can start a match in the
535
    //     part labeled "rest of tree"
536
    // 
537
62
    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
538
539
62
    RBBINode *startNode;
540
62
    int       startNodeIx;
541
43.6k
    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
542
43.5k
        startNode = static_cast<RBBINode*>(matchStartNodes->elementAt(startNodeIx));
543
43.5k
        if (startNode->fType != RBBINode::leafChar) {
544
198
            continue;
545
198
        }
546
547
43.3k
        if (startNode->fVal == bofNode->fVal) {
548
            //  We found a leaf node corresponding to a {bof} that was
549
            //    explicitly written into a rule.
550
            //  Add everything from the followPos set of this node to the
551
            //    followPos set of the fake bofNode at the start of the tree.
552
            //  
553
687
            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
554
687
        }
555
43.3k
    }
556
62
}
557
558
//-----------------------------------------------------------------------------
559
//
560
//   buildStateTable()    Determine the set of runtime DFA states and the
561
//                        transition tables for these states, by the algorithm
562
//                        of fig. 3.44 in Aho.
563
//
564
//                        Most of the comments are quotes of Aho's psuedo-code.
565
//
566
//-----------------------------------------------------------------------------
567
2.62k
void RBBITableBuilder::buildStateTable() {
568
2.62k
    if (U_FAILURE(*fStatus)) {
569
0
        return;
570
0
    }
571
2.62k
    RBBIStateDescriptor *failState;
572
    // Set it to nullptr to avoid uninitialized warning
573
2.62k
    RBBIStateDescriptor *initialState = nullptr;
574
    //
575
    // Add a dummy state 0 - the stop state.  Not from Aho.
576
2.62k
    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
577
2.62k
    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
578
2.62k
    if (failState == nullptr) {
579
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
580
0
        goto ExitBuildSTdeleteall;
581
0
    }
582
2.62k
    failState->fPositions = new UVector(*fStatus);
583
2.62k
    if (failState->fPositions == nullptr) {
584
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
585
0
    }
586
2.62k
    if (failState->fPositions == nullptr || U_FAILURE(*fStatus)) {
587
0
        goto ExitBuildSTdeleteall;
588
0
    }
589
2.62k
    fDStates->addElement(failState, *fStatus);
590
2.62k
    if (U_FAILURE(*fStatus)) {
591
0
        goto ExitBuildSTdeleteall;
592
0
    }
593
594
    // initially, the only unmarked state in Dstates is firstpos(root),
595
    //       where toot is the root of the syntax tree for (r)#;
596
2.62k
    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
597
2.62k
    if (initialState == nullptr) {
598
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
599
0
    }
600
2.62k
    if (U_FAILURE(*fStatus)) {
601
0
        goto ExitBuildSTdeleteall;
602
0
    }
603
2.62k
    initialState->fPositions = new UVector(*fStatus);
604
2.62k
    if (initialState->fPositions == nullptr) {
605
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
606
0
    }
607
2.62k
    if (U_FAILURE(*fStatus)) {
608
0
        goto ExitBuildSTdeleteall;
609
0
    }
610
2.62k
    setAdd(initialState->fPositions, fTree->fFirstPosSet);
611
2.62k
    fDStates->addElement(initialState, *fStatus);
612
2.62k
    if (U_FAILURE(*fStatus)) {
613
0
        goto ExitBuildSTdeleteall;
614
0
    }
615
616
    // while there is an unmarked state T in Dstates do begin
617
257k
    for (;;) {
618
257k
        RBBIStateDescriptor *T = nullptr;
619
257k
        int32_t              tx;
620
190M
        for (tx=1; tx<fDStates->size(); tx++) {
621
190M
            RBBIStateDescriptor *temp;
622
190M
            temp = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(tx));
623
190M
            if (temp->fMarked == false) {
624
255k
                T = temp;
625
255k
                break;
626
255k
            }
627
190M
        }
628
257k
        if (T == nullptr) {
629
2.62k
            break;
630
2.62k
        }
631
632
        // mark T;
633
255k
        T->fMarked = true;
634
635
        // for each input symbol a do begin
636
255k
        int32_t  a;
637
19.7M
        for (a = 1; a<=lastInputSymbol; a++) {
638
            // let U be the set of positions that are in followpos(p)
639
            //    for some position p in T
640
            //    such that the symbol at position p is a;
641
19.5M
            UVector    *U = nullptr;
642
19.5M
            RBBINode   *p;
643
19.5M
            int32_t     px;
644
293M
            for (px=0; px<T->fPositions->size(); px++) {
645
273M
                p = static_cast<RBBINode*>(T->fPositions->elementAt(px));
646
273M
                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
647
5.01M
                    if (U == nullptr) {
648
1.41M
                        U = new UVector(*fStatus);
649
1.41M
                        if (U == nullptr) {
650
0
                          *fStatus = U_MEMORY_ALLOCATION_ERROR;
651
0
                          goto ExitBuildSTdeleteall;
652
0
                        }
653
1.41M
                    }
654
5.01M
                    setAdd(U, p->fFollowPos);
655
5.01M
                }
656
273M
            }
657
658
            // if U is not empty and not in DStates then
659
19.5M
            int32_t  ux = 0;
660
19.5M
            UBool    UinDstates = false;
661
19.5M
            if (U != nullptr) {
662
1.41M
                U_ASSERT(U->size() > 0);
663
1.41M
                int  ix;
664
554M
                for (ix=0; ix<fDStates->size(); ix++) {
665
554M
                    RBBIStateDescriptor *temp2;
666
554M
                    temp2 = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(ix));
667
554M
                    if (setEquals(U, temp2->fPositions)) {
668
1.15M
                        delete U;
669
1.15M
                        U  = temp2->fPositions;
670
1.15M
                        ux = ix;
671
1.15M
                        UinDstates = true;
672
1.15M
                        break;
673
1.15M
                    }
674
554M
                }
675
676
                // Add U as an unmarked state to Dstates
677
1.41M
                if (!UinDstates)
678
252k
                {
679
252k
                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
680
252k
                    if (newState == nullptr) {
681
0
                      *fStatus = U_MEMORY_ALLOCATION_ERROR;
682
0
                    }
683
252k
                    if (U_FAILURE(*fStatus)) {
684
0
                        goto ExitBuildSTdeleteall;
685
0
                    }
686
252k
                    newState->fPositions = U;
687
252k
                    fDStates->addElement(newState, *fStatus);
688
252k
                    if (U_FAILURE(*fStatus)) {
689
0
                        return;
690
0
                    }
691
252k
                    ux = fDStates->size()-1;
692
252k
                }
693
694
                // Dtran[T, a] := U;
695
1.41M
                T->fDtran->setElementAt(ux, a);
696
1.41M
            }
697
19.5M
        }
698
255k
    }
699
2.62k
    return;
700
    // delete local pointers only if error occurred.
701
2.62k
ExitBuildSTdeleteall:
702
0
    delete initialState;
703
0
    delete failState;
704
0
}
705
706
707
/**
708
 * mapLookAheadRules
709
 *
710
 */
711
2.62k
void RBBITableBuilder::mapLookAheadRules() {
712
2.62k
    fLookAheadRuleMap =  new UVector32(fRB->fScanner->numRules() + 1, *fStatus);
713
2.62k
    if (fLookAheadRuleMap == nullptr) {
714
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
715
0
    }
716
2.62k
    if (U_FAILURE(*fStatus)) {
717
0
        return;
718
0
    }
719
2.62k
    fLookAheadRuleMap->setSize(fRB->fScanner->numRules() + 1);
720
721
260k
    for (int32_t n=0; n<fDStates->size(); n++) {
722
257k
        RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n));
723
257k
        int32_t laSlotForState = 0;
724
725
        // Establish the look-ahead slot for this state, if the state covers
726
        // any look-ahead nodes - corresponding to the '/' in look-ahead rules.
727
728
        // If any of the look-ahead nodes already have a slot assigned, use it,
729
        // otherwise assign a new one.
730
731
257k
        bool sawLookAheadNode = false;
732
5.39M
        for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
733
5.13M
            RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
734
5.13M
            if (node->fType != RBBINode::NodeType::lookAhead) {
735
5.11M
                continue;
736
5.11M
            }
737
20.6k
            sawLookAheadNode = true;
738
20.6k
            int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
739
20.6k
            U_ASSERT(ruleNum < fLookAheadRuleMap->size());
740
20.6k
            U_ASSERT(ruleNum > 0);
741
20.6k
            int32_t laSlot = fLookAheadRuleMap->elementAti(ruleNum);
742
20.6k
            if (laSlot != 0) {
743
13.5k
                if (laSlotForState == 0) {
744
9.69k
                    laSlotForState = laSlot;
745
9.69k
                } else {
746
                    // TODO: figure out if this can fail, change to setting an error code if so.
747
3.90k
                    U_ASSERT(laSlot == laSlotForState);
748
3.90k
                }
749
13.5k
            }
750
20.6k
        }
751
257k
        if (!sawLookAheadNode) {
752
247k
            continue;
753
247k
        }
754
755
10.3k
        if (laSlotForState == 0) {
756
634
            laSlotForState = ++fLASlotsInUse;
757
634
        }
758
759
        // For each look ahead node covered by this state,
760
        // set the mapping from the node's rule number to the look ahead slot.
761
        // There can be multiple nodes/rule numbers going to the same la slot.
762
763
725k
        for (int32_t ipos=0; ipos<sd->fPositions->size(); ++ipos) {
764
715k
            RBBINode *node = static_cast<RBBINode *>(sd->fPositions->elementAt(ipos));
765
715k
            if (node->fType != RBBINode::NodeType::lookAhead) {
766
694k
                continue;
767
694k
            }
768
20.6k
            int32_t ruleNum = node->fVal;     // Set when rule was originally parsed.
769
20.6k
            int32_t existingVal = fLookAheadRuleMap->elementAti(ruleNum);
770
20.6k
            (void)existingVal;
771
20.6k
            U_ASSERT(existingVal == 0 || existingVal == laSlotForState);
772
20.6k
            fLookAheadRuleMap->setElementAt(laSlotForState, ruleNum);
773
20.6k
        }
774
10.3k
    }
775
776
2.62k
}
777
778
//-----------------------------------------------------------------------------
779
//
780
//   flagAcceptingStates    Identify accepting states.
781
//                          First get a list of all of the end marker nodes.
782
//                          Then, for each state s,
783
//                              if s contains one of the end marker nodes in its list of tree positions then
784
//                                  s is an accepting state.
785
//
786
//-----------------------------------------------------------------------------
787
2.62k
void     RBBITableBuilder::flagAcceptingStates() {
788
2.62k
    if (U_FAILURE(*fStatus)) {
789
0
        return;
790
0
    }
791
2.62k
    UVector     endMarkerNodes(*fStatus);
792
2.62k
    RBBINode    *endMarker;
793
2.62k
    int32_t     i;
794
2.62k
    int32_t     n;
795
796
2.62k
    if (U_FAILURE(*fStatus)) {
797
0
        return;
798
0
    }
799
800
2.62k
    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
801
2.62k
    if (U_FAILURE(*fStatus)) {
802
0
        return;
803
0
    }
804
805
12.3k
    for (i=0; i<endMarkerNodes.size(); i++) {
806
9.67k
        endMarker = static_cast<RBBINode*>(endMarkerNodes.elementAt(i));
807
1.18M
        for (n=0; n<fDStates->size(); n++) {
808
1.17M
            RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n));
809
1.17M
            if (sd->fPositions->indexOf(endMarker) >= 0) {
810
                // Any non-zero value for fAccepting means this is an accepting node.
811
                // The value is what will be returned to the user as the break status.
812
                // If no other value was specified, force it to ACCEPTING_UNCONDITIONAL (1).
813
814
39.1k
                if (sd->fAccepting==0) {
815
                    // State hasn't been marked as accepting yet.  Do it now.
816
31.4k
                    sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
817
31.4k
                    if (sd->fAccepting == 0) {
818
27.5k
                        sd->fAccepting = ACCEPTING_UNCONDITIONAL;
819
27.5k
                    }
820
31.4k
                }
821
39.1k
                if (sd->fAccepting==ACCEPTING_UNCONDITIONAL && endMarker->fVal != 0) {
822
                    // Both lookahead and non-lookahead accepting for this state.
823
                    // Favor the look-ahead, because a look-ahead match needs to
824
                    // immediately stop the run-time engine. First match, not longest.
825
861
                    sd->fAccepting = fLookAheadRuleMap->elementAti(endMarker->fVal);
826
861
                }
827
                // implicit else:
828
                // if sd->fAccepting already had a value other than 0 or 1, leave it be.
829
39.1k
            }
830
1.17M
        }
831
9.67k
    }
832
2.62k
}
833
834
835
//-----------------------------------------------------------------------------
836
//
837
//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
838
//
839
//-----------------------------------------------------------------------------
840
2.62k
void     RBBITableBuilder::flagLookAheadStates() {
841
2.62k
    if (U_FAILURE(*fStatus)) {
842
0
        return;
843
0
    }
844
2.62k
    UVector     lookAheadNodes(*fStatus);
845
2.62k
    RBBINode    *lookAheadNode;
846
2.62k
    int32_t     i;
847
2.62k
    int32_t     n;
848
849
2.62k
    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
850
2.62k
    if (U_FAILURE(*fStatus)) {
851
0
        return;
852
0
    }
853
12.1k
    for (i=0; i<lookAheadNodes.size(); i++) {
854
9.50k
        lookAheadNode = static_cast<RBBINode*>(lookAheadNodes.elementAt(i));
855
9.50k
        U_ASSERT(lookAheadNode->fType == RBBINode::NodeType::lookAhead);
856
857
4.09M
        for (n=0; n<fDStates->size(); n++) {
858
4.08M
            RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n));
859
4.08M
            int32_t positionsIdx = sd->fPositions->indexOf(lookAheadNode);
860
4.08M
            if (positionsIdx >= 0) {
861
20.6k
                U_ASSERT(lookAheadNode == sd->fPositions->elementAt(positionsIdx));
862
20.6k
                uint32_t lookaheadSlot = fLookAheadRuleMap->elementAti(lookAheadNode->fVal);
863
20.6k
                U_ASSERT(sd->fLookAhead == 0 || sd->fLookAhead == lookaheadSlot);
864
                // if (sd->fLookAhead != 0 && sd->fLookAhead != lookaheadSlot) {
865
                //     printf("%s:%d Bingo. sd->fLookAhead:%d   lookaheadSlot:%d\n",
866
                //            __FILE__, __LINE__, sd->fLookAhead, lookaheadSlot);
867
                // }
868
20.6k
                sd->fLookAhead = lookaheadSlot;
869
20.6k
            }
870
4.08M
        }
871
9.50k
    }
872
2.62k
}
873
874
875
876
877
//-----------------------------------------------------------------------------
878
//
879
//    flagTaggedStates
880
//
881
//-----------------------------------------------------------------------------
882
2.62k
void     RBBITableBuilder::flagTaggedStates() {
883
2.62k
    if (U_FAILURE(*fStatus)) {
884
0
        return;
885
0
    }
886
2.62k
    UVector     tagNodes(*fStatus);
887
2.62k
    RBBINode    *tagNode;
888
2.62k
    int32_t     i;
889
2.62k
    int32_t     n;
890
891
2.62k
    if (U_FAILURE(*fStatus)) {
892
0
        return;
893
0
    }
894
2.62k
    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
895
2.62k
    if (U_FAILURE(*fStatus)) {
896
0
        return;
897
0
    }
898
3.92k
    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
899
1.30k
        tagNode = static_cast<RBBINode*>(tagNodes.elementAt(i));
900
901
245k
        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
902
243k
            RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n));
903
243k
            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
904
59.9k
                sortedAdd(&sd->fTagVals, tagNode->fVal);
905
59.9k
            }
906
243k
        }
907
1.30k
    }
908
2.62k
}
909
910
911
912
913
//-----------------------------------------------------------------------------
914
//
915
//  mergeRuleStatusVals
916
//
917
//      Update the global table of rule status {tag} values
918
//      The rule builder has a global vector of status values that are common
919
//      for all tables.  Merge the ones from this table into the global set.
920
//
921
//-----------------------------------------------------------------------------
922
2.62k
void  RBBITableBuilder::mergeRuleStatusVals() {
923
    //
924
    //  The basic outline of what happens here is this...
925
    //
926
    //    for each state in this state table
927
    //       if the status tag list for this state is in the global statuses list
928
    //           record where and
929
    //           continue with the next state
930
    //       else
931
    //           add the tag list for this state to the global list.
932
    //
933
2.62k
    int i;
934
2.62k
    int n;
935
936
    // Pre-set a single tag of {0} into the table.
937
    //   We will need this as a default, for rule sets with no explicit tagging.
938
2.62k
    if (fRB->fRuleStatusVals->size() == 0) {
939
2.62k
        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
940
2.62k
        fRB->fRuleStatusVals->addElement(static_cast<int32_t>(0), *fStatus); // and our single status of zero
941
2.62k
    }
942
943
    //    For each state
944
260k
    for (n=0; n<fDStates->size(); n++) {
945
257k
        RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(n));
946
257k
        UVector *thisStatesTagValues = sd->fTagVals;
947
257k
        if (thisStatesTagValues == nullptr) {
948
            // No tag values are explicitly associated with this state.
949
            //   Set the default tag value.
950
241k
            sd->fTagsIdx = 0;
951
241k
            continue;
952
241k
        }
953
954
        // There are tag(s) associated with this state.
955
        //   fTagsIdx will be the index into the global tag list for this state's tag values.
956
        //   Initial value of -1 flags that we haven't got it set yet.
957
16.1k
        sd->fTagsIdx = -1;
958
16.1k
        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
959
16.1k
        int32_t  nextTagGroupStart = 0;
960
961
        // Loop runs once per group of tags in the global list
962
1.13M
        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
963
1.12M
            thisTagGroupStart = nextTagGroupStart;
964
1.12M
            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
965
1.12M
            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
966
                // The number of tags for this state is different from
967
                //    the number of tags in this group from the global list.
968
                //    Continue with the next group from the global list.
969
892k
                continue;
970
892k
            }
971
            // The lengths match, go ahead and compare the actual tag values
972
            //    between this state and the group from the global list.
973
380k
            for (i=0; i<thisStatesTagValues->size(); i++) {
974
366k
                if (thisStatesTagValues->elementAti(i) !=
975
366k
                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
976
                    // Mismatch.
977
222k
                    break;
978
222k
                }
979
366k
            }
980
981
237k
            if (i == thisStatesTagValues->size()) {
982
                // We found a set of tag values in the global list that match
983
                //   those for this state.  Use them.
984
14.1k
                sd->fTagsIdx = thisTagGroupStart;
985
14.1k
                break;
986
14.1k
            }
987
237k
        }
988
989
16.1k
        if (sd->fTagsIdx == -1) {
990
            // No suitable entry in the global tag list already.  Add one
991
2.04k
            sd->fTagsIdx = fRB->fRuleStatusVals->size();
992
2.04k
            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
993
9.80k
            for (i=0; i<thisStatesTagValues->size(); i++) {
994
7.76k
                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
995
7.76k
            }
996
2.04k
        }
997
16.1k
    }
998
2.62k
}
999
1000
1001
1002
1003
1004
1005
1006
//-----------------------------------------------------------------------------
1007
//
1008
//  sortedAdd  Add a value to a vector of sorted values (ints).
1009
//             Do not replicate entries; if the value is already there, do not
1010
//                add a second one.
1011
//             Lazily create the vector if it does not already exist.
1012
//
1013
//-----------------------------------------------------------------------------
1014
59.9k
void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
1015
59.9k
    int32_t i;
1016
1017
59.9k
    if (*vector == nullptr) {
1018
16.1k
        *vector = new UVector(*fStatus);
1019
16.1k
    }
1020
59.9k
    if (*vector == nullptr || U_FAILURE(*fStatus)) {
1021
0
        return;
1022
0
    }
1023
59.9k
    UVector *vec = *vector;
1024
59.9k
    int32_t  vSize = vec->size();
1025
117k
    for (i=0; i<vSize; i++) {
1026
86.7k
        int32_t valAtI = vec->elementAti(i);
1027
86.7k
        if (valAtI == val) {
1028
            // The value is already in the vector.  Don't add it again.
1029
4.26k
            return;
1030
4.26k
        }
1031
82.5k
        if (valAtI > val) {
1032
24.6k
            break;
1033
24.6k
        }
1034
82.5k
    }
1035
55.6k
    vec->insertElementAt(val, i, *fStatus);
1036
55.6k
}
1037
1038
1039
1040
//-----------------------------------------------------------------------------
1041
//
1042
//  setAdd     Set operation on UVector
1043
//             dest = dest union source
1044
//             Elements may only appear once and must be sorted.
1045
//
1046
//-----------------------------------------------------------------------------
1047
8.48M
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
1048
8.48M
    U_ASSERT(!dest->hasDeleter());
1049
8.48M
    U_ASSERT(!source->hasDeleter());
1050
8.48M
    int32_t destOriginalSize = dest->size();
1051
8.48M
    int32_t sourceSize       = source->size();
1052
8.48M
    int32_t di           = 0;
1053
8.48M
    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
1054
8.48M
    void **destPtr, **sourcePtr;
1055
8.48M
    void **destLim, **sourceLim;
1056
1057
8.48M
    if (destOriginalSize > destArray.getCapacity()) {
1058
3.08M
        if (destArray.resize(destOriginalSize) == nullptr) {
1059
0
            return;
1060
0
        }
1061
3.08M
    }
1062
8.48M
    destPtr = destArray.getAlias();
1063
8.48M
    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
1064
1065
8.48M
    if (sourceSize > sourceArray.getCapacity()) {
1066
1.91M
        if (sourceArray.resize(sourceSize) == nullptr) {
1067
0
            return;
1068
0
        }
1069
1.91M
    }
1070
8.48M
    sourcePtr = sourceArray.getAlias();
1071
8.48M
    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1072
1073
    // Avoid multiple "get element" calls by getting the contents into arrays
1074
8.48M
    (void) dest->toArray(destPtr);
1075
8.48M
    (void) source->toArray(sourcePtr);
1076
1077
8.48M
    dest->setSize(sourceSize+destOriginalSize, *fStatus);
1078
8.48M
    if (U_FAILURE(*fStatus)) {
1079
0
        return;
1080
0
    }
1081
1082
441M
    while (sourcePtr < sourceLim && destPtr < destLim) {
1083
433M
        if (*destPtr == *sourcePtr) {
1084
38.3M
            dest->setElementAt(*sourcePtr++, di++);
1085
38.3M
            destPtr++;
1086
38.3M
        }
1087
        // This check is required for machines with segmented memory, like i5/OS.
1088
        // Direct pointer comparison is not recommended.
1089
394M
        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1090
350M
            dest->setElementAt(*destPtr++, di++);
1091
350M
        }
1092
44.2M
        else { /* *sourcePtr < *destPtr */
1093
44.2M
            dest->setElementAt(*sourcePtr++, di++);
1094
44.2M
        }
1095
433M
    }
1096
1097
    // At most one of these two cleanup loops will execute
1098
140M
    while (destPtr < destLim) {
1099
131M
        dest->setElementAt(*destPtr++, di++);
1100
131M
    }
1101
212M
    while (sourcePtr < sourceLim) {
1102
204M
        dest->setElementAt(*sourcePtr++, di++);
1103
204M
    }
1104
1105
8.48M
    dest->setSize(di, *fStatus);
1106
8.48M
}
1107
1108
1109
1110
//-----------------------------------------------------------------------------
1111
//
1112
//  setEqual    Set operation on UVector.
1113
//              Compare for equality.
1114
//              Elements must be sorted.
1115
//
1116
//-----------------------------------------------------------------------------
1117
554M
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1118
554M
    return a->equals(*b);
1119
554M
}
1120
1121
1122
//-----------------------------------------------------------------------------
1123
//
1124
//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1125
//                 for each node in the tree.
1126
//
1127
//-----------------------------------------------------------------------------
1128
#ifdef RBBI_DEBUG
1129
void RBBITableBuilder::printPosSets(RBBINode *n) {
1130
    if (n==nullptr) {
1131
        return;
1132
    }
1133
    printf("\n");
1134
    RBBINode::printNodeHeader();
1135
    RBBINode::printNode(n);
1136
    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"true":"false");
1137
1138
    RBBIDebugPrintf("         firstpos:  ");
1139
    printSet(n->fFirstPosSet);
1140
1141
    RBBIDebugPrintf("         lastpos:   ");
1142
    printSet(n->fLastPosSet);
1143
1144
    RBBIDebugPrintf("         followpos: ");
1145
    printSet(n->fFollowPos);
1146
1147
    printPosSets(n->fLeftChild);
1148
    printPosSets(n->fRightChild);
1149
}
1150
#endif
1151
1152
//
1153
//    findDuplCharClassFrom()
1154
//
1155
19.8k
bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1156
19.8k
    int32_t numStates = fDStates->size();
1157
19.8k
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1158
1159
123k
    for (; categories->first < numCols-1; categories->first++) {
1160
        // Note: dictionary & non-dictionary columns cannot be merged.
1161
        //       The limitSecond value prevents considering mixed pairs.
1162
        //       Dictionary categories are >= DictCategoriesStart.
1163
        //       Non dict categories are   <  DictCategoriesStart.
1164
119k
        int limitSecond = categories->first < fRB->fSetBuilder->getDictCategoriesStart() ?
1165
119k
            fRB->fSetBuilder->getDictCategoriesStart() : numCols;
1166
21.6M
        for (categories->second=categories->first+1; categories->second < limitSecond; categories->second++) {
1167
            // Initialized to different values to prevent returning true if numStates = 0 (implies no duplicates).
1168
21.5M
            uint16_t table_base = 0;
1169
21.5M
            uint16_t table_dupl = 1;
1170
504M
            for (int32_t state=0; state<numStates; state++) {
1171
488M
                RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state));
1172
488M
                table_base = static_cast<uint16_t>(sd->fDtran->elementAti(categories->first));
1173
488M
                table_dupl = static_cast<uint16_t>(sd->fDtran->elementAti(categories->second));
1174
488M
                if (table_base != table_dupl) {
1175
5.34M
                    break;
1176
5.34M
                }
1177
488M
            }
1178
21.5M
            if (table_base == table_dupl) {
1179
16.6k
                return true;
1180
16.6k
            }
1181
21.5M
        }
1182
119k
    }
1183
3.24k
    return false;
1184
19.8k
}
1185
1186
1187
//
1188
//    removeColumn()
1189
//
1190
16.6k
void RBBITableBuilder::removeColumn(int32_t column) {
1191
16.6k
    int32_t numStates = fDStates->size();
1192
364k
    for (int32_t state=0; state<numStates; state++) {
1193
347k
        RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state));
1194
347k
        U_ASSERT(column < sd->fDtran->size());
1195
347k
        sd->fDtran->removeElementAt(column);
1196
347k
    }
1197
16.6k
}
1198
1199
/*
1200
 * findDuplicateState
1201
 */
1202
21.0k
bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1203
21.0k
    int32_t numStates = fDStates->size();
1204
21.0k
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1205
1206
647k
    for (; states->first<numStates-1; states->first++) {
1207
642k
        RBBIStateDescriptor* firstSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(states->first));
1208
296M
        for (states->second=states->first+1; states->second<numStates; states->second++) {
1209
295M
            RBBIStateDescriptor* duplSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(states->second));
1210
295M
            if (firstSD->fAccepting != duplSD->fAccepting ||
1211
226M
                firstSD->fLookAhead != duplSD->fLookAhead ||
1212
219M
                firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1213
89.4M
                continue;
1214
89.4M
            }
1215
206M
            bool rowsMatch = true;
1216
2.97G
            for (int32_t col=0; col < numCols; ++col) {
1217
2.97G
                int32_t firstVal = firstSD->fDtran->elementAti(col);
1218
2.97G
                int32_t duplVal = duplSD->fDtran->elementAti(col);
1219
2.97G
                if (!((firstVal == duplVal) ||
1220
206M
                        ((firstVal == states->first || firstVal == states->second) &&
1221
206M
                        (duplVal  == states->first || duplVal  == states->second)))) {
1222
206M
                    rowsMatch = false;
1223
206M
                    break;
1224
206M
                }
1225
2.97G
            }
1226
206M
            if (rowsMatch) {
1227
15.9k
                return true;
1228
15.9k
            }
1229
206M
        }
1230
642k
    }
1231
5.09k
    return false;
1232
21.0k
}
1233
1234
1235
28.5k
bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1236
28.5k
    int32_t numStates = fSafeTable->size();
1237
1238
78.0k
    for (; states->first<numStates-1; states->first++) {
1239
75.4k
        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1240
3.57M
        for (states->second=states->first+1; states->second<numStates; states->second++) {
1241
3.52M
            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1242
3.52M
            bool rowsMatch = true;
1243
3.52M
            int32_t numCols = firstRow->length();
1244
239M
            for (int32_t col=0; col < numCols; ++col) {
1245
239M
                int32_t firstVal = firstRow->charAt(col);
1246
239M
                int32_t duplVal = duplRow->charAt(col);
1247
239M
                if (!((firstVal == duplVal) ||
1248
3.50M
                        ((firstVal == states->first || firstVal == states->second) &&
1249
3.50M
                        (duplVal  == states->first || duplVal  == states->second)))) {
1250
3.50M
                    rowsMatch = false;
1251
3.50M
                    break;
1252
3.50M
                }
1253
239M
            }
1254
3.52M
            if (rowsMatch) {
1255
25.9k
                return true;
1256
25.9k
            }
1257
3.52M
        }
1258
75.4k
    }
1259
2.62k
    return false;
1260
28.5k
}
1261
1262
1263
15.9k
void RBBITableBuilder::removeState(IntPair duplStates) {
1264
15.9k
    const int32_t keepState = duplStates.first;
1265
15.9k
    const int32_t duplState = duplStates.second;
1266
15.9k
    U_ASSERT(keepState < duplState);
1267
15.9k
    U_ASSERT(duplState < fDStates->size());
1268
1269
15.9k
    RBBIStateDescriptor* duplSD = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(duplState));
1270
15.9k
    fDStates->removeElementAt(duplState);
1271
15.9k
    delete duplSD;
1272
1273
15.9k
    int32_t numStates = fDStates->size();
1274
15.9k
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1275
10.4M
    for (int32_t state=0; state<numStates; ++state) {
1276
10.4M
        RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state));
1277
181M
        for (int32_t col=0; col<numCols; col++) {
1278
170M
            int32_t existingVal = sd->fDtran->elementAti(col);
1279
170M
            int32_t newVal = existingVal;
1280
170M
            if (existingVal == duplState) {
1281
70.0k
                newVal = keepState;
1282
170M
            } else if (existingVal > duplState) {
1283
9.34M
                newVal = existingVal - 1;
1284
9.34M
            }
1285
170M
            sd->fDtran->setElementAt(newVal, col);
1286
170M
        }
1287
10.4M
    }
1288
15.9k
}
1289
1290
25.9k
void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1291
25.9k
    const int32_t keepState = duplStates.first;
1292
25.9k
    const int32_t duplState = duplStates.second;
1293
25.9k
    U_ASSERT(keepState < duplState);
1294
25.9k
    U_ASSERT(duplState < fSafeTable->size());
1295
1296
25.9k
    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1297
                                              // and will auto-delete the removed element.
1298
25.9k
    int32_t numStates = fSafeTable->size();
1299
1.53M
    for (int32_t state=0; state<numStates; ++state) {
1300
1.51M
        UnicodeString* sd = static_cast<UnicodeString*>(fSafeTable->elementAt(state));
1301
1.51M
        int32_t numCols = sd->length();
1302
306M
        for (int32_t col=0; col<numCols; col++) {
1303
304M
            int32_t existingVal = sd->charAt(col);
1304
304M
            int32_t newVal = existingVal;
1305
304M
            if (existingVal == duplState) {
1306
950k
                newVal = keepState;
1307
303M
            } else if (existingVal > duplState) {
1308
144M
                newVal = existingVal - 1;
1309
144M
            }
1310
304M
            sd->setCharAt(col, static_cast<char16_t>(newVal));
1311
304M
        }
1312
1.51M
    }
1313
25.9k
}
1314
1315
1316
/*
1317
 * RemoveDuplicateStates
1318
 */
1319
5.09k
int32_t RBBITableBuilder::removeDuplicateStates() {
1320
5.09k
    IntPair dupls = {3, 0};
1321
5.09k
    int32_t numStatesRemoved = 0;
1322
1323
21.0k
    while (findDuplicateState(&dupls)) {
1324
        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1325
15.9k
        removeState(dupls);
1326
15.9k
        ++numStatesRemoved;
1327
15.9k
    }
1328
5.09k
    return numStatesRemoved;
1329
5.09k
}
1330
1331
1332
//-----------------------------------------------------------------------------
1333
//
1334
//   getTableSize()    Calculate the size of the runtime form of this
1335
//                     state transition table.
1336
//
1337
//-----------------------------------------------------------------------------
1338
2.62k
int32_t  RBBITableBuilder::getTableSize() const {
1339
2.62k
    int32_t    size = 0;
1340
2.62k
    int32_t    numRows;
1341
2.62k
    int32_t    numCols;
1342
2.62k
    int32_t    rowSize;
1343
1344
2.62k
    if (fTree == nullptr) {
1345
0
        return 0;
1346
0
    }
1347
1348
2.62k
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1349
1350
2.62k
    numRows = fDStates->size();
1351
2.62k
    numCols = fRB->fSetBuilder->getNumCharCategories();
1352
1353
2.62k
    if (use8BitsForTable()) {
1354
2.41k
        rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1355
2.41k
    } else {
1356
206
        rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1357
206
    }
1358
2.62k
    size   += numRows * rowSize;
1359
2.62k
    return size;
1360
2.62k
}
1361
1362
247k
bool RBBITableBuilder::use8BitsForTable() const {
1363
247k
    return fDStates->size() <= kMaxStateFor8BitsTable;
1364
247k
}
1365
1366
//-----------------------------------------------------------------------------
1367
//
1368
//   exportTable()    export the state transition table in the format required
1369
//                    by the runtime engine.  getTableSize() bytes of memory
1370
//                    must be available at the output address "where".
1371
//
1372
//-----------------------------------------------------------------------------
1373
2.62k
void RBBITableBuilder::exportTable(void *where) {
1374
2.62k
    RBBIStateTable* table = static_cast<RBBIStateTable*>(where);
1375
2.62k
    uint32_t           state;
1376
2.62k
    int                col;
1377
1378
2.62k
    if (U_FAILURE(*fStatus) || fTree == nullptr) {
1379
0
        return;
1380
0
    }
1381
1382
2.62k
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1383
2.62k
    if (catCount > 0x7fff ||
1384
2.62k
        fDStates->size() > 0x7fff) {
1385
0
        *fStatus = U_BRK_INTERNAL_ERROR;
1386
0
        return;
1387
0
    }
1388
1389
2.62k
    table->fNumStates = fDStates->size();
1390
2.62k
    table->fDictCategoriesStart = fRB->fSetBuilder->getDictCategoriesStart();
1391
2.62k
    table->fLookAheadResultsSize = fLASlotsInUse == ACCEPTING_UNCONDITIONAL ? 0 : fLASlotsInUse + 1;
1392
2.62k
    table->fFlags     = 0;
1393
2.62k
    if (use8BitsForTable()) {
1394
2.41k
        table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1395
2.41k
        table->fFlags  |= RBBI_8BITS_ROWS;
1396
2.41k
    } else {
1397
206
        table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1398
206
    }
1399
2.62k
    if (fRB->fLookAheadHardBreak) {
1400
4
        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1401
4
    }
1402
2.62k
    if (fRB->fSetBuilder->sawBOF()) {
1403
62
        table->fFlags  |= RBBI_BOF_REQUIRED;
1404
62
    }
1405
1406
244k
    for (state=0; state<table->fNumStates; state++) {
1407
241k
        RBBIStateDescriptor* sd = static_cast<RBBIStateDescriptor*>(fDStates->elementAt(state));
1408
241k
        RBBIStateTableRow* row = reinterpret_cast<RBBIStateTableRow*>(table->fTableData + state * table->fRowLen);
1409
241k
        if (use8BitsForTable()) {
1410
72.7k
            U_ASSERT (sd->fAccepting <= 255);
1411
72.7k
            U_ASSERT (sd->fLookAhead <= 255);
1412
72.7k
            U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 255);
1413
72.7k
            RBBIStateTableRow8* r8 = reinterpret_cast<RBBIStateTableRow8*>(row);
1414
72.7k
            r8->fAccepting = sd->fAccepting;
1415
72.7k
            r8->fLookAhead = sd->fLookAhead;
1416
72.7k
            r8->fTagsIdx   = sd->fTagsIdx;
1417
5.14M
            for (col=0; col<catCount; col++) {
1418
5.07M
                U_ASSERT (sd->fDtran->elementAti(col) <= kMaxStateFor8BitsTable);
1419
5.07M
                r8->fNextState[col] = sd->fDtran->elementAti(col);
1420
5.07M
            }
1421
169k
        } else {
1422
169k
            U_ASSERT (sd->fAccepting <= 0xffff);
1423
169k
            U_ASSERT (sd->fLookAhead <= 0xffff);
1424
169k
            U_ASSERT (0 <= sd->fTagsIdx && sd->fTagsIdx <= 0xffff);
1425
169k
            row->r16.fAccepting = sd->fAccepting;
1426
169k
            row->r16.fLookAhead = sd->fLookAhead;
1427
169k
            row->r16.fTagsIdx   = sd->fTagsIdx;
1428
14.1M
            for (col=0; col<catCount; col++) {
1429
13.9M
                row->r16.fNextState[col] = sd->fDtran->elementAti(col);
1430
13.9M
            }
1431
169k
        }
1432
241k
    }
1433
2.62k
}
1434
1435
1436
/**
1437
 *   Synthesize a safe state table from the main state table.
1438
 */
1439
2.66k
void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1440
    // The safe table creation has three steps:
1441
1442
    // 1. Identify pairs of character classes that are "safe." Safe means that boundaries
1443
    // following the pair do not depend on context or state before the pair. To test
1444
    // whether a pair is safe, run it through the main forward state table, starting
1445
    // from each state. If the final state is the same, no matter what the starting state,
1446
    // the pair is safe.
1447
    //
1448
    // 2. Build a state table that recognizes the safe pairs. It's similar to their
1449
    // forward table, with a column for each input character [class], and a row for
1450
    // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1451
    // create an additional state for each input character category; being in
1452
    // one of these states means that the character has been seen, and is potentially
1453
    // the first of a pair. In each of these rows, the entry for the second character
1454
    // of a safe pair is set to the stop state (0), indicating that a match was found.
1455
    // All other table entries are set to the state corresponding the current input
1456
    // character, allowing that character to be the of a start following pair.
1457
    //
1458
    // Because the safe rules are to be run in reverse, moving backwards in the text,
1459
    // the first and second pair categories are swapped when building the table.
1460
    //
1461
    // 3. Compress the table. There are typically many rows (states) that are
1462
    // equivalent - that have zeroes (match completed) in the same columns -
1463
    // and can be folded together.
1464
1465
    // Each safe pair is stored as two UChars in the safePair string.
1466
2.66k
    UnicodeString safePairs;
1467
1468
2.66k
    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1469
2.66k
    int32_t numStates = fDStates->size();
1470
1471
99.7k
    for (int32_t c1=0; c1<numCharClasses; ++c1) {
1472
42.0M
        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1473
41.9M
            int32_t wantedEndState = -1;
1474
41.9M
            int32_t endState = 0;
1475
2.01G
            for (int32_t startState = 1; startState < numStates; ++startState) {
1476
1.97G
                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1477
1.97G
                int32_t s2 = startStateD->fDtran->elementAti(c1);
1478
1.97G
                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1479
1.97G
                endState = s2StateD->fDtran->elementAti(c2);
1480
1.97G
                if (wantedEndState < 0) {
1481
9.47M
                    wantedEndState = endState;
1482
1.96G
                } else {
1483
1.96G
                    if (wantedEndState != endState) {
1484
1.95M
                        break;
1485
1.95M
                    }
1486
1.96G
                }
1487
1.97G
            }
1488
41.9M
            if (wantedEndState == endState) {
1489
7.51M
                safePairs.append(static_cast<char16_t>(c1));
1490
7.51M
                safePairs.append(static_cast<char16_t>(c2));
1491
                // printf("(%d, %d) ", c1, c2);
1492
7.51M
            }
1493
41.9M
        }
1494
        // printf("\n");
1495
97.0k
    }
1496
1497
    // Populate the initial safe table.
1498
    // The table as a whole is UVector<UnicodeString>
1499
    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1500
    // Row 0 is the stop state.
1501
    // Row 1 is the start state.
1502
    // Row 2 and beyond are other states, initially one per char class, but
1503
    //   after initial construction, many of the states will be combined, compacting the table.
1504
    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1505
    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1506
1507
2.66k
    U_ASSERT(fSafeTable == nullptr);
1508
2.66k
    LocalPointer<UVector> lpSafeTable(
1509
2.66k
        new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status), status);
1510
2.66k
    if (U_FAILURE(status)) {
1511
43
        return;
1512
43
    }
1513
2.62k
    fSafeTable = lpSafeTable.orphan();
1514
83.3k
    for (int32_t row=0; row<numCharClasses + 2; ++row) {
1515
80.7k
        LocalPointer<UnicodeString> lpString(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1516
80.7k
        fSafeTable->adoptElement(lpString.orphan(), status);
1517
80.7k
    }
1518
2.62k
    if (U_FAILURE(status)) {
1519
0
        return;
1520
0
    }
1521
1522
    // From the start state, each input char class transitions to the state for that input.
1523
2.62k
    UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1524
78.0k
    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1525
        // Note: +2 for the start & stop state.
1526
75.4k
        startState.setCharAt(charClass, static_cast<char16_t>(charClass+2));
1527
75.4k
    }
1528
1529
    // Initially make every other state table row look like the start state row,
1530
78.0k
    for (int32_t row=2; row<numCharClasses+2; ++row) {
1531
75.4k
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1532
75.4k
        rowState = startState;   // UnicodeString assignment, copies contents.
1533
75.4k
    }
1534
1535
    // Run through the safe pairs, set the next state to zero when pair has been seen.
1536
    // Zero being the stop state, meaning we found a safe point.
1537
7.51M
    for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1538
7.51M
        int32_t c1 = safePairs.charAt(pairIdx);
1539
7.51M
        int32_t c2 = safePairs.charAt(pairIdx + 1);
1540
1541
7.51M
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1542
7.51M
        rowState.setCharAt(c1, 0);
1543
7.51M
    }
1544
1545
    // Remove duplicate or redundant rows from the table.
1546
2.62k
    IntPair states = {1, 0};
1547
28.5k
    while (findDuplicateSafeState(&states)) {
1548
        // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1549
25.9k
        removeSafeState(states);
1550
25.9k
    }
1551
2.62k
}
1552
1553
1554
//-----------------------------------------------------------------------------
1555
//
1556
//   getSafeTableSize()    Calculate the size of the runtime form of this
1557
//                         safe state table.
1558
//
1559
//-----------------------------------------------------------------------------
1560
2.62k
int32_t  RBBITableBuilder::getSafeTableSize() const {
1561
2.62k
    int32_t    size = 0;
1562
2.62k
    int32_t    numRows;
1563
2.62k
    int32_t    numCols;
1564
2.62k
    int32_t    rowSize;
1565
1566
2.62k
    if (fSafeTable == nullptr) {
1567
0
        return 0;
1568
0
    }
1569
1570
2.62k
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1571
1572
2.62k
    numRows = fSafeTable->size();
1573
2.62k
    numCols = fRB->fSetBuilder->getNumCharCategories();
1574
1575
2.62k
    if (use8BitsForSafeTable()) {
1576
2.59k
        rowSize = offsetof(RBBIStateTableRow8, fNextState) + sizeof(int8_t)*numCols;
1577
2.59k
    } else {
1578
27
        rowSize = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t)*numCols;
1579
27
    }
1580
2.62k
    size   += numRows * rowSize;
1581
2.62k
    return size;
1582
2.62k
}
1583
1584
59.9k
bool RBBITableBuilder::use8BitsForSafeTable() const {
1585
59.9k
    return fSafeTable->size() <= kMaxStateFor8BitsTable;
1586
59.9k
}
1587
1588
//-----------------------------------------------------------------------------
1589
//
1590
//   exportSafeTable()   export the state transition table in the format required
1591
//                       by the runtime engine.  getTableSize() bytes of memory
1592
//                       must be available at the output address "where".
1593
//
1594
//-----------------------------------------------------------------------------
1595
2.62k
void RBBITableBuilder::exportSafeTable(void *where) {
1596
2.62k
    RBBIStateTable* table = static_cast<RBBIStateTable*>(where);
1597
2.62k
    uint32_t           state;
1598
2.62k
    int                col;
1599
1600
2.62k
    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1601
0
        return;
1602
0
    }
1603
1604
2.62k
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1605
2.62k
    if (catCount > 0x7fff ||
1606
2.62k
            fSafeTable->size() > 0x7fff) {
1607
0
        *fStatus = U_BRK_INTERNAL_ERROR;
1608
0
        return;
1609
0
    }
1610
1611
2.62k
    table->fNumStates = fSafeTable->size();
1612
2.62k
    table->fFlags     = 0;
1613
2.62k
    if (use8BitsForSafeTable()) {
1614
2.59k
        table->fRowLen    = offsetof(RBBIStateTableRow8, fNextState) + sizeof(uint8_t) * catCount;
1615
2.59k
        table->fFlags  |= RBBI_8BITS_ROWS;
1616
2.59k
    } else {
1617
27
        table->fRowLen    = offsetof(RBBIStateTableRow16, fNextState) + sizeof(int16_t) * catCount;
1618
27
    }
1619
1620
57.3k
    for (state=0; state<table->fNumStates; state++) {
1621
54.7k
        UnicodeString* rowString = static_cast<UnicodeString*>(fSafeTable->elementAt(state));
1622
54.7k
        RBBIStateTableRow* row = reinterpret_cast<RBBIStateTableRow*>(table->fTableData + state * table->fRowLen);
1623
54.7k
        if (use8BitsForSafeTable()) {
1624
47.3k
            RBBIStateTableRow8* r8 = reinterpret_cast<RBBIStateTableRow8*>(row);
1625
47.3k
            r8->fAccepting = 0;
1626
47.3k
            r8->fLookAhead = 0;
1627
47.3k
            r8->fTagsIdx    = 0;
1628
5.11M
            for (col=0; col<catCount; col++) {
1629
5.07M
                U_ASSERT(rowString->charAt(col) <= kMaxStateFor8BitsTable);
1630
5.07M
                r8->fNextState[col] = static_cast<uint8_t>(rowString->charAt(col));
1631
5.07M
            }
1632
47.3k
        } else {
1633
7.40k
            row->r16.fAccepting = 0;
1634
7.40k
            row->r16.fLookAhead = 0;
1635
7.40k
            row->r16.fTagsIdx    = 0;
1636
2.09M
            for (col=0; col<catCount; col++) {
1637
2.08M
                row->r16.fNextState[col] = rowString->charAt(col);
1638
2.08M
            }
1639
7.40k
        }
1640
54.7k
    }
1641
2.62k
}
1642
1643
1644
1645
1646
//-----------------------------------------------------------------------------
1647
//
1648
//   printSet    Debug function.   Print the contents of a UVector
1649
//
1650
//-----------------------------------------------------------------------------
1651
#ifdef RBBI_DEBUG
1652
void RBBITableBuilder::printSet(UVector *s) {
1653
    int32_t  i;
1654
    for (i=0; i<s->size(); i++) {
1655
        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1656
        RBBIDebugPrintf("%5d", v==nullptr? -1 : v->fSerialNum);
1657
    }
1658
    RBBIDebugPrintf("\n");
1659
}
1660
#endif
1661
1662
1663
//-----------------------------------------------------------------------------
1664
//
1665
//   printStates    Debug Function.  Dump the fully constructed state transition table.
1666
//
1667
//-----------------------------------------------------------------------------
1668
#ifdef RBBI_DEBUG
1669
void RBBITableBuilder::printStates() {
1670
    int     c;    // input "character"
1671
    int     n;    // state number
1672
1673
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1674
    RBBIDebugPrintf("      | Acc  LA    Tag");
1675
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1676
        RBBIDebugPrintf(" %3d", c);
1677
    }
1678
    RBBIDebugPrintf("\n");
1679
    RBBIDebugPrintf("      |---------------");
1680
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1681
        RBBIDebugPrintf("----");
1682
    }
1683
    RBBIDebugPrintf("\n");
1684
1685
    for (n=0; n<fDStates->size(); n++) {
1686
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1687
        RBBIDebugPrintf("  %3d | " , n);
1688
        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1689
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1690
            RBBIDebugPrintf(" %3d", sd->fDtran->elementAti(c));
1691
        }
1692
        RBBIDebugPrintf("\n");
1693
    }
1694
    RBBIDebugPrintf("\n\n");
1695
}
1696
#endif
1697
1698
1699
//-----------------------------------------------------------------------------
1700
//
1701
//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1702
//
1703
//-----------------------------------------------------------------------------
1704
#ifdef RBBI_DEBUG
1705
void RBBITableBuilder::printReverseTable() {
1706
    int     c;    // input "character"
1707
    int     n;    // state number
1708
1709
    RBBIDebugPrintf("    Safe Reverse Table \n");
1710
    if (fSafeTable == nullptr) {
1711
        RBBIDebugPrintf("   --- nullptr ---\n");
1712
        return;
1713
    }
1714
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1715
    RBBIDebugPrintf("      | Acc  LA    Tag");
1716
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1717
        RBBIDebugPrintf(" %2d", c);
1718
    }
1719
    RBBIDebugPrintf("\n");
1720
    RBBIDebugPrintf("      |---------------");
1721
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1722
        RBBIDebugPrintf("---");
1723
    }
1724
    RBBIDebugPrintf("\n");
1725
1726
    for (n=0; n<fSafeTable->size(); n++) {
1727
        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1728
        RBBIDebugPrintf("  %3d | " , n);
1729
        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1730
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1731
            RBBIDebugPrintf(" %2d", rowString->charAt(c));
1732
        }
1733
        RBBIDebugPrintf("\n");
1734
    }
1735
    RBBIDebugPrintf("\n\n");
1736
}
1737
#endif
1738
1739
1740
1741
//-----------------------------------------------------------------------------
1742
//
1743
//   printRuleStatusTable    Debug Function.  Dump the common rule status table
1744
//
1745
//-----------------------------------------------------------------------------
1746
#ifdef RBBI_DEBUG
1747
void RBBITableBuilder::printRuleStatusTable() {
1748
    int32_t  thisRecord = 0;
1749
    int32_t  nextRecord = 0;
1750
    int      i;
1751
    UVector  *tbl = fRB->fRuleStatusVals;
1752
1753
    RBBIDebugPrintf("index |  tags \n");
1754
    RBBIDebugPrintf("-------------------\n");
1755
1756
    while (nextRecord < tbl->size()) {
1757
        thisRecord = nextRecord;
1758
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1759
        RBBIDebugPrintf("%4d   ", thisRecord);
1760
        for (i=thisRecord+1; i<nextRecord; i++) {
1761
            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1762
        }
1763
        RBBIDebugPrintf("\n");
1764
    }
1765
    RBBIDebugPrintf("\n\n");
1766
}
1767
#endif
1768
1769
1770
//-----------------------------------------------------------------------------
1771
//
1772
//   RBBIStateDescriptor     Methods.  This is a very struct-like class
1773
//                           Most access is directly to the fields.
1774
//
1775
//-----------------------------------------------------------------------------
1776
1777
257k
RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1778
257k
    fMarked    = false;
1779
257k
    fAccepting = 0;
1780
257k
    fLookAhead = 0;
1781
257k
    fTagsIdx   = 0;
1782
257k
    fTagVals   = nullptr;
1783
257k
    fPositions = nullptr;
1784
257k
    fDtran     = nullptr;
1785
1786
257k
    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1787
257k
    if (U_FAILURE(*fStatus)) {
1788
0
        return;
1789
0
    }
1790
257k
    if (fDtran == nullptr) {
1791
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
1792
0
        return;
1793
0
    }
1794
257k
    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1795
                                           //   It is indexed by input symbols, and will
1796
                                           //   hold  the next state number for each
1797
                                           //   symbol.
1798
257k
}
1799
1800
1801
257k
RBBIStateDescriptor::~RBBIStateDescriptor() {
1802
257k
    delete       fPositions;
1803
257k
    delete       fDtran;
1804
257k
    delete       fTagVals;
1805
257k
    fPositions = nullptr;
1806
257k
    fDtran     = nullptr;
1807
257k
    fTagVals   = nullptr;
1808
257k
}
1809
1810
U_NAMESPACE_END
1811
1812
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */