Coverage Report

Created: 2018-09-25 14:53

/src/mozilla-central/intl/icu/source/common/rbbitblb.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (c) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*/
9
//
10
//  rbbitblb.cpp
11
//
12
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_BREAK_ITERATION
17
18
#include "unicode/unistr.h"
19
#include "rbbitblb.h"
20
#include "rbbirb.h"
21
#include "rbbisetb.h"
22
#include "rbbidata.h"
23
#include "cstring.h"
24
#include "uassert.h"
25
#include "uvectr32.h"
26
#include "cmemory.h"
27
28
U_NAMESPACE_BEGIN
29
30
RBBITableBuilder::RBBITableBuilder(RBBIRuleBuilder *rb, RBBINode **rootNode, UErrorCode &status) :
31
        fRB(rb),
32
        fTree(*rootNode),
33
        fStatus(&status),
34
        fDStates(nullptr),
35
0
        fSafeTable(nullptr) {
36
0
    if (U_FAILURE(status)) {
37
0
        return;
38
0
    }
39
0
    // fDStates is UVector<RBBIStateDescriptor *>
40
0
    fDStates = new UVector(status);
41
0
    if (U_SUCCESS(status) && fDStates == nullptr ) {
42
0
        status = U_MEMORY_ALLOCATION_ERROR;
43
0
    }
44
0
}
45
46
47
48
0
RBBITableBuilder::~RBBITableBuilder() {
49
0
    int i;
50
0
    for (i=0; i<fDStates->size(); i++) {
51
0
        delete (RBBIStateDescriptor *)fDStates->elementAt(i);
52
0
    }
53
0
    delete fDStates;
54
0
    delete fSafeTable;
55
0
}
56
57
58
//-----------------------------------------------------------------------------
59
//
60
//   RBBITableBuilder::buildForwardTable  -  This is the main function for building
61
//                               the DFA state transition table from the RBBI rules parse tree.
62
//
63
//-----------------------------------------------------------------------------
64
0
void  RBBITableBuilder::buildForwardTable() {
65
0
66
0
    if (U_FAILURE(*fStatus)) {
67
0
        return;
68
0
    }
69
0
70
0
    // If there were no rules, just return.  This situation can easily arise
71
0
    //   for the reverse rules.
72
0
    if (fTree==NULL) {
73
0
        return;
74
0
    }
75
0
76
0
    //
77
0
    // Walk through the tree, replacing any references to $variables with a copy of the
78
0
    //   parse tree for the substition expression.
79
0
    //
80
0
    fTree = fTree->flattenVariables();
81
#ifdef RBBI_DEBUG
82
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ftree")) {
83
        RBBIDebugPuts("\nParse tree after flattening variable references.");
84
        RBBINode::printTree(fTree, TRUE);
85
    }
86
#endif
87
88
0
    //
89
0
    // If the rules contained any references to {bof} 
90
0
    //   add a {bof} <cat> <former root of tree> to the
91
0
    //   tree.  Means that all matches must start out with the 
92
0
    //   {bof} fake character.
93
0
    // 
94
0
    if (fRB->fSetBuilder->sawBOF()) {
95
0
        RBBINode *bofTop    = new RBBINode(RBBINode::opCat);
96
0
        RBBINode *bofLeaf   = new RBBINode(RBBINode::leafChar);
97
0
        // Delete and exit if memory allocation failed.
98
0
        if (bofTop == NULL || bofLeaf == NULL) {
99
0
            *fStatus = U_MEMORY_ALLOCATION_ERROR;
100
0
            delete bofTop;
101
0
            delete bofLeaf;
102
0
            return;
103
0
        }
104
0
        bofTop->fLeftChild  = bofLeaf;
105
0
        bofTop->fRightChild = fTree;
106
0
        bofLeaf->fParent    = bofTop;
107
0
        bofLeaf->fVal       = 2;      // Reserved value for {bof}.
108
0
        fTree               = bofTop;
109
0
    }
110
0
111
0
    //
112
0
    // Add a unique right-end marker to the expression.
113
0
    //   Appears as a cat-node, left child being the original tree,
114
0
    //   right child being the end marker.
115
0
    //
116
0
    RBBINode *cn = new RBBINode(RBBINode::opCat);
117
0
    // Exit if memory allocation failed.
118
0
    if (cn == NULL) {
119
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
120
0
        return;
121
0
    }
122
0
    cn->fLeftChild = fTree;
123
0
    fTree->fParent = cn;
124
0
    cn->fRightChild = new RBBINode(RBBINode::endMark);
125
0
    // Delete and exit if memory allocation failed.
126
0
    if (cn->fRightChild == NULL) {
127
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
128
0
        delete cn;
129
0
        return;
130
0
    }
131
0
    cn->fRightChild->fParent = cn;
132
0
    fTree = cn;
133
0
134
0
    //
135
0
    //  Replace all references to UnicodeSets with the tree for the equivalent
136
0
    //      expression.
137
0
    //
138
0
    fTree->flattenSets();
139
#ifdef RBBI_DEBUG
140
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "stree")) {
141
        RBBIDebugPuts("\nParse tree after flattening Unicode Set references.");
142
        RBBINode::printTree(fTree, TRUE);
143
    }
144
#endif
145
146
0
147
0
    //
148
0
    // calculate the functions nullable, firstpos, lastpos and followpos on
149
0
    // nodes in the parse tree.
150
0
    //    See the alogrithm description in Aho.
151
0
    //    Understanding how this works by looking at the code alone will be
152
0
    //       nearly impossible.
153
0
    //
154
0
    calcNullable(fTree);
155
0
    calcFirstPos(fTree);
156
0
    calcLastPos(fTree);
157
0
    calcFollowPos(fTree);
158
0
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "pos")) {
159
0
        RBBIDebugPuts("\n");
160
0
        printPosSets(fTree);
161
0
    }
162
0
163
0
    //
164
0
    //  For "chained" rules, modify the followPos sets
165
0
    //
166
0
    if (fRB->fChainRules) {
167
0
        calcChainedFollowPos(fTree);
168
0
    }
169
0
170
0
    //
171
0
    //  BOF (start of input) test fixup.
172
0
    //
173
0
    if (fRB->fSetBuilder->sawBOF()) {
174
0
        bofFixup();
175
0
    }
176
0
177
0
    //
178
0
    // Build the DFA state transition tables.
179
0
    //
180
0
    buildStateTable();
181
0
    flagAcceptingStates();
182
0
    flagLookAheadStates();
183
0
    flagTaggedStates();
184
0
185
0
    //
186
0
    // Update the global table of rule status {tag} values
187
0
    // The rule builder has a global vector of status values that are common
188
0
    //    for all tables.  Merge the ones from this table into the global set.
189
0
    //
190
0
    mergeRuleStatusVals();
191
0
}
192
193
194
195
//-----------------------------------------------------------------------------
196
//
197
//   calcNullable.    Impossible to explain succinctly.  See Aho, section 3.9
198
//
199
//-----------------------------------------------------------------------------
200
0
void RBBITableBuilder::calcNullable(RBBINode *n) {
201
0
    if (n == NULL) {
202
0
        return;
203
0
    }
204
0
    if (n->fType == RBBINode::setRef ||
205
0
        n->fType == RBBINode::endMark ) {
206
0
        // These are non-empty leaf node types.
207
0
        n->fNullable = FALSE;
208
0
        return;
209
0
    }
210
0
211
0
    if (n->fType == RBBINode::lookAhead || n->fType == RBBINode::tag) {
212
0
        // Lookahead marker node.  It's a leaf, so no recursion on children.
213
0
        // It's nullable because it does not match any literal text from the input stream.
214
0
        n->fNullable = TRUE;
215
0
        return;
216
0
    }
217
0
218
0
219
0
    // The node is not a leaf.
220
0
    //  Calculate nullable on its children.
221
0
    calcNullable(n->fLeftChild);
222
0
    calcNullable(n->fRightChild);
223
0
224
0
    // Apply functions from table 3.40 in Aho
225
0
    if (n->fType == RBBINode::opOr) {
226
0
        n->fNullable = n->fLeftChild->fNullable || n->fRightChild->fNullable;
227
0
    }
228
0
    else if (n->fType == RBBINode::opCat) {
229
0
        n->fNullable = n->fLeftChild->fNullable && n->fRightChild->fNullable;
230
0
    }
231
0
    else if (n->fType == RBBINode::opStar || n->fType == RBBINode::opQuestion) {
232
0
        n->fNullable = TRUE;
233
0
    }
234
0
    else {
235
0
        n->fNullable = FALSE;
236
0
    }
237
0
}
238
239
240
241
242
//-----------------------------------------------------------------------------
243
//
244
//   calcFirstPos.    Impossible to explain succinctly.  See Aho, section 3.9
245
//
246
//-----------------------------------------------------------------------------
247
0
void RBBITableBuilder::calcFirstPos(RBBINode *n) {
248
0
    if (n == NULL) {
249
0
        return;
250
0
    }
251
0
    if (n->fType == RBBINode::leafChar  ||
252
0
        n->fType == RBBINode::endMark   ||
253
0
        n->fType == RBBINode::lookAhead ||
254
0
        n->fType == RBBINode::tag) {
255
0
        // These are non-empty leaf node types.
256
0
        // Note: In order to maintain the sort invariant on the set,
257
0
        // this function should only be called on a node whose set is
258
0
        // empty to start with.
259
0
        n->fFirstPosSet->addElement(n, *fStatus);
260
0
        return;
261
0
    }
262
0
263
0
    // The node is not a leaf.
264
0
    //  Calculate firstPos on its children.
265
0
    calcFirstPos(n->fLeftChild);
266
0
    calcFirstPos(n->fRightChild);
267
0
268
0
    // Apply functions from table 3.40 in Aho
269
0
    if (n->fType == RBBINode::opOr) {
270
0
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
271
0
        setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
272
0
    }
273
0
    else if (n->fType == RBBINode::opCat) {
274
0
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
275
0
        if (n->fLeftChild->fNullable) {
276
0
            setAdd(n->fFirstPosSet, n->fRightChild->fFirstPosSet);
277
0
        }
278
0
    }
279
0
    else if (n->fType == RBBINode::opStar ||
280
0
             n->fType == RBBINode::opQuestion ||
281
0
             n->fType == RBBINode::opPlus) {
282
0
        setAdd(n->fFirstPosSet, n->fLeftChild->fFirstPosSet);
283
0
    }
284
0
}
285
286
287
288
//-----------------------------------------------------------------------------
289
//
290
//   calcLastPos.    Impossible to explain succinctly.  See Aho, section 3.9
291
//
292
//-----------------------------------------------------------------------------
293
0
void RBBITableBuilder::calcLastPos(RBBINode *n) {
294
0
    if (n == NULL) {
295
0
        return;
296
0
    }
297
0
    if (n->fType == RBBINode::leafChar  ||
298
0
        n->fType == RBBINode::endMark   ||
299
0
        n->fType == RBBINode::lookAhead ||
300
0
        n->fType == RBBINode::tag) {
301
0
        // These are non-empty leaf node types.
302
0
        // Note: In order to maintain the sort invariant on the set,
303
0
        // this function should only be called on a node whose set is
304
0
        // empty to start with.
305
0
        n->fLastPosSet->addElement(n, *fStatus);
306
0
        return;
307
0
    }
308
0
309
0
    // The node is not a leaf.
310
0
    //  Calculate lastPos on its children.
311
0
    calcLastPos(n->fLeftChild);
312
0
    calcLastPos(n->fRightChild);
313
0
314
0
    // Apply functions from table 3.40 in Aho
315
0
    if (n->fType == RBBINode::opOr) {
316
0
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
317
0
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
318
0
    }
319
0
    else if (n->fType == RBBINode::opCat) {
320
0
        setAdd(n->fLastPosSet, n->fRightChild->fLastPosSet);
321
0
        if (n->fRightChild->fNullable) {
322
0
            setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
323
0
        }
324
0
    }
325
0
    else if (n->fType == RBBINode::opStar     ||
326
0
             n->fType == RBBINode::opQuestion ||
327
0
             n->fType == RBBINode::opPlus) {
328
0
        setAdd(n->fLastPosSet, n->fLeftChild->fLastPosSet);
329
0
    }
330
0
}
331
332
333
334
//-----------------------------------------------------------------------------
335
//
336
//   calcFollowPos.    Impossible to explain succinctly.  See Aho, section 3.9
337
//
338
//-----------------------------------------------------------------------------
339
0
void RBBITableBuilder::calcFollowPos(RBBINode *n) {
340
0
    if (n == NULL ||
341
0
        n->fType == RBBINode::leafChar ||
342
0
        n->fType == RBBINode::endMark) {
343
0
        return;
344
0
    }
345
0
346
0
    calcFollowPos(n->fLeftChild);
347
0
    calcFollowPos(n->fRightChild);
348
0
349
0
    // Aho rule #1
350
0
    if (n->fType == RBBINode::opCat) {
351
0
        RBBINode *i;   // is 'i' in Aho's description
352
0
        uint32_t     ix;
353
0
354
0
        UVector *LastPosOfLeftChild = n->fLeftChild->fLastPosSet;
355
0
356
0
        for (ix=0; ix<(uint32_t)LastPosOfLeftChild->size(); ix++) {
357
0
            i = (RBBINode *)LastPosOfLeftChild->elementAt(ix);
358
0
            setAdd(i->fFollowPos, n->fRightChild->fFirstPosSet);
359
0
        }
360
0
    }
361
0
362
0
    // Aho rule #2
363
0
    if (n->fType == RBBINode::opStar ||
364
0
        n->fType == RBBINode::opPlus) {
365
0
        RBBINode   *i;  // again, n and i are the names from Aho's description.
366
0
        uint32_t    ix;
367
0
368
0
        for (ix=0; ix<(uint32_t)n->fLastPosSet->size(); ix++) {
369
0
            i = (RBBINode *)n->fLastPosSet->elementAt(ix);
370
0
            setAdd(i->fFollowPos, n->fFirstPosSet);
371
0
        }
372
0
    }
373
0
374
0
375
0
376
0
}
377
378
//-----------------------------------------------------------------------------
379
//
380
//    addRuleRootNodes    Recursively walk a parse tree, adding all nodes flagged
381
//                        as roots of a rule to a destination vector.
382
//
383
//-----------------------------------------------------------------------------
384
0
void RBBITableBuilder::addRuleRootNodes(UVector *dest, RBBINode *node) {
385
0
    if (node == NULL || U_FAILURE(*fStatus)) {
386
0
        return;
387
0
    }
388
0
    if (node->fRuleRoot) {
389
0
        dest->addElement(node, *fStatus);
390
0
        // Note: rules cannot nest. If we found a rule start node,
391
0
        //       no child node can also be a start node.
392
0
        return;
393
0
    }
394
0
    addRuleRootNodes(dest, node->fLeftChild);
395
0
    addRuleRootNodes(dest, node->fRightChild);
396
0
}
397
398
//-----------------------------------------------------------------------------
399
//
400
//   calcChainedFollowPos.    Modify the previously calculated followPos sets
401
//                            to implement rule chaining.  NOT described by Aho
402
//
403
//-----------------------------------------------------------------------------
404
0
void RBBITableBuilder::calcChainedFollowPos(RBBINode *tree) {
405
0
406
0
    UVector         endMarkerNodes(*fStatus);
407
0
    UVector         leafNodes(*fStatus);
408
0
    int32_t         i;
409
0
410
0
    if (U_FAILURE(*fStatus)) {
411
0
        return;
412
0
    }
413
0
414
0
    // get a list of all endmarker nodes.
415
0
    tree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
416
0
417
0
    // get a list all leaf nodes
418
0
    tree->findNodes(&leafNodes, RBBINode::leafChar, *fStatus);
419
0
    if (U_FAILURE(*fStatus)) {
420
0
        return;
421
0
    }
422
0
423
0
    // Collect all leaf nodes that can start matches for rules
424
0
    // with inbound chaining enabled, which is the union of the 
425
0
    // firstPosition sets from each of the rule root nodes.
426
0
    
427
0
    UVector ruleRootNodes(*fStatus);
428
0
    addRuleRootNodes(&ruleRootNodes, tree);
429
0
430
0
    UVector matchStartNodes(*fStatus);
431
0
    for (int i=0; i<ruleRootNodes.size(); ++i) {
432
0
        RBBINode *node = static_cast<RBBINode *>(ruleRootNodes.elementAt(i));
433
0
        if (node->fChainIn) {
434
0
            setAdd(&matchStartNodes, node->fFirstPosSet);
435
0
        }
436
0
    }
437
0
    if (U_FAILURE(*fStatus)) {
438
0
        return;
439
0
    }
440
0
441
0
    int32_t  endNodeIx;
442
0
    int32_t  startNodeIx;
443
0
444
0
    for (endNodeIx=0; endNodeIx<leafNodes.size(); endNodeIx++) {
445
0
        RBBINode *tNode   = (RBBINode *)leafNodes.elementAt(endNodeIx);
446
0
        RBBINode *endNode = NULL;
447
0
448
0
        // Identify leaf nodes that correspond to overall rule match positions.
449
0
        //   These include an endMarkerNode in their followPos sets.
450
0
        for (i=0; i<endMarkerNodes.size(); i++) {
451
0
            if (tNode->fFollowPos->contains(endMarkerNodes.elementAt(i))) {
452
0
                endNode = tNode;
453
0
                break;
454
0
            }
455
0
        }
456
0
        if (endNode == NULL) {
457
0
            // node wasn't an end node.  Try again with the next.
458
0
            continue;
459
0
        }
460
0
461
0
        // We've got a node that can end a match.
462
0
463
0
        // Line Break Specific hack:  If this node's val correspond to the $CM char class,
464
0
        //                            don't chain from it.
465
0
        // TODO:  Add rule syntax for this behavior, get specifics out of here and
466
0
        //        into the rule file.
467
0
        if (fRB->fLBCMNoChain) {
468
0
            UChar32 c = this->fRB->fSetBuilder->getFirstChar(endNode->fVal);
469
0
            if (c != -1) {
470
0
                // c == -1 occurs with sets containing only the {eof} marker string.
471
0
                ULineBreak cLBProp = (ULineBreak)u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
472
0
                if (cLBProp == U_LB_COMBINING_MARK) {
473
0
                    continue;
474
0
                }
475
0
            }
476
0
        }
477
0
478
0
479
0
        // Now iterate over the nodes that can start a match, looking for ones
480
0
        //   with the same char class as our ending node.
481
0
        RBBINode *startNode;
482
0
        for (startNodeIx = 0; startNodeIx<matchStartNodes.size(); startNodeIx++) {
483
0
            startNode = (RBBINode *)matchStartNodes.elementAt(startNodeIx);
484
0
            if (startNode->fType != RBBINode::leafChar) {
485
0
                continue;
486
0
            }
487
0
488
0
            if (endNode->fVal == startNode->fVal) {
489
0
                // The end val (character class) of one possible match is the
490
0
                //   same as the start of another.
491
0
492
0
                // Add all nodes from the followPos of the start node to the
493
0
                //  followPos set of the end node, which will have the effect of
494
0
                //  letting matches transition from a match state at endNode
495
0
                //  to the second char of a match starting with startNode.
496
0
                setAdd(endNode->fFollowPos, startNode->fFollowPos);
497
0
            }
498
0
        }
499
0
    }
500
0
}
501
502
503
//-----------------------------------------------------------------------------
504
//
505
//   bofFixup.    Fixup for state tables that include {bof} beginning of input testing.
506
//                Do an swizzle similar to chaining, modifying the followPos set of
507
//                the bofNode to include the followPos nodes from other {bot} nodes
508
//                scattered through the tree.
509
//
510
//                This function has much in common with calcChainedFollowPos().
511
//
512
//-----------------------------------------------------------------------------
513
0
void RBBITableBuilder::bofFixup() {
514
0
515
0
    if (U_FAILURE(*fStatus)) {
516
0
        return;
517
0
    }
518
0
519
0
    //   The parse tree looks like this ...
520
0
    //         fTree root  --->       <cat>
521
0
    //                               /     \       .
522
0
    //                            <cat>   <#end node>
523
0
    //                           /     \  .
524
0
    //                     <bofNode>   rest
525
0
    //                               of tree
526
0
    //
527
0
    //    We will be adding things to the followPos set of the <bofNode>
528
0
    //
529
0
    RBBINode  *bofNode = fTree->fLeftChild->fLeftChild;
530
0
    U_ASSERT(bofNode->fType == RBBINode::leafChar);
531
0
    U_ASSERT(bofNode->fVal == 2);
532
0
533
0
    // Get all nodes that can be the start a match of the user-written rules
534
0
    //  (excluding the fake bofNode)
535
0
    //  We want the nodes that can start a match in the
536
0
    //     part labeled "rest of tree"
537
0
    // 
538
0
    UVector *matchStartNodes = fTree->fLeftChild->fRightChild->fFirstPosSet;
539
0
540
0
    RBBINode *startNode;
541
0
    int       startNodeIx;
542
0
    for (startNodeIx = 0; startNodeIx<matchStartNodes->size(); startNodeIx++) {
543
0
        startNode = (RBBINode *)matchStartNodes->elementAt(startNodeIx);
544
0
        if (startNode->fType != RBBINode::leafChar) {
545
0
            continue;
546
0
        }
547
0
548
0
        if (startNode->fVal == bofNode->fVal) {
549
0
            //  We found a leaf node corresponding to a {bof} that was
550
0
            //    explicitly written into a rule.
551
0
            //  Add everything from the followPos set of this node to the
552
0
            //    followPos set of the fake bofNode at the start of the tree.
553
0
            //  
554
0
            setAdd(bofNode->fFollowPos, startNode->fFollowPos);
555
0
        }
556
0
    }
557
0
}
558
559
//-----------------------------------------------------------------------------
560
//
561
//   buildStateTable()    Determine the set of runtime DFA states and the
562
//                        transition tables for these states, by the algorithm
563
//                        of fig. 3.44 in Aho.
564
//
565
//                        Most of the comments are quotes of Aho's psuedo-code.
566
//
567
//-----------------------------------------------------------------------------
568
0
void RBBITableBuilder::buildStateTable() {
569
0
    if (U_FAILURE(*fStatus)) {
570
0
        return;
571
0
    }
572
0
    RBBIStateDescriptor *failState;
573
0
    // Set it to NULL to avoid uninitialized warning
574
0
    RBBIStateDescriptor *initialState = NULL; 
575
0
    //
576
0
    // Add a dummy state 0 - the stop state.  Not from Aho.
577
0
    int      lastInputSymbol = fRB->fSetBuilder->getNumCharCategories() - 1;
578
0
    failState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
579
0
    if (failState == NULL) {
580
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
581
0
        goto ExitBuildSTdeleteall;
582
0
    }
583
0
    failState->fPositions = new UVector(*fStatus);
584
0
    if (failState->fPositions == NULL) {
585
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
586
0
    }
587
0
    if (failState->fPositions == NULL || U_FAILURE(*fStatus)) {
588
0
        goto ExitBuildSTdeleteall;
589
0
    }
590
0
    fDStates->addElement(failState, *fStatus);
591
0
    if (U_FAILURE(*fStatus)) {
592
0
        goto ExitBuildSTdeleteall;
593
0
    }
594
0
595
0
    // initially, the only unmarked state in Dstates is firstpos(root),
596
0
    //       where toot is the root of the syntax tree for (r)#;
597
0
    initialState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
598
0
    if (initialState == NULL) {
599
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
600
0
    }
601
0
    if (U_FAILURE(*fStatus)) {
602
0
        goto ExitBuildSTdeleteall;
603
0
    }
604
0
    initialState->fPositions = new UVector(*fStatus);
605
0
    if (initialState->fPositions == NULL) {
606
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
607
0
    }
608
0
    if (U_FAILURE(*fStatus)) {
609
0
        goto ExitBuildSTdeleteall;
610
0
    }
611
0
    setAdd(initialState->fPositions, fTree->fFirstPosSet);
612
0
    fDStates->addElement(initialState, *fStatus);
613
0
    if (U_FAILURE(*fStatus)) {
614
0
        goto ExitBuildSTdeleteall;
615
0
    }
616
0
617
0
    // while there is an unmarked state T in Dstates do begin
618
0
    for (;;) {
619
0
        RBBIStateDescriptor *T = NULL;
620
0
        int32_t              tx;
621
0
        for (tx=1; tx<fDStates->size(); tx++) {
622
0
            RBBIStateDescriptor *temp;
623
0
            temp = (RBBIStateDescriptor *)fDStates->elementAt(tx);
624
0
            if (temp->fMarked == FALSE) {
625
0
                T = temp;
626
0
                break;
627
0
            }
628
0
        }
629
0
        if (T == NULL) {
630
0
            break;
631
0
        }
632
0
633
0
        // mark T;
634
0
        T->fMarked = TRUE;
635
0
636
0
        // for each input symbol a do begin
637
0
        int32_t  a;
638
0
        for (a = 1; a<=lastInputSymbol; a++) {
639
0
            // let U be the set of positions that are in followpos(p)
640
0
            //    for some position p in T
641
0
            //    such that the symbol at position p is a;
642
0
            UVector    *U = NULL;
643
0
            RBBINode   *p;
644
0
            int32_t     px;
645
0
            for (px=0; px<T->fPositions->size(); px++) {
646
0
                p = (RBBINode *)T->fPositions->elementAt(px);
647
0
                if ((p->fType == RBBINode::leafChar) &&  (p->fVal == a)) {
648
0
                    if (U == NULL) {
649
0
                        U = new UVector(*fStatus);
650
0
                        if (U == NULL) {
651
0
                          *fStatus = U_MEMORY_ALLOCATION_ERROR;
652
0
                          goto ExitBuildSTdeleteall;
653
0
                        }
654
0
                    }
655
0
                    setAdd(U, p->fFollowPos);
656
0
                }
657
0
            }
658
0
659
0
            // if U is not empty and not in DStates then
660
0
            int32_t  ux = 0;
661
0
            UBool    UinDstates = FALSE;
662
0
            if (U != NULL) {
663
0
                U_ASSERT(U->size() > 0);
664
0
                int  ix;
665
0
                for (ix=0; ix<fDStates->size(); ix++) {
666
0
                    RBBIStateDescriptor *temp2;
667
0
                    temp2 = (RBBIStateDescriptor *)fDStates->elementAt(ix);
668
0
                    if (setEquals(U, temp2->fPositions)) {
669
0
                        delete U;
670
0
                        U  = temp2->fPositions;
671
0
                        ux = ix;
672
0
                        UinDstates = TRUE;
673
0
                        break;
674
0
                    }
675
0
                }
676
0
677
0
                // Add U as an unmarked state to Dstates
678
0
                if (!UinDstates)
679
0
                {
680
0
                    RBBIStateDescriptor *newState = new RBBIStateDescriptor(lastInputSymbol, fStatus);
681
0
                    if (newState == NULL) {
682
0
                      *fStatus = U_MEMORY_ALLOCATION_ERROR;
683
0
                    }
684
0
                    if (U_FAILURE(*fStatus)) {
685
0
                        goto ExitBuildSTdeleteall;
686
0
                    }
687
0
                    newState->fPositions = U;
688
0
                    fDStates->addElement(newState, *fStatus);
689
0
                    if (U_FAILURE(*fStatus)) {
690
0
                        return;
691
0
                    }
692
0
                    ux = fDStates->size()-1;
693
0
                }
694
0
695
0
                // Dtran[T, a] := U;
696
0
                T->fDtran->setElementAt(ux, a);
697
0
            }
698
0
        }
699
0
    }
700
0
    return;
701
0
    // delete local pointers only if error occured.
702
0
ExitBuildSTdeleteall:
703
0
    delete initialState;
704
0
    delete failState;
705
0
}
706
707
708
709
//-----------------------------------------------------------------------------
710
//
711
//   flagAcceptingStates    Identify accepting states.
712
//                          First get a list of all of the end marker nodes.
713
//                          Then, for each state s,
714
//                              if s contains one of the end marker nodes in its list of tree positions then
715
//                                  s is an accepting state.
716
//
717
//-----------------------------------------------------------------------------
718
0
void     RBBITableBuilder::flagAcceptingStates() {
719
0
    if (U_FAILURE(*fStatus)) {
720
0
        return;
721
0
    }
722
0
    UVector     endMarkerNodes(*fStatus);
723
0
    RBBINode    *endMarker;
724
0
    int32_t     i;
725
0
    int32_t     n;
726
0
727
0
    if (U_FAILURE(*fStatus)) {
728
0
        return;
729
0
    }
730
0
731
0
    fTree->findNodes(&endMarkerNodes, RBBINode::endMark, *fStatus);
732
0
    if (U_FAILURE(*fStatus)) {
733
0
        return;
734
0
    }
735
0
736
0
    for (i=0; i<endMarkerNodes.size(); i++) {
737
0
        endMarker = (RBBINode *)endMarkerNodes.elementAt(i);
738
0
        for (n=0; n<fDStates->size(); n++) {
739
0
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
740
0
            if (sd->fPositions->indexOf(endMarker) >= 0) {
741
0
                // Any non-zero value for fAccepting means this is an accepting node.
742
0
                // The value is what will be returned to the user as the break status.
743
0
                // If no other value was specified, force it to -1.
744
0
745
0
                if (sd->fAccepting==0) {
746
0
                    // State hasn't been marked as accepting yet.  Do it now.
747
0
                    sd->fAccepting = endMarker->fVal;
748
0
                    if (sd->fAccepting == 0) {
749
0
                        sd->fAccepting = -1;
750
0
                    }
751
0
                }
752
0
                if (sd->fAccepting==-1 && endMarker->fVal != 0) {
753
0
                    // Both lookahead and non-lookahead accepting for this state.
754
0
                    // Favor the look-ahead.  Expedient for line break.
755
0
                    // TODO:  need a more elegant resolution for conflicting rules.
756
0
                    sd->fAccepting = endMarker->fVal;
757
0
                }
758
0
                // implicit else:
759
0
                // if sd->fAccepting already had a value other than 0 or -1, leave it be.
760
0
761
0
                // If the end marker node is from a look-ahead rule, set
762
0
                //   the fLookAhead field for this state also.
763
0
                if (endMarker->fLookAheadEnd) {
764
0
                    // TODO:  don't change value if already set?
765
0
                    // TODO:  allow for more than one active look-ahead rule in engine.
766
0
                    //        Make value here an index to a side array in engine?
767
0
                    sd->fLookAhead = sd->fAccepting;
768
0
                }
769
0
            }
770
0
        }
771
0
    }
772
0
}
773
774
775
//-----------------------------------------------------------------------------
776
//
777
//    flagLookAheadStates   Very similar to flagAcceptingStates, above.
778
//
779
//-----------------------------------------------------------------------------
780
0
void     RBBITableBuilder::flagLookAheadStates() {
781
0
    if (U_FAILURE(*fStatus)) {
782
0
        return;
783
0
    }
784
0
    UVector     lookAheadNodes(*fStatus);
785
0
    RBBINode    *lookAheadNode;
786
0
    int32_t     i;
787
0
    int32_t     n;
788
0
789
0
    fTree->findNodes(&lookAheadNodes, RBBINode::lookAhead, *fStatus);
790
0
    if (U_FAILURE(*fStatus)) {
791
0
        return;
792
0
    }
793
0
    for (i=0; i<lookAheadNodes.size(); i++) {
794
0
        lookAheadNode = (RBBINode *)lookAheadNodes.elementAt(i);
795
0
796
0
        for (n=0; n<fDStates->size(); n++) {
797
0
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
798
0
            if (sd->fPositions->indexOf(lookAheadNode) >= 0) {
799
0
                sd->fLookAhead = lookAheadNode->fVal;
800
0
            }
801
0
        }
802
0
    }
803
0
}
804
805
806
807
808
//-----------------------------------------------------------------------------
809
//
810
//    flagTaggedStates
811
//
812
//-----------------------------------------------------------------------------
813
0
void     RBBITableBuilder::flagTaggedStates() {
814
0
    if (U_FAILURE(*fStatus)) {
815
0
        return;
816
0
    }
817
0
    UVector     tagNodes(*fStatus);
818
0
    RBBINode    *tagNode;
819
0
    int32_t     i;
820
0
    int32_t     n;
821
0
822
0
    if (U_FAILURE(*fStatus)) {
823
0
        return;
824
0
    }
825
0
    fTree->findNodes(&tagNodes, RBBINode::tag, *fStatus);
826
0
    if (U_FAILURE(*fStatus)) {
827
0
        return;
828
0
    }
829
0
    for (i=0; i<tagNodes.size(); i++) {                   // For each tag node t (all of 'em)
830
0
        tagNode = (RBBINode *)tagNodes.elementAt(i);
831
0
832
0
        for (n=0; n<fDStates->size(); n++) {              //    For each state  s (row in the state table)
833
0
            RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
834
0
            if (sd->fPositions->indexOf(tagNode) >= 0) {  //       if  s include the tag node t
835
0
                sortedAdd(&sd->fTagVals, tagNode->fVal);
836
0
            }
837
0
        }
838
0
    }
839
0
}
840
841
842
843
844
//-----------------------------------------------------------------------------
845
//
846
//  mergeRuleStatusVals
847
//
848
//      Update the global table of rule status {tag} values
849
//      The rule builder has a global vector of status values that are common
850
//      for all tables.  Merge the ones from this table into the global set.
851
//
852
//-----------------------------------------------------------------------------
853
0
void  RBBITableBuilder::mergeRuleStatusVals() {
854
0
    //
855
0
    //  The basic outline of what happens here is this...
856
0
    //
857
0
    //    for each state in this state table
858
0
    //       if the status tag list for this state is in the global statuses list
859
0
    //           record where and
860
0
    //           continue with the next state
861
0
    //       else
862
0
    //           add the tag list for this state to the global list.
863
0
    //
864
0
    int i;
865
0
    int n;
866
0
867
0
    // Pre-set a single tag of {0} into the table.
868
0
    //   We will need this as a default, for rule sets with no explicit tagging.
869
0
    if (fRB->fRuleStatusVals->size() == 0) {
870
0
        fRB->fRuleStatusVals->addElement(1, *fStatus);  // Num of statuses in group
871
0
        fRB->fRuleStatusVals->addElement((int32_t)0, *fStatus);  //   and our single status of zero
872
0
    }
873
0
874
0
    //    For each state
875
0
    for (n=0; n<fDStates->size(); n++) {
876
0
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
877
0
        UVector *thisStatesTagValues = sd->fTagVals;
878
0
        if (thisStatesTagValues == NULL) {
879
0
            // No tag values are explicitly associated with this state.
880
0
            //   Set the default tag value.
881
0
            sd->fTagsIdx = 0;
882
0
            continue;
883
0
        }
884
0
885
0
        // There are tag(s) associated with this state.
886
0
        //   fTagsIdx will be the index into the global tag list for this state's tag values.
887
0
        //   Initial value of -1 flags that we haven't got it set yet.
888
0
        sd->fTagsIdx = -1;
889
0
        int32_t  thisTagGroupStart = 0;   // indexes into the global rule status vals list
890
0
        int32_t  nextTagGroupStart = 0;
891
0
892
0
        // Loop runs once per group of tags in the global list
893
0
        while (nextTagGroupStart < fRB->fRuleStatusVals->size()) {
894
0
            thisTagGroupStart = nextTagGroupStart;
895
0
            nextTagGroupStart += fRB->fRuleStatusVals->elementAti(thisTagGroupStart) + 1;
896
0
            if (thisStatesTagValues->size() != fRB->fRuleStatusVals->elementAti(thisTagGroupStart)) {
897
0
                // The number of tags for this state is different from
898
0
                //    the number of tags in this group from the global list.
899
0
                //    Continue with the next group from the global list.
900
0
                continue;
901
0
            }
902
0
            // The lengths match, go ahead and compare the actual tag values
903
0
            //    between this state and the group from the global list.
904
0
            for (i=0; i<thisStatesTagValues->size(); i++) {
905
0
                if (thisStatesTagValues->elementAti(i) !=
906
0
                    fRB->fRuleStatusVals->elementAti(thisTagGroupStart + 1 + i) ) {
907
0
                    // Mismatch.
908
0
                    break;
909
0
                }
910
0
            }
911
0
912
0
            if (i == thisStatesTagValues->size()) {
913
0
                // We found a set of tag values in the global list that match
914
0
                //   those for this state.  Use them.
915
0
                sd->fTagsIdx = thisTagGroupStart;
916
0
                break;
917
0
            }
918
0
        }
919
0
920
0
        if (sd->fTagsIdx == -1) {
921
0
            // No suitable entry in the global tag list already.  Add one
922
0
            sd->fTagsIdx = fRB->fRuleStatusVals->size();
923
0
            fRB->fRuleStatusVals->addElement(thisStatesTagValues->size(), *fStatus);
924
0
            for (i=0; i<thisStatesTagValues->size(); i++) {
925
0
                fRB->fRuleStatusVals->addElement(thisStatesTagValues->elementAti(i), *fStatus);
926
0
            }
927
0
        }
928
0
    }
929
0
}
930
931
932
933
934
935
936
937
//-----------------------------------------------------------------------------
938
//
939
//  sortedAdd  Add a value to a vector of sorted values (ints).
940
//             Do not replicate entries; if the value is already there, do not
941
//                add a second one.
942
//             Lazily create the vector if it does not already exist.
943
//
944
//-----------------------------------------------------------------------------
945
0
void RBBITableBuilder::sortedAdd(UVector **vector, int32_t val) {
946
0
    int32_t i;
947
0
948
0
    if (*vector == NULL) {
949
0
        *vector = new UVector(*fStatus);
950
0
    }
951
0
    if (*vector == NULL || U_FAILURE(*fStatus)) {
952
0
        return;
953
0
    }
954
0
    UVector *vec = *vector;
955
0
    int32_t  vSize = vec->size();
956
0
    for (i=0; i<vSize; i++) {
957
0
        int32_t valAtI = vec->elementAti(i);
958
0
        if (valAtI == val) {
959
0
            // The value is already in the vector.  Don't add it again.
960
0
            return;
961
0
        }
962
0
        if (valAtI > val) {
963
0
            break;
964
0
        }
965
0
    }
966
0
    vec->insertElementAt(val, i, *fStatus);
967
0
}
968
969
970
971
//-----------------------------------------------------------------------------
972
//
973
//  setAdd     Set operation on UVector
974
//             dest = dest union source
975
//             Elements may only appear once and must be sorted.
976
//
977
//-----------------------------------------------------------------------------
978
0
void RBBITableBuilder::setAdd(UVector *dest, UVector *source) {
979
0
    int32_t destOriginalSize = dest->size();
980
0
    int32_t sourceSize       = source->size();
981
0
    int32_t di           = 0;
982
0
    MaybeStackArray<void *, 16> destArray, sourceArray;  // Handle small cases without malloc
983
0
    void **destPtr, **sourcePtr;
984
0
    void **destLim, **sourceLim;
985
0
986
0
    if (destOriginalSize > destArray.getCapacity()) {
987
0
        if (destArray.resize(destOriginalSize) == NULL) {
988
0
            return;
989
0
        }
990
0
    }
991
0
    destPtr = destArray.getAlias();
992
0
    destLim = destPtr + destOriginalSize;  // destArray.getArrayLimit()?
993
0
994
0
    if (sourceSize > sourceArray.getCapacity()) {
995
0
        if (sourceArray.resize(sourceSize) == NULL) {
996
0
            return;
997
0
        }
998
0
    }
999
0
    sourcePtr = sourceArray.getAlias();
1000
0
    sourceLim = sourcePtr + sourceSize;  // sourceArray.getArrayLimit()?
1001
0
1002
0
    // Avoid multiple "get element" calls by getting the contents into arrays
1003
0
    (void) dest->toArray(destPtr);
1004
0
    (void) source->toArray(sourcePtr);
1005
0
1006
0
    dest->setSize(sourceSize+destOriginalSize, *fStatus);
1007
0
1008
0
    while (sourcePtr < sourceLim && destPtr < destLim) {
1009
0
        if (*destPtr == *sourcePtr) {
1010
0
            dest->setElementAt(*sourcePtr++, di++);
1011
0
            destPtr++;
1012
0
        }
1013
0
        // This check is required for machines with segmented memory, like i5/OS.
1014
0
        // Direct pointer comparison is not recommended.
1015
0
        else if (uprv_memcmp(destPtr, sourcePtr, sizeof(void *)) < 0) {
1016
0
            dest->setElementAt(*destPtr++, di++);
1017
0
        }
1018
0
        else { /* *sourcePtr < *destPtr */
1019
0
            dest->setElementAt(*sourcePtr++, di++);
1020
0
        }
1021
0
    }
1022
0
1023
0
    // At most one of these two cleanup loops will execute
1024
0
    while (destPtr < destLim) {
1025
0
        dest->setElementAt(*destPtr++, di++);
1026
0
    }
1027
0
    while (sourcePtr < sourceLim) {
1028
0
        dest->setElementAt(*sourcePtr++, di++);
1029
0
    }
1030
0
1031
0
    dest->setSize(di, *fStatus);
1032
0
}
1033
1034
1035
1036
//-----------------------------------------------------------------------------
1037
//
1038
//  setEqual    Set operation on UVector.
1039
//              Compare for equality.
1040
//              Elements must be sorted.
1041
//
1042
//-----------------------------------------------------------------------------
1043
0
UBool RBBITableBuilder::setEquals(UVector *a, UVector *b) {
1044
0
    return a->equals(*b);
1045
0
}
1046
1047
1048
//-----------------------------------------------------------------------------
1049
//
1050
//  printPosSets   Debug function.  Dump Nullable, firstpos, lastpos and followpos
1051
//                 for each node in the tree.
1052
//
1053
//-----------------------------------------------------------------------------
1054
#ifdef RBBI_DEBUG
1055
void RBBITableBuilder::printPosSets(RBBINode *n) {
1056
    if (n==NULL) {
1057
        return;
1058
    }
1059
    printf("\n");
1060
    RBBINode::printNodeHeader();
1061
    RBBINode::printNode(n);
1062
    RBBIDebugPrintf("         Nullable:  %s\n", n->fNullable?"TRUE":"FALSE");
1063
1064
    RBBIDebugPrintf("         firstpos:  ");
1065
    printSet(n->fFirstPosSet);
1066
1067
    RBBIDebugPrintf("         lastpos:   ");
1068
    printSet(n->fLastPosSet);
1069
1070
    RBBIDebugPrintf("         followpos: ");
1071
    printSet(n->fFollowPos);
1072
1073
    printPosSets(n->fLeftChild);
1074
    printPosSets(n->fRightChild);
1075
}
1076
#endif
1077
1078
//
1079
//    findDuplCharClassFrom()
1080
//
1081
0
bool RBBITableBuilder::findDuplCharClassFrom(IntPair *categories) {
1082
0
    int32_t numStates = fDStates->size();
1083
0
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1084
0
1085
0
    uint16_t table_base;
1086
0
    uint16_t table_dupl;
1087
0
    for (; categories->first < numCols-1; categories->first++) {
1088
0
        for (categories->second=categories->first+1; categories->second < numCols; categories->second++) {
1089
0
             for (int32_t state=0; state<numStates; state++) {
1090
0
                 RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1091
0
                 table_base = (uint16_t)sd->fDtran->elementAti(categories->first);
1092
0
                 table_dupl = (uint16_t)sd->fDtran->elementAti(categories->second);
1093
0
                 if (table_base != table_dupl) {
1094
0
                     break;
1095
0
                 }
1096
0
             }
1097
0
             if (table_base == table_dupl) {
1098
0
                 return true;
1099
0
             }
1100
0
        }
1101
0
    }
1102
0
    return false;
1103
0
}
1104
1105
1106
//
1107
//    removeColumn()
1108
//
1109
0
void RBBITableBuilder::removeColumn(int32_t column) {
1110
0
    int32_t numStates = fDStates->size();
1111
0
    for (int32_t state=0; state<numStates; state++) {
1112
0
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1113
0
        U_ASSERT(column < sd->fDtran->size());
1114
0
        sd->fDtran->removeElementAt(column);
1115
0
    }
1116
0
}
1117
1118
/*
1119
 * findDuplicateState
1120
 */
1121
0
bool RBBITableBuilder::findDuplicateState(IntPair *states) {
1122
0
    int32_t numStates = fDStates->size();
1123
0
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1124
0
1125
0
    for (; states->first<numStates-1; states->first++) {
1126
0
        RBBIStateDescriptor *firstSD = (RBBIStateDescriptor *)fDStates->elementAt(states->first);
1127
0
        for (states->second=states->first+1; states->second<numStates; states->second++) {
1128
0
            RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(states->second);
1129
0
            if (firstSD->fAccepting != duplSD->fAccepting ||
1130
0
                firstSD->fLookAhead != duplSD->fLookAhead ||
1131
0
                firstSD->fTagsIdx   != duplSD->fTagsIdx) {
1132
0
                continue;
1133
0
            }
1134
0
            bool rowsMatch = true;
1135
0
            for (int32_t col=0; col < numCols; ++col) {
1136
0
                int32_t firstVal = firstSD->fDtran->elementAti(col);
1137
0
                int32_t duplVal = duplSD->fDtran->elementAti(col);
1138
0
                if (!((firstVal == duplVal) ||
1139
0
                        ((firstVal == states->first || firstVal == states->second) &&
1140
0
                        (duplVal  == states->first || duplVal  == states->second)))) {
1141
0
                    rowsMatch = false;
1142
0
                    break;
1143
0
                }
1144
0
            }
1145
0
            if (rowsMatch) {
1146
0
                return true;
1147
0
            }
1148
0
        }
1149
0
    }
1150
0
    return false;
1151
0
}
1152
1153
1154
0
bool RBBITableBuilder::findDuplicateSafeState(IntPair *states) {
1155
0
    int32_t numStates = fSafeTable->size();
1156
0
1157
0
    for (; states->first<numStates-1; states->first++) {
1158
0
        UnicodeString *firstRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->first));
1159
0
        for (states->second=states->first+1; states->second<numStates; states->second++) {
1160
0
            UnicodeString *duplRow = static_cast<UnicodeString *>(fSafeTable->elementAt(states->second));
1161
0
            bool rowsMatch = true;
1162
0
            int32_t numCols = firstRow->length();
1163
0
            for (int32_t col=0; col < numCols; ++col) {
1164
0
                int32_t firstVal = firstRow->charAt(col);
1165
0
                int32_t duplVal = duplRow->charAt(col);
1166
0
                if (!((firstVal == duplVal) ||
1167
0
                        ((firstVal == states->first || firstVal == states->second) &&
1168
0
                        (duplVal  == states->first || duplVal  == states->second)))) {
1169
0
                    rowsMatch = false;
1170
0
                    break;
1171
0
                }
1172
0
            }
1173
0
            if (rowsMatch) {
1174
0
                return true;
1175
0
            }
1176
0
        }
1177
0
    }
1178
0
    return false;
1179
0
}
1180
1181
1182
0
void RBBITableBuilder::removeState(IntPair duplStates) {
1183
0
    const int32_t keepState = duplStates.first;
1184
0
    const int32_t duplState = duplStates.second;
1185
0
    U_ASSERT(keepState < duplState);
1186
0
    U_ASSERT(duplState < fDStates->size());
1187
0
1188
0
    RBBIStateDescriptor *duplSD = (RBBIStateDescriptor *)fDStates->elementAt(duplState);
1189
0
    fDStates->removeElementAt(duplState);
1190
0
    delete duplSD;
1191
0
1192
0
    int32_t numStates = fDStates->size();
1193
0
    int32_t numCols = fRB->fSetBuilder->getNumCharCategories();
1194
0
    for (int32_t state=0; state<numStates; ++state) {
1195
0
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1196
0
        for (int32_t col=0; col<numCols; col++) {
1197
0
            int32_t existingVal = sd->fDtran->elementAti(col);
1198
0
            int32_t newVal = existingVal;
1199
0
            if (existingVal == duplState) {
1200
0
                newVal = keepState;
1201
0
            } else if (existingVal > duplState) {
1202
0
                newVal = existingVal - 1;
1203
0
            }
1204
0
            sd->fDtran->setElementAt(newVal, col);
1205
0
        }
1206
0
        if (sd->fAccepting == duplState) {
1207
0
            sd->fAccepting = keepState;
1208
0
        } else if (sd->fAccepting > duplState) {
1209
0
            sd->fAccepting--;
1210
0
        }
1211
0
        if (sd->fLookAhead == duplState) {
1212
0
            sd->fLookAhead = keepState;
1213
0
        } else if (sd->fLookAhead > duplState) {
1214
0
            sd->fLookAhead--;
1215
0
        }
1216
0
    }
1217
0
}
1218
1219
0
void RBBITableBuilder::removeSafeState(IntPair duplStates) {
1220
0
    const int32_t keepState = duplStates.first;
1221
0
    const int32_t duplState = duplStates.second;
1222
0
    U_ASSERT(keepState < duplState);
1223
0
    U_ASSERT(duplState < fSafeTable->size());
1224
0
1225
0
    fSafeTable->removeElementAt(duplState);   // Note that fSafeTable has a deleter function
1226
0
                                              // and will auto-delete the removed element.
1227
0
    int32_t numStates = fSafeTable->size();
1228
0
    for (int32_t state=0; state<numStates; ++state) {
1229
0
        UnicodeString *sd = (UnicodeString *)fSafeTable->elementAt(state);
1230
0
        int32_t numCols = sd->length();
1231
0
        for (int32_t col=0; col<numCols; col++) {
1232
0
            int32_t existingVal = sd->charAt(col);
1233
0
            int32_t newVal = existingVal;
1234
0
            if (existingVal == duplState) {
1235
0
                newVal = keepState;
1236
0
            } else if (existingVal > duplState) {
1237
0
                newVal = existingVal - 1;
1238
0
            }
1239
0
            sd->setCharAt(col, newVal);
1240
0
        }
1241
0
    }
1242
0
}
1243
1244
1245
/*
1246
 * RemoveDuplicateStates
1247
 */
1248
0
void RBBITableBuilder::removeDuplicateStates() {
1249
0
    IntPair dupls = {3, 0};
1250
0
    while (findDuplicateState(&dupls)) {
1251
0
        // printf("Removing duplicate states (%d, %d)\n", dupls.first, dupls.second);
1252
0
        removeState(dupls);
1253
0
    }
1254
0
}
1255
1256
1257
//-----------------------------------------------------------------------------
1258
//
1259
//   getTableSize()    Calculate the size of the runtime form of this
1260
//                     state transition table.
1261
//
1262
//-----------------------------------------------------------------------------
1263
0
int32_t  RBBITableBuilder::getTableSize() const {
1264
0
    int32_t    size = 0;
1265
0
    int32_t    numRows;
1266
0
    int32_t    numCols;
1267
0
    int32_t    rowSize;
1268
0
1269
0
    if (fTree == NULL) {
1270
0
        return 0;
1271
0
    }
1272
0
1273
0
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1274
0
1275
0
    numRows = fDStates->size();
1276
0
    numCols = fRB->fSetBuilder->getNumCharCategories();
1277
0
1278
0
    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1279
0
    size   += numRows * rowSize;
1280
0
    return size;
1281
0
}
1282
1283
1284
//-----------------------------------------------------------------------------
1285
//
1286
//   exportTable()    export the state transition table in the format required
1287
//                    by the runtime engine.  getTableSize() bytes of memory
1288
//                    must be available at the output address "where".
1289
//
1290
//-----------------------------------------------------------------------------
1291
0
void RBBITableBuilder::exportTable(void *where) {
1292
0
    RBBIStateTable    *table = (RBBIStateTable *)where;
1293
0
    uint32_t           state;
1294
0
    int                col;
1295
0
1296
0
    if (U_FAILURE(*fStatus) || fTree == NULL) {
1297
0
        return;
1298
0
    }
1299
0
1300
0
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1301
0
    if (catCount > 0x7fff ||
1302
0
        fDStates->size() > 0x7fff) {
1303
0
        *fStatus = U_BRK_INTERNAL_ERROR;
1304
0
        return;
1305
0
    }
1306
0
1307
0
    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1308
0
    table->fNumStates = fDStates->size();
1309
0
    table->fFlags     = 0;
1310
0
    if (fRB->fLookAheadHardBreak) {
1311
0
        table->fFlags  |= RBBI_LOOKAHEAD_HARD_BREAK;
1312
0
    }
1313
0
    if (fRB->fSetBuilder->sawBOF()) {
1314
0
        table->fFlags  |= RBBI_BOF_REQUIRED;
1315
0
    }
1316
0
    table->fReserved  = 0;
1317
0
1318
0
    for (state=0; state<table->fNumStates; state++) {
1319
0
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(state);
1320
0
        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1321
0
        U_ASSERT (-32768 < sd->fAccepting && sd->fAccepting <= 32767);
1322
0
        U_ASSERT (-32768 < sd->fLookAhead && sd->fLookAhead <= 32767);
1323
0
        row->fAccepting = (int16_t)sd->fAccepting;
1324
0
        row->fLookAhead = (int16_t)sd->fLookAhead;
1325
0
        row->fTagIdx    = (int16_t)sd->fTagsIdx;
1326
0
        for (col=0; col<catCount; col++) {
1327
0
            row->fNextState[col] = (uint16_t)sd->fDtran->elementAti(col);
1328
0
        }
1329
0
    }
1330
0
}
1331
1332
1333
/**
1334
 *   Synthesize a safe state table from the main state table.
1335
 */
1336
0
void RBBITableBuilder::buildSafeReverseTable(UErrorCode &status) {
1337
0
    // The safe table creation has three steps:
1338
0
1339
0
    // 1. Identifiy pairs of character classes that are "safe." Safe means that boundaries
1340
0
    // following the pair do not depend on context or state before the pair. To test
1341
0
    // whether a pair is safe, run it through the main forward state table, starting
1342
0
    // from each state. If the the final state is the same, no matter what the starting state,
1343
0
    // the pair is safe.
1344
0
    //
1345
0
    // 2. Build a state table that recognizes the safe pairs. It's similar to their
1346
0
    // forward table, with a column for each input character [class], and a row for
1347
0
    // each state. Row 1 is the start state, and row 0 is the stop state. Initially
1348
0
    // create an additional state for each input character category; being in
1349
0
    // one of these states means that the character has been seen, and is potentially
1350
0
    // the first of a pair. In each of these rows, the entry for the second character
1351
0
    // of a safe pair is set to the stop state (0), indicating that a match was found.
1352
0
    // All other table entries are set to the state corresponding the current input
1353
0
    // character, allowing that charcter to be the of a start following pair.
1354
0
    //
1355
0
    // Because the safe rules are to be run in reverse, moving backwards in the text,
1356
0
    // the first and second pair categories are swapped when building the table.
1357
0
    //
1358
0
    // 3. Compress the table. There are typically many rows (states) that are
1359
0
    // equivalent - that have zeroes (match completed) in the same columns -
1360
0
    // and can be folded together.
1361
0
1362
0
    // Each safe pair is stored as two UChars in the safePair string.
1363
0
    UnicodeString safePairs;
1364
0
1365
0
    int32_t numCharClasses = fRB->fSetBuilder->getNumCharCategories();
1366
0
    int32_t numStates = fDStates->size();
1367
0
1368
0
    for (int32_t c1=0; c1<numCharClasses; ++c1) {
1369
0
        for (int32_t c2=0; c2 < numCharClasses; ++c2) {
1370
0
            int32_t wantedEndState = -1;
1371
0
            int32_t endState = 0;
1372
0
            for (int32_t startState = 1; startState < numStates; ++startState) {
1373
0
                RBBIStateDescriptor *startStateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(startState));
1374
0
                int32_t s2 = startStateD->fDtran->elementAti(c1);
1375
0
                RBBIStateDescriptor *s2StateD = static_cast<RBBIStateDescriptor *>(fDStates->elementAt(s2));
1376
0
                endState = s2StateD->fDtran->elementAti(c2);
1377
0
                if (wantedEndState < 0) {
1378
0
                    wantedEndState = endState;
1379
0
                } else {
1380
0
                    if (wantedEndState != endState) {
1381
0
                        break;
1382
0
                    }
1383
0
                }
1384
0
            }
1385
0
            if (wantedEndState == endState) {
1386
0
                safePairs.append((char16_t)c1);
1387
0
                safePairs.append((char16_t)c2);
1388
0
                // printf("(%d, %d) ", c1, c2);
1389
0
            }
1390
0
        }
1391
0
        // printf("\n");
1392
0
    }
1393
0
1394
0
    // Populate the initial safe table.
1395
0
    // The table as a whole is UVector<UnicodeString>
1396
0
    // Each row is represented by a UnicodeString, being used as a Vector<int16>.
1397
0
    // Row 0 is the stop state.
1398
0
    // Row 1 is the start sate.
1399
0
    // Row 2 and beyond are other states, initially one per char class, but
1400
0
    //   after initial construction, many of the states will be combined, compacting the table.
1401
0
    // The String holds the nextState data only. The four leading fields of a row, fAccepting,
1402
0
    // fLookAhead, etc. are not needed for the safe table, and are omitted at this stage of building.
1403
0
1404
0
    U_ASSERT(fSafeTable == nullptr);
1405
0
    fSafeTable = new UVector(uprv_deleteUObject, uhash_compareUnicodeString, numCharClasses + 2, status);
1406
0
    for (int32_t row=0; row<numCharClasses + 2; ++row) {
1407
0
        fSafeTable->addElement(new UnicodeString(numCharClasses, 0, numCharClasses+4), status);
1408
0
    }
1409
0
1410
0
    // From the start state, each input char class transitions to the state for that input.
1411
0
    UnicodeString &startState = *static_cast<UnicodeString *>(fSafeTable->elementAt(1));
1412
0
    for (int32_t charClass=0; charClass < numCharClasses; ++charClass) {
1413
0
        // Note: +2 for the start & stop state.
1414
0
        startState.setCharAt(charClass, charClass+2);
1415
0
    }
1416
0
1417
0
    // Initially make every other state table row look like the start state row,
1418
0
    for (int32_t row=2; row<numCharClasses+2; ++row) {
1419
0
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(row));
1420
0
        rowState = startState;   // UnicodeString assignment, copies contents.
1421
0
    }
1422
0
1423
0
    // Run through the safe pairs, set the next state to zero when pair has been seen.
1424
0
    // Zero being the stop state, meaning we found a safe point.
1425
0
    for (int32_t pairIdx=0; pairIdx<safePairs.length(); pairIdx+=2) {
1426
0
        int32_t c1 = safePairs.charAt(pairIdx);
1427
0
        int32_t c2 = safePairs.charAt(pairIdx + 1);
1428
0
1429
0
        UnicodeString &rowState = *static_cast<UnicodeString *>(fSafeTable->elementAt(c2 + 2));
1430
0
        rowState.setCharAt(c1, 0);
1431
0
    }
1432
0
1433
0
    // Remove duplicate or redundant rows from the table.
1434
0
    IntPair states = {1, 0};
1435
0
    while (findDuplicateSafeState(&states)) {
1436
0
        // printf("Removing duplicate safe states (%d, %d)\n", states.first, states.second);
1437
0
        removeSafeState(states);
1438
0
    }
1439
0
}
1440
1441
1442
//-----------------------------------------------------------------------------
1443
//
1444
//   getSafeTableSize()    Calculate the size of the runtime form of this
1445
//                         safe state table.
1446
//
1447
//-----------------------------------------------------------------------------
1448
0
int32_t  RBBITableBuilder::getSafeTableSize() const {
1449
0
    int32_t    size = 0;
1450
0
    int32_t    numRows;
1451
0
    int32_t    numCols;
1452
0
    int32_t    rowSize;
1453
0
1454
0
    if (fSafeTable == nullptr) {
1455
0
        return 0;
1456
0
    }
1457
0
1458
0
    size    = offsetof(RBBIStateTable, fTableData);    // The header, with no rows to the table.
1459
0
1460
0
    numRows = fSafeTable->size();
1461
0
    numCols = fRB->fSetBuilder->getNumCharCategories();
1462
0
1463
0
    rowSize = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t)*numCols;
1464
0
    size   += numRows * rowSize;
1465
0
    return size;
1466
0
}
1467
1468
1469
//-----------------------------------------------------------------------------
1470
//
1471
//   exportSafeTable()   export the state transition table in the format required
1472
//                       by the runtime engine.  getTableSize() bytes of memory
1473
//                       must be available at the output address "where".
1474
//
1475
//-----------------------------------------------------------------------------
1476
0
void RBBITableBuilder::exportSafeTable(void *where) {
1477
0
    RBBIStateTable    *table = (RBBIStateTable *)where;
1478
0
    uint32_t           state;
1479
0
    int                col;
1480
0
1481
0
    if (U_FAILURE(*fStatus) || fSafeTable == nullptr) {
1482
0
        return;
1483
0
    }
1484
0
1485
0
    int32_t catCount = fRB->fSetBuilder->getNumCharCategories();
1486
0
    if (catCount > 0x7fff ||
1487
0
            fSafeTable->size() > 0x7fff) {
1488
0
        *fStatus = U_BRK_INTERNAL_ERROR;
1489
0
        return;
1490
0
    }
1491
0
1492
0
    table->fRowLen    = offsetof(RBBIStateTableRow, fNextState) + sizeof(uint16_t) * catCount;
1493
0
    table->fNumStates = fSafeTable->size();
1494
0
    table->fFlags     = 0;
1495
0
    table->fReserved  = 0;
1496
0
1497
0
    for (state=0; state<table->fNumStates; state++) {
1498
0
        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(state);
1499
0
        RBBIStateTableRow   *row = (RBBIStateTableRow *)(table->fTableData + state*table->fRowLen);
1500
0
        row->fAccepting = 0;
1501
0
        row->fLookAhead = 0;
1502
0
        row->fTagIdx    = 0;
1503
0
        row->fReserved  = 0;
1504
0
        for (col=0; col<catCount; col++) {
1505
0
            row->fNextState[col] = rowString->charAt(col);
1506
0
        }
1507
0
    }
1508
0
}
1509
1510
1511
1512
1513
//-----------------------------------------------------------------------------
1514
//
1515
//   printSet    Debug function.   Print the contents of a UVector
1516
//
1517
//-----------------------------------------------------------------------------
1518
#ifdef RBBI_DEBUG
1519
void RBBITableBuilder::printSet(UVector *s) {
1520
    int32_t  i;
1521
    for (i=0; i<s->size(); i++) {
1522
        const RBBINode *v = static_cast<const RBBINode *>(s->elementAt(i));
1523
        RBBIDebugPrintf("%5d", v==NULL? -1 : v->fSerialNum);
1524
    }
1525
    RBBIDebugPrintf("\n");
1526
}
1527
#endif
1528
1529
1530
//-----------------------------------------------------------------------------
1531
//
1532
//   printStates    Debug Function.  Dump the fully constructed state transition table.
1533
//
1534
//-----------------------------------------------------------------------------
1535
#ifdef RBBI_DEBUG
1536
void RBBITableBuilder::printStates() {
1537
    int     c;    // input "character"
1538
    int     n;    // state number
1539
1540
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1541
    RBBIDebugPrintf("      | Acc  LA    Tag");
1542
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1543
        RBBIDebugPrintf(" %2d", c);
1544
    }
1545
    RBBIDebugPrintf("\n");
1546
    RBBIDebugPrintf("      |---------------");
1547
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1548
        RBBIDebugPrintf("---");
1549
    }
1550
    RBBIDebugPrintf("\n");
1551
1552
    for (n=0; n<fDStates->size(); n++) {
1553
        RBBIStateDescriptor *sd = (RBBIStateDescriptor *)fDStates->elementAt(n);
1554
        RBBIDebugPrintf("  %3d | " , n);
1555
        RBBIDebugPrintf("%3d %3d %5d ", sd->fAccepting, sd->fLookAhead, sd->fTagsIdx);
1556
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1557
            RBBIDebugPrintf(" %2d", sd->fDtran->elementAti(c));
1558
        }
1559
        RBBIDebugPrintf("\n");
1560
    }
1561
    RBBIDebugPrintf("\n\n");
1562
}
1563
#endif
1564
1565
1566
//-----------------------------------------------------------------------------
1567
//
1568
//   printSafeTable    Debug Function.  Dump the fully constructed safe table.
1569
//
1570
//-----------------------------------------------------------------------------
1571
#ifdef RBBI_DEBUG
1572
void RBBITableBuilder::printReverseTable() {
1573
    int     c;    // input "character"
1574
    int     n;    // state number
1575
1576
    RBBIDebugPrintf("    Safe Reverse Table \n");
1577
    if (fSafeTable == nullptr) {
1578
        RBBIDebugPrintf("   --- nullptr ---\n");
1579
        return;
1580
    }
1581
    RBBIDebugPrintf("state |           i n p u t     s y m b o l s \n");
1582
    RBBIDebugPrintf("      | Acc  LA    Tag");
1583
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1584
        RBBIDebugPrintf(" %2d", c);
1585
    }
1586
    RBBIDebugPrintf("\n");
1587
    RBBIDebugPrintf("      |---------------");
1588
    for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1589
        RBBIDebugPrintf("---");
1590
    }
1591
    RBBIDebugPrintf("\n");
1592
1593
    for (n=0; n<fSafeTable->size(); n++) {
1594
        UnicodeString *rowString = (UnicodeString *)fSafeTable->elementAt(n);
1595
        RBBIDebugPrintf("  %3d | " , n);
1596
        RBBIDebugPrintf("%3d %3d %5d ", 0, 0, 0);  // Accepting, LookAhead, Tags
1597
        for (c=0; c<fRB->fSetBuilder->getNumCharCategories(); c++) {
1598
            RBBIDebugPrintf(" %2d", rowString->charAt(c));
1599
        }
1600
        RBBIDebugPrintf("\n");
1601
    }
1602
    RBBIDebugPrintf("\n\n");
1603
}
1604
#endif
1605
1606
1607
1608
//-----------------------------------------------------------------------------
1609
//
1610
//   printRuleStatusTable    Debug Function.  Dump the common rule status table
1611
//
1612
//-----------------------------------------------------------------------------
1613
#ifdef RBBI_DEBUG
1614
void RBBITableBuilder::printRuleStatusTable() {
1615
    int32_t  thisRecord = 0;
1616
    int32_t  nextRecord = 0;
1617
    int      i;
1618
    UVector  *tbl = fRB->fRuleStatusVals;
1619
1620
    RBBIDebugPrintf("index |  tags \n");
1621
    RBBIDebugPrintf("-------------------\n");
1622
1623
    while (nextRecord < tbl->size()) {
1624
        thisRecord = nextRecord;
1625
        nextRecord = thisRecord + tbl->elementAti(thisRecord) + 1;
1626
        RBBIDebugPrintf("%4d   ", thisRecord);
1627
        for (i=thisRecord+1; i<nextRecord; i++) {
1628
            RBBIDebugPrintf("  %5d", tbl->elementAti(i));
1629
        }
1630
        RBBIDebugPrintf("\n");
1631
    }
1632
    RBBIDebugPrintf("\n\n");
1633
}
1634
#endif
1635
1636
1637
//-----------------------------------------------------------------------------
1638
//
1639
//   RBBIStateDescriptor     Methods.  This is a very struct-like class
1640
//                           Most access is directly to the fields.
1641
//
1642
//-----------------------------------------------------------------------------
1643
1644
0
RBBIStateDescriptor::RBBIStateDescriptor(int lastInputSymbol, UErrorCode *fStatus) {
1645
0
    fMarked    = FALSE;
1646
0
    fAccepting = 0;
1647
0
    fLookAhead = 0;
1648
0
    fTagsIdx   = 0;
1649
0
    fTagVals   = NULL;
1650
0
    fPositions = NULL;
1651
0
    fDtran     = NULL;
1652
0
1653
0
    fDtran     = new UVector32(lastInputSymbol+1, *fStatus);
1654
0
    if (U_FAILURE(*fStatus)) {
1655
0
        return;
1656
0
    }
1657
0
    if (fDtran == NULL) {
1658
0
        *fStatus = U_MEMORY_ALLOCATION_ERROR;
1659
0
        return;
1660
0
    }
1661
0
    fDtran->setSize(lastInputSymbol+1);    // fDtran needs to be pre-sized.
1662
0
                                           //   It is indexed by input symbols, and will
1663
0
                                           //   hold  the next state number for each
1664
0
                                           //   symbol.
1665
0
}
1666
1667
1668
0
RBBIStateDescriptor::~RBBIStateDescriptor() {
1669
0
    delete       fPositions;
1670
0
    delete       fDtran;
1671
0
    delete       fTagVals;
1672
0
    fPositions = NULL;
1673
0
    fDtran     = NULL;
1674
0
    fTagVals   = NULL;
1675
0
}
1676
1677
U_NAMESPACE_END
1678
1679
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */