Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/common/rbbiscan.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  rbbiscan.cpp
5
//
6
//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the Rule Based Break Iterator Rule Builder functions for
10
//   scanning the rules and assembling a parse tree.  This is the first phase
11
//   of compiling the rules.
12
//
13
//  The overall of the rules is managed by class RBBIRuleBuilder, which will
14
//  create and use an instance of this class as part of the process.
15
//
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_BREAK_ITERATION
20
21
#include "unicode/unistr.h"
22
#include "unicode/uniset.h"
23
#include "unicode/uchar.h"
24
#include "unicode/uchriter.h"
25
#include "unicode/parsepos.h"
26
#include "unicode/parseerr.h"
27
#include "cmemory.h"
28
#include "cstring.h"
29
30
#include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
31
                       //   generated by a Perl script.
32
#include "rbbirb.h"
33
#include "rbbinode.h"
34
#include "rbbiscan.h"
35
#include "rbbitblb.h"
36
37
#include "uassert.h"
38
39
//------------------------------------------------------------------------------
40
//
41
// Unicode Set init strings for each of the character classes needed for parsing a rule file.
42
//               (Initialized with hex values for portability to EBCDIC based machines.
43
//                Really ugly, but there's no good way to avoid it.)
44
//
45
//              The sets are referred to by name in the rbbirpt.txt, which is the
46
//              source form of the state transition table for the RBBI rule parser.
47
//
48
//------------------------------------------------------------------------------
49
static const char16_t gRuleSet_rule_char_pattern[]       = {
50
 // Characters that may appear as literals in patterns without escaping or quoting.
51
 //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
52
    0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
53
 //   -    \      u    0     0     7      f     ]     -     [    \      p
54
    0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
55
 //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
56
    0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
57
58
static const char16_t gRuleSet_name_char_pattern[]       = {
59
//    [    _      \    p     {     L      }     \     p     {    N      }     ]
60
    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
61
62
static const char16_t gRuleSet_digit_char_pattern[] = {
63
//    [    0      -    9     ]
64
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
65
66
static const char16_t gRuleSet_name_start_char_pattern[] = {
67
//    [    _      \    p     {     L      }     ]
68
    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
69
70
static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
71
72
73
U_CDECL_BEGIN
74
172k
static void U_CALLCONV RBBISetTable_deleter(void *p) {
75
172k
    icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
76
172k
    delete px->key;
77
    // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
78
    //        Don't delete the value nodes here.
79
172k
    uprv_free(px);
80
172k
}
81
U_CDECL_END
82
83
U_NAMESPACE_BEGIN
84
85
//------------------------------------------------------------------------------
86
//
87
//  Constructor.
88
//
89
//------------------------------------------------------------------------------
90
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
91
8.25k
{
92
8.25k
    fRB                 = rb;
93
8.25k
    fScanIndex          = 0;
94
8.25k
    fNextIndex          = 0;
95
8.25k
    fQuoteMode          = false;
96
8.25k
    fLineNum            = 1;
97
8.25k
    fCharNum            = 0;
98
8.25k
    fLastChar           = 0;
99
    
100
8.25k
    fStateTable         = nullptr;
101
8.25k
    fStack[0]           = 0;
102
8.25k
    fStackPtr           = 0;
103
8.25k
    fNodeStack[0]       = nullptr;
104
8.25k
    fNodeStackPtr       = 0;
105
106
8.25k
    fReverseRule        = false;
107
8.25k
    fLookAheadRule      = false;
108
8.25k
    fNoChainInRule      = false;
109
110
8.25k
    fSymbolTable        = nullptr;
111
8.25k
    fSetTable           = nullptr;
112
8.25k
    fRuleNum            = 0;
113
8.25k
    fOptionStart        = 0;
114
115
    // Do not check status until after all critical fields are sufficiently initialized
116
    //   that the destructor can run cleanly.
117
8.25k
    if (U_FAILURE(*rb->fStatus)) {
118
0
        return;
119
0
    }
120
121
    //
122
    //  Set up the constant Unicode Sets.
123
    //     Note:  These could be made static, lazily initialized, and shared among
124
    //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
125
    //            and the time to build these few sets should be small compared to a
126
    //            full break iterator build.
127
8.25k
    fRuleSets[kRuleSet_rule_char-128]
128
8.25k
        = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus);
129
    // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
130
8.25k
    fRuleSets[kRuleSet_white_space-128].
131
8.25k
        add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
132
8.25k
    fRuleSets[kRuleSet_name_char-128]
133
8.25k
        = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus);
134
8.25k
    fRuleSets[kRuleSet_name_start_char-128]
135
8.25k
        = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
136
8.25k
    fRuleSets[kRuleSet_digit_char-128]
137
8.25k
        = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus);
138
8.25k
    if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
139
        // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
140
        //   names from the init string, can't find them, and claims an illegal argument.
141
        //   Change the error so that the actual problem will be clearer to users.
142
0
        *rb->fStatus = U_BRK_INIT_ERROR;
143
0
    }
144
8.25k
    if (U_FAILURE(*rb->fStatus)) {
145
0
        return;
146
0
    }
147
148
8.25k
    fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
149
8.25k
    if (fSymbolTable == nullptr) {
150
0
        *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
151
0
        return;
152
0
    }
153
8.25k
    fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
154
8.25k
    if (U_FAILURE(*rb->fStatus)) {
155
0
        return;
156
0
    }
157
8.25k
    uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
158
8.25k
}
159
160
161
162
//------------------------------------------------------------------------------
163
//
164
//  Destructor
165
//
166
//------------------------------------------------------------------------------
167
8.25k
RBBIRuleScanner::~RBBIRuleScanner() {
168
8.25k
    delete fSymbolTable;
169
8.25k
    if (fSetTable != nullptr) {
170
8.25k
         uhash_close(fSetTable);
171
8.25k
         fSetTable = nullptr;
172
173
8.25k
    }
174
175
176
    // Node Stack.
177
    //   Normally has one entry, which is the entire parse tree for the rules.
178
    //   If errors occurred, there may be additional subtrees left on the stack.
179
18.5k
    while (fNodeStackPtr > 0) {
180
10.3k
        delete fNodeStack[fNodeStackPtr];
181
10.3k
        fNodeStackPtr--;
182
10.3k
    }
183
184
8.25k
}
185
186
//------------------------------------------------------------------------------
187
//
188
//  doParseAction        Do some action during rule parsing.
189
//                       Called by the parse state machine.
190
//                       Actions build the parse tree and Unicode Sets,
191
//                       and maintain the parse stack for nested expressions.
192
//
193
//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
194
//                              They represent exactly the same thing.  They're separate
195
//                              only to work around enum forward declaration restrictions
196
//                              in some compilers, while at the same time avoiding multiple
197
//                              definitions problems.  I'm sure that there's a better way.
198
//
199
//------------------------------------------------------------------------------
200
UBool RBBIRuleScanner::doParseActions(int32_t action)
201
15.5M
{
202
15.5M
    RBBINode *n       = nullptr;
203
204
15.5M
    UBool   returnVal = true;
205
206
15.5M
    switch (action) {
207
208
39.9k
    case doExprStart:
209
39.9k
        pushNewNode(RBBINode::opStart);
210
39.9k
        fRuleNum++;
211
39.9k
        break;
212
213
214
234
    case doNoChain:
215
        // Scanned a '^' while on the rule start state.
216
234
        fNoChainInRule = true;
217
234
        break;
218
219
220
16.7k
    case doExprOrOperator:
221
16.7k
        {
222
16.7k
            fixOpStack(RBBINode::precOpCat);
223
16.7k
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
224
16.7k
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
225
16.7k
            if (U_FAILURE(*fRB->fStatus)) {
226
0
                break;
227
0
            }
228
16.7k
            orNode->fLeftChild     = operandNode;
229
16.7k
            operandNode->fParent   = orNode;
230
16.7k
        }
231
0
        break;
232
233
4.83M
    case doExprCatOperator:
234
        // concatenation operator.
235
        // For the implicit concatenation of adjacent terms in an expression that are
236
        //   not separated by any other operator.  Action is invoked between the
237
        //   actions for the two terms.
238
4.83M
        {
239
4.83M
            fixOpStack(RBBINode::precOpCat);
240
4.83M
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
241
4.83M
            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
242
4.83M
            if (U_FAILURE(*fRB->fStatus)) {
243
0
                break;
244
0
            }
245
4.83M
            catNode->fLeftChild    = operandNode;
246
4.83M
            operandNode->fParent   = catNode;
247
4.83M
        }
248
0
        break;
249
250
1.33k
    case doLParen:
251
        // Open Paren.
252
        //   The openParen node is a dummy operation type with a low precedence,
253
        //     which has the affect of ensuring that any real binary op that
254
        //     follows within the parens binds more tightly to the operands than
255
        //     stuff outside of the parens.
256
1.33k
        pushNewNode(RBBINode::opLParen);
257
1.33k
        break;
258
259
372
    case doExprRParen:
260
372
        fixOpStack(RBBINode::precLParen);
261
372
        break;
262
263
5.69M
    case doNOP:
264
5.69M
        break;
265
266
1.51k
    case doStartAssign:
267
        // We've just scanned "$variable = "
268
        // The top of the node stack has the $variable ref node.
269
270
        // Save the start position of the RHS text in the StartExpression node
271
        //   that precedes the $variableReference node on the stack.
272
        //   This will eventually be used when saving the full $variable replacement
273
        //   text as a string.
274
1.51k
        n = fNodeStack[fNodeStackPtr-1];
275
1.51k
        n->fFirstPos = fNextIndex;              // move past the '='
276
277
        // Push a new start-of-expression node; needed to keep parse of the
278
        //   RHS expression happy.
279
1.51k
        pushNewNode(RBBINode::opStart);
280
1.51k
        break;
281
282
283
284
285
1.45k
    case doEndAssign:
286
1.45k
        {
287
            // We have reached the end of an assignment statement.
288
            //   Current scan char is the ';' that terminates the assignment.
289
290
            // Terminate expression, leaves expression parse tree rooted in TOS node.
291
1.45k
            fixOpStack(RBBINode::precStart);
292
1.45k
            if (U_FAILURE(*fRB->fStatus)) {
293
2
                break;
294
2
            }
295
296
1.45k
            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
297
1.45k
            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
298
1.45k
            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
299
300
            // Save original text of right side of assignment, excluding the terminating ';'
301
            //  in the root of the node for the right-hand-side expression.
302
1.45k
            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
303
1.45k
            RHSExprNode->fLastPos  = fScanIndex;
304
1.45k
            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
305
306
            // Expression parse tree becomes l. child of the $variable reference node.
307
1.45k
            varRefNode->fLeftChild = RHSExprNode;
308
1.45k
            RHSExprNode->fParent   = varRefNode;
309
310
            // Make a symbol table entry for the $variableRef node.
311
1.45k
            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
312
1.45k
            if (U_FAILURE(*fRB->fStatus)) {
313
                // This is a round-about way to get the parse position set
314
                //  so that duplicate symbols error messages include a line number.
315
5
                UErrorCode t = *fRB->fStatus;
316
5
                *fRB->fStatus = U_ZERO_ERROR;
317
5
                error(t);
318
                // When adding $variableRef to the symbol table fail, Delete
319
                // both nodes because deleting varRefNode will not delete
320
                // RHSExprNode internally.
321
5
                delete RHSExprNode;
322
5
                delete varRefNode;
323
5
            }
324
325
            // Clean up the stack.
326
1.45k
            delete startExprNode;
327
1.45k
            fNodeStackPtr-=3;
328
1.45k
            break;
329
1.45k
        }
330
331
33.1k
    case doEndOfRule:
332
33.1k
        {
333
33.1k
        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
334
33.1k
        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
335
1
            break;
336
1
        }
337
#ifdef RBBI_DEBUG
338
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
339
#endif
340
33.1k
        U_ASSERT(fNodeStackPtr == 1);
341
33.1k
        RBBINode *thisRule = fNodeStack[fNodeStackPtr];
342
343
        // If this rule includes a look-ahead '/', add a endMark node to the
344
        //   expression tree.
345
33.1k
        if (fLookAheadRule) {
346
6.82k
            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
347
6.82k
            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
348
6.82k
            if (U_FAILURE(*fRB->fStatus)) {
349
0
                break;
350
0
            }
351
6.82k
            fNodeStackPtr -= 2;
352
6.82k
            catNode->fLeftChild       = thisRule;
353
6.82k
            catNode->fRightChild      = endNode;
354
6.82k
            fNodeStack[fNodeStackPtr] = catNode;
355
6.82k
            endNode->fVal             = fRuleNum;
356
6.82k
            endNode->fLookAheadEnd    = true;
357
6.82k
            thisRule                  = catNode;
358
359
            // TODO: Disable chaining out of look-ahead (hard break) rules.
360
            //   The break on rule match is forced, so there is no point in building up
361
            //   the state table to chain into another rule for a longer match.
362
6.82k
        }
363
364
        // Mark this node as being the root of a rule.
365
33.1k
        thisRule->fRuleRoot = true;
366
367
        // Flag if chaining into this rule is wanted.
368
        //    
369
33.1k
        if (fRB->fChainRules &&         // If rule chaining is enabled globally via !!chain
370
5.35k
                !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
371
5.14k
            thisRule->fChainIn = true;
372
5.14k
        }
373
374
375
        // All rule expressions are ORed together.
376
        // The ';' that terminates an expression really just functions as a '|' with
377
        //   a low operator prededence.
378
        //
379
        // Each of the four sets of rules are collected separately.
380
        //  (forward, reverse, safe_forward, safe_reverse)
381
        //  OR this rule into the appropriate group of them.
382
        //
383
33.1k
        RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
384
385
33.1k
        if (*destRules != nullptr) {
386
            // This is not the first rule encountered.
387
            // OR previous stuff  (from *destRules)
388
            // with the current rule expression (on the Node Stack)
389
            //  with the resulting OR expression going to *destRules
390
            //
391
30.1k
                       thisRule    = fNodeStack[fNodeStackPtr];
392
30.1k
            RBBINode  *prevRules   = *destRules;
393
30.1k
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
394
30.1k
            if (U_FAILURE(*fRB->fStatus)) {
395
0
                break;
396
0
            }
397
30.1k
            orNode->fLeftChild     = prevRules;
398
30.1k
            prevRules->fParent     = orNode;
399
30.1k
            orNode->fRightChild    = thisRule;
400
30.1k
            thisRule->fParent      = orNode;
401
30.1k
            *destRules             = orNode;
402
30.1k
        }
403
3.04k
        else
404
3.04k
        {
405
            // This is the first rule encountered (for this direction).
406
            // Just move its parse tree from the stack to *destRules.
407
3.04k
            *destRules = fNodeStack[fNodeStackPtr];
408
3.04k
        }
409
33.1k
        fReverseRule   = false;   // in preparation for the next rule.
410
33.1k
        fLookAheadRule = false;
411
33.1k
        fNoChainInRule = false;
412
33.1k
        fNodeStackPtr  = 0;
413
33.1k
        }
414
0
        break;
415
416
417
1.07k
    case doRuleError:
418
1.07k
        error(U_BRK_RULE_SYNTAX);
419
1.07k
        returnVal = false;
420
1.07k
        break;
421
422
423
15
    case doVariableNameExpectedErr:
424
15
        error(U_BRK_RULE_SYNTAX);
425
15
        break;
426
427
428
    //
429
    //  Unary operands  + ? *
430
    //    These all appear after the operand to which they apply.
431
    //    When we hit one, the operand (may be a whole sub expression)
432
    //    will be on the top of the stack.
433
    //    Unary Operator becomes TOS, with the old TOS as its one child.
434
977
    case doUnaryOpPlus:
435
977
        {
436
977
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
437
977
            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
438
977
            if (U_FAILURE(*fRB->fStatus)) {
439
0
                break;
440
0
            }
441
977
            plusNode->fLeftChild   = operandNode;
442
977
            operandNode->fParent   = plusNode;
443
977
        }
444
0
        break;
445
446
307
    case doUnaryOpQuestion:
447
307
        {
448
307
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
449
307
            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
450
307
            if (U_FAILURE(*fRB->fStatus)) {
451
0
                break;
452
0
            }
453
307
            qNode->fLeftChild      = operandNode;
454
307
            operandNode->fParent   = qNode;
455
307
        }
456
0
        break;
457
458
2.17k
    case doUnaryOpStar:
459
2.17k
        {
460
2.17k
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
461
2.17k
            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
462
2.17k
            if (U_FAILURE(*fRB->fStatus)) {
463
0
                break;
464
0
            }
465
2.17k
            starNode->fLeftChild   = operandNode;
466
2.17k
            operandNode->fParent   = starNode;
467
2.17k
        }
468
0
        break;
469
470
4.84M
    case doRuleChar:
471
        // A "Rule Character" is any single character that is a literal part
472
        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
473
        // These are pretty uncommon in break rules; the terms are more commonly
474
        //  sets.  To keep things uniform, treat these characters like as
475
        // sets that just happen to contain only one character.
476
4.84M
        {
477
4.84M
            n = pushNewNode(RBBINode::setRef);
478
4.84M
            if (U_FAILURE(*fRB->fStatus)) {
479
1
                break;
480
1
            }
481
4.84M
            findSetFor(UnicodeString(fC.fChar), n);
482
4.84M
            n->fFirstPos = fScanIndex;
483
4.84M
            n->fLastPos  = fNextIndex;
484
4.84M
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
485
4.84M
            break;
486
4.84M
        }
487
488
7.93k
    case doDotAny:
489
        // scanned a ".", meaning match any single character.
490
7.93k
        {
491
7.93k
            n = pushNewNode(RBBINode::setRef);
492
7.93k
            if (U_FAILURE(*fRB->fStatus)) {
493
1
                break;
494
1
            }
495
7.93k
            findSetFor(UnicodeString(true, kAny, 3), n);
496
7.93k
            n->fFirstPos = fScanIndex;
497
7.93k
            n->fLastPos  = fNextIndex;
498
7.93k
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
499
7.93k
            break;
500
7.93k
        }
501
502
10.8k
    case doSlash:
503
        // Scanned a '/', which identifies a look-ahead break position in a rule.
504
10.8k
        n = pushNewNode(RBBINode::lookAhead);
505
10.8k
        if (U_FAILURE(*fRB->fStatus)) {
506
1
            break;
507
1
        }
508
10.8k
        n->fVal      = fRuleNum;
509
10.8k
        n->fFirstPos = fScanIndex;
510
10.8k
        n->fLastPos  = fNextIndex;
511
10.8k
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
512
10.8k
        fLookAheadRule = true;
513
10.8k
        break;
514
515
516
929
    case doStartTagValue:
517
        // Scanned a '{', the opening delimiter for a tag value within a rule.
518
929
        n = pushNewNode(RBBINode::tag);
519
929
        if (U_FAILURE(*fRB->fStatus)) {
520
1
            break;
521
1
        }
522
928
        n->fVal      = 0;
523
928
        n->fFirstPos = fScanIndex;
524
928
        n->fLastPos  = fNextIndex;
525
928
        break;
526
527
1.50k
    case doTagDigit:
528
        // Just scanned a decimal digit that's part of a tag value
529
1.50k
        {
530
1.50k
            n = fNodeStack[fNodeStackPtr];
531
1.50k
            uint32_t v = u_charDigitValue(fC.fChar);
532
1.50k
            U_ASSERT(v < 10);
533
1.50k
            int64_t updated = static_cast<int64_t>(n->fVal)*10 + v;
534
            // Avoid overflow n->fVal
535
1.50k
            if (updated > INT32_MAX) {
536
2
                error(U_BRK_RULE_SYNTAX);
537
2
                break;
538
2
            }
539
1.50k
            n->fVal = static_cast<int32_t>(updated);
540
1.50k
            break;
541
1.50k
        }
542
543
896
    case doTagValue:
544
896
        n = fNodeStack[fNodeStackPtr];
545
896
        n->fLastPos = fNextIndex;
546
896
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
547
896
        break;
548
549
38
    case doTagExpectedError:
550
38
        error(U_BRK_MALFORMED_RULE_TAG);
551
38
        returnVal = false;
552
38
        break;
553
554
2.73k
    case doOptionStart:
555
        // Scanning a !!option.   At the start of string.
556
2.73k
        fOptionStart = fScanIndex;
557
2.73k
        break;
558
559
2.73k
    case doOptionEnd:
560
2.73k
        {
561
2.73k
            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
562
2.73k
            if (opt == UNICODE_STRING("chain", 5)) {
563
480
                fRB->fChainRules = true;
564
2.25k
            } else if (opt == UNICODE_STRING("forward", 7)) {
565
360
                fRB->fDefaultTree   = &fRB->fForwardTree;
566
1.89k
            } else if (opt == UNICODE_STRING("reverse", 7)) {
567
765
                fRB->fDefaultTree   = &fRB->fReverseTree;
568
1.13k
            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
569
920
                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
570
920
            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
571
66
                fRB->fDefaultTree   = &fRB->fSafeRevTree;
572
146
            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
573
72
                fRB->fLookAheadHardBreak = true;
574
74
            } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
575
1
                fRuleSets[kRuleSet_rule_char-128].clear();
576
73
            } else if (opt == UNICODE_STRING("unquoted_literals",  17)) {
577
1
                fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
578
72
            } else {
579
72
                error(U_BRK_UNRECOGNIZED_OPTION);
580
72
            }
581
2.73k
        }
582
2.73k
        break;
583
584
344
    case doReverseDir:
585
344
        fReverseRule = true;
586
344
        break;
587
588
2.28k
    case doStartVariableName:
589
2.28k
        n = pushNewNode(RBBINode::varRef);
590
2.28k
        if (U_FAILURE(*fRB->fStatus)) {
591
1
            break;
592
1
        }
593
2.28k
        n->fFirstPos = fScanIndex;
594
2.28k
        break;
595
596
2.26k
    case doEndVariableName:
597
2.26k
        n = fNodeStack[fNodeStackPtr];
598
2.26k
        if (n==nullptr || n->fType != RBBINode::varRef) {
599
0
            error(U_BRK_INTERNAL_ERROR);
600
0
            break;
601
0
        }
602
2.26k
        n->fLastPos = fScanIndex;
603
2.26k
        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
604
        // Look the newly scanned name up in the symbol table
605
        //   If there's an entry, set the l. child of the var ref to the replacement expression.
606
        //   (We also pass through here when scanning assignments, but no harm is done, other
607
        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
608
2.26k
        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
609
2.26k
        break;
610
611
752
    case doCheckVarDef:
612
752
        n = fNodeStack[fNodeStackPtr];
613
752
        if (n->fLeftChild == nullptr) {
614
21
            error(U_BRK_UNDEFINED_VARIABLE);
615
21
            returnVal = false;
616
21
        }
617
752
        break;
618
619
35.9k
    case doExprFinished:
620
35.9k
        break;
621
622
11
    case doRuleErrorAssignExpr:
623
11
        error(U_BRK_ASSIGN_ERROR);
624
11
        returnVal = false;
625
11
        break;
626
627
0
    case doExit:
628
0
        returnVal = false;
629
0
        break;
630
631
19.8k
    case doScanUnicodeSet:
632
19.8k
        scanSet();
633
19.8k
        break;
634
635
0
    default:
636
0
        error(U_BRK_INTERNAL_ERROR);
637
0
        returnVal = false;
638
0
        break;
639
15.5M
    }
640
15.5M
    return returnVal && U_SUCCESS(*fRB->fStatus);
641
15.5M
}
642
643
644
645
646
//------------------------------------------------------------------------------
647
//
648
//  Error         Report a rule parse error.
649
//                Only report it if no previous error has been recorded.
650
//
651
//------------------------------------------------------------------------------
652
5.59k
void RBBIRuleScanner::error(UErrorCode e) {
653
5.59k
    if (U_SUCCESS(*fRB->fStatus)) {
654
5.59k
        *fRB->fStatus = e;
655
5.59k
        if (fRB->fParseError) {
656
5.59k
            fRB->fParseError->line  = fLineNum;
657
5.59k
            fRB->fParseError->offset = fCharNum;
658
5.59k
            fRB->fParseError->preContext[0] = 0;
659
5.59k
            fRB->fParseError->postContext[0] = 0;
660
5.59k
        }
661
5.59k
    }
662
5.59k
}
663
664
665
666
667
//------------------------------------------------------------------------------
668
//
669
//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
670
//               An entry on the stack may be as small as a single setRef node,
671
//               or as large as the parse tree
672
//               for an entire expression (this will be the one item left on the stack
673
//               when the parsing of an RBBI rule completes.
674
//
675
//               This function is called when a binary operator is encountered.
676
//               It looks back up the stack for operators that are not yet associated
677
//               with a right operand, and if the precedence of the stacked operator >=
678
//               the precedence of the current operator, binds the operand left,
679
//               to the previously encountered operator.
680
//
681
//------------------------------------------------------------------------------
682
4.88M
void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
683
4.88M
    RBBINode *n;
684
    // printNodeStack("entering fixOpStack()");
685
9.73M
    for (;;) {
686
9.73M
        n = fNodeStack[fNodeStackPtr-1];   // an operator node
687
9.73M
        if (n->fPrecedence == 0) {
688
0
            RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
689
0
            error(U_BRK_INTERNAL_ERROR);
690
0
            return;
691
0
        }
692
693
9.73M
        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
694
            // The most recent operand goes with the current operator,
695
            //   not with the previously stacked one.
696
4.88M
            break;
697
4.88M
        }
698
            // Stack operator is a binary op  ( '|' or concatenation)
699
            //   TOS operand becomes right child of this operator.
700
            //   Resulting subexpression becomes the TOS operand.
701
4.84M
            n->fRightChild = fNodeStack[fNodeStackPtr];
702
4.84M
            fNodeStack[fNodeStackPtr]->fParent = n;
703
4.84M
            fNodeStackPtr--;
704
        // printNodeStack("looping in fixOpStack()   ");
705
4.84M
    }
706
707
4.88M
    if (p <= RBBINode::precLParen) {
708
        // Scan is at a right paren or end of expression.
709
        //  The scanned item must match the stack, or else there was an error.
710
        //  Discard the left paren (or start expr) node from the stack,
711
            //  leaving the completed (sub)expression as TOS.
712
34.9k
            if (n->fPrecedence != p) {
713
                // Right paren encountered matched start of expression node, or
714
                // end of expression matched with a left paren node.
715
6
                error(U_BRK_MISMATCHED_PAREN);
716
6
            }
717
34.9k
            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
718
34.9k
            fNodeStackPtr--;
719
            // Delete the now-discarded LParen or Start node.
720
34.9k
            delete n;
721
34.9k
    }
722
    // printNodeStack("leaving fixOpStack()");
723
4.88M
}
724
725
726
727
728
//------------------------------------------------------------------------------
729
//
730
//   findSetFor    given a UnicodeString,
731
//                  - find the corresponding Unicode Set  (uset node)
732
//                         (create one if necessary)
733
//                  - Set fLeftChild of the caller's node (should be a setRef node)
734
//                         to the uset node
735
//                 Maintain a hash table of uset nodes, so the same one is always used
736
//                    for the same string.
737
//                 If a "to adopt" set is provided and we haven't seen this key before,
738
//                    add the provided set to the hash table.
739
//                 If the string is one (32 bit) char in length, the set contains
740
//                    just one element which is the char in question.
741
//                 If the string is "any", return a set containing all chars.
742
//
743
//------------------------------------------------------------------------------
744
4.87M
void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
745
746
4.87M
    RBBISetTableEl   *el;
747
748
    // First check whether we've already cached a set for this string.
749
    // If so, just use the cached set in the new node.
750
    //   delete any set provided by the caller, since we own it.
751
4.87M
    el = static_cast<RBBISetTableEl*>(uhash_get(fSetTable, &s));
752
4.87M
    if (el != nullptr) {
753
4.69M
        delete setToAdopt;
754
4.69M
        node->fLeftChild = el->val;
755
4.69M
        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
756
4.69M
        return;
757
4.69M
    }
758
759
    // Haven't seen this set before.
760
    // If the caller didn't provide us with a prebuilt set,
761
    //   create a new UnicodeSet now.
762
172k
    if (setToAdopt == nullptr) {
763
167k
        if (s.compare(kAny, -1) == 0) {
764
981
            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
765
166k
        } else {
766
166k
            UChar32 c;
767
166k
            c = s.char32At(0);
768
166k
            setToAdopt = new UnicodeSet(c, c);
769
166k
        }
770
167k
        if (setToAdopt == nullptr) {
771
0
            error(U_MEMORY_ALLOCATION_ERROR);
772
0
            return;
773
0
        }
774
167k
    }
775
776
    //
777
    // Make a new uset node to refer to this UnicodeSet
778
    // This new uset node becomes the child of the caller's setReference node.
779
    //
780
172k
    UErrorCode localStatus = U_ZERO_ERROR;
781
172k
    RBBINode *usetNode    = new RBBINode(RBBINode::uset, localStatus);
782
172k
    if (usetNode == nullptr) {
783
0
        localStatus = U_MEMORY_ALLOCATION_ERROR;
784
0
    }
785
172k
    if (U_FAILURE(localStatus)) {
786
0
        delete usetNode;
787
0
        error(localStatus);
788
0
        delete setToAdopt;
789
0
        return;
790
0
    }
791
172k
    usetNode->fInputSet   = setToAdopt;
792
172k
    usetNode->fParent     = node;
793
172k
    node->fLeftChild      = usetNode;
794
172k
    usetNode->fText = s;
795
796
797
    //
798
    // Add the new uset node to the list of all uset nodes.
799
    //
800
172k
    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
801
802
803
    //
804
    // Add the new set to the set hash table.
805
    //
806
172k
    el = static_cast<RBBISetTableEl*>(uprv_malloc(sizeof(RBBISetTableEl)));
807
172k
    UnicodeString *tkey = new UnicodeString(s);
808
172k
    if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) {
809
        // Delete to avoid memory leak
810
0
        delete tkey;
811
0
        tkey = nullptr;
812
0
        uprv_free(el);
813
0
        el = nullptr;
814
0
        delete setToAdopt;
815
0
        setToAdopt = nullptr;
816
817
0
        error(U_MEMORY_ALLOCATION_ERROR);
818
0
        return;
819
0
    }
820
172k
    el->key = tkey;
821
172k
    el->val = usetNode;
822
172k
    uhash_put(fSetTable, el->key, el, fRB->fStatus);
823
172k
}
824
825
826
827
//
828
//  Assorted Unicode character constants.
829
//     Numeric because there is no portable way to enter them as literals.
830
//     (Think EBCDIC).
831
//
832
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
833
static const char16_t   chLF        = 0x0a;
834
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
835
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
836
static const char16_t   chApos      = 0x27;      //  single quote, for quoted chars.
837
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
838
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
839
static const char16_t   chLParen    = 0x28;
840
static const char16_t   chRParen    = 0x29;
841
842
843
//------------------------------------------------------------------------------
844
//
845
//  stripRules    Return a rules string without extra spaces.
846
//                (Comments are removed separately, during rule parsing.)
847
//
848
//------------------------------------------------------------------------------
849
2.61k
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
850
2.61k
    UnicodeString strippedRules;
851
2.61k
    int32_t rulesLength = rules.length();
852
853
7.88M
    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
854
7.88M
        UChar32 cp = rules.char32At(idx);
855
7.88M
        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
856
7.88M
        if (whiteSpace) {
857
30.5k
            continue;
858
30.5k
        }
859
7.85M
        strippedRules.append(cp);
860
7.85M
    }
861
2.61k
    return strippedRules;
862
2.61k
}
863
864
865
//------------------------------------------------------------------------------
866
//
867
//  nextCharLL    Low Level Next Char from rule input source.
868
//                Get a char from the input character iterator,
869
//                keep track of input position for error reporting.
870
//
871
//------------------------------------------------------------------------------
872
18.0M
UChar32  RBBIRuleScanner::nextCharLL() {
873
18.0M
    UChar32  ch;
874
875
18.0M
    if (fNextIndex >= fRB->fRules.length()) {
876
3.89k
        return static_cast<UChar32>(-1);
877
3.89k
    }
878
18.0M
    ch         = fRB->fRules.char32At(fNextIndex);
879
18.0M
    if (U_IS_SURROGATE(ch)) {
880
190
        error(U_ILLEGAL_CHAR_FOUND);
881
190
        return U_SENTINEL;
882
190
    }
883
18.0M
    fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
884
885
18.0M
    if (ch == chCR ||
886
18.0M
        ch == chNEL ||
887
18.0M
        ch == chLS   ||
888
18.0M
        (ch == chLF && fLastChar != chCR)) {
889
        // Character is starting a new line.  Bump up the line number, and
890
        //  reset the column to 0.
891
68.7k
        fLineNum++;
892
68.7k
        fCharNum=0;
893
68.7k
        if (fQuoteMode) {
894
1
            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
895
1
            fQuoteMode = false;
896
1
        }
897
68.7k
    }
898
18.0M
    else {
899
        // Character is not starting a new line.  Except in the case of a
900
        //   LF following a CR, increment the column position.
901
18.0M
        if (ch != chLF) {
902
18.0M
            fCharNum++;
903
18.0M
        }
904
18.0M
    }
905
18.0M
    fLastChar = ch;
906
18.0M
    return ch;
907
18.0M
}
908
909
910
//------------------------------------------------------------------------------
911
//
912
//   nextChar     for rules scanning.  At this level, we handle stripping
913
//                out comments and processing backslash character escapes.
914
//                The rest of the rules grammar is handled at the next level up.
915
//
916
//------------------------------------------------------------------------------
917
5.75M
void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
918
919
    // Unicode Character constants needed for the processing done by nextChar(),
920
    //   in hex because literals wont work on EBCDIC machines.
921
922
5.75M
    fScanIndex = fNextIndex;
923
5.75M
    c.fChar    = nextCharLL();
924
5.75M
    c.fEscaped = false;
925
926
    //
927
    //  check for '' sequence.
928
    //  These are recognized in all contexts, whether in quoted text or not.
929
    //
930
5.75M
    if (c.fChar == chApos) {
931
967
        if (fRB->fRules.char32At(fNextIndex) == chApos) {
932
194
            c.fChar    = nextCharLL();        // get nextChar officially so character counts
933
194
            c.fEscaped = true;                //   stay correct.
934
194
        }
935
773
        else
936
773
        {
937
            // Single quote, by itself.
938
            //   Toggle quoting mode.
939
            //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
940
773
            fQuoteMode = !fQuoteMode;
941
773
            if (fQuoteMode) {
942
405
                c.fChar = chLParen;
943
405
            } else {
944
368
                c.fChar = chRParen;
945
368
            }
946
773
            c.fEscaped = false;      // The paren that we return is not escaped.
947
773
            return;
948
773
        }
949
967
    }
950
951
5.75M
    if (c.fChar == static_cast<UChar32>(-1)) {
952
3.85k
        return;
953
3.85k
    }
954
5.75M
    if (fQuoteMode) {
955
1.60k
        c.fEscaped = true;
956
1.60k
    }
957
5.75M
    else
958
5.75M
    {
959
        // We are not in a 'quoted region' of the source.
960
        //
961
5.75M
        if (c.fChar == chPound) {
962
            // Start of a comment.  Consume the rest of it.
963
            //  The new-line char that terminates the comment is always returned.
964
            //  It will be treated as white-space, and serves to break up anything
965
            //    that might otherwise incorrectly clump together with a comment in
966
            //    the middle (a variable name, for example.)
967
476
            int32_t commentStart = fScanIndex;
968
549k
            for (;;) {
969
549k
                c.fChar = nextCharLL();
970
549k
                if (c.fChar == static_cast<UChar32>(-1) || // EOF
971
549k
                    c.fChar == chCR     ||
972
548k
                    c.fChar == chLF     ||
973
548k
                    c.fChar == chNEL    ||
974
548k
                    c.fChar == chLS)       {break;}
975
549k
            }
976
549k
            for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
977
549k
                fRB->fStrippedRules.setCharAt(i, u' ');
978
549k
            }
979
476
        }
980
5.75M
        if (c.fChar == static_cast<UChar32>(-1)) {
981
70
            return;
982
70
        }
983
984
        //
985
        //  check for backslash escaped characters.
986
        //  Use UnicodeString::unescapeAt() to handle them.
987
        //
988
5.75M
        if (c.fChar == chBackSlash) {
989
66.4k
            c.fEscaped = true;
990
66.4k
            int32_t startX = fNextIndex;
991
66.4k
            c.fChar = fRB->fRules.unescapeAt(fNextIndex);
992
66.4k
            if (fNextIndex == startX) {
993
115
                error(U_BRK_HEX_DIGITS_EXPECTED);
994
115
            }
995
66.4k
            fCharNum += fNextIndex-startX;
996
66.4k
        }
997
5.75M
    }
998
    // putc(c.fChar, stdout);
999
5.75M
}
1000
1001
//------------------------------------------------------------------------------
1002
//
1003
//  Parse RBBI rules.   The state machine for rules parsing is here.
1004
//                      The state tables are hand-written in the file rbbirpt.txt,
1005
//                      and converted to the form used here by a perl
1006
//                      script rbbicst.pl
1007
//
1008
//------------------------------------------------------------------------------
1009
8.25k
void RBBIRuleScanner::parse() {
1010
8.25k
    uint16_t                state;
1011
8.25k
    const RBBIRuleTableEl  *tableEl;
1012
1013
8.25k
    if (U_FAILURE(*fRB->fStatus)) {
1014
0
        return;
1015
0
    }
1016
1017
8.25k
    state = 1;
1018
8.25k
    nextChar(fC);
1019
    //
1020
    // Main loop for the rule parsing state machine.
1021
    //   Runs once per state transition.
1022
    //   Each time through optionally performs, depending on the state table,
1023
    //      - an advance to the next input char
1024
    //      - an action to be performed.
1025
    //      - pushing or popping a state to/from the local state return stack.
1026
    //
1027
15.5M
    for (;;) {
1028
        //  Bail out if anything has gone wrong.
1029
        //  RBBI rule file parsing stops on the first error encountered.
1030
15.5M
        if (U_FAILURE(*fRB->fStatus)) {
1031
149
            break;
1032
149
        }
1033
1034
        // Quit if state == 0.  This is the normal way to exit the state machine.
1035
        //
1036
15.5M
        if (state == 0) {
1037
2.76k
            break;
1038
2.76k
        }
1039
1040
        // Find the state table element that matches the input char from the rule, or the
1041
        //    class of the input character.  Start with the first table row for this
1042
        //    state, then linearly scan forward until we find a row that matches the
1043
        //    character.  The last row for each state always matches all characters, so
1044
        //    the search will stop there, if not before.
1045
        //
1046
15.5M
        tableEl = &gRuleParseStateTable[state];
1047
        #ifdef RBBI_DEBUG
1048
            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
1049
                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
1050
                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
1051
            }
1052
        #endif
1053
1054
55.2M
        for (;;) {
1055
            #ifdef RBBI_DEBUG
1056
                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
1057
            #endif
1058
55.2M
            if (tableEl->fCharClass < 127 && fC.fEscaped == false &&   tableEl->fCharClass == fC.fChar) {
1059
                // Table row specified an individual character, not a set, and
1060
                //   the input character is not escaped, and
1061
                //   the input character matched it.
1062
170k
                break;
1063
170k
            }
1064
55.0M
            if (tableEl->fCharClass == 255) {
1065
                // Table row specified default, match anything character class.
1066
4.95M
                break;
1067
4.95M
            }
1068
50.1M
            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
1069
                // Table row specified "escaped" and the char was escaped.
1070
121k
                break;
1071
121k
            }
1072
50.0M
            if (tableEl->fCharClass == 253 && fC.fEscaped &&
1073
0
                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
1074
                // Table row specified "escaped P" and the char is either 'p' or 'P'.
1075
0
                break;
1076
0
            }
1077
50.0M
            if (tableEl->fCharClass == 252 && fC.fChar == static_cast<UChar32>(-1)) {
1078
                // Table row specified eof and we hit eof on the input.
1079
2.77k
                break;
1080
2.77k
            }
1081
1082
50.0M
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
1083
25.0M
                fC.fEscaped == false &&                                      //   char is not escaped &&
1084
24.9M
                fC.fChar != static_cast<UChar32>(-1)) {                      //   char is not EOF
1085
24.9M
                U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
1086
24.9M
                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
1087
                    // Table row specified a character class, or set of characters,
1088
                    //   and the current char matches it.
1089
10.3M
                    break;
1090
10.3M
                }
1091
24.9M
            }
1092
1093
            // No match on this row, advance to the next  row for this state,
1094
39.6M
            tableEl++;
1095
39.6M
        }
1096
15.5M
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1097
1098
        //
1099
        // We've found the row of the state table that matches the current input
1100
        //   character from the rules string.
1101
        // Perform any action specified  by this row in the state table.
1102
15.5M
        if (doParseActions(static_cast<int32_t>(tableEl->fAction)) == false) {
1103
            // Break out of the state machine loop if the
1104
            //   the action signalled some kind of error, or
1105
            //   the action was to exit, occurs on normal end-of-rules-input.
1106
5.33k
            break;
1107
5.33k
        }
1108
1109
15.5M
        if (tableEl->fPushState != 0) {
1110
63.7k
            fStackPtr++;
1111
63.7k
            if (fStackPtr >= kStackSize) {
1112
1
                error(U_BRK_INTERNAL_ERROR);
1113
1
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1114
1
                fStackPtr--;
1115
1
            }
1116
63.7k
            fStack[fStackPtr] = tableEl->fPushState;
1117
63.7k
        }
1118
1119
15.5M
        if (tableEl->fNextChar) {
1120
5.75M
            nextChar(fC);
1121
5.75M
        }
1122
1123
        // Get the next state from the table entry, or from the
1124
        //   state stack if the next state was specified as "pop".
1125
15.5M
        if (tableEl->fNextState != 255) {
1126
15.5M
            state = tableEl->fNextState;
1127
15.5M
        } else {
1128
54.3k
            state = fStack[fStackPtr];
1129
54.3k
            fStackPtr--;
1130
54.3k
            if (fStackPtr < 0) {
1131
0
                error(U_BRK_INTERNAL_ERROR);
1132
0
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1133
0
                fStackPtr++;
1134
0
            }
1135
54.3k
        }
1136
1137
15.5M
    }
1138
1139
8.25k
    if (U_FAILURE(*fRB->fStatus)) {
1140
5.48k
        return;
1141
5.48k
    }
1142
    
1143
    // If there are no forward rules set an error.
1144
    //
1145
2.76k
    if (fRB->fForwardTree == nullptr) {
1146
107
        error(U_BRK_RULE_SYNTAX);
1147
107
        return;
1148
107
    }
1149
1150
    //
1151
    // Parsing of the input RBBI rules is complete.
1152
    // We now have a parse tree for the rule expressions
1153
    // and a list of all UnicodeSets that are referenced.
1154
    //
1155
#ifdef RBBI_DEBUG
1156
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1157
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
1158
        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1159
        RBBINode::printTree(fRB->fForwardTree, true);
1160
        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1161
        RBBINode::printTree(fRB->fReverseTree, true);
1162
        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1163
        RBBINode::printTree(fRB->fSafeFwdTree, true);
1164
        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1165
        RBBINode::printTree(fRB->fSafeRevTree, true);
1166
    }
1167
#endif
1168
2.76k
}
1169
1170
1171
//------------------------------------------------------------------------------
1172
//
1173
//  printNodeStack     for debugging...
1174
//
1175
//------------------------------------------------------------------------------
1176
#ifdef RBBI_DEBUG
1177
void RBBIRuleScanner::printNodeStack(const char *title) {
1178
    int i;
1179
    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
1180
    for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
1181
}
1182
#endif
1183
1184
1185
1186
1187
//------------------------------------------------------------------------------
1188
//
1189
//  pushNewNode   create a new RBBINode of the specified type and push it
1190
//                onto the stack of nodes.
1191
//
1192
//------------------------------------------------------------------------------
1193
9.82M
RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
1194
9.82M
    if (U_FAILURE(*fRB->fStatus)) {
1195
0
        return nullptr;
1196
0
    }
1197
9.82M
    if (fNodeStackPtr >= kStackSize - 1) {
1198
6
        error(U_BRK_RULE_SYNTAX);
1199
6
        RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1200
6
        return nullptr;
1201
6
    }
1202
9.82M
    fNodeStackPtr++;
1203
9.82M
    fNodeStack[fNodeStackPtr] = new RBBINode(t, *fRB->fStatus);
1204
9.82M
    if (fNodeStack[fNodeStackPtr] == nullptr) {
1205
0
        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1206
0
    }
1207
9.82M
    return fNodeStack[fNodeStackPtr];
1208
9.82M
}
1209
1210
1211
1212
//------------------------------------------------------------------------------
1213
//
1214
//  scanSet    Construct a UnicodeSet from the text at the current scan
1215
//             position.  Advance the scan position to the first character
1216
//             after the set.
1217
//
1218
//             A new RBBI setref node referring to the set is pushed onto the node
1219
//             stack.
1220
//
1221
//             The scan position is normally under the control of the state machine
1222
//             that controls rule parsing.  UnicodeSets, however, are parsed by
1223
//             the UnicodeSet constructor, not by the RBBI rule parser.
1224
//
1225
//------------------------------------------------------------------------------
1226
19.8k
void RBBIRuleScanner::scanSet() {
1227
19.8k
    ParsePosition  pos;
1228
19.8k
    int            startPos;
1229
19.8k
    int            i;
1230
1231
19.8k
    if (U_FAILURE(*fRB->fStatus)) {
1232
0
        return;
1233
0
    }
1234
1235
19.8k
    pos.setIndex(fScanIndex);
1236
19.8k
    startPos = fScanIndex;
1237
19.8k
    UErrorCode localStatus = U_ZERO_ERROR;
1238
19.8k
    LocalPointer<UnicodeSet> uset(new UnicodeSet(), localStatus);
1239
19.8k
    if (U_FAILURE(localStatus)) {
1240
0
        error(localStatus);
1241
0
        return;
1242
0
    }
1243
19.8k
    uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1244
19.8k
    if (U_FAILURE(localStatus)) {
1245
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
1246
        //         UnicodeSet appears to not be reporting correctly at this time.
1247
        #ifdef RBBI_DEBUG
1248
            RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
1249
        #endif
1250
3.88k
        error(localStatus);
1251
3.88k
        return;
1252
3.88k
    }
1253
1254
    // Verify that the set contains at least one code point.
1255
    //
1256
15.9k
    U_ASSERT(uset.isValid());
1257
15.9k
    UnicodeSet tempSet(*uset);
1258
    // Use tempSet to handle the case that the UnicodeSet contains
1259
    // only string element, such as [{ab}] and treat it as empty set.
1260
15.9k
    tempSet.removeAllStrings();
1261
15.9k
    if (tempSet.isEmpty()) {
1262
        // This set is empty.
1263
        //  Make it an error, because it almost certainly is not what the user wanted.
1264
        //  Also, avoids having to think about corner cases in the tree manipulation code
1265
        //   that occurs later on.
1266
40
        error(U_BRK_RULE_EMPTY_SET);
1267
40
        return;
1268
40
    }
1269
1270
1271
    // Advance the RBBI parse position over the UnicodeSet pattern.
1272
    //   Don't just set fScanIndex because the line/char positions maintained
1273
    //   for error reporting would be thrown off.
1274
15.9k
    i = pos.getIndex();
1275
11.7M
    for (;U_SUCCESS(*fRB->fStatus);) {
1276
11.7M
        if (fNextIndex >= i) {
1277
15.7k
            break;
1278
15.7k
        }
1279
11.7M
        nextCharLL();
1280
11.7M
    }
1281
1282
15.9k
    if (U_SUCCESS(*fRB->fStatus)) {
1283
15.7k
        RBBINode         *n;
1284
1285
15.7k
        n = pushNewNode(RBBINode::setRef);
1286
15.7k
        if (U_FAILURE(*fRB->fStatus)) {
1287
1
            return;
1288
1
        }
1289
15.7k
        n->fFirstPos = startPos;
1290
15.7k
        n->fLastPos  = fNextIndex;
1291
15.7k
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1292
        //  findSetFor() serves several purposes here:
1293
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1294
        //     - Maintains collection of all sets in use, needed later for establishing
1295
        //          character categories for run time engine.
1296
        //     - Eliminates mulitiple instances of the same set.
1297
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1298
15.7k
        findSetFor(n->fText, n, uset.orphan());
1299
15.7k
    }
1300
1301
15.9k
}
1302
1303
5.22k
int32_t RBBIRuleScanner::numRules() {
1304
5.22k
    return fRuleNum;
1305
5.22k
}
1306
1307
U_NAMESPACE_END
1308
1309
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */