Coverage Report

Created: 2023-11-19 06:22

/src/icu/icu4c/source/common/rbbiscan.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  rbbiscan.cpp
5
//
6
//  Copyright (C) 2002-2016, International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the Rule Based Break Iterator Rule Builder functions for
10
//   scanning the rules and assembling a parse tree.  This is the first phase
11
//   of compiling the rules.
12
//
13
//  The overall of the rules is managed by class RBBIRuleBuilder, which will
14
//  create and use an instance of this class as part of the process.
15
//
16
17
#include "unicode/utypes.h"
18
19
#if !UCONFIG_NO_BREAK_ITERATION
20
21
#include "unicode/unistr.h"
22
#include "unicode/uniset.h"
23
#include "unicode/uchar.h"
24
#include "unicode/uchriter.h"
25
#include "unicode/parsepos.h"
26
#include "unicode/parseerr.h"
27
#include "cmemory.h"
28
#include "cstring.h"
29
30
#include "rbbirpt.h"   // Contains state table for the rbbi rules parser.
31
                       //   generated by a Perl script.
32
#include "rbbirb.h"
33
#include "rbbinode.h"
34
#include "rbbiscan.h"
35
#include "rbbitblb.h"
36
37
#include "uassert.h"
38
39
//------------------------------------------------------------------------------
40
//
41
// Unicode Set init strings for each of the character classes needed for parsing a rule file.
42
//               (Initialized with hex values for portability to EBCDIC based machines.
43
//                Really ugly, but there's no good way to avoid it.)
44
//
45
//              The sets are referred to by name in the rbbirpt.txt, which is the
46
//              source form of the state transition table for the RBBI rule parser.
47
//
48
//------------------------------------------------------------------------------
49
static const char16_t gRuleSet_rule_char_pattern[]       = {
50
 // Characters that may appear as literals in patterns without escaping or quoting.
51
 //   [    ^      [    \     p     {      Z     }     \     u    0      0    2      0
52
    0x5b, 0x5e, 0x5b, 0x5c, 0x70, 0x7b, 0x5a, 0x7d, 0x5c, 0x75, 0x30, 0x30, 0x32, 0x30,
53
 //   -    \      u    0     0     7      f     ]     -     [    \      p
54
    0x2d, 0x5c, 0x75, 0x30, 0x30, 0x37, 0x66, 0x5d, 0x2d, 0x5b, 0x5c, 0x70,
55
 //   {     L     }    ]     -     [      \     p     {     N    }      ]     ]
56
    0x7b, 0x4c, 0x7d, 0x5d, 0x2d, 0x5b, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0x5d, 0};
57
58
static const char16_t gRuleSet_name_char_pattern[]       = {
59
//    [    _      \    p     {     L      }     \     p     {    N      }     ]
60
    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5c, 0x70, 0x7b, 0x4e, 0x7d, 0x5d, 0};
61
62
static const char16_t gRuleSet_digit_char_pattern[] = {
63
//    [    0      -    9     ]
64
    0x5b, 0x30, 0x2d, 0x39, 0x5d, 0};
65
66
static const char16_t gRuleSet_name_start_char_pattern[] = {
67
//    [    _      \    p     {     L      }     ]
68
    0x5b, 0x5f, 0x5c, 0x70, 0x7b, 0x4c, 0x7d, 0x5d, 0 };
69
70
static const char16_t kAny[] = {0x61, 0x6e, 0x79, 0x00};  // "any"
71
72
73
U_CDECL_BEGIN
74
0
static void U_CALLCONV RBBISetTable_deleter(void *p) {
75
0
    icu::RBBISetTableEl *px = (icu::RBBISetTableEl *)p;
76
0
    delete px->key;
77
    // Note:  px->val is owned by the linked list "fSetsListHead" in scanner.
78
    //        Don't delete the value nodes here.
79
0
    uprv_free(px);
80
0
}
81
U_CDECL_END
82
83
U_NAMESPACE_BEGIN
84
85
//------------------------------------------------------------------------------
86
//
87
//  Constructor.
88
//
89
//------------------------------------------------------------------------------
90
RBBIRuleScanner::RBBIRuleScanner(RBBIRuleBuilder *rb)
91
0
{
92
0
    fRB                 = rb;
93
0
    fScanIndex          = 0;
94
0
    fNextIndex          = 0;
95
0
    fQuoteMode          = false;
96
0
    fLineNum            = 1;
97
0
    fCharNum            = 0;
98
0
    fLastChar           = 0;
99
    
100
0
    fStateTable         = nullptr;
101
0
    fStack[0]           = 0;
102
0
    fStackPtr           = 0;
103
0
    fNodeStack[0]       = nullptr;
104
0
    fNodeStackPtr       = 0;
105
106
0
    fReverseRule        = false;
107
0
    fLookAheadRule      = false;
108
0
    fNoChainInRule      = false;
109
110
0
    fSymbolTable        = nullptr;
111
0
    fSetTable           = nullptr;
112
0
    fRuleNum            = 0;
113
0
    fOptionStart        = 0;
114
115
    // Do not check status until after all critical fields are sufficiently initialized
116
    //   that the destructor can run cleanly.
117
0
    if (U_FAILURE(*rb->fStatus)) {
118
0
        return;
119
0
    }
120
121
    //
122
    //  Set up the constant Unicode Sets.
123
    //     Note:  These could be made static, lazily initialized, and shared among
124
    //            all instances of RBBIRuleScanners.  BUT this is quite a bit simpler,
125
    //            and the time to build these few sets should be small compared to a
126
    //            full break iterator build.
127
0
    fRuleSets[kRuleSet_rule_char-128]
128
0
        = UnicodeSet(UnicodeString(gRuleSet_rule_char_pattern),       *rb->fStatus);
129
    // fRuleSets[kRuleSet_white_space-128] = [:Pattern_White_Space:]
130
0
    fRuleSets[kRuleSet_white_space-128].
131
0
        add(9, 0xd).add(0x20).add(0x85).add(0x200e, 0x200f).add(0x2028, 0x2029);
132
0
    fRuleSets[kRuleSet_name_char-128]
133
0
        = UnicodeSet(UnicodeString(gRuleSet_name_char_pattern),       *rb->fStatus);
134
0
    fRuleSets[kRuleSet_name_start_char-128]
135
0
        = UnicodeSet(UnicodeString(gRuleSet_name_start_char_pattern), *rb->fStatus);
136
0
    fRuleSets[kRuleSet_digit_char-128]
137
0
        = UnicodeSet(UnicodeString(gRuleSet_digit_char_pattern),      *rb->fStatus);
138
0
    if (*rb->fStatus == U_ILLEGAL_ARGUMENT_ERROR) {
139
        // This case happens if ICU's data is missing.  UnicodeSet tries to look up property
140
        //   names from the init string, can't find them, and claims an illegal argument.
141
        //   Change the error so that the actual problem will be clearer to users.
142
0
        *rb->fStatus = U_BRK_INIT_ERROR;
143
0
    }
144
0
    if (U_FAILURE(*rb->fStatus)) {
145
0
        return;
146
0
    }
147
148
0
    fSymbolTable = new RBBISymbolTable(this, rb->fRules, *rb->fStatus);
149
0
    if (fSymbolTable == nullptr) {
150
0
        *rb->fStatus = U_MEMORY_ALLOCATION_ERROR;
151
0
        return;
152
0
    }
153
0
    fSetTable    = uhash_open(uhash_hashUnicodeString, uhash_compareUnicodeString, nullptr, rb->fStatus);
154
0
    if (U_FAILURE(*rb->fStatus)) {
155
0
        return;
156
0
    }
157
0
    uhash_setValueDeleter(fSetTable, RBBISetTable_deleter);
158
0
}
159
160
161
162
//------------------------------------------------------------------------------
163
//
164
//  Destructor
165
//
166
//------------------------------------------------------------------------------
167
0
RBBIRuleScanner::~RBBIRuleScanner() {
168
0
    delete fSymbolTable;
169
0
    if (fSetTable != nullptr) {
170
0
         uhash_close(fSetTable);
171
0
         fSetTable = nullptr;
172
173
0
    }
174
175
176
    // Node Stack.
177
    //   Normally has one entry, which is the entire parse tree for the rules.
178
    //   If errors occurred, there may be additional subtrees left on the stack.
179
0
    while (fNodeStackPtr > 0) {
180
0
        delete fNodeStack[fNodeStackPtr];
181
0
        fNodeStackPtr--;
182
0
    }
183
184
0
}
185
186
//------------------------------------------------------------------------------
187
//
188
//  doParseAction        Do some action during rule parsing.
189
//                       Called by the parse state machine.
190
//                       Actions build the parse tree and Unicode Sets,
191
//                       and maintain the parse stack for nested expressions.
192
//
193
//                       TODO:  unify EParseAction and RBBI_RuleParseAction enum types.
194
//                              They represent exactly the same thing.  They're separate
195
//                              only to work around enum forward declaration restrictions
196
//                              in some compilers, while at the same time avoiding multiple
197
//                              definitions problems.  I'm sure that there's a better way.
198
//
199
//------------------------------------------------------------------------------
200
UBool RBBIRuleScanner::doParseActions(int32_t action)
201
0
{
202
0
    RBBINode *n       = nullptr;
203
204
0
    UBool   returnVal = true;
205
206
0
    switch (action) {
207
208
0
    case doExprStart:
209
0
        pushNewNode(RBBINode::opStart);
210
0
        fRuleNum++;
211
0
        break;
212
213
214
0
    case doNoChain:
215
        // Scanned a '^' while on the rule start state.
216
0
        fNoChainInRule = true;
217
0
        break;
218
219
220
0
    case doExprOrOperator:
221
0
        {
222
0
            fixOpStack(RBBINode::precOpCat);
223
0
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
224
0
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
225
0
            if (U_FAILURE(*fRB->fStatus)) {
226
0
                break;
227
0
            }
228
0
            orNode->fLeftChild     = operandNode;
229
0
            operandNode->fParent   = orNode;
230
0
        }
231
0
        break;
232
233
0
    case doExprCatOperator:
234
        // concatenation operator.
235
        // For the implicit concatenation of adjacent terms in an expression that are
236
        //   not separated by any other operator.  Action is invoked between the
237
        //   actions for the two terms.
238
0
        {
239
0
            fixOpStack(RBBINode::precOpCat);
240
0
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
241
0
            RBBINode  *catNode     = pushNewNode(RBBINode::opCat);
242
0
            if (U_FAILURE(*fRB->fStatus)) {
243
0
                break;
244
0
            }
245
0
            catNode->fLeftChild    = operandNode;
246
0
            operandNode->fParent   = catNode;
247
0
        }
248
0
        break;
249
250
0
    case doLParen:
251
        // Open Paren.
252
        //   The openParen node is a dummy operation type with a low precedence,
253
        //     which has the affect of ensuring that any real binary op that
254
        //     follows within the parens binds more tightly to the operands than
255
        //     stuff outside of the parens.
256
0
        pushNewNode(RBBINode::opLParen);
257
0
        break;
258
259
0
    case doExprRParen:
260
0
        fixOpStack(RBBINode::precLParen);
261
0
        break;
262
263
0
    case doNOP:
264
0
        break;
265
266
0
    case doStartAssign:
267
        // We've just scanned "$variable = "
268
        // The top of the node stack has the $variable ref node.
269
270
        // Save the start position of the RHS text in the StartExpression node
271
        //   that precedes the $variableReference node on the stack.
272
        //   This will eventually be used when saving the full $variable replacement
273
        //   text as a string.
274
0
        n = fNodeStack[fNodeStackPtr-1];
275
0
        n->fFirstPos = fNextIndex;              // move past the '='
276
277
        // Push a new start-of-expression node; needed to keep parse of the
278
        //   RHS expression happy.
279
0
        pushNewNode(RBBINode::opStart);
280
0
        break;
281
282
283
284
285
0
    case doEndAssign:
286
0
        {
287
            // We have reached the end of an assignment statement.
288
            //   Current scan char is the ';' that terminates the assignment.
289
290
            // Terminate expression, leaves expression parse tree rooted in TOS node.
291
0
            fixOpStack(RBBINode::precStart);
292
293
0
            RBBINode *startExprNode  = fNodeStack[fNodeStackPtr-2];
294
0
            RBBINode *varRefNode     = fNodeStack[fNodeStackPtr-1];
295
0
            RBBINode *RHSExprNode    = fNodeStack[fNodeStackPtr];
296
297
            // Save original text of right side of assignment, excluding the terminating ';'
298
            //  in the root of the node for the right-hand-side expression.
299
0
            RHSExprNode->fFirstPos = startExprNode->fFirstPos;
300
0
            RHSExprNode->fLastPos  = fScanIndex;
301
0
            fRB->fRules.extractBetween(RHSExprNode->fFirstPos, RHSExprNode->fLastPos, RHSExprNode->fText);
302
303
            // Expression parse tree becomes l. child of the $variable reference node.
304
0
            varRefNode->fLeftChild = RHSExprNode;
305
0
            RHSExprNode->fParent   = varRefNode;
306
307
            // Make a symbol table entry for the $variableRef node.
308
0
            fSymbolTable->addEntry(varRefNode->fText, varRefNode, *fRB->fStatus);
309
0
            if (U_FAILURE(*fRB->fStatus)) {
310
                // This is a round-about way to get the parse position set
311
                //  so that duplicate symbols error messages include a line number.
312
0
                UErrorCode t = *fRB->fStatus;
313
0
                *fRB->fStatus = U_ZERO_ERROR;
314
0
                error(t);
315
0
            }
316
317
            // Clean up the stack.
318
0
            delete startExprNode;
319
0
            fNodeStackPtr-=3;
320
0
            break;
321
0
        }
322
323
0
    case doEndOfRule:
324
0
        {
325
0
        fixOpStack(RBBINode::precStart);      // Terminate expression, leaves expression
326
0
        if (U_FAILURE(*fRB->fStatus)) {       //   parse tree rooted in TOS node.
327
0
            break;
328
0
        }
329
#ifdef RBBI_DEBUG
330
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "rtree")) {printNodeStack("end of rule");}
331
#endif
332
0
        U_ASSERT(fNodeStackPtr == 1);
333
0
        RBBINode *thisRule = fNodeStack[fNodeStackPtr];
334
335
        // If this rule includes a look-ahead '/', add a endMark node to the
336
        //   expression tree.
337
0
        if (fLookAheadRule) {
338
0
            RBBINode  *endNode        = pushNewNode(RBBINode::endMark);
339
0
            RBBINode  *catNode        = pushNewNode(RBBINode::opCat);
340
0
            if (U_FAILURE(*fRB->fStatus)) {
341
0
                break;
342
0
            }
343
0
            fNodeStackPtr -= 2;
344
0
            catNode->fLeftChild       = thisRule;
345
0
            catNode->fRightChild      = endNode;
346
0
            fNodeStack[fNodeStackPtr] = catNode;
347
0
            endNode->fVal             = fRuleNum;
348
0
            endNode->fLookAheadEnd    = true;
349
0
            thisRule                  = catNode;
350
351
            // TODO: Disable chaining out of look-ahead (hard break) rules.
352
            //   The break on rule match is forced, so there is no point in building up
353
            //   the state table to chain into another rule for a longer match.
354
0
        }
355
356
        // Mark this node as being the root of a rule.
357
0
        thisRule->fRuleRoot = true;
358
359
        // Flag if chaining into this rule is wanted.
360
        //    
361
0
        if (fRB->fChainRules &&         // If rule chaining is enabled globally via !!chain
362
0
                !fNoChainInRule) {      //     and no '^' chain-in inhibit was on this rule
363
0
            thisRule->fChainIn = true;
364
0
        }
365
366
367
        // All rule expressions are ORed together.
368
        // The ';' that terminates an expression really just functions as a '|' with
369
        //   a low operator prededence.
370
        //
371
        // Each of the four sets of rules are collected separately.
372
        //  (forward, reverse, safe_forward, safe_reverse)
373
        //  OR this rule into the appropriate group of them.
374
        //
375
0
        RBBINode **destRules = (fReverseRule? &fRB->fSafeRevTree : fRB->fDefaultTree);
376
377
0
        if (*destRules != nullptr) {
378
            // This is not the first rule encountered.
379
            // OR previous stuff  (from *destRules)
380
            // with the current rule expression (on the Node Stack)
381
            //  with the resulting OR expression going to *destRules
382
            //
383
0
                       thisRule    = fNodeStack[fNodeStackPtr];
384
0
            RBBINode  *prevRules   = *destRules;
385
0
            RBBINode  *orNode      = pushNewNode(RBBINode::opOr);
386
0
            if (U_FAILURE(*fRB->fStatus)) {
387
0
                break;
388
0
            }
389
0
            orNode->fLeftChild     = prevRules;
390
0
            prevRules->fParent     = orNode;
391
0
            orNode->fRightChild    = thisRule;
392
0
            thisRule->fParent      = orNode;
393
0
            *destRules             = orNode;
394
0
        }
395
0
        else
396
0
        {
397
            // This is the first rule encountered (for this direction).
398
            // Just move its parse tree from the stack to *destRules.
399
0
            *destRules = fNodeStack[fNodeStackPtr];
400
0
        }
401
0
        fReverseRule   = false;   // in preparation for the next rule.
402
0
        fLookAheadRule = false;
403
0
        fNoChainInRule = false;
404
0
        fNodeStackPtr  = 0;
405
0
        }
406
0
        break;
407
408
409
0
    case doRuleError:
410
0
        error(U_BRK_RULE_SYNTAX);
411
0
        returnVal = false;
412
0
        break;
413
414
415
0
    case doVariableNameExpectedErr:
416
0
        error(U_BRK_RULE_SYNTAX);
417
0
        break;
418
419
420
    //
421
    //  Unary operands  + ? *
422
    //    These all appear after the operand to which they apply.
423
    //    When we hit one, the operand (may be a whole sub expression)
424
    //    will be on the top of the stack.
425
    //    Unary Operator becomes TOS, with the old TOS as its one child.
426
0
    case doUnaryOpPlus:
427
0
        {
428
0
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
429
0
            RBBINode  *plusNode    = pushNewNode(RBBINode::opPlus);
430
0
            if (U_FAILURE(*fRB->fStatus)) {
431
0
                break;
432
0
            }
433
0
            plusNode->fLeftChild   = operandNode;
434
0
            operandNode->fParent   = plusNode;
435
0
        }
436
0
        break;
437
438
0
    case doUnaryOpQuestion:
439
0
        {
440
0
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
441
0
            RBBINode  *qNode       = pushNewNode(RBBINode::opQuestion);
442
0
            if (U_FAILURE(*fRB->fStatus)) {
443
0
                break;
444
0
            }
445
0
            qNode->fLeftChild      = operandNode;
446
0
            operandNode->fParent   = qNode;
447
0
        }
448
0
        break;
449
450
0
    case doUnaryOpStar:
451
0
        {
452
0
            RBBINode  *operandNode = fNodeStack[fNodeStackPtr--];
453
0
            RBBINode  *starNode    = pushNewNode(RBBINode::opStar);
454
0
            if (U_FAILURE(*fRB->fStatus)) {
455
0
                break;
456
0
            }
457
0
            starNode->fLeftChild   = operandNode;
458
0
            operandNode->fParent   = starNode;
459
0
        }
460
0
        break;
461
462
0
    case doRuleChar:
463
        // A "Rule Character" is any single character that is a literal part
464
        // of the regular expression.  Like a, b and c in the expression "(abc*) | [:L:]"
465
        // These are pretty uncommon in break rules; the terms are more commonly
466
        //  sets.  To keep things uniform, treat these characters like as
467
        // sets that just happen to contain only one character.
468
0
        {
469
0
            n = pushNewNode(RBBINode::setRef);
470
0
            if (U_FAILURE(*fRB->fStatus)) {
471
0
                break;
472
0
            }
473
0
            findSetFor(UnicodeString(fC.fChar), n);
474
0
            n->fFirstPos = fScanIndex;
475
0
            n->fLastPos  = fNextIndex;
476
0
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
477
0
            break;
478
0
        }
479
480
0
    case doDotAny:
481
        // scanned a ".", meaning match any single character.
482
0
        {
483
0
            n = pushNewNode(RBBINode::setRef);
484
0
            if (U_FAILURE(*fRB->fStatus)) {
485
0
                break;
486
0
            }
487
0
            findSetFor(UnicodeString(true, kAny, 3), n);
488
0
            n->fFirstPos = fScanIndex;
489
0
            n->fLastPos  = fNextIndex;
490
0
            fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
491
0
            break;
492
0
        }
493
494
0
    case doSlash:
495
        // Scanned a '/', which identifies a look-ahead break position in a rule.
496
0
        n = pushNewNode(RBBINode::lookAhead);
497
0
        if (U_FAILURE(*fRB->fStatus)) {
498
0
            break;
499
0
        }
500
0
        n->fVal      = fRuleNum;
501
0
        n->fFirstPos = fScanIndex;
502
0
        n->fLastPos  = fNextIndex;
503
0
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
504
0
        fLookAheadRule = true;
505
0
        break;
506
507
508
0
    case doStartTagValue:
509
        // Scanned a '{', the opening delimiter for a tag value within a rule.
510
0
        n = pushNewNode(RBBINode::tag);
511
0
        if (U_FAILURE(*fRB->fStatus)) {
512
0
            break;
513
0
        }
514
0
        n->fVal      = 0;
515
0
        n->fFirstPos = fScanIndex;
516
0
        n->fLastPos  = fNextIndex;
517
0
        break;
518
519
0
    case doTagDigit:
520
        // Just scanned a decimal digit that's part of a tag value
521
0
        {
522
0
            n = fNodeStack[fNodeStackPtr];
523
0
            uint32_t v = u_charDigitValue(fC.fChar);
524
0
            U_ASSERT(v < 10);
525
0
            n->fVal = n->fVal*10 + v;
526
0
            break;
527
0
        }
528
529
0
    case doTagValue:
530
0
        n = fNodeStack[fNodeStackPtr];
531
0
        n->fLastPos = fNextIndex;
532
0
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
533
0
        break;
534
535
0
    case doTagExpectedError:
536
0
        error(U_BRK_MALFORMED_RULE_TAG);
537
0
        returnVal = false;
538
0
        break;
539
540
0
    case doOptionStart:
541
        // Scanning a !!option.   At the start of string.
542
0
        fOptionStart = fScanIndex;
543
0
        break;
544
545
0
    case doOptionEnd:
546
0
        {
547
0
            UnicodeString opt(fRB->fRules, fOptionStart, fScanIndex-fOptionStart);
548
0
            if (opt == UNICODE_STRING("chain", 5)) {
549
0
                fRB->fChainRules = true;
550
0
            } else if (opt == UNICODE_STRING("forward", 7)) {
551
0
                fRB->fDefaultTree   = &fRB->fForwardTree;
552
0
            } else if (opt == UNICODE_STRING("reverse", 7)) {
553
0
                fRB->fDefaultTree   = &fRB->fReverseTree;
554
0
            } else if (opt == UNICODE_STRING("safe_forward", 12)) {
555
0
                fRB->fDefaultTree   = &fRB->fSafeFwdTree;
556
0
            } else if (opt == UNICODE_STRING("safe_reverse", 12)) {
557
0
                fRB->fDefaultTree   = &fRB->fSafeRevTree;
558
0
            } else if (opt == UNICODE_STRING("lookAheadHardBreak", 18)) {
559
0
                fRB->fLookAheadHardBreak = true;
560
0
            } else if (opt == UNICODE_STRING("quoted_literals_only", 20)) {
561
0
                fRuleSets[kRuleSet_rule_char-128].clear();
562
0
            } else if (opt == UNICODE_STRING("unquoted_literals",  17)) {
563
0
                fRuleSets[kRuleSet_rule_char-128].applyPattern(UnicodeString(gRuleSet_rule_char_pattern), *fRB->fStatus);
564
0
            } else {
565
0
                error(U_BRK_UNRECOGNIZED_OPTION);
566
0
            }
567
0
        }
568
0
        break;
569
570
0
    case doReverseDir:
571
0
        fReverseRule = true;
572
0
        break;
573
574
0
    case doStartVariableName:
575
0
        n = pushNewNode(RBBINode::varRef);
576
0
        if (U_FAILURE(*fRB->fStatus)) {
577
0
            break;
578
0
        }
579
0
        n->fFirstPos = fScanIndex;
580
0
        break;
581
582
0
    case doEndVariableName:
583
0
        n = fNodeStack[fNodeStackPtr];
584
0
        if (n==nullptr || n->fType != RBBINode::varRef) {
585
0
            error(U_BRK_INTERNAL_ERROR);
586
0
            break;
587
0
        }
588
0
        n->fLastPos = fScanIndex;
589
0
        fRB->fRules.extractBetween(n->fFirstPos+1, n->fLastPos, n->fText);
590
        // Look the newly scanned name up in the symbol table
591
        //   If there's an entry, set the l. child of the var ref to the replacement expression.
592
        //   (We also pass through here when scanning assignments, but no harm is done, other
593
        //    than a slight wasted effort that seems hard to avoid.  Lookup will be null)
594
0
        n->fLeftChild = fSymbolTable->lookupNode(n->fText);
595
0
        break;
596
597
0
    case doCheckVarDef:
598
0
        n = fNodeStack[fNodeStackPtr];
599
0
        if (n->fLeftChild == nullptr) {
600
0
            error(U_BRK_UNDEFINED_VARIABLE);
601
0
            returnVal = false;
602
0
        }
603
0
        break;
604
605
0
    case doExprFinished:
606
0
        break;
607
608
0
    case doRuleErrorAssignExpr:
609
0
        error(U_BRK_ASSIGN_ERROR);
610
0
        returnVal = false;
611
0
        break;
612
613
0
    case doExit:
614
0
        returnVal = false;
615
0
        break;
616
617
0
    case doScanUnicodeSet:
618
0
        scanSet();
619
0
        break;
620
621
0
    default:
622
0
        error(U_BRK_INTERNAL_ERROR);
623
0
        returnVal = false;
624
0
        break;
625
0
    }
626
0
    return returnVal && U_SUCCESS(*fRB->fStatus);
627
0
}
628
629
630
631
632
//------------------------------------------------------------------------------
633
//
634
//  Error         Report a rule parse error.
635
//                Only report it if no previous error has been recorded.
636
//
637
//------------------------------------------------------------------------------
638
0
void RBBIRuleScanner::error(UErrorCode e) {
639
0
    if (U_SUCCESS(*fRB->fStatus)) {
640
0
        *fRB->fStatus = e;
641
0
        if (fRB->fParseError) {
642
0
            fRB->fParseError->line  = fLineNum;
643
0
            fRB->fParseError->offset = fCharNum;
644
0
            fRB->fParseError->preContext[0] = 0;
645
0
            fRB->fParseError->postContext[0] = 0;
646
0
        }
647
0
    }
648
0
}
649
650
651
652
653
//------------------------------------------------------------------------------
654
//
655
//  fixOpStack   The parse stack holds partially assembled chunks of the parse tree.
656
//               An entry on the stack may be as small as a single setRef node,
657
//               or as large as the parse tree
658
//               for an entire expression (this will be the one item left on the stack
659
//               when the parsing of an RBBI rule completes.
660
//
661
//               This function is called when a binary operator is encountered.
662
//               It looks back up the stack for operators that are not yet associated
663
//               with a right operand, and if the precedence of the stacked operator >=
664
//               the precedence of the current operator, binds the operand left,
665
//               to the previously encountered operator.
666
//
667
//------------------------------------------------------------------------------
668
0
void RBBIRuleScanner::fixOpStack(RBBINode::OpPrecedence p) {
669
0
    RBBINode *n;
670
    // printNodeStack("entering fixOpStack()");
671
0
    for (;;) {
672
0
        n = fNodeStack[fNodeStackPtr-1];   // an operator node
673
0
        if (n->fPrecedence == 0) {
674
0
            RBBIDebugPuts("RBBIRuleScanner::fixOpStack, bad operator node");
675
0
            error(U_BRK_INTERNAL_ERROR);
676
0
            return;
677
0
        }
678
679
0
        if (n->fPrecedence < p || n->fPrecedence <= RBBINode::precLParen) {
680
            // The most recent operand goes with the current operator,
681
            //   not with the previously stacked one.
682
0
            break;
683
0
        }
684
            // Stack operator is a binary op  ( '|' or concatenation)
685
            //   TOS operand becomes right child of this operator.
686
            //   Resulting subexpression becomes the TOS operand.
687
0
            n->fRightChild = fNodeStack[fNodeStackPtr];
688
0
            fNodeStack[fNodeStackPtr]->fParent = n;
689
0
            fNodeStackPtr--;
690
        // printNodeStack("looping in fixOpStack()   ");
691
0
    }
692
693
0
    if (p <= RBBINode::precLParen) {
694
        // Scan is at a right paren or end of expression.
695
        //  The scanned item must match the stack, or else there was an error.
696
        //  Discard the left paren (or start expr) node from the stack,
697
            //  leaving the completed (sub)expression as TOS.
698
0
            if (n->fPrecedence != p) {
699
                // Right paren encountered matched start of expression node, or
700
                // end of expression matched with a left paren node.
701
0
                error(U_BRK_MISMATCHED_PAREN);
702
0
            }
703
0
            fNodeStack[fNodeStackPtr-1] = fNodeStack[fNodeStackPtr];
704
0
            fNodeStackPtr--;
705
            // Delete the now-discarded LParen or Start node.
706
0
            delete n;
707
0
    }
708
    // printNodeStack("leaving fixOpStack()");
709
0
}
710
711
712
713
714
//------------------------------------------------------------------------------
715
//
716
//   findSetFor    given a UnicodeString,
717
//                  - find the corresponding Unicode Set  (uset node)
718
//                         (create one if necessary)
719
//                  - Set fLeftChild of the caller's node (should be a setRef node)
720
//                         to the uset node
721
//                 Maintain a hash table of uset nodes, so the same one is always used
722
//                    for the same string.
723
//                 If a "to adopt" set is provided and we haven't seen this key before,
724
//                    add the provided set to the hash table.
725
//                 If the string is one (32 bit) char in length, the set contains
726
//                    just one element which is the char in question.
727
//                 If the string is "any", return a set containing all chars.
728
//
729
//------------------------------------------------------------------------------
730
0
void RBBIRuleScanner::findSetFor(const UnicodeString &s, RBBINode *node, UnicodeSet *setToAdopt) {
731
732
0
    RBBISetTableEl   *el;
733
734
    // First check whether we've already cached a set for this string.
735
    // If so, just use the cached set in the new node.
736
    //   delete any set provided by the caller, since we own it.
737
0
    el = (RBBISetTableEl *)uhash_get(fSetTable, &s);
738
0
    if (el != nullptr) {
739
0
        delete setToAdopt;
740
0
        node->fLeftChild = el->val;
741
0
        U_ASSERT(node->fLeftChild->fType == RBBINode::uset);
742
0
        return;
743
0
    }
744
745
    // Haven't seen this set before.
746
    // If the caller didn't provide us with a prebuilt set,
747
    //   create a new UnicodeSet now.
748
0
    if (setToAdopt == nullptr) {
749
0
        if (s.compare(kAny, -1) == 0) {
750
0
            setToAdopt = new UnicodeSet(0x000000, 0x10ffff);
751
0
        } else {
752
0
            UChar32 c;
753
0
            c = s.char32At(0);
754
0
            setToAdopt = new UnicodeSet(c, c);
755
0
        }
756
0
    }
757
758
    //
759
    // Make a new uset node to refer to this UnicodeSet
760
    // This new uset node becomes the child of the caller's setReference node.
761
    //
762
0
    RBBINode *usetNode    = new RBBINode(RBBINode::uset);
763
0
    if (usetNode == nullptr) {
764
0
        error(U_MEMORY_ALLOCATION_ERROR);
765
0
        return;
766
0
    }
767
0
    usetNode->fInputSet   = setToAdopt;
768
0
    usetNode->fParent     = node;
769
0
    node->fLeftChild      = usetNode;
770
0
    usetNode->fText = s;
771
772
773
    //
774
    // Add the new uset node to the list of all uset nodes.
775
    //
776
0
    fRB->fUSetNodes->addElement(usetNode, *fRB->fStatus);
777
778
779
    //
780
    // Add the new set to the set hash table.
781
    //
782
0
    el      = (RBBISetTableEl *)uprv_malloc(sizeof(RBBISetTableEl));
783
0
    UnicodeString *tkey = new UnicodeString(s);
784
0
    if (tkey == nullptr || el == nullptr || setToAdopt == nullptr) {
785
        // Delete to avoid memory leak
786
0
        delete tkey;
787
0
        tkey = nullptr;
788
0
        uprv_free(el);
789
0
        el = nullptr;
790
0
        delete setToAdopt;
791
0
        setToAdopt = nullptr;
792
793
0
        error(U_MEMORY_ALLOCATION_ERROR);
794
0
        return;
795
0
    }
796
0
    el->key = tkey;
797
0
    el->val = usetNode;
798
0
    uhash_put(fSetTable, el->key, el, fRB->fStatus);
799
800
0
    return;
801
0
}
802
803
804
805
//
806
//  Assorted Unicode character constants.
807
//     Numeric because there is no portable way to enter them as literals.
808
//     (Think EBCDIC).
809
//
810
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
811
static const char16_t   chLF        = 0x0a;
812
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
813
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
814
static const char16_t   chApos      = 0x27;      //  single quote, for quoted chars.
815
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
816
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
817
static const char16_t   chLParen    = 0x28;
818
static const char16_t   chRParen    = 0x29;
819
820
821
//------------------------------------------------------------------------------
822
//
823
//  stripRules    Return a rules string without extra spaces.
824
//                (Comments are removed separately, during rule parsing.)
825
//
826
//------------------------------------------------------------------------------
827
0
UnicodeString RBBIRuleScanner::stripRules(const UnicodeString &rules) {
828
0
    UnicodeString strippedRules;
829
0
    int32_t rulesLength = rules.length();
830
831
0
    for (int32_t idx=0; idx<rulesLength; idx = rules.moveIndex32(idx, 1)) {
832
0
        UChar32 cp = rules.char32At(idx);
833
0
        bool whiteSpace = u_hasBinaryProperty(cp, UCHAR_PATTERN_WHITE_SPACE);
834
0
        if (whiteSpace) {
835
0
            continue;
836
0
        }
837
0
        strippedRules.append(cp);
838
0
    }
839
0
    return strippedRules;
840
0
}
841
842
843
//------------------------------------------------------------------------------
844
//
845
//  nextCharLL    Low Level Next Char from rule input source.
846
//                Get a char from the input character iterator,
847
//                keep track of input position for error reporting.
848
//
849
//------------------------------------------------------------------------------
850
0
UChar32  RBBIRuleScanner::nextCharLL() {
851
0
    UChar32  ch;
852
853
0
    if (fNextIndex >= fRB->fRules.length()) {
854
0
        return (UChar32)-1;
855
0
    }
856
0
    ch         = fRB->fRules.char32At(fNextIndex);
857
0
    if (U_IS_SURROGATE(ch)) {
858
0
        error(U_ILLEGAL_CHAR_FOUND);
859
0
        return U_SENTINEL;
860
0
    }
861
0
    fNextIndex = fRB->fRules.moveIndex32(fNextIndex, 1);
862
863
0
    if (ch == chCR ||
864
0
        ch == chNEL ||
865
0
        ch == chLS   ||
866
0
        (ch == chLF && fLastChar != chCR)) {
867
        // Character is starting a new line.  Bump up the line number, and
868
        //  reset the column to 0.
869
0
        fLineNum++;
870
0
        fCharNum=0;
871
0
        if (fQuoteMode) {
872
0
            error(U_BRK_NEW_LINE_IN_QUOTED_STRING);
873
0
            fQuoteMode = false;
874
0
        }
875
0
    }
876
0
    else {
877
        // Character is not starting a new line.  Except in the case of a
878
        //   LF following a CR, increment the column position.
879
0
        if (ch != chLF) {
880
0
            fCharNum++;
881
0
        }
882
0
    }
883
0
    fLastChar = ch;
884
0
    return ch;
885
0
}
886
887
888
//------------------------------------------------------------------------------
889
//
890
//   nextChar     for rules scanning.  At this level, we handle stripping
891
//                out comments and processing backslash character escapes.
892
//                The rest of the rules grammar is handled at the next level up.
893
//
894
//------------------------------------------------------------------------------
895
0
void RBBIRuleScanner::nextChar(RBBIRuleChar &c) {
896
897
    // Unicode Character constants needed for the processing done by nextChar(),
898
    //   in hex because literals wont work on EBCDIC machines.
899
900
0
    fScanIndex = fNextIndex;
901
0
    c.fChar    = nextCharLL();
902
0
    c.fEscaped = false;
903
904
    //
905
    //  check for '' sequence.
906
    //  These are recognized in all contexts, whether in quoted text or not.
907
    //
908
0
    if (c.fChar == chApos) {
909
0
        if (fRB->fRules.char32At(fNextIndex) == chApos) {
910
0
            c.fChar    = nextCharLL();        // get nextChar officially so character counts
911
0
            c.fEscaped = true;                //   stay correct.
912
0
        }
913
0
        else
914
0
        {
915
            // Single quote, by itself.
916
            //   Toggle quoting mode.
917
            //   Return either '('  or ')', because quotes cause a grouping of the quoted text.
918
0
            fQuoteMode = !fQuoteMode;
919
0
            if (fQuoteMode) {
920
0
                c.fChar = chLParen;
921
0
            } else {
922
0
                c.fChar = chRParen;
923
0
            }
924
0
            c.fEscaped = false;      // The paren that we return is not escaped.
925
0
            return;
926
0
        }
927
0
    }
928
929
0
    if (fQuoteMode) {
930
0
        c.fEscaped = true;
931
0
    }
932
0
    else
933
0
    {
934
        // We are not in a 'quoted region' of the source.
935
        //
936
0
        if (c.fChar == chPound) {
937
            // Start of a comment.  Consume the rest of it.
938
            //  The new-line char that terminates the comment is always returned.
939
            //  It will be treated as white-space, and serves to break up anything
940
            //    that might otherwise incorrectly clump together with a comment in
941
            //    the middle (a variable name, for example.)
942
0
            int32_t commentStart = fScanIndex;
943
0
            for (;;) {
944
0
                c.fChar = nextCharLL();
945
0
                if (c.fChar == (UChar32)-1 ||  // EOF
946
0
                    c.fChar == chCR     ||
947
0
                    c.fChar == chLF     ||
948
0
                    c.fChar == chNEL    ||
949
0
                    c.fChar == chLS)       {break;}
950
0
            }
951
0
            for (int32_t i=commentStart; i<fNextIndex-1; ++i) {
952
0
                fRB->fStrippedRules.setCharAt(i, u' ');
953
0
            }
954
0
        }
955
0
        if (c.fChar == (UChar32)-1) {
956
0
            return;
957
0
        }
958
959
        //
960
        //  check for backslash escaped characters.
961
        //  Use UnicodeString::unescapeAt() to handle them.
962
        //
963
0
        if (c.fChar == chBackSlash) {
964
0
            c.fEscaped = true;
965
0
            int32_t startX = fNextIndex;
966
0
            c.fChar = fRB->fRules.unescapeAt(fNextIndex);
967
0
            if (fNextIndex == startX) {
968
0
                error(U_BRK_HEX_DIGITS_EXPECTED);
969
0
            }
970
0
            fCharNum += fNextIndex-startX;
971
0
        }
972
0
    }
973
    // putc(c.fChar, stdout);
974
0
}
975
976
//------------------------------------------------------------------------------
977
//
978
//  Parse RBBI rules.   The state machine for rules parsing is here.
979
//                      The state tables are hand-written in the file rbbirpt.txt,
980
//                      and converted to the form used here by a perl
981
//                      script rbbicst.pl
982
//
983
//------------------------------------------------------------------------------
984
0
void RBBIRuleScanner::parse() {
985
0
    uint16_t                state;
986
0
    const RBBIRuleTableEl  *tableEl;
987
988
0
    if (U_FAILURE(*fRB->fStatus)) {
989
0
        return;
990
0
    }
991
992
0
    state = 1;
993
0
    nextChar(fC);
994
    //
995
    // Main loop for the rule parsing state machine.
996
    //   Runs once per state transition.
997
    //   Each time through optionally performs, depending on the state table,
998
    //      - an advance to the the next input char
999
    //      - an action to be performed.
1000
    //      - pushing or popping a state to/from the local state return stack.
1001
    //
1002
0
    for (;;) {
1003
        //  Bail out if anything has gone wrong.
1004
        //  RBBI rule file parsing stops on the first error encountered.
1005
0
        if (U_FAILURE(*fRB->fStatus)) {
1006
0
            break;
1007
0
        }
1008
1009
        // Quit if state == 0.  This is the normal way to exit the state machine.
1010
        //
1011
0
        if (state == 0) {
1012
0
            break;
1013
0
        }
1014
1015
        // Find the state table element that matches the input char from the rule, or the
1016
        //    class of the input character.  Start with the first table row for this
1017
        //    state, then linearly scan forward until we find a row that matches the
1018
        //    character.  The last row for each state always matches all characters, so
1019
        //    the search will stop there, if not before.
1020
        //
1021
0
        tableEl = &gRuleParseStateTable[state];
1022
        #ifdef RBBI_DEBUG
1023
            if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) {
1024
                RBBIDebugPrintf("char, line, col = (\'%c\', %d, %d)    state=%s ",
1025
                    fC.fChar, fLineNum, fCharNum, RBBIRuleStateNames[state]);
1026
            }
1027
        #endif
1028
1029
0
        for (;;) {
1030
            #ifdef RBBI_DEBUG
1031
                if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPrintf("."); fflush(stdout);}
1032
            #endif
1033
0
            if (tableEl->fCharClass < 127 && fC.fEscaped == false &&   tableEl->fCharClass == fC.fChar) {
1034
                // Table row specified an individual character, not a set, and
1035
                //   the input character is not escaped, and
1036
                //   the input character matched it.
1037
0
                break;
1038
0
            }
1039
0
            if (tableEl->fCharClass == 255) {
1040
                // Table row specified default, match anything character class.
1041
0
                break;
1042
0
            }
1043
0
            if (tableEl->fCharClass == 254 && fC.fEscaped)  {
1044
                // Table row specified "escaped" and the char was escaped.
1045
0
                break;
1046
0
            }
1047
0
            if (tableEl->fCharClass == 253 && fC.fEscaped &&
1048
0
                (fC.fChar == 0x50 || fC.fChar == 0x70 ))  {
1049
                // Table row specified "escaped P" and the char is either 'p' or 'P'.
1050
0
                break;
1051
0
            }
1052
0
            if (tableEl->fCharClass == 252 && fC.fChar == (UChar32)-1)  {
1053
                // Table row specified eof and we hit eof on the input.
1054
0
                break;
1055
0
            }
1056
1057
0
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
1058
0
                fC.fEscaped == false &&                                      //   char is not escaped &&
1059
0
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
1060
0
                U_ASSERT((tableEl->fCharClass-128) < UPRV_LENGTHOF(fRuleSets));
1061
0
                if (fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
1062
                    // Table row specified a character class, or set of characters,
1063
                    //   and the current char matches it.
1064
0
                    break;
1065
0
                }
1066
0
            }
1067
1068
            // No match on this row, advance to the next  row for this state,
1069
0
            tableEl++;
1070
0
        }
1071
0
        if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "scan")) { RBBIDebugPuts("");}
1072
1073
        //
1074
        // We've found the row of the state table that matches the current input
1075
        //   character from the rules string.
1076
        // Perform any action specified  by this row in the state table.
1077
0
        if (doParseActions((int32_t)tableEl->fAction) == false) {
1078
            // Break out of the state machine loop if the
1079
            //   the action signalled some kind of error, or
1080
            //   the action was to exit, occurs on normal end-of-rules-input.
1081
0
            break;
1082
0
        }
1083
1084
0
        if (tableEl->fPushState != 0) {
1085
0
            fStackPtr++;
1086
0
            if (fStackPtr >= kStackSize) {
1087
0
                error(U_BRK_INTERNAL_ERROR);
1088
0
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack overflow.");
1089
0
                fStackPtr--;
1090
0
            }
1091
0
            fStack[fStackPtr] = tableEl->fPushState;
1092
0
        }
1093
1094
0
        if (tableEl->fNextChar) {
1095
0
            nextChar(fC);
1096
0
        }
1097
1098
        // Get the next state from the table entry, or from the
1099
        //   state stack if the next state was specified as "pop".
1100
0
        if (tableEl->fNextState != 255) {
1101
0
            state = tableEl->fNextState;
1102
0
        } else {
1103
0
            state = fStack[fStackPtr];
1104
0
            fStackPtr--;
1105
0
            if (fStackPtr < 0) {
1106
0
                error(U_BRK_INTERNAL_ERROR);
1107
0
                RBBIDebugPuts("RBBIRuleScanner::parse() - state stack underflow.");
1108
0
                fStackPtr++;
1109
0
            }
1110
0
        }
1111
1112
0
    }
1113
1114
0
    if (U_FAILURE(*fRB->fStatus)) {
1115
0
        return;
1116
0
    }
1117
    
1118
    // If there are no forward rules set an error.
1119
    //
1120
0
    if (fRB->fForwardTree == nullptr) {
1121
0
        error(U_BRK_RULE_SYNTAX);
1122
0
        return;
1123
0
    }
1124
1125
    //
1126
    // Parsing of the input RBBI rules is complete.
1127
    // We now have a parse tree for the rule expressions
1128
    // and a list of all UnicodeSets that are referenced.
1129
    //
1130
#ifdef RBBI_DEBUG
1131
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "symbols")) {fSymbolTable->rbbiSymtablePrint();}
1132
    if (fRB->fDebugEnv && uprv_strstr(fRB->fDebugEnv, "ptree")) {
1133
        RBBIDebugPrintf("Completed Forward Rules Parse Tree...\n");
1134
        RBBINode::printTree(fRB->fForwardTree, true);
1135
        RBBIDebugPrintf("\nCompleted Reverse Rules Parse Tree...\n");
1136
        RBBINode::printTree(fRB->fReverseTree, true);
1137
        RBBIDebugPrintf("\nCompleted Safe Point Forward Rules Parse Tree...\n");
1138
        RBBINode::printTree(fRB->fSafeFwdTree, true);
1139
        RBBIDebugPrintf("\nCompleted Safe Point Reverse Rules Parse Tree...\n");
1140
        RBBINode::printTree(fRB->fSafeRevTree, true);
1141
    }
1142
#endif
1143
0
}
1144
1145
1146
//------------------------------------------------------------------------------
1147
//
1148
//  printNodeStack     for debugging...
1149
//
1150
//------------------------------------------------------------------------------
1151
#ifdef RBBI_DEBUG
1152
void RBBIRuleScanner::printNodeStack(const char *title) {
1153
    int i;
1154
    RBBIDebugPrintf("%s.  Dumping node stack...\n", title);
1155
    for (i=fNodeStackPtr; i>0; i--) {RBBINode::printTree(fNodeStack[i], true);}
1156
}
1157
#endif
1158
1159
1160
1161
1162
//------------------------------------------------------------------------------
1163
//
1164
//  pushNewNode   create a new RBBINode of the specified type and push it
1165
//                onto the stack of nodes.
1166
//
1167
//------------------------------------------------------------------------------
1168
0
RBBINode  *RBBIRuleScanner::pushNewNode(RBBINode::NodeType  t) {
1169
0
    if (U_FAILURE(*fRB->fStatus)) {
1170
0
        return nullptr;
1171
0
    }
1172
0
    if (fNodeStackPtr >= kStackSize - 1) {
1173
0
        error(U_BRK_RULE_SYNTAX);
1174
0
        RBBIDebugPuts("RBBIRuleScanner::pushNewNode - stack overflow.");
1175
0
        return nullptr;
1176
0
    }
1177
0
    fNodeStackPtr++;
1178
0
    fNodeStack[fNodeStackPtr] = new RBBINode(t);
1179
0
    if (fNodeStack[fNodeStackPtr] == nullptr) {
1180
0
        *fRB->fStatus = U_MEMORY_ALLOCATION_ERROR;
1181
0
    }
1182
0
    return fNodeStack[fNodeStackPtr];
1183
0
}
1184
1185
1186
1187
//------------------------------------------------------------------------------
1188
//
1189
//  scanSet    Construct a UnicodeSet from the text at the current scan
1190
//             position.  Advance the scan position to the first character
1191
//             after the set.
1192
//
1193
//             A new RBBI setref node referring to the set is pushed onto the node
1194
//             stack.
1195
//
1196
//             The scan position is normally under the control of the state machine
1197
//             that controls rule parsing.  UnicodeSets, however, are parsed by
1198
//             the UnicodeSet constructor, not by the RBBI rule parser.
1199
//
1200
//------------------------------------------------------------------------------
1201
0
void RBBIRuleScanner::scanSet() {
1202
0
    UnicodeSet    *uset;
1203
0
    ParsePosition  pos;
1204
0
    int            startPos;
1205
0
    int            i;
1206
1207
0
    if (U_FAILURE(*fRB->fStatus)) {
1208
0
        return;
1209
0
    }
1210
1211
0
    pos.setIndex(fScanIndex);
1212
0
    startPos = fScanIndex;
1213
0
    UErrorCode localStatus = U_ZERO_ERROR;
1214
0
    uset = new UnicodeSet();
1215
0
    if (uset == nullptr) {
1216
0
        localStatus = U_MEMORY_ALLOCATION_ERROR;
1217
0
    } else {
1218
0
        uset->applyPatternIgnoreSpace(fRB->fRules, pos, fSymbolTable, localStatus);
1219
0
    }
1220
0
    if (U_FAILURE(localStatus)) {
1221
        //  TODO:  Get more accurate position of the error from UnicodeSet's return info.
1222
        //         UnicodeSet appears to not be reporting correctly at this time.
1223
        #ifdef RBBI_DEBUG
1224
            RBBIDebugPrintf("UnicodeSet parse position.ErrorIndex = %d\n", pos.getIndex());
1225
        #endif
1226
0
        error(localStatus);
1227
0
        delete uset;
1228
0
        return;
1229
0
    }
1230
1231
    // Verify that the set contains at least one code point.
1232
    //
1233
0
    U_ASSERT(uset!=nullptr);
1234
0
    if (uset->isEmpty()) {
1235
        // This set is empty.
1236
        //  Make it an error, because it almost certainly is not what the user wanted.
1237
        //  Also, avoids having to think about corner cases in the tree manipulation code
1238
        //   that occurs later on.
1239
0
        error(U_BRK_RULE_EMPTY_SET);
1240
0
        delete uset;
1241
0
        return;
1242
0
    }
1243
1244
1245
    // Advance the RBBI parse position over the UnicodeSet pattern.
1246
    //   Don't just set fScanIndex because the line/char positions maintained
1247
    //   for error reporting would be thrown off.
1248
0
    i = pos.getIndex();
1249
0
    for (;;) {
1250
0
        if (fNextIndex >= i) {
1251
0
            break;
1252
0
        }
1253
0
        nextCharLL();
1254
0
    }
1255
1256
0
    if (U_SUCCESS(*fRB->fStatus)) {
1257
0
        RBBINode         *n;
1258
1259
0
        n = pushNewNode(RBBINode::setRef);
1260
0
        if (U_FAILURE(*fRB->fStatus)) {
1261
0
            return;
1262
0
        }
1263
0
        n->fFirstPos = startPos;
1264
0
        n->fLastPos  = fNextIndex;
1265
0
        fRB->fRules.extractBetween(n->fFirstPos, n->fLastPos, n->fText);
1266
        //  findSetFor() serves several purposes here:
1267
        //     - Adopts storage for the UnicodeSet, will be responsible for deleting.
1268
        //     - Maintains collection of all sets in use, needed later for establishing
1269
        //          character categories for run time engine.
1270
        //     - Eliminates mulitiple instances of the same set.
1271
        //     - Creates a new uset node if necessary (if this isn't a duplicate.)
1272
0
        findSetFor(n->fText, n, uset);
1273
0
    }
1274
1275
0
}
1276
1277
0
int32_t RBBIRuleScanner::numRules() {
1278
0
    return fRuleNum;
1279
0
}
1280
1281
U_NAMESPACE_END
1282
1283
#endif /* #if !UCONFIG_NO_BREAK_ITERATION */