Coverage Report

Created: 2023-03-04 07:00

/src/icu/icu4c/source/i18n/regexcmp.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  regexcmp.cpp
5
//
6
//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the ICU regular expression compiler, which is responsible
10
//  for processing a regular expression pattern into the compiled form that
11
//  is used by the match finding engine.
12
//
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18
#include "unicode/ustring.h"
19
#include "unicode/unistr.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/uchriter.h"
23
#include "unicode/parsepos.h"
24
#include "unicode/parseerr.h"
25
#include "unicode/regex.h"
26
#include "unicode/utf.h"
27
#include "unicode/utf16.h"
28
#include "patternprops.h"
29
#include "putilimp.h"
30
#include "cmemory.h"
31
#include "cstr.h"
32
#include "cstring.h"
33
#include "uvectr32.h"
34
#include "uvectr64.h"
35
#include "uassert.h"
36
#include "uinvchar.h"
37
38
#include "regeximp.h"
39
#include "regexcst.h"   // Contains state table for the regex pattern parser.
40
                        //   generated by a Perl script.
41
#include "regexcmp.h"
42
#include "regexst.h"
43
#include "regextxt.h"
44
45
46
47
U_NAMESPACE_BEGIN
48
49
50
//------------------------------------------------------------------------------
51
//
52
//  Constructor.
53
//
54
//------------------------------------------------------------------------------
55
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56
   fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57
11.5k
{
58
    // Lazy init of all shared global sets (needed for init()'s empty text)
59
11.5k
    RegexStaticSets::initGlobals(&status);
60
61
11.5k
    fStatus           = &status;
62
63
11.5k
    fRXPat            = rxp;
64
11.5k
    fScanIndex        = 0;
65
11.5k
    fLastChar         = -1;
66
11.5k
    fPeekChar         = -1;
67
11.5k
    fLineNum          = 1;
68
11.5k
    fCharNum          = 0;
69
11.5k
    fQuoteMode        = false;
70
11.5k
    fInBackslashQuote = false;
71
11.5k
    fModeFlags        = fRXPat->fFlags | 0x80000000;
72
11.5k
    fEOLComments      = true;
73
74
11.5k
    fMatchOpenParen   = -1;
75
11.5k
    fMatchCloseParen  = -1;
76
11.5k
    fCaptureName      = nullptr;
77
11.5k
    fLastSetLiteral   = U_SENTINEL;
78
79
11.5k
    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80
0
        status = rxp->fDeferredStatus;
81
0
    }
82
11.5k
}
83
84
static const char16_t   chAmp       = 0x26;      // '&'
85
static const char16_t   chDash      = 0x2d;      // '-'
86
87
88
//------------------------------------------------------------------------------
89
//
90
//  Destructor
91
//
92
//------------------------------------------------------------------------------
93
11.5k
RegexCompile::~RegexCompile() {
94
11.5k
    delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
95
                                 //   compilation stops with a syntax error.
96
11.5k
}
97
98
580
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99
580
    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100
580
}
101
102
//------------------------------------------------------------------------------
103
//
104
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105
//                           The state tables are hand-written in the file regexcst.txt,
106
//                           and converted to the form used here by a perl
107
//                           script regexcst.pl
108
//
109
//------------------------------------------------------------------------------
110
void    RegexCompile::compile(
111
                         const UnicodeString &pat,   // Source pat to be compiled.
112
                         UParseError &pp,            // Error position info
113
                         UErrorCode &e)              // Error Code
114
0
{
115
0
    fRXPat->fPatternString = new UnicodeString(pat);
116
0
    UText patternText = UTEXT_INITIALIZER;
117
0
    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119
0
    if (U_SUCCESS(e)) {
120
0
        compile(&patternText, pp, e);
121
0
        utext_close(&patternText);
122
0
    }
123
0
}
124
125
//
126
//   compile, UText mode
127
//     All the work is actually done here.
128
//
129
void    RegexCompile::compile(
130
                         UText *pat,                 // Source pat to be compiled.
131
                         UParseError &pp,            // Error position info
132
                         UErrorCode &e)              // Error Code
133
11.5k
{
134
11.5k
    fStatus             = &e;
135
11.5k
    fParseErr           = &pp;
136
11.5k
    fStackPtr           = 0;
137
11.5k
    fStack[fStackPtr]   = 0;
138
139
11.5k
    if (U_FAILURE(*fStatus)) {
140
0
        return;
141
0
    }
142
143
    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144
11.5k
    U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
145
146
    // Prepare the RegexPattern object to receive the compiled pattern.
147
11.5k
    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
148
11.5k
    if (U_FAILURE(*fStatus)) {
149
0
        return;
150
0
    }
151
152
    // Initialize the pattern scanning state machine
153
11.5k
    fPatternLength = utext_nativeLength(pat);
154
11.5k
    uint16_t                state = 1;
155
11.5k
    const RegexTableEl      *tableEl;
156
157
    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158
11.5k
    if (fModeFlags & UREGEX_LITERAL) {
159
0
        fQuoteMode = true;
160
0
    }
161
162
11.5k
    nextChar(fC);                        // Fetch the first char from the pattern string.
163
164
    //
165
    // Main loop for the regex pattern parsing state machine.
166
    //   Runs once per state transition.
167
    //   Each time through optionally performs, depending on the state table,
168
    //      - an advance to the the next pattern char
169
    //      - an action to be performed.
170
    //      - pushing or popping a state to/from the local state return stack.
171
    //   file regexcst.txt is the source for the state table.  The logic behind
172
    //     recongizing the pattern syntax is there, not here.
173
    //
174
100M
    for (;;) {
175
        //  Bail out if anything has gone wrong.
176
        //  Regex pattern parsing stops on the first error encountered.
177
100M
        if (U_FAILURE(*fStatus)) {
178
211
            break;
179
211
        }
180
181
100M
        U_ASSERT(state != 0);
182
183
        // Find the state table element that matches the input char from the pattern, or the
184
        //    class of the input character.  Start with the first table row for this
185
        //    state, then linearly scan forward until we find a row that matches the
186
        //    character.  The last row for each state always matches all characters, so
187
        //    the search will stop there, if not before.
188
        //
189
100M
        tableEl = &gRuleParseStateTable[state];
190
100M
        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191
100M
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193
527M
        for (;;) {    // loop through table rows belonging to this state, looking for one
194
                      //   that matches the current input char.
195
527M
            REGEX_SCAN_DEBUG_PRINTF(("."));
196
527M
            if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
197
                // Table row specified an individual character, not a set, and
198
                //   the input character is not quoted, and
199
                //   the input character matched it.
200
29.2M
                break;
201
29.2M
            }
202
497M
            if (tableEl->fCharClass == 255) {
203
                // Table row specified default, match anything character class.
204
51.1M
                break;
205
51.1M
            }
206
446M
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207
                // Table row specified "quoted" and the char was quoted.
208
66.2k
                break;
209
66.2k
            }
210
446M
            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
211
                // Table row specified eof and we hit eof on the input.
212
8.35k
                break;
213
8.35k
            }
214
215
446M
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216
446M
                fC.fQuoted == false &&                                       //   char is not escaped &&
217
446M
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
218
47.2M
                U_ASSERT(tableEl->fCharClass <= 137);
219
47.2M
                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220
                    // Table row specified a character class, or set of characters,
221
                    //   and the current char matches it.
222
20.1M
                    break;
223
20.1M
                }
224
47.2M
            }
225
226
            // No match on this row, advance to the next  row for this state,
227
426M
            tableEl++;
228
426M
        }
229
100M
        REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231
        //
232
        // We've found the row of the state table that matches the current input
233
        //   character from the rules string.
234
        // Perform any action specified  by this row in the state table.
235
100M
        if (doParseActions(tableEl->fAction) == false) {
236
            // Break out of the state machine loop if the
237
            //   the action signalled some kind of error, or
238
            //   the action was to exit, occurs on normal end-of-rules-input.
239
11.3k
            break;
240
11.3k
        }
241
242
100M
        if (tableEl->fPushState != 0) {
243
1.03M
            fStackPtr++;
244
1.03M
            if (fStackPtr >= kStackSize) {
245
5
                error(U_REGEX_INTERNAL_ERROR);
246
5
                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247
5
                fStackPtr--;
248
5
            }
249
1.03M
            fStack[fStackPtr] = tableEl->fPushState;
250
1.03M
        }
251
252
        //
253
        //  NextChar.  This is where characters are actually fetched from the pattern.
254
        //             Happens under control of the 'n' tag in the state table.
255
        //
256
100M
        if (tableEl->fNextChar) {
257
57.4M
            nextChar(fC);
258
57.4M
        }
259
260
        // Get the next state from the table entry, or from the
261
        //   state stack if the next state was specified as "pop".
262
100M
        if (tableEl->fNextState != 255) {
263
99.5M
            state = tableEl->fNextState;
264
99.5M
        } else {
265
1.01M
            state = fStack[fStackPtr];
266
1.01M
            fStackPtr--;
267
1.01M
            if (fStackPtr < 0) {
268
                // state stack underflow
269
                // This will occur if the user pattern has mis-matched parentheses,
270
                //   with extra close parens.
271
                //
272
0
                fStackPtr++;
273
0
                error(U_REGEX_MISMATCHED_PAREN);
274
0
            }
275
1.01M
        }
276
277
100M
    }
278
279
11.5k
    if (U_FAILURE(*fStatus)) {
280
        // Bail out if the pattern had errors.
281
6.91k
        return;
282
6.91k
    }
283
284
    //
285
    // The pattern has now been read and processed, and the compiled code generated.
286
    //
287
288
    //
289
    // The pattern's fFrameSize so far has accumulated the requirements for
290
    //   storage for capture parentheses, counters, etc. that are encountered
291
    //   in the pattern.  Add space for the two variables that are always
292
    //   present in the saved state:  the input string position (int64_t) and
293
    //   the position in the compiled pattern.
294
    //
295
4.64k
    allocateStackData(RESTACKFRAME_HDRCOUNT);
296
297
    //
298
    // Optimization pass 1: NOPs, back-references, and case-folding
299
    //
300
4.64k
    stripNOPs();
301
302
    //
303
    // Get bounds for the minimum and maximum length of a string that this
304
    //   pattern can match.  Used to avoid looking for matches in strings that
305
    //   are too short.
306
    //
307
4.64k
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309
    //
310
    // Optimization pass 2: match start type
311
    //
312
4.64k
    matchStartType();
313
314
    //
315
    // Set up fast latin-1 range sets
316
    //
317
4.64k
    int32_t numSets = fRXPat->fSets->size();
318
4.64k
    fRXPat->fSets8 = new Regex8BitSet[numSets];
319
    // Null pointer check.
320
4.64k
    if (fRXPat->fSets8 == nullptr) {
321
0
        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322
0
        return;
323
0
    }
324
4.64k
    int32_t i;
325
403k
    for (i=0; i<numSets; i++) {
326
398k
        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
327
398k
        fRXPat->fSets8[i].init(s);
328
398k
    }
329
330
4.64k
}
331
332
333
334
335
336
//------------------------------------------------------------------------------
337
//
338
//  doParseAction        Do some action during regex pattern parsing.
339
//                       Called by the parse state machine.
340
//
341
//                       Generation of the match engine PCode happens here, or
342
//                       in functions called from the parse actions defined here.
343
//
344
//
345
//------------------------------------------------------------------------------
346
UBool RegexCompile::doParseActions(int32_t action)
347
100M
{
348
100M
    UBool   returnVal = true;
349
350
100M
    switch ((Regex_PatternParseAction)action) {
351
352
11.4k
    case doPatStart:
353
        // Start of pattern compiles to:
354
        //0   SAVE   2        Fall back to position of FAIL
355
        //1   jmp    3
356
        //2   FAIL            Stop if we ever reach here.
357
        //3   NOP             Dummy, so start of pattern looks the same as
358
        //                    the start of an ( grouping.
359
        //4   NOP             Resreved, will be replaced by a save if there are
360
        //                    OR | operators at the top level
361
11.4k
        appendOp(URX_STATE_SAVE, 2);
362
11.4k
        appendOp(URX_JMP,  3);
363
11.4k
        appendOp(URX_FAIL, 0);
364
365
        // Standard open nonCapture paren action emits the two NOPs and
366
        //   sets up the paren stack frame.
367
11.4k
        doParseActions(doOpenNonCaptureParen);
368
11.4k
        break;
369
370
6.75k
    case doPatFinish:
371
        // We've scanned to the end of the pattern
372
        //  The end of pattern compiles to:
373
        //        URX_END
374
        //    which will stop the runtime match engine.
375
        //  Encountering end of pattern also behaves like a close paren,
376
        //   and forces fixups of the State Save at the beginning of the compiled pattern
377
        //   and of any OR operations at the top level.
378
        //
379
6.75k
        handleCloseParen();
380
6.75k
        if (fParenStack.size() > 0) {
381
            // Missing close paren in pattern.
382
2.11k
            error(U_REGEX_MISMATCHED_PAREN);
383
2.11k
        }
384
385
        // add the END operation to the compiled pattern.
386
6.75k
        appendOp(URX_END, 0);
387
388
        // Terminate the pattern compilation state machine.
389
6.75k
        returnVal = false;
390
6.75k
        break;
391
392
393
394
26.2M
    case doOrOperator:
395
        // Scanning a '|', as in (A|B)
396
26.2M
        {
397
            // Generate code for any pending literals preceding the '|'
398
26.2M
            fixLiterals(false);
399
400
            // Insert a SAVE operation at the start of the pattern section preceding
401
            //   this OR at this level.  This SAVE will branch the match forward
402
            //   to the right hand side of the OR in the event that the left hand
403
            //   side fails to match and backtracks.  Locate the position for the
404
            //   save from the location on the top of the parentheses stack.
405
26.2M
            int32_t savePosition = fParenStack.popi();
406
26.2M
            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
407
26.2M
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408
26.2M
            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409
26.2M
            fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411
            // Append an JMP operation into the compiled pattern.  The operand for
412
            //  the JMP will eventually be the location following the ')' for the
413
            //  group.  This will be patched in later, when the ')' is encountered.
414
26.2M
            appendOp(URX_JMP, 0);
415
416
            // Push the position of the newly added JMP op onto the parentheses stack.
417
            // This registers if for fixup when this block's close paren is encountered.
418
26.2M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420
            // Append a NOP to the compiled pattern.  This is the slot reserved
421
            //   for a SAVE in the event that there is yet another '|' following
422
            //   this one.
423
26.2M
            appendOp(URX_NOP, 0);
424
26.2M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425
26.2M
        }
426
26.2M
        break;
427
428
429
595
    case doBeginNamedCapture:
430
        // Scanning (?<letter.
431
        //   The first letter of the name will come through again under doConinueNamedCapture.
432
595
        fCaptureName = new UnicodeString();
433
595
        if (fCaptureName == nullptr) {
434
0
            error(U_MEMORY_ALLOCATION_ERROR);
435
0
        }
436
595
        break;
437
438
2.17k
    case  doContinueNamedCapture:
439
2.17k
        fCaptureName->append(fC.fChar);
440
2.17k
        break;
441
442
34
    case doBadNamedCapture:
443
34
        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444
34
        break;
445
        
446
307k
    case doOpenCaptureParen:
447
        // Open Capturing Paren, possibly named.
448
        //   Compile to a
449
        //      - NOP, which later may be replaced by a save-state if the
450
        //         parenthesized group gets a * quantifier, followed by
451
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452
        //      - NOP, which may later be replaced by a save-state if there
453
        //             is an '|' alternation within the parens.
454
        //
455
        //    Each capture group gets three slots in the save stack frame:
456
        //         0: Capture Group start position (in input string being matched.)
457
        //         1: Capture Group end position.
458
        //         2: Start of Match-in-progress.
459
        //    The first two locations are for a completed capture group, and are
460
        //     referred to by back references and the like.
461
        //    The third location stores the capture start position when an START_CAPTURE is
462
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463
        //      END_CAPTURE is encountered.
464
307k
        {
465
307k
            fixLiterals();
466
307k
            appendOp(URX_NOP, 0);
467
307k
            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468
307k
            appendOp(URX_START_CAPTURE, varsLoc);
469
307k
            appendOp(URX_NOP, 0);
470
471
            // On the Parentheses stack, start a new frame and add the positions
472
            //   of the two NOPs.  Depending on what follows in the pattern, the
473
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474
            //   address of the end of the parenthesized group.
475
307k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476
307k
            fParenStack.push(capturing, *fStatus);                        // Frame type.
477
307k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478
307k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479
480
            // Save the mapping from group number to stack frame variable position.
481
307k
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483
            // If this is a named capture group, add the name->group number mapping.
484
307k
            if (fCaptureName != nullptr) {
485
582
                if (!fRXPat->initNamedCaptureMap()) {
486
0
                    if (U_SUCCESS(*fStatus)) {
487
0
                        error(fRXPat->fDeferredStatus);
488
0
                    }
489
0
                    break;
490
0
                }
491
582
                int32_t groupNumber = fRXPat->fGroupMap->size();
492
582
                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493
582
                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
494
582
                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495
30
                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496
30
                }
497
582
            }
498
307k
        }
499
307k
        break;
500
501
307k
    case doOpenNonCaptureParen:
502
        // Open non-caputuring (grouping only) Paren.
503
        //   Compile to a
504
        //      - NOP, which later may be replaced by a save-state if the
505
        //         parenthesized group gets a * quantifier, followed by
506
        //      - NOP, which may later be replaced by a save-state if there
507
        //             is an '|' alternation within the parens.
508
12.1k
        {
509
12.1k
            fixLiterals();
510
12.1k
            appendOp(URX_NOP, 0);
511
12.1k
            appendOp(URX_NOP, 0);
512
513
            // On the Parentheses stack, start a new frame and add the positions
514
            //   of the two NOPs.
515
12.1k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516
12.1k
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517
12.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518
12.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519
12.1k
        }
520
12.1k
         break;
521
522
523
614
    case doOpenAtomicParen:
524
        // Open Atomic Paren.  (?>
525
        //   Compile to a
526
        //      - NOP, which later may be replaced if the parenthesized group
527
        //         has a quantifier, followed by
528
        //      - STO_SP  save state stack position, so it can be restored at the ")"
529
        //      - NOP, which may later be replaced by a save-state if there
530
        //             is an '|' alternation within the parens.
531
614
        {
532
614
            fixLiterals();
533
614
            appendOp(URX_NOP, 0);
534
614
            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535
614
            appendOp(URX_STO_SP, varLoc);
536
614
            appendOp(URX_NOP, 0);
537
538
            // On the Parentheses stack, start a new frame and add the positions
539
            //   of the two NOPs.  Depending on what follows in the pattern, the
540
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541
            //   address of the end of the parenthesized group.
542
614
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543
614
            fParenStack.push(atomic, *fStatus);                           // Frame type.
544
614
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545
614
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546
614
        }
547
614
        break;
548
549
550
60.1k
    case doOpenLookAhead:
551
        // Positive Look-ahead   (?=  stuff  )
552
        //
553
        //   Note:   Addition of transparent input regions, with the need to
554
        //           restore the original regions when failing out of a lookahead
555
        //           block, complicated this sequence.  Some combined opcodes
556
        //           might make sense - or might not, lookahead aren't that common.
557
        //
558
        //      Caution:  min match length optimization knows about this
559
        //               sequence; don't change without making updates there too.
560
        //
561
        // Compiles to
562
        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563
        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564
        //    3    JMP          6           continue ...
565
        //
566
        //    4.   LA_END                   Look Ahead failed.  Restore regions.
567
        //    5.   BACKTRACK                and back track again.
568
        //
569
        //    6.   NOP              reserved for use by quantifiers on the block.
570
        //                          Look-ahead can't have quantifiers, but paren stack
571
        //                             compile time conventions require the slot anyhow.
572
        //    7.   NOP              may be replaced if there is are '|' ops in the block.
573
        //    8.     code for parenthesized stuff.
574
        //    9.   LA_END
575
        //
576
        //  Four data slots are reserved, for saving state on entry to the look-around
577
        //    0:   stack pointer on entry.
578
        //    1:   input position on entry.
579
        //    2:   fActiveStart, the active bounds start on entry.
580
        //    3:   fActiveLimit, the active bounds limit on entry.
581
60.1k
        {
582
60.1k
            fixLiterals();
583
60.1k
            int32_t dataLoc = allocateData(4);
584
60.1k
            appendOp(URX_LA_START, dataLoc);
585
60.1k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586
60.1k
            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587
60.1k
            appendOp(URX_LA_END, dataLoc);
588
60.1k
            appendOp(URX_BACKTRACK, 0);
589
60.1k
            appendOp(URX_NOP, 0);
590
60.1k
            appendOp(URX_NOP, 0);
591
592
            // On the Parentheses stack, start a new frame and add the positions
593
            //   of the NOPs.
594
60.1k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595
60.1k
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596
60.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597
60.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598
60.1k
        }
599
60.1k
        break;
600
601
919
    case doOpenLookAheadNeg:
602
        // Negated Lookahead.   (?! stuff )
603
        // Compiles to
604
        //    1.    LA_START    dataloc
605
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606
        //                                //   which continues with the match.
607
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608
        //    4.       code for parenthesized stuff.
609
        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611
        //    7.    END_LA                // Restore match region, in case look-ahead was using
612
        //                                        an alternate (transparent) region.
613
        //  Four data slots are reserved, for saving state on entry to the look-around
614
        //    0:   stack pointer on entry.
615
        //    1:   input position on entry.
616
        //    2:   fActiveStart, the active bounds start on entry.
617
        //    3:   fActiveLimit, the active bounds limit on entry.
618
919
        {
619
919
            fixLiterals();
620
919
            int32_t dataLoc = allocateData(4);
621
919
            appendOp(URX_LA_START, dataLoc);
622
919
            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623
919
            appendOp(URX_NOP, 0);
624
625
            // On the Parentheses stack, start a new frame and add the positions
626
            //   of the StateSave and NOP.
627
919
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628
919
            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629
919
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630
919
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631
632
            // Instructions #5 - #7 will be added when the ')' is encountered.
633
919
        }
634
919
        break;
635
636
1.68k
    case doOpenLookBehind:
637
1.68k
        {
638
            //   Compile a (?<= look-behind open paren.
639
            //
640
            //          Compiles to
641
            //              0       URX_LB_START     dataLoc
642
            //              1       URX_LB_CONT      dataLoc
643
            //              2                        MinMatchLen
644
            //              3                        MaxMatchLen
645
            //              4       URX_NOP          Standard '(' boilerplate.
646
            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647
            //              6         <code for LookBehind expression>
648
            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649
            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650
            //
651
            //          Allocate a block of matcher data, to contain (when running a match)
652
            //              0:    Stack ptr on entry
653
            //              1:    Input Index on entry
654
            //              2:    fActiveStart, the active bounds start on entry.
655
            //              3:    fActiveLimit, the active bounds limit on entry.
656
            //              4:    Start index of match current match attempt.
657
            //          The first four items must match the layout of data for LA_START / LA_END
658
659
            // Generate match code for any pending literals.
660
1.68k
            fixLiterals();
661
662
            // Allocate data space
663
1.68k
            int32_t dataLoc = allocateData(5);
664
665
            // Emit URX_LB_START
666
1.68k
            appendOp(URX_LB_START, dataLoc);
667
668
            // Emit URX_LB_CONT
669
1.68k
            appendOp(URX_LB_CONT, dataLoc);
670
1.68k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671
1.68k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672
673
            // Emit the NOPs
674
1.68k
            appendOp(URX_NOP, 0);
675
1.68k
            appendOp(URX_NOP, 0);
676
677
            // On the Parentheses stack, start a new frame and add the positions
678
            //   of the URX_LB_CONT and the NOP.
679
1.68k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680
1.68k
            fParenStack.push(lookBehind, *fStatus);                       // Frame type
681
1.68k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682
1.68k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683
684
            // The final two instructions will be added when the ')' is encountered.
685
1.68k
        }
686
687
1.68k
        break;
688
689
1.82k
    case doOpenLookBehindNeg:
690
1.82k
        {
691
            //   Compile a (?<! negated look-behind open paren.
692
            //
693
            //          Compiles to
694
            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695
            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696
            //              2                        MinMatchLen
697
            //              3                        MaxMatchLen
698
            //              4                        continueLoc (9)
699
            //              5       URX_NOP          Standard '(' boilerplate.
700
            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701
            //              7         <code for LookBehind expression>
702
            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703
            //              9       ...
704
            //
705
            //          Allocate a block of matcher data, to contain (when running a match)
706
            //              0:    Stack ptr on entry
707
            //              1:    Input Index on entry
708
            //              2:    fActiveStart, the active bounds start on entry.
709
            //              3:    fActiveLimit, the active bounds limit on entry.
710
            //              4:    Start index of match current match attempt.
711
            //          The first four items must match the layout of data for LA_START / LA_END
712
713
            // Generate match code for any pending literals.
714
1.82k
            fixLiterals();
715
716
            // Allocate data space
717
1.82k
            int32_t dataLoc = allocateData(5);
718
719
            // Emit URX_LB_START
720
1.82k
            appendOp(URX_LB_START, dataLoc);
721
722
            // Emit URX_LBN_CONT
723
1.82k
            appendOp(URX_LBN_CONT, dataLoc);
724
1.82k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725
1.82k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726
1.82k
            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727
728
            // Emit the NOPs
729
1.82k
            appendOp(URX_NOP, 0);
730
1.82k
            appendOp(URX_NOP, 0);
731
732
            // On the Parentheses stack, start a new frame and add the positions
733
            //   of the URX_LB_CONT and the NOP.
734
1.82k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735
1.82k
            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736
1.82k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737
1.82k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738
739
            // The final two instructions will be added when the ')' is encountered.
740
1.82k
        }
741
1.82k
        break;
742
743
1
    case doConditionalExpr:
744
        // Conditionals such as (?(1)a:b)
745
2
    case doPerlInline:
746
        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747
2
        error(U_REGEX_UNIMPLEMENTED);
748
2
        break;
749
750
751
371k
    case doCloseParen:
752
371k
        handleCloseParen();
753
371k
        if (fParenStack.size() <= 0) {
754
            //  Extra close paren, or missing open paren.
755
40
            error(U_REGEX_MISMATCHED_PAREN);
756
40
        }
757
371k
        break;
758
759
42.7M
    case doNOP:
760
42.7M
        break;
761
762
763
22
    case doBadOpenParenType:
764
74
    case doRuleError:
765
74
        error(U_REGEX_RULE_SYNTAX);
766
74
        break;
767
768
769
8
    case doMismatchedParenErr:
770
8
        error(U_REGEX_MISMATCHED_PAREN);
771
8
        break;
772
773
12.1k
    case doPlus:
774
        //  Normal '+'  compiles to
775
        //     1.   stuff to be repeated  (already built)
776
        //     2.   jmp-sav 1
777
        //     3.   ...
778
        //
779
        //  Or, if the item to be repeated can match a zero length string,
780
        //     1.   STO_INP_LOC  data-loc
781
        //     2.      body of stuff to be repeated
782
        //     3.   JMP_SAV_X    2
783
        //     4.   ...
784
785
        //
786
        //  Or, if the item to be repeated is simple
787
        //     1.   Item to be repeated.
788
        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789
        //     3.   LOOP_C       stack location
790
12.1k
        {
791
12.1k
            int32_t  topLoc = blockTopLoc(false);        // location of item #1
792
12.1k
            int32_t  frameLoc;
793
794
            // Check for simple constructs, which may get special optimized code.
795
12.1k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796
11.4k
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797
798
11.4k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
799
                    // Emit optimized code for [char set]+
800
2.49k
                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801
2.49k
                    frameLoc = allocateStackData(1);
802
2.49k
                    appendOp(URX_LOOP_C, frameLoc);
803
2.49k
                    break;
804
2.49k
                }
805
806
8.96k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807
8.96k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808
8.96k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809
                    // Emit Optimized code for .+ operations.
810
2.92k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811
2.92k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812
                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813
1.37k
                        loopOpI |= 1;
814
1.37k
                    }
815
2.92k
                    if (fModeFlags & UREGEX_UNIX_LINES) {
816
303
                        loopOpI |= 2;
817
303
                    }
818
2.92k
                    appendOp(loopOpI);
819
2.92k
                    frameLoc = allocateStackData(1);
820
2.92k
                    appendOp(URX_LOOP_C, frameLoc);
821
2.92k
                    break;
822
2.92k
                }
823
824
8.96k
            }
825
826
            // General case.
827
828
            // Check for minimum match length of zero, which requires
829
            //    extra loop-breaking code.
830
6.77k
            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831
                // Zero length match is possible.
832
                // Emit the code sequence that can handle it.
833
1.01k
                insertOp(topLoc);
834
1.01k
                frameLoc = allocateStackData(1);
835
836
1.01k
                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837
1.01k
                fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839
1.01k
                appendOp(URX_JMP_SAV_X, topLoc+1);
840
5.76k
            } else {
841
                // Simpler code when the repeated body must match something non-empty
842
5.76k
                appendOp(URX_JMP_SAV, topLoc);
843
5.76k
            }
844
6.77k
        }
845
0
        break;
846
847
1.25k
    case doNGPlus:
848
        //  Non-greedy '+?'  compiles to
849
        //     1.   stuff to be repeated  (already built)
850
        //     2.   state-save  1
851
        //     3.   ...
852
1.25k
        {
853
1.25k
            int32_t topLoc      = blockTopLoc(false);
854
1.25k
            appendOp(URX_STATE_SAVE, topLoc);
855
1.25k
        }
856
1.25k
        break;
857
858
859
4.73k
    case doOpt:
860
        // Normal (greedy) ? quantifier.
861
        //  Compiles to
862
        //     1. state save 3
863
        //     2.    body of optional block
864
        //     3. ...
865
        // Insert the state save into the compiled pattern, and we're done.
866
4.73k
        {
867
4.73k
            int32_t   saveStateLoc = blockTopLoc(true);
868
4.73k
            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869
4.73k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870
4.73k
        }
871
4.73k
        break;
872
873
780
    case doNGOpt:
874
        // Non-greedy ?? quantifier
875
        //   compiles to
876
        //    1.  jmp   4
877
        //    2.     body of optional block
878
        //    3   jmp   5
879
        //    4.  state save 2
880
        //    5    ...
881
        //  This code is less than ideal, with two jmps instead of one, because we can only
882
        //  insert one instruction at the top of the block being iterated.
883
780
        {
884
780
            int32_t  jmp1_loc = blockTopLoc(true);
885
780
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886
887
780
            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888
780
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890
780
            appendOp(URX_JMP, jmp2_loc+2);
891
892
780
            appendOp(URX_STATE_SAVE, jmp1_loc+1);
893
780
        }
894
780
        break;
895
896
897
171k
    case doStar:
898
        // Normal (greedy) * quantifier.
899
        // Compiles to
900
        //       1.   STATE_SAVE   4
901
        //       2.      body of stuff being iterated over
902
        //       3.   JMP_SAV      2
903
        //       4.   ...
904
        //
905
        // Or, if the body is a simple [Set],
906
        //       1.   LOOP_SR_I    set number
907
        //       2.   LOOP_C       stack location
908
        //       ...
909
        //
910
        // Or if this is a .*
911
        //       1.   LOOP_DOT_I    (. matches all mode flag)
912
        //       2.   LOOP_C        stack location
913
        //
914
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
915
        //       1.   STATE_SAVE   5
916
        //       2.   STO_INP_LOC  data-loc
917
        //       3.      body of stuff
918
        //       4.   JMP_SAV_X    2
919
        //       5.   ...
920
171k
        {
921
            // location of item #1, the STATE_SAVE
922
171k
            int32_t   topLoc = blockTopLoc(false);
923
171k
            int32_t   dataLoc = -1;
924
925
            // Check for simple *, where the construct being repeated
926
            //   compiled to single opcode, and might be optimizable.
927
171k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928
21.2k
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929
930
21.2k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
931
                    // Emit optimized code for a [char set]*
932
355
                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933
355
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934
355
                    dataLoc = allocateStackData(1);
935
355
                    appendOp(URX_LOOP_C, dataLoc);
936
355
                    break;
937
355
                }
938
939
20.9k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940
20.9k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941
20.9k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942
                    // Emit Optimized code for .* operations.
943
1.38k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944
1.38k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945
                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946
348
                        loopOpI |= 1;
947
348
                    }
948
1.38k
                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949
214
                        loopOpI |= 2;
950
214
                    }
951
1.38k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952
1.38k
                    dataLoc = allocateStackData(1);
953
1.38k
                    appendOp(URX_LOOP_C, dataLoc);
954
1.38k
                    break;
955
1.38k
                }
956
20.9k
            }
957
958
            // Emit general case code for this *
959
            // The optimizations did not apply.
960
961
169k
            int32_t   saveStateLoc = blockTopLoc(true);
962
169k
            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964
            // Check for minimum match length of zero, which requires
965
            //    extra loop-breaking code.
966
169k
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967
1.99k
                insertOp(saveStateLoc);
968
1.99k
                dataLoc = allocateStackData(1);
969
970
1.99k
                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971
1.99k
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972
1.99k
                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973
1.99k
            }
974
975
            // Locate the position in the compiled pattern where the match will continue
976
            //   after completing the *.   (4 or 5 in the comment above)
977
169k
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979
            // Put together the save state op and store it into the compiled code.
980
169k
            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981
169k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983
            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984
169k
            appendOp(jmpOp);
985
169k
        }
986
0
        break;
987
988
221
    case doNGStar:
989
        // Non-greedy *? quantifier
990
        // compiles to
991
        //     1.   JMP    3
992
        //     2.      body of stuff being iterated over
993
        //     3.   STATE_SAVE  2
994
        //     4    ...
995
221
        {
996
221
            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
997
221
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998
221
            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999
221
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000
221
            appendOp(URX_STATE_SAVE, jmpLoc+1);
1001
221
        }
1002
221
        break;
1003
1004
1005
110k
    case doIntervalInit:
1006
        // The '{' opening an interval quantifier was just scanned.
1007
        // Init the counter variables that will accumulate the values as the digits
1008
        //    are scanned.
1009
110k
        fIntervalLow = 0;
1010
110k
        fIntervalUpper = -1;
1011
110k
        break;
1012
1013
114k
    case doIntevalLowerDigit:
1014
        // Scanned a digit from the lower value of an {lower,upper} interval
1015
114k
        {
1016
114k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1017
114k
            U_ASSERT(digitValue >= 0);
1018
114k
            int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019
114k
            if (val > INT32_MAX) {
1020
1
                error(U_REGEX_NUMBER_TOO_BIG);
1021
114k
            } else {
1022
114k
                fIntervalLow = (int32_t)val;
1023
114k
            }
1024
114k
        }
1025
114k
        break;
1026
1027
97.4k
    case doIntervalUpperDigit:
1028
        // Scanned a digit from the upper value of an {lower,upper} interval
1029
97.4k
        {
1030
97.4k
            if (fIntervalUpper < 0) {
1031
96.7k
                fIntervalUpper = 0;
1032
96.7k
            }
1033
97.4k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1034
97.4k
            U_ASSERT(digitValue >= 0);
1035
97.4k
            int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036
97.4k
            if (val > INT32_MAX) {
1037
8
                error(U_REGEX_NUMBER_TOO_BIG);
1038
97.4k
            } else {
1039
97.4k
                fIntervalUpper = (int32_t)val;
1040
97.4k
            }
1041
97.4k
        }
1042
97.4k
        break;
1043
1044
12.3k
    case doIntervalSame:
1045
        // Scanned a single value interval like {27}.  Upper = Lower.
1046
12.3k
        fIntervalUpper = fIntervalLow;
1047
12.3k
        break;
1048
1049
104k
    case doInterval:
1050
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051
104k
        if (compileInlineInterval() == false) {
1052
3.71k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
3.71k
        }
1054
104k
        break;
1055
1056
2.72k
    case doPossessiveInterval:
1057
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058
2.72k
        {
1059
            // Remember the loc for the top of the block being looped over.
1060
            //   (Can not reserve a slot in the compiled pattern at this time, because
1061
            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062
            //    once per block.)
1063
2.72k
            int32_t topLoc = blockTopLoc(false);
1064
1065
            // Produce normal looping code.
1066
2.72k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068
            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069
            //  just as if the loop was inclosed in atomic parentheses.
1070
1071
            // First the STO_SP before the start of the loop
1072
2.72k
            insertOp(topLoc);
1073
1074
2.72k
            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075
2.72k
            int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076
2.72k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078
2.72k
            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079
2.72k
            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080
2.72k
            loopOp++;     // point LoopOp after the just-inserted STO_SP
1081
2.72k
            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083
            // Then the LD_SP after the end of the loop
1084
2.72k
            appendOp(URX_LD_SP, varLoc);
1085
2.72k
        }
1086
1087
2.72k
        break;
1088
1089
2.44k
    case doNGInterval:
1090
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091
2.44k
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092
2.44k
        break;
1093
1094
95
    case doIntervalError:
1095
95
        error(U_REGEX_BAD_INTERVAL);
1096
95
        break;
1097
1098
19.9M
    case doLiteralChar:
1099
        // We've just scanned a "normal" character from the pattern,
1100
19.9M
        literalChar(fC.fChar);
1101
19.9M
        break;
1102
1103
1104
4.30k
    case doEscapedLiteralChar:
1105
        // We've just scanned an backslashed escaped character with  no
1106
        //   special meaning.  It represents itself.
1107
4.30k
        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108
4.30k
            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109
0
            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110
0
               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111
0
             }
1112
4.30k
        literalChar(fC.fChar);
1113
4.30k
        break;
1114
1115
1116
9.80k
    case doDotAny:
1117
        // scanned a ".",  match any single character.
1118
9.80k
        {
1119
9.80k
            fixLiterals(false);
1120
9.80k
            if (fModeFlags & UREGEX_DOTALL) {
1121
2.54k
                appendOp(URX_DOTANY_ALL, 0);
1122
7.26k
            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123
866
                appendOp(URX_DOTANY_UNIX, 0);
1124
6.39k
            } else {
1125
6.39k
                appendOp(URX_DOTANY, 0);
1126
6.39k
            }
1127
9.80k
        }
1128
9.80k
        break;
1129
1130
5.42k
    case doCaret:
1131
5.42k
        {
1132
5.42k
            fixLiterals(false);
1133
5.42k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134
4.23k
                appendOp(URX_CARET, 0);
1135
4.23k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136
584
                appendOp(URX_CARET_M, 0);
1137
598
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138
194
                appendOp(URX_CARET, 0);   // Only testing true start of input.
1139
404
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140
404
                appendOp(URX_CARET_M_UNIX, 0);
1141
404
            }
1142
5.42k
        }
1143
5.42k
        break;
1144
1145
2.02k
    case doDollar:
1146
2.02k
        {
1147
2.02k
            fixLiterals(false);
1148
2.02k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149
1.16k
                appendOp(URX_DOLLAR, 0);
1150
1.16k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151
269
                appendOp(URX_DOLLAR_M, 0);
1152
593
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153
345
                appendOp(URX_DOLLAR_D, 0);
1154
345
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155
248
                appendOp(URX_DOLLAR_MD, 0);
1156
248
            }
1157
2.02k
        }
1158
2.02k
        break;
1159
1160
194
    case doBackslashA:
1161
194
        fixLiterals(false);
1162
194
        appendOp(URX_CARET, 0);
1163
194
        break;
1164
1165
197
    case doBackslashB:
1166
197
        {
1167
            #if  UCONFIG_NO_BREAK_ITERATION==1
1168
            if (fModeFlags & UREGEX_UWORD) {
1169
                error(U_UNSUPPORTED_ERROR);
1170
            }
1171
            #endif
1172
197
            fixLiterals(false);
1173
197
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174
197
            appendOp(op, 1);
1175
197
        }
1176
197
        break;
1177
1178
797
    case doBackslashb:
1179
797
        {
1180
            #if  UCONFIG_NO_BREAK_ITERATION==1
1181
            if (fModeFlags & UREGEX_UWORD) {
1182
                error(U_UNSUPPORTED_ERROR);
1183
            }
1184
            #endif
1185
797
            fixLiterals(false);
1186
797
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187
797
            appendOp(op, 0);
1188
797
        }
1189
797
        break;
1190
1191
784
    case doBackslashD:
1192
784
        fixLiterals(false);
1193
784
        appendOp(URX_BACKSLASH_D, 1);
1194
784
        break;
1195
1196
549
    case doBackslashd:
1197
549
        fixLiterals(false);
1198
549
        appendOp(URX_BACKSLASH_D, 0);
1199
549
        break;
1200
1201
792
    case doBackslashG:
1202
792
        fixLiterals(false);
1203
792
        appendOp(URX_BACKSLASH_G, 0);
1204
792
        break;
1205
1206
1.16k
    case doBackslashH:
1207
1.16k
        fixLiterals(false);
1208
1.16k
        appendOp(URX_BACKSLASH_H, 1);
1209
1.16k
        break;
1210
1211
667
    case doBackslashh:
1212
667
        fixLiterals(false);
1213
667
        appendOp(URX_BACKSLASH_H, 0);
1214
667
        break;
1215
1216
473
    case doBackslashR:
1217
473
        fixLiterals(false);
1218
473
        appendOp(URX_BACKSLASH_R, 0);
1219
473
        break;
1220
1221
692
    case doBackslashS:
1222
692
        fixLiterals(false);
1223
692
        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224
692
        break;
1225
1226
1.71k
    case doBackslashs:
1227
1.71k
        fixLiterals(false);
1228
1.71k
        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229
1.71k
        break;
1230
1231
1.00k
    case doBackslashV:
1232
1.00k
        fixLiterals(false);
1233
1.00k
        appendOp(URX_BACKSLASH_V, 1);
1234
1.00k
        break;
1235
1236
1.02k
    case doBackslashv:
1237
1.02k
        fixLiterals(false);
1238
1.02k
        appendOp(URX_BACKSLASH_V, 0);
1239
1.02k
        break;
1240
1241
806
    case doBackslashW:
1242
806
        fixLiterals(false);
1243
806
        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244
806
        break;
1245
1246
383
    case doBackslashw:
1247
383
        fixLiterals(false);
1248
383
        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249
383
        break;
1250
1251
1.09k
    case doBackslashX:
1252
        #if  UCONFIG_NO_BREAK_ITERATION==1
1253
        // Grapheme Cluster Boundary requires ICU break iteration.
1254
        error(U_UNSUPPORTED_ERROR);
1255
        #endif
1256
1.09k
        fixLiterals(false);
1257
1.09k
        appendOp(URX_BACKSLASH_X, 0);
1258
1.09k
        break;
1259
1260
196
    case doBackslashZ:
1261
196
        fixLiterals(false);
1262
196
        appendOp(URX_DOLLAR, 0);
1263
196
        break;
1264
1265
411
    case doBackslashz:
1266
411
        fixLiterals(false);
1267
411
        appendOp(URX_BACKSLASH_Z, 0);
1268
411
        break;
1269
1270
25
    case doEscapeError:
1271
25
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272
25
        break;
1273
1274
0
    case doExit:
1275
0
        fixLiterals(false);
1276
0
        returnVal = false;
1277
0
        break;
1278
1279
868
    case doProperty:
1280
868
        {
1281
868
            fixLiterals(false);
1282
868
            UnicodeSet *theSet = scanProp();
1283
868
            compileSet(theSet);
1284
868
        }
1285
868
        break;
1286
1287
1.70k
    case doNamedChar:
1288
1.70k
        {
1289
1.70k
            UChar32 c = scanNamedChar();
1290
1.70k
            literalChar(c);
1291
1.70k
        }
1292
1.70k
        break;
1293
1294
1295
5.90k
    case doBackRef:
1296
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297
        //                 the regular expression, because the number of digits to be consumed
1298
        //                 depends on the number of capture groups that have been defined.  So
1299
        //                 we have to do it here instead.
1300
5.90k
        {
1301
5.90k
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302
5.90k
            int32_t  groupNum = 0;
1303
5.90k
            UChar32  c        = fC.fChar;
1304
1305
6.19k
            for (;;) {
1306
                // Loop once per digit, for max allowed number of digits in a back reference.
1307
6.19k
                int32_t digit = u_charDigitValue(c);
1308
6.19k
                groupNum = groupNum * 10 + digit;
1309
6.19k
                if (groupNum >= numCaptureGroups) {
1310
1.64k
                    break;
1311
1.64k
                }
1312
4.55k
                c = peekCharLL();
1313
4.55k
                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
1314
4.25k
                    break;
1315
4.25k
                }
1316
293
                nextCharLL();
1317
293
            }
1318
1319
            // Scan of the back reference in the source regexp is complete.  Now generate
1320
            //  the compiled code for it.
1321
            // Because capture groups can be forward-referenced by back-references,
1322
            //  we fill the operand with the capture group number.  At the end
1323
            //  of compilation, it will be changed to the variable's location.
1324
5.90k
            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325
                                     //    and shouldn't enter this code path at all.
1326
5.90k
            fixLiterals(false);
1327
5.90k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328
2.42k
                appendOp(URX_BACKREF_I, groupNum);
1329
3.48k
            } else {
1330
3.48k
                appendOp(URX_BACKREF, groupNum);
1331
3.48k
            }
1332
5.90k
        }
1333
5.90k
        break;
1334
1335
900
    case doBeginNamedBackRef:
1336
900
        U_ASSERT(fCaptureName == nullptr);
1337
900
        fCaptureName = new UnicodeString;
1338
900
        if (fCaptureName == nullptr) {
1339
0
            error(U_MEMORY_ALLOCATION_ERROR);
1340
0
        }
1341
900
        break;
1342
            
1343
3.16k
    case doContinueNamedBackRef:
1344
3.16k
        fCaptureName->append(fC.fChar);
1345
3.16k
        break;
1346
1347
884
    case doCompleteNamedBackRef:
1348
884
        {
1349
884
        int32_t groupNumber =
1350
884
            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351
884
        if (groupNumber == 0) {
1352
            // Group name has not been defined.
1353
            //   Could be a forward reference. If we choose to support them at some
1354
            //   future time, extra mechanism will be required at this point.
1355
42
            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356
842
        } else {
1357
            // Given the number, handle identically to a \n numbered back reference.
1358
            // See comments above, under doBackRef
1359
842
            fixLiterals(false);
1360
842
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361
197
                appendOp(URX_BACKREF_I, groupNumber);
1362
645
            } else {
1363
645
                appendOp(URX_BACKREF, groupNumber);
1364
645
            }
1365
842
        }
1366
884
        delete fCaptureName;
1367
884
        fCaptureName = nullptr;
1368
884
        break;
1369
171k
        }
1370
       
1371
2.19k
    case doPossessivePlus:
1372
        // Possessive ++ quantifier.
1373
        // Compiles to
1374
        //       1.   STO_SP
1375
        //       2.      body of stuff being iterated over
1376
        //       3.   STATE_SAVE 5
1377
        //       4.   JMP        2
1378
        //       5.   LD_SP
1379
        //       6.   ...
1380
        //
1381
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382
        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383
        //
1384
2.19k
        {
1385
            // Emit the STO_SP
1386
2.19k
            int32_t   topLoc = blockTopLoc(true);
1387
2.19k
            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388
2.19k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389
2.19k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391
            // Emit the STATE_SAVE
1392
2.19k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394
            // Emit the JMP
1395
2.19k
            appendOp(URX_JMP, topLoc+1);
1396
1397
            // Emit the LD_SP
1398
2.19k
            appendOp(URX_LD_SP, stoLoc);
1399
2.19k
        }
1400
2.19k
        break;
1401
1402
384
    case doPossessiveStar:
1403
        // Possessive *+ quantifier.
1404
        // Compiles to
1405
        //       1.   STO_SP       loc
1406
        //       2.   STATE_SAVE   5
1407
        //       3.      body of stuff being iterated over
1408
        //       4.   JMP          2
1409
        //       5.   LD_SP        loc
1410
        //       6    ...
1411
        // TODO:  do something to cut back the state stack each time through the loop.
1412
384
        {
1413
            // Reserve two slots at the top of the block.
1414
384
            int32_t   topLoc = blockTopLoc(true);
1415
384
            insertOp(topLoc);
1416
1417
            // emit   STO_SP     loc
1418
384
            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419
384
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420
384
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422
            // Emit the SAVE_STATE   5
1423
384
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424
384
            op = buildOp(URX_STATE_SAVE, L7);
1425
384
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427
            // Append the JMP operation.
1428
384
            appendOp(URX_JMP, topLoc+1);
1429
1430
            // Emit the LD_SP       loc
1431
384
            appendOp(URX_LD_SP, stoLoc);
1432
384
        }
1433
384
        break;
1434
1435
251
    case doPossessiveOpt:
1436
        // Possessive  ?+ quantifier.
1437
        //  Compiles to
1438
        //     1. STO_SP      loc
1439
        //     2. SAVE_STATE  5
1440
        //     3.    body of optional block
1441
        //     4. LD_SP       loc
1442
        //     5. ...
1443
        //
1444
251
        {
1445
            // Reserve two slots at the top of the block.
1446
251
            int32_t   topLoc = blockTopLoc(true);
1447
251
            insertOp(topLoc);
1448
1449
            // Emit the STO_SP
1450
251
            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451
251
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452
251
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454
            // Emit the SAVE_STATE
1455
251
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456
251
            op = buildOp(URX_STATE_SAVE, continueLoc);
1457
251
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459
            // Emit the LD_SP
1460
251
            appendOp(URX_LD_SP, stoLoc);
1461
251
        }
1462
251
        break;
1463
1464
1465
8.33k
    case doBeginMatchMode:
1466
8.33k
        fNewModeFlags = fModeFlags;
1467
8.33k
        fSetModeFlag  = true;
1468
8.33k
        break;
1469
1470
9.64k
    case doMatchMode:   //  (?i)    and similar
1471
9.64k
        {
1472
9.64k
            int32_t  bit = 0;
1473
9.64k
            switch (fC.fChar) {
1474
3.69k
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475
304
            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476
498
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477
258
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478
73
            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479
213
            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480
4.40k
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481
202
            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
1482
0
            default:
1483
0
                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484
                                        // by the scanner.
1485
9.64k
            }
1486
9.64k
            if (fSetModeFlag) {
1487
9.23k
                fNewModeFlags |= bit;
1488
9.23k
            } else {
1489
413
                fNewModeFlags &= ~bit;
1490
413
            }
1491
9.64k
        }
1492
0
        break;
1493
1494
2.67k
    case doSetMatchMode:
1495
        // Emit code to match any pending literals, using the not-yet changed match mode.
1496
2.67k
        fixLiterals();
1497
1498
        // We've got a (?i) or similar.  The match mode is being changed, but
1499
        //   the change is not scoped to a parenthesized block.
1500
2.67k
        U_ASSERT(fNewModeFlags < 0);
1501
2.67k
        fModeFlags = fNewModeFlags;
1502
1503
2.67k
        break;
1504
1505
1506
5.58k
    case doMatchModeParen:
1507
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508
        //   mode flags so they can be restored at the close of the block.
1509
        //
1510
        //   Compile to a
1511
        //      - NOP, which later may be replaced by a save-state if the
1512
        //         parenthesized group gets a * quantifier, followed by
1513
        //      - NOP, which may later be replaced by a save-state if there
1514
        //             is an '|' alternation within the parens.
1515
5.58k
        {
1516
5.58k
            fixLiterals(false);
1517
5.58k
            appendOp(URX_NOP, 0);
1518
5.58k
            appendOp(URX_NOP, 0);
1519
1520
            // On the Parentheses stack, start a new frame and add the positions
1521
            //   of the two NOPs (a normal non-capturing () frame, except for the
1522
            //   saving of the original mode flags.)
1523
5.58k
            fParenStack.push(fModeFlags, *fStatus);
1524
5.58k
            fParenStack.push(flags, *fStatus);                            // Frame Marker
1525
5.58k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526
5.58k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527
1528
            // Set the current mode flags to the new values.
1529
5.58k
            U_ASSERT(fNewModeFlags < 0);
1530
5.58k
            fModeFlags = fNewModeFlags;
1531
5.58k
        }
1532
5.58k
        break;
1533
1534
68
    case doBadModeFlag:
1535
68
        error(U_REGEX_INVALID_FLAG);
1536
68
        break;
1537
1538
75.1k
    case doSuppressComments:
1539
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540
        // treating a '#' as a to-the-end-of-line comment.
1541
        //   (This Perl compatibility just gets uglier and uglier to do...)
1542
75.1k
        fEOLComments = false;
1543
75.1k
        break;
1544
1545
1546
547
    case doSetAddAmp:
1547
547
        {
1548
547
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1549
547
          set->add(chAmp);
1550
547
        }
1551
547
        break;
1552
1553
1.51k
    case doSetAddDash:
1554
1.51k
        {
1555
1.51k
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1556
1.51k
          set->add(chDash);
1557
1.51k
        }
1558
1.51k
        break;
1559
1560
240
     case doSetBackslash_s:
1561
240
        {
1562
240
         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563
240
         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564
240
         break;
1565
9.64k
        }
1566
1567
697
     case doSetBackslash_S:
1568
697
        {
1569
697
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1570
697
            UnicodeSet SSet;
1571
697
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572
697
            set->addAll(SSet);
1573
697
            break;
1574
9.64k
        }
1575
1576
580
    case doSetBackslash_d:
1577
580
        {
1578
580
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579
            // TODO - make a static set, ticket 6058.
1580
580
            addCategory(set, U_GC_ND_MASK, *fStatus);
1581
580
            break;
1582
9.64k
        }
1583
1584
763
    case doSetBackslash_D:
1585
763
        {
1586
763
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1587
763
            UnicodeSet digits;
1588
            // TODO - make a static set, ticket 6058.
1589
763
            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590
763
            digits.complement();
1591
763
            set->addAll(digits);
1592
763
            break;
1593
9.64k
        }
1594
1595
659
    case doSetBackslash_h:
1596
659
        {
1597
659
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1598
659
            UnicodeSet h;
1599
659
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600
659
            h.add((UChar32)9);   // Tab
1601
659
            set->addAll(h);
1602
659
            break;
1603
9.64k
        }
1604
1605
878
    case doSetBackslash_H:
1606
878
        {
1607
878
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1608
878
            UnicodeSet h;
1609
878
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610
878
            h.add((UChar32)9);   // Tab
1611
878
            h.complement();
1612
878
            set->addAll(h);
1613
878
            break;
1614
9.64k
        }
1615
1616
554
    case doSetBackslash_v:
1617
554
        {
1618
554
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1619
554
            set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1620
554
            set->add((UChar32)0x85);
1621
554
            set->add((UChar32)0x2028, (UChar32)0x2029);
1622
554
            break;
1623
9.64k
        }
1624
1625
360
    case doSetBackslash_V:
1626
360
        {
1627
360
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628
360
            UnicodeSet v;
1629
360
            v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1630
360
            v.add((UChar32)0x85);
1631
360
            v.add((UChar32)0x2028, (UChar32)0x2029);
1632
360
            v.complement();
1633
360
            set->addAll(v);
1634
360
            break;
1635
9.64k
        }
1636
1637
1.84k
    case doSetBackslash_w:
1638
1.84k
        {
1639
1.84k
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1640
1.84k
            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641
1.84k
            break;
1642
9.64k
        }
1643
1644
2.12k
    case doSetBackslash_W:
1645
2.12k
        {
1646
2.12k
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1647
2.12k
            UnicodeSet SSet;
1648
2.12k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649
2.12k
            set->addAll(SSet);
1650
2.12k
            break;
1651
9.64k
        }
1652
1653
576k
    case doSetBegin:
1654
576k
        {
1655
576k
            fixLiterals(false);
1656
576k
            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657
576k
            fSetStack.push(lpSet.orphan(), *fStatus);
1658
576k
            fSetOpStack.push(setStart, *fStatus);
1659
576k
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660
539k
                fSetOpStack.push(setCaseClose, *fStatus);
1661
539k
            }
1662
576k
            break;
1663
9.64k
        }
1664
1665
1.69k
    case doSetBeginDifference1:
1666
        //  We have scanned something like [[abc]-[
1667
        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668
        //  Push a Difference operator, which will cause the new set to be subtracted from what
1669
        //    went before once it is created.
1670
1.69k
        setPushOp(setDifference1);
1671
1.69k
        fSetOpStack.push(setStart, *fStatus);
1672
1.69k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673
469
            fSetOpStack.push(setCaseClose, *fStatus);
1674
469
        }
1675
1.69k
        break;
1676
1677
534
    case doSetBeginIntersection1:
1678
        //  We have scanned something like  [[abc]&[
1679
        //   Need both the '&' operator and the open '[' operator.
1680
534
        setPushOp(setIntersection1);
1681
534
        fSetOpStack.push(setStart, *fStatus);
1682
534
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683
223
            fSetOpStack.push(setCaseClose, *fStatus);
1684
223
        }
1685
534
        break;
1686
1687
77.7k
    case doSetBeginUnion:
1688
        //  We have scanned something like  [[abc][
1689
        //     Need to handle the union operation explicitly [[abc] | [
1690
77.7k
        setPushOp(setUnion);
1691
77.7k
        fSetOpStack.push(setStart, *fStatus);
1692
77.7k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693
21.4k
            fSetOpStack.push(setCaseClose, *fStatus);
1694
21.4k
        }
1695
77.7k
        break;
1696
1697
642
    case doSetDifference2:
1698
        // We have scanned something like [abc--
1699
        //   Consider this to unambiguously be a set difference operator.
1700
642
        setPushOp(setDifference2);
1701
642
        break;
1702
1703
645k
    case doSetEnd:
1704
        // Have encountered the ']' that closes a set.
1705
        //    Force the evaluation of any pending operations within this set,
1706
        //    leave the completed set on the top of the set stack.
1707
645k
        setEval(setEnd);
1708
645k
        U_ASSERT(fSetOpStack.peeki()==setStart);
1709
645k
        fSetOpStack.popi();
1710
645k
        break;
1711
1712
573k
    case doSetFinish:
1713
573k
        {
1714
        // Finished a complete set expression, including all nested sets.
1715
        //   The close bracket has already triggered clearing out pending set operators,
1716
        //    the operator stack should be empty and the operand stack should have just
1717
        //    one entry, the result set.
1718
573k
        U_ASSERT(fSetOpStack.empty());
1719
573k
        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1720
573k
        U_ASSERT(fSetStack.empty());
1721
573k
        compileSet(theSet);
1722
573k
        break;
1723
9.64k
        }
1724
1725
1.63k
    case doSetIntersection2:
1726
        // Have scanned something like [abc&&
1727
1.63k
        setPushOp(setIntersection2);
1728
1.63k
        break;
1729
1730
8.14M
    case doSetLiteral:
1731
        // Union the just-scanned literal character into the set being built.
1732
        //    This operation is the highest precedence set operation, so we can always do
1733
        //    it immediately, without waiting to see what follows.  It is necessary to perform
1734
        //    any pending '-' or '&' operation first, because these have the same precedence
1735
        //    as union-ing in a literal'
1736
8.14M
        {
1737
8.14M
            setEval(setUnion);
1738
8.14M
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1739
8.14M
            s->add(fC.fChar);
1740
8.14M
            fLastSetLiteral = fC.fChar;
1741
8.14M
            break;
1742
9.64k
        }
1743
1744
38.8k
    case doSetLiteralEscaped:
1745
        // A back-slash escaped literal character was encountered.
1746
        // Processing is the same as with setLiteral, above, with the addition of
1747
        //  the optional check for errors on escaped ASCII letters.
1748
38.8k
        {
1749
38.8k
            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750
38.8k
                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751
0
                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752
0
                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753
0
            }
1754
38.8k
            setEval(setUnion);
1755
38.8k
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1756
38.8k
            s->add(fC.fChar);
1757
38.8k
            fLastSetLiteral = fC.fChar;
1758
38.8k
            break;
1759
9.64k
        }
1760
1761
1.14k
        case doSetNamedChar:
1762
        // Scanning a \N{UNICODE CHARACTER NAME}
1763
        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764
        //    above.
1765
1.14k
        {
1766
1.14k
            UChar32  c = scanNamedChar();
1767
1.14k
            setEval(setUnion);
1768
1.14k
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1769
1.14k
            s->add(c);
1770
1.14k
            fLastSetLiteral = c;
1771
1.14k
            break;
1772
9.64k
        }
1773
1774
222
    case doSetNamedRange:
1775
        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776
        // The left character is already in the set, and is saved in fLastSetLiteral.
1777
        // The right side needs to be picked up, the scan is at the 'N'.
1778
        // Lower Limit > Upper limit being an error matches both Java
1779
        //        and ICU UnicodeSet behavior.
1780
222
        {
1781
222
            UChar32  c = scanNamedChar();
1782
222
            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783
10
                error(U_REGEX_INVALID_RANGE);
1784
10
            }
1785
222
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1786
222
            s->add(fLastSetLiteral, c);
1787
222
            fLastSetLiteral = c;
1788
222
            break;
1789
9.64k
        }
1790
1791
1792
931
    case  doSetNegate:
1793
        // Scanned a '^' at the start of a set.
1794
        // Push the negation operator onto the set op stack.
1795
        // A twist for case-insensitive matching:
1796
        //   the case closure operation must happen _before_ negation.
1797
        //   But the case closure operation will already be on the stack if it's required.
1798
        //   This requires checking for case closure, and swapping the stack order
1799
        //    if it is present.
1800
931
        {
1801
931
            int32_t  tosOp = fSetOpStack.peeki();
1802
931
            if (tosOp == setCaseClose) {
1803
470
                fSetOpStack.popi();
1804
470
                fSetOpStack.push(setNegation, *fStatus);
1805
470
                fSetOpStack.push(setCaseClose, *fStatus);
1806
470
            } else {
1807
461
                fSetOpStack.push(setNegation, *fStatus);
1808
461
            }
1809
931
        }
1810
931
        break;
1811
1812
1.56k
    case doSetNoCloseError:
1813
1.56k
        error(U_REGEX_MISSING_CLOSE_BRACKET);
1814
1.56k
        break;
1815
1816
1
    case doSetOpError:
1817
1
        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818
1
        break;
1819
1820
91.1k
    case doSetPosixProp:
1821
91.1k
        {
1822
91.1k
            UnicodeSet *s = scanPosixProp();
1823
91.1k
            if (s != nullptr) {
1824
67.7k
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1825
67.7k
                tos->addAll(*s);
1826
67.7k
                delete s;
1827
67.7k
            }  // else error.  scanProp() reported the error status already.
1828
91.1k
        }
1829
91.1k
        break;
1830
1831
366
    case doSetProp:
1832
        //  Scanned a \p \P within [brackets].
1833
366
        {
1834
366
            UnicodeSet *s = scanProp();
1835
366
            if (s != nullptr) {
1836
357
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1837
357
                tos->addAll(*s);
1838
357
                delete s;
1839
357
            }  // else error.  scanProp() reported the error status already.
1840
366
        }
1841
366
        break;
1842
1843
1844
1.53k
    case doSetRange:
1845
        // We have scanned literal-literal.  Add the range to the set.
1846
        // The left character is already in the set, and is saved in fLastSetLiteral.
1847
        // The right side is the current character.
1848
        // Lower Limit > Upper limit being an error matches both Java
1849
        //        and ICU UnicodeSet behavior.
1850
1.53k
        {
1851
1852
1.53k
        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853
56
            error(U_REGEX_INVALID_RANGE);
1854
56
        }
1855
1.53k
        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1856
1.53k
        s->add(fLastSetLiteral, fC.fChar);
1857
1.53k
        break;
1858
9.64k
        }
1859
1860
0
    default:
1861
0
        UPRV_UNREACHABLE_EXIT;
1862
100M
    }
1863
1864
100M
    if (U_FAILURE(*fStatus)) {
1865
6.70k
        returnVal = false;
1866
6.70k
    }
1867
1868
100M
    return returnVal;
1869
100M
}
1870
1871
1872
1873
//------------------------------------------------------------------------------
1874
//
1875
//   literalChar           We've encountered a literal character from the pattern,
1876
//                             or an escape sequence that reduces to a character.
1877
//                         Add it to the string containing all literal chars/strings from
1878
//                             the pattern.
1879
//
1880
//------------------------------------------------------------------------------
1881
20.2M
void RegexCompile::literalChar(UChar32 c)  {
1882
20.2M
    fLiteralChars.append(c);
1883
20.2M
}
1884
1885
1886
//------------------------------------------------------------------------------
1887
//
1888
//    fixLiterals           When compiling something that can follow a literal
1889
//                          string in a pattern, emit the code to match the
1890
//                          accumulated literal string.
1891
//
1892
//                          Optionally, split the last char of the string off into
1893
//                          a single "ONE_CHAR" operation, so that quantifiers can
1894
//                          apply to that char alone.  Example:   abc*
1895
//                          The * must apply to the 'c' only.
1896
//
1897
//------------------------------------------------------------------------------
1898
28.6M
void    RegexCompile::fixLiterals(UBool split) {
1899
1900
    // If no literal characters have been scanned but not yet had code generated
1901
    //   for them, nothing needs to be done.
1902
28.6M
    if (fLiteralChars.length() == 0) {
1903
27.6M
        return;
1904
27.6M
    }
1905
1906
1.05M
    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907
1.05M
    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909
    // Split:  We need to  ensure that the last item in the compiled pattern
1910
    //     refers only to the last literal scanned in the pattern, so that
1911
    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912
    //     Split before case folding for case insensitive matches.
1913
1914
1.05M
    if (split) {
1915
283k
        fLiteralChars.truncate(indexOfLastCodePoint);
1916
283k
        fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
1917
                              //  Note that the truncated literal string may be empty, in which case
1918
                              //  nothing will be emitted.
1919
1920
283k
        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921
283k
        fixLiterals(false);          // Second recursive call, code for the final code point.
1922
283k
        return;
1923
283k
    }
1924
1925
    // If we are doing case-insensitive matching, case fold the string.  This may expand
1926
    //   the string, e.g. the German sharp-s turns into "ss"
1927
769k
    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928
464k
        fLiteralChars.foldCase();
1929
464k
        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930
464k
        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931
464k
    }
1932
1933
769k
    if (indexOfLastCodePoint == 0) {
1934
        // Single character, emit a URX_ONECHAR op to match it.
1935
340k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936
340k
                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937
16.3k
            appendOp(URX_ONECHAR_I, lastCodePoint);
1938
324k
        } else {
1939
324k
            appendOp(URX_ONECHAR, lastCodePoint);
1940
324k
        }
1941
428k
    } else {
1942
        // Two or more chars, emit a URX_STRING to match them.
1943
428k
        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944
0
            error(U_REGEX_PATTERN_TOO_BIG);
1945
0
        }
1946
428k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947
323k
            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948
323k
        } else {
1949
            // TODO here:  add optimization to split case sensitive strings of length two
1950
            //             into two single char ops, for efficiency.
1951
105k
            appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952
105k
        }
1953
428k
        appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955
        // Add this string into the accumulated strings of the compiled pattern.
1956
428k
        fRXPat->fLiteralText.append(fLiteralChars);
1957
428k
    }
1958
1959
769k
    fLiteralChars.remove();
1960
769k
}
1961
1962
1963
115M
int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964
115M
    if (U_FAILURE(*fStatus)) {
1965
2.92k
        return 0;
1966
2.92k
    }
1967
115M
    if (type < 0 || type > 255) {
1968
0
        UPRV_UNREACHABLE_EXIT;
1969
0
    }
1970
115M
    if (val > 0x00ffffff) {
1971
0
        UPRV_UNREACHABLE_EXIT;
1972
0
    }
1973
115M
    if (val < 0) {
1974
0
        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975
0
            UPRV_UNREACHABLE_EXIT;
1976
0
        }
1977
0
        if (URX_TYPE(val) != 0xff) {
1978
0
            UPRV_UNREACHABLE_EXIT;
1979
0
        }
1980
0
        type = URX_RESERVED_OP_N;
1981
0
    }
1982
115M
    return (type << 24) | val;
1983
115M
}
1984
1985
1986
//------------------------------------------------------------------------------
1987
//
1988
//   appendOp()             Append a new instruction onto the compiled pattern
1989
//                          Includes error checking, limiting the size of the
1990
//                          pattern to lengths that can be represented in the
1991
//                          24 bit operand field of an instruction.
1992
//
1993
//------------------------------------------------------------------------------
1994
57.8M
void RegexCompile::appendOp(int32_t op) {
1995
57.8M
    if (U_FAILURE(*fStatus)) {
1996
2.13k
        return;
1997
2.13k
    }
1998
57.8M
    fRXPat->fCompiledPat->addElement(op, *fStatus);
1999
57.8M
    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000
0
        error(U_REGEX_PATTERN_TOO_BIG);
2001
0
    }
2002
57.8M
}
2003
2004
56.1M
void RegexCompile::appendOp(int32_t type, int32_t val) {
2005
56.1M
    appendOp(buildOp(type, val));
2006
56.1M
}
2007
2008
2009
//------------------------------------------------------------------------------
2010
//
2011
//   insertOp()             Insert a slot for a new opcode into the already
2012
//                          compiled pattern code.
2013
//
2014
//                          Fill the slot with a NOP.  Our caller will replace it
2015
//                          with what they really wanted.
2016
//
2017
//------------------------------------------------------------------------------
2018
128k
void   RegexCompile::insertOp(int32_t where) {
2019
128k
    UVector64 *code = fRXPat->fCompiledPat;
2020
128k
    U_ASSERT(where>0 && where < code->size());
2021
2022
128k
    int32_t  nop = buildOp(URX_NOP, 0);
2023
128k
    code->insertElementAt(nop, where, *fStatus);
2024
2025
    // Walk through the pattern, looking for any ops with targets that
2026
    //  were moved down by the insert.  Fix them.
2027
128k
    int32_t loc;
2028
3.54G
    for (loc=0; loc<code->size(); loc++) {
2029
3.54G
        int32_t op = (int32_t)code->elementAti(loc);
2030
3.54G
        int32_t opType = URX_TYPE(op);
2031
3.54G
        int32_t opValue = URX_VAL(op);
2032
3.54G
        if ((opType == URX_JMP         ||
2033
3.54G
            opType == URX_JMPX         ||
2034
3.54G
            opType == URX_STATE_SAVE   ||
2035
3.54G
            opType == URX_CTR_LOOP     ||
2036
3.54G
            opType == URX_CTR_LOOP_NG  ||
2037
3.54G
            opType == URX_JMP_SAV      ||
2038
3.54G
            opType == URX_JMP_SAV_X    ||
2039
3.54G
            opType == URX_RELOC_OPRND)    && opValue > where) {
2040
            // Target location for this opcode is after the insertion point and
2041
            //   needs to be incremented to adjust for the insertion.
2042
6.16M
            opValue++;
2043
6.16M
            op = buildOp(opType, opValue);
2044
6.16M
            code->setElementAt(op, loc);
2045
6.16M
        }
2046
3.54G
    }
2047
2048
    // Now fix up the parentheses stack.  All positive values in it are locations in
2049
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050
897M
    for (loc=0; loc<fParenStack.size(); loc++) {
2051
897M
        int32_t x = fParenStack.elementAti(loc);
2052
897M
        U_ASSERT(x < code->size());
2053
897M
        if (x>where) {
2054
0
            x++;
2055
0
            fParenStack.setElementAt(x, loc);
2056
0
        }
2057
897M
    }
2058
2059
128k
    if (fMatchCloseParen > where) {
2060
10.2k
        fMatchCloseParen++;
2061
10.2k
    }
2062
128k
    if (fMatchOpenParen > where) {
2063
0
        fMatchOpenParen++;
2064
0
    }
2065
128k
}
2066
2067
2068
//------------------------------------------------------------------------------
2069
//
2070
//   allocateData()        Allocate storage in the matcher's static data area.
2071
//                         Return the index for the newly allocated data.
2072
//                         The storage won't actually exist until we are running a match
2073
//                         operation, but the storage indexes are inserted into various
2074
//                         opcodes while compiling the pattern.
2075
//
2076
//------------------------------------------------------------------------------
2077
70.7k
int32_t RegexCompile::allocateData(int32_t size) {
2078
70.7k
    if (U_FAILURE(*fStatus)) {
2079
12
        return 0;
2080
12
    }
2081
70.7k
    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082
0
        error(U_REGEX_INTERNAL_ERROR);
2083
0
        return 0;
2084
0
    }
2085
70.7k
    int32_t dataIndex = fRXPat->fDataSize;
2086
70.7k
    fRXPat->fDataSize += size;
2087
70.7k
    if (fRXPat->fDataSize >= 0x00fffff0) {
2088
0
        error(U_REGEX_INTERNAL_ERROR);
2089
0
    }
2090
70.7k
    return dataIndex;
2091
70.7k
}
2092
2093
2094
//------------------------------------------------------------------------------
2095
//
2096
//   allocateStackData()   Allocate space in the back-tracking stack frame.
2097
//                         Return the index for the newly allocated data.
2098
//                         The frame indexes are inserted into various
2099
//                         opcodes while compiling the pattern, meaning that frame
2100
//                         size must be restricted to the size that will fit
2101
//                         as an operand (24 bits).
2102
//
2103
//------------------------------------------------------------------------------
2104
330k
int32_t RegexCompile::allocateStackData(int32_t size) {
2105
330k
    if (U_FAILURE(*fStatus)) {
2106
0
        return 0;
2107
0
    }
2108
330k
    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109
0
        error(U_REGEX_INTERNAL_ERROR);
2110
0
        return 0;
2111
0
    }
2112
330k
    int32_t dataIndex = fRXPat->fFrameSize;
2113
330k
    fRXPat->fFrameSize += size;
2114
330k
    if (fRXPat->fFrameSize >= 0x00fffff0) {
2115
0
        error(U_REGEX_PATTERN_TOO_BIG);
2116
0
    }
2117
330k
    return dataIndex;
2118
330k
}
2119
2120
2121
//------------------------------------------------------------------------------
2122
//
2123
//   blockTopLoc()          Find or create a location in the compiled pattern
2124
//                          at the start of the operation or block that has
2125
//                          just been compiled.  Needed when a quantifier (* or
2126
//                          whatever) appears, and we need to add an operation
2127
//                          at the start of the thing being quantified.
2128
//
2129
//                          (Parenthesized Blocks) have a slot with a NOP that
2130
//                          is reserved for this purpose.  .* or similar don't
2131
//                          and a slot needs to be added.
2132
//
2133
//       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
2134
//                                         at the returned location.
2135
//                                 false - just return the address,
2136
//                                         do not reserve a location there.
2137
//
2138
//------------------------------------------------------------------------------
2139
476k
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140
476k
    int32_t   theLoc;
2141
476k
    fixLiterals(true);  // Emit code for any pending literals.
2142
                        //   If last item was a string, emit separate op for the its last char.
2143
476k
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144
8.04k
    {
2145
        // The item just processed is a parenthesized block.
2146
8.04k
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147
8.04k
        U_ASSERT(theLoc > 0);
2148
8.04k
        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149
8.04k
    }
2150
468k
    else {
2151
        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153
        // We need to make space now.
2154
468k
        theLoc = fRXPat->fCompiledPat->size()-1;
2155
468k
        int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156
468k
        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157
            // Strings take two opcode, we want the position of the first one.
2158
            // We can have a string at this point if a single character case-folded to two.
2159
299k
            theLoc--;
2160
299k
        }
2161
468k
        if (reserveLoc) {
2162
183k
            int32_t  nop = buildOp(URX_NOP, 0);
2163
183k
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164
183k
        }
2165
468k
    }
2166
476k
    return theLoc;
2167
476k
}
2168
2169
2170
2171
//------------------------------------------------------------------------------
2172
//
2173
//    handleCloseParen      When compiling a close paren, we need to go back
2174
//                          and fix up any JMP or SAVE operations within the
2175
//                          parenthesized block that need to target the end
2176
//                          of the block.  The locations of these are kept on
2177
//                          the paretheses stack.
2178
//
2179
//                          This function is called both when encountering a
2180
//                          real ) and at the end of the pattern.
2181
//
2182
//------------------------------------------------------------------------------
2183
378k
void  RegexCompile::handleCloseParen() {
2184
378k
    int32_t   patIdx;
2185
378k
    int32_t   patOp;
2186
378k
    if (fParenStack.size() <= 0) {
2187
0
        error(U_REGEX_MISMATCHED_PAREN);
2188
0
        return;
2189
0
    }
2190
2191
    // Emit code for any pending literals.
2192
378k
    fixLiterals(false);
2193
2194
    // Fixup any operations within the just-closed parenthesized group
2195
    //    that need to reference the end of the (block).
2196
    //    (The first one popped from the stack is an unused slot for
2197
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198
18.8M
    for (;;) {
2199
18.8M
        patIdx = fParenStack.popi();
2200
18.8M
        if (patIdx < 0) {
2201
            // value < 0 flags the start of the frame on the paren stack.
2202
378k
            break;
2203
378k
        }
2204
18.4M
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205
18.4M
        patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206
18.4M
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207
18.4M
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208
18.4M
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209
18.4M
        fMatchOpenParen     = patIdx;
2210
18.4M
    }
2211
2212
    //  At the close of any parenthesized block, restore the match mode flags  to
2213
    //  the value they had at the open paren.  Saved value is
2214
    //  at the top of the paren stack.
2215
378k
    fModeFlags = fParenStack.popi();
2216
378k
    U_ASSERT(fModeFlags < 0);
2217
2218
    // DO any additional fixups, depending on the specific kind of
2219
    // parentesized grouping this is
2220
2221
378k
    switch (patIdx) {
2222
5.29k
    case plain:
2223
9.97k
    case flags:
2224
        // No additional fixups required.
2225
        //   (Grouping-only parentheses)
2226
9.97k
        break;
2227
303k
    case capturing:
2228
        // Capturing Parentheses.
2229
        //   Insert a End Capture op into the pattern.
2230
        //   The frame offset of the variables for this cg is obtained from the
2231
        //       start capture op and put it into the end-capture op.
2232
303k
        {
2233
303k
            int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234
303k
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235
2236
303k
            int32_t   frameVarLocation = URX_VAL(captureOp);
2237
303k
            appendOp(URX_END_CAPTURE, frameVarLocation);
2238
303k
        }
2239
303k
        break;
2240
581
    case atomic:
2241
        // Atomic Parenthesis.
2242
        //   Insert a LD_SP operation to restore the state stack to the position
2243
        //   it was when the atomic parens were entered.
2244
581
        {
2245
581
            int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246
581
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247
581
            int32_t   stoLoc = URX_VAL(stoOp);
2248
581
            appendOp(URX_LD_SP, stoLoc);
2249
581
        }
2250
581
        break;
2251
2252
60.1k
    case lookAhead:
2253
60.1k
        {
2254
60.1k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255
60.1k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256
60.1k
            int32_t dataLoc  = URX_VAL(startOp);
2257
60.1k
            appendOp(URX_LA_END, dataLoc);
2258
60.1k
        }
2259
60.1k
        break;
2260
2261
851
    case negLookAhead:
2262
851
        {
2263
            // See comment at doOpenLookAheadNeg
2264
851
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265
851
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266
851
            int32_t dataLoc  = URX_VAL(startOp);
2267
851
            appendOp(URX_LA_END, dataLoc);
2268
851
            appendOp(URX_BACKTRACK, 0);
2269
851
            appendOp(URX_LA_END, dataLoc);
2270
2271
            // Patch the URX_SAVE near the top of the block.
2272
            // The destination of the SAVE is the final LA_END that was just added.
2273
851
            int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2274
851
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275
851
            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276
851
            saveOp           = buildOp(URX_STATE_SAVE, dest);
2277
851
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278
851
        }
2279
851
        break;
2280
2281
1.52k
    case lookBehind:
2282
1.52k
        {
2283
            // See comment at doOpenLookBehind.
2284
2285
            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286
1.52k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287
1.52k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288
1.52k
            int32_t dataLoc  = URX_VAL(startOp);
2289
1.52k
            appendOp(URX_LB_END, dataLoc);
2290
1.52k
            appendOp(URX_LA_END, dataLoc);
2291
2292
            // Determine the min and max bounds for the length of the
2293
            //  string that the pattern can match.
2294
            //  An unbounded upper limit is an error.
2295
1.52k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296
1.52k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297
1.52k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298
1.52k
            if (URX_TYPE(maxML) != 0) {
2299
143
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2300
143
                break;
2301
143
            }
2302
1.37k
            if (maxML == INT32_MAX) {
2303
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2304
0
                break;
2305
0
            }
2306
1.37k
            if (minML == INT32_MAX) {
2307
                // This condition happens when no match is possible, such as with a
2308
                // [set] expression containing no elements.
2309
                // In principle, the generated code to evaluate the expression could be deleted,
2310
                // but it's probably not worth the complication.
2311
249
                minML = 0;
2312
249
            }
2313
1.37k
            U_ASSERT(minML <= maxML);
2314
2315
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2316
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317
1.37k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318
1.37k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319
2320
1.37k
        }
2321
0
        break;
2322
2323
2324
2325
1.64k
    case lookBehindN:
2326
1.64k
        {
2327
            // See comment at doOpenLookBehindNeg.
2328
2329
            // Append the URX_LBN_END to the compiled pattern.
2330
1.64k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331
1.64k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332
1.64k
            int32_t dataLoc  = URX_VAL(startOp);
2333
1.64k
            appendOp(URX_LBN_END, dataLoc);
2334
2335
            // Determine the min and max bounds for the length of the
2336
            //  string that the pattern can match.
2337
            //  An unbounded upper limit is an error.
2338
1.64k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339
1.64k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340
1.64k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341
1.64k
            if (URX_TYPE(maxML) != 0) {
2342
180
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2343
180
                break;
2344
180
            }
2345
1.46k
            if (maxML == INT32_MAX) {
2346
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2347
0
                break;
2348
0
            }
2349
1.46k
            if (minML == INT32_MAX) {
2350
                // This condition happens when no match is possible, such as with a
2351
                // [set] expression containing no elements.
2352
                // In principle, the generated code to evaluate the expression could be deleted,
2353
                // but it's probably not worth the complication.
2354
216
                minML = 0;
2355
216
            }
2356
2357
1.46k
            U_ASSERT(minML <= maxML);
2358
2359
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2360
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361
1.46k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362
1.46k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363
2364
            // Insert the pattern location to continue at after a successful match
2365
            //  as the last operand of the URX_LBN_CONT
2366
1.46k
            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367
1.46k
            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368
1.46k
        }
2369
0
        break;
2370
2371
2372
2373
0
    default:
2374
0
        UPRV_UNREACHABLE_EXIT;
2375
378k
    }
2376
2377
    // remember the next location in the compiled pattern.
2378
    // The compilation of Quantifiers will look at this to see whether its looping
2379
    //   over a parenthesized block or a single item
2380
378k
    fMatchCloseParen = fRXPat->fCompiledPat->size();
2381
378k
}
2382
2383
2384
2385
//------------------------------------------------------------------------------
2386
//
2387
//   compileSet       Compile the pattern operations for a reference to a
2388
//                    UnicodeSet.
2389
//
2390
//------------------------------------------------------------------------------
2391
void        RegexCompile::compileSet(UnicodeSet *theSet)
2392
573k
{
2393
573k
    if (theSet == nullptr) {
2394
110
        return;
2395
110
    }
2396
    //  Remove any strings from the set.
2397
    //  There shouldn't be any, but just in case.
2398
    //     (Case Closure can add them; if we had a simple case closure available that
2399
    //      ignored strings, that would be better.)
2400
573k
    theSet->removeAllStrings();
2401
573k
    int32_t  setSize = theSet->size();
2402
2403
573k
    switch (setSize) {
2404
4.47k
    case 0:
2405
4.47k
        {
2406
            // Set of no elements.   Always fails to match.
2407
4.47k
            appendOp(URX_BACKTRACK, 0);
2408
4.47k
            delete theSet;
2409
4.47k
        }
2410
4.47k
        break;
2411
2412
6.51k
    case 1:
2413
6.51k
        {
2414
            // The set contains only a single code point.  Put it into
2415
            //   the compiled pattern as a single char operation rather
2416
            //   than a set, and discard the set itself.
2417
6.51k
            literalChar(theSet->charAt(0));
2418
6.51k
            delete theSet;
2419
6.51k
        }
2420
6.51k
        break;
2421
2422
562k
    default:
2423
562k
        {
2424
            //  The set contains two or more chars.  (the normal case)
2425
            //  Put it into the compiled pattern as a set.
2426
562k
            theSet->freeze();
2427
562k
            int32_t setNumber = fRXPat->fSets->size();
2428
562k
            fRXPat->fSets->addElement(theSet, *fStatus);
2429
562k
            if (U_SUCCESS(*fStatus)) {
2430
562k
                appendOp(URX_SETREF, setNumber);
2431
562k
            } else {
2432
3
                delete theSet;
2433
3
            }
2434
562k
        }
2435
573k
    }
2436
573k
}
2437
2438
2439
//------------------------------------------------------------------------------
2440
//
2441
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442
//                      Except for the specific opcodes used, the code is the same
2443
//                      for all three types (greedy, non-greedy, possessive) of
2444
//                      intervals.  The opcodes are supplied as parameters.
2445
//                      (There are two sets of opcodes - greedy & possessive use the
2446
//                      same ones, while non-greedy has it's own.)
2447
//
2448
//                      The code for interval loops has this form:
2449
//                         0  CTR_INIT   counter loc (in stack frame)
2450
//                         1             5  patt address of CTR_LOOP at bottom of block
2451
//                         2             min count
2452
//                         3             max count   (-1 for unbounded)
2453
//                         4  ...        block to be iterated over
2454
//                         5  CTR_LOOP
2455
//
2456
//                       In
2457
//------------------------------------------------------------------------------
2458
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459
8.88k
{
2460
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461
    //   four slots in the compiled code.  Reserve them.
2462
8.88k
    int32_t   topOfBlock = blockTopLoc(true);
2463
8.88k
    insertOp(topOfBlock);
2464
8.88k
    insertOp(topOfBlock);
2465
8.88k
    insertOp(topOfBlock);
2466
2467
    // The operands for the CTR_INIT opcode include the index in the matcher data
2468
    //   of the counter.  Allocate it now. There are two data items
2469
    //        counterLoc   -->  Loop counter
2470
    //               +1    -->  Input index (for breaking non-progressing loops)
2471
    //                          (Only present if unbounded upper limit on loop)
2472
8.88k
    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473
8.88k
    int32_t   counterLoc = allocateStackData(dataSize);
2474
2475
8.88k
    int32_t   op = buildOp(InitOp, counterLoc);
2476
8.88k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478
    // The second operand of CTR_INIT is the location following the end of the loop.
2479
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480
    //   compilation of something later on causes the code to grow and the target
2481
    //   position to move.
2482
8.88k
    int32_t loopEnd = fRXPat->fCompiledPat->size();
2483
8.88k
    op = buildOp(URX_RELOC_OPRND, loopEnd);
2484
8.88k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486
    // Followed by the min and max counts.
2487
8.88k
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488
8.88k
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490
    // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491
    //   Goes at end of the block being looped over, so just append to the code so far.
2492
8.88k
    appendOp(LoopOp, topOfBlock);
2493
2494
8.88k
    if ((fIntervalLow & 0xff000000) != 0 ||
2495
8.88k
        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496
31
            error(U_REGEX_NUMBER_TOO_BIG);
2497
31
        }
2498
2499
8.88k
    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500
16
        error(U_REGEX_MAX_LT_MIN);
2501
16
    }
2502
8.88k
}
2503
2504
2505
2506
104k
UBool RegexCompile::compileInlineInterval() {
2507
104k
    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508
        // Too big to inline.  Fail, which will cause looping code to be generated.
2509
        //   (Upper < Lower picks up unbounded upper and errors, both.)
2510
2.87k
        return false;
2511
2.87k
    }
2512
2513
101k
    int32_t   topOfBlock = blockTopLoc(false);
2514
101k
    if (fIntervalUpper == 0) {
2515
        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516
        // Discard the generated code for the block.
2517
        // If the block included parens, discard the info pertaining to them as well.
2518
1.88k
        fRXPat->fCompiledPat->setSize(topOfBlock);
2519
1.88k
        if (fMatchOpenParen >= topOfBlock) {
2520
398
            fMatchOpenParen = -1;
2521
398
        }
2522
1.88k
        if (fMatchCloseParen >= topOfBlock) {
2523
398
            fMatchCloseParen = -1;
2524
398
        }
2525
1.88k
        return true;
2526
1.88k
    }
2527
2528
99.9k
    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529
        // The thing being repeated is not a single op, but some
2530
        //   more complex block.  Do it as a loop, not inlines.
2531
        //   Note that things "repeated" a max of once are handled as inline, because
2532
        //     the one copy of the code already generated is just fine.
2533
838
        return false;
2534
838
    }
2535
2536
    // Pick up the opcode that is to be repeated
2537
    //
2538
99.1k
    int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539
2540
    // Compute the pattern location where the inline sequence
2541
    //   will end, and set up the state save op that will be needed.
2542
    //
2543
99.1k
    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544
99.1k
                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545
99.1k
    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546
99.1k
    if (fIntervalLow == 0) {
2547
95.2k
        insertOp(topOfBlock);
2548
95.2k
        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549
95.2k
    }
2550
2551
2552
2553
    //  Loop, emitting the op for the thing being repeated each time.
2554
    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555
    //    it was put there when it was originally encountered.
2556
99.1k
    int32_t i;
2557
883k
    for (i=1; i<fIntervalUpper; i++ ) {
2558
784k
        if (i >= fIntervalLow) {
2559
766k
            appendOp(saveOp);
2560
766k
        }
2561
784k
        appendOp(op);
2562
784k
    }
2563
99.1k
    return true;
2564
99.9k
}
2565
2566
2567
2568
//------------------------------------------------------------------------------
2569
//
2570
//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
2571
//                         set of characters that could potentially begin a case-insensitive 
2572
//                         match of a string beginning with that character, using full Unicode
2573
//                         case insensitive matching.
2574
//
2575
//          This is used in optimizing find().
2576
//
2577
//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578
//          misses cases like this:
2579
//             A string from the pattern begins with 'ss' (although all we know
2580
//                 in this context is that it begins with 's')
2581
//             The pattern could match a string beginning with a German sharp-s
2582
//
2583
//           To the ordinary case closure for a character c, we add all other
2584
//           characters cx where the case closure of cx includes a string form that begins
2585
//           with the original character c.
2586
//
2587
//           This function could be made smarter. The full pattern string is available
2588
//           and it would be possible to verify that the extra characters being added
2589
//           to the starting set fully match, rather than having just a first-char of the
2590
//           folded form match.
2591
//
2592
//------------------------------------------------------------------------------
2593
64.3k
void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595
// Machine Generated below.
2596
// It may need updating with new versions of Unicode.
2597
// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598
// The update tool is here:
2599
// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601
// Machine Generated Data. Do not hand edit.
2602
64.3k
    static const UChar32 RECaseFixCodePoints[] = {
2603
64.3k
        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
2604
64.3k
        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
2605
64.3k
        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
2606
64.3k
        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
2607
64.3k
        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609
64.3k
    static const int16_t RECaseFixStringOffsets[] = {
2610
64.3k
        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
2611
64.3k
        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
2612
64.3k
        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
2613
64.3k
        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
2614
64.3k
        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616
64.3k
    static const int16_t RECaseFixCounts[] = {
2617
64.3k
        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
2618
64.3k
        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
2619
64.3k
        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2620
64.3k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2621
64.3k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623
64.3k
    static const char16_t RECaseFixData[] = {
2624
64.3k
        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
2625
64.3k
        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
2626
64.3k
        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
2627
64.3k
        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
2628
64.3k
        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
2629
64.3k
        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
2630
64.3k
        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
2631
64.3k
        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
2632
64.3k
        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
2633
64.3k
        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
2634
64.3k
        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636
// End of machine generated data.
2637
2638
64.3k
    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639
        // This function should never be called with an invalid input character.
2640
0
        UPRV_UNREACHABLE_EXIT;
2641
64.3k
    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642
14.4k
        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643
14.4k
        starterChars->set(caseFoldedC, caseFoldedC);
2644
2645
14.4k
        int32_t i;
2646
425k
        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647
            // Simple linear search through the sorted list of interesting code points.
2648
410k
        }
2649
2650
14.4k
        if (RECaseFixCodePoints[i] == c) {
2651
7.77k
            int32_t dataIndex = RECaseFixStringOffsets[i];
2652
7.77k
            int32_t numCharsToAdd = RECaseFixCounts[i];
2653
7.77k
            UChar32 cpToAdd = 0;
2654
42.8k
            for (int32_t j=0; j<numCharsToAdd; j++) {
2655
35.1k
                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656
35.1k
                starterChars->add(cpToAdd);
2657
35.1k
            }
2658
7.77k
        }
2659
2660
14.4k
        starterChars->closeOver(USET_CASE_INSENSITIVE);
2661
14.4k
        starterChars->removeAllStrings();
2662
49.9k
    } else {
2663
        // Not a cased character. Just return it alone.
2664
49.9k
        starterChars->set(c, c);
2665
49.9k
    }
2666
64.3k
}
2667
2668
2669
// Increment with overflow check.
2670
// val and delta will both be positive.
2671
2672
3.38M
static int32_t safeIncrement(int32_t val, int32_t delta) {
2673
3.38M
    if (INT32_MAX - val > delta) {
2674
3.33M
        return val + delta;
2675
3.33M
    } else {
2676
49.1k
        return INT32_MAX;
2677
49.1k
    }
2678
3.38M
}
2679
2680
2681
//------------------------------------------------------------------------------
2682
//
2683
//   matchStartType    Determine how a match can start.
2684
//                     Used to optimize find() operations.
2685
//
2686
//                     Operation is very similar to minMatchLength().  Walk the compiled
2687
//                     pattern, keeping an on-going minimum-match-length.  For any
2688
//                     op where the min match coming in is zero, add that ops possible
2689
//                     starting matches to the possible starts for the overall pattern.
2690
//
2691
//------------------------------------------------------------------------------
2692
4.64k
void   RegexCompile::matchStartType() {
2693
4.64k
    if (U_FAILURE(*fStatus)) {
2694
45
        return;
2695
45
    }
2696
2697
2698
4.59k
    int32_t    loc;                    // Location in the pattern of the current op being processed.
2699
4.59k
    int32_t    op;                     // The op being processed
2700
4.59k
    int32_t    opType;                 // The opcode type of the op
2701
4.59k
    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702
4.59k
    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703
2704
4.59k
    UBool      atStart = true;         // True if no part of the pattern yet encountered
2705
                                       //   could have advanced the position in a match.
2706
                                       //   (Maximum match length so far == 0)
2707
2708
    // forwardedLength is a vector holding minimum-match-length values that
2709
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710
    //   It must be one longer than the pattern being checked because some  ops
2711
    //   will jmp to a end-of-block+1 location from within a block, and we must
2712
    //   count those when checking the block.
2713
4.59k
    int32_t end = fRXPat->fCompiledPat->size();
2714
4.59k
    UVector32  forwardedLength(end+1, *fStatus);
2715
4.59k
    forwardedLength.setSize(end+1);
2716
27.7M
    for (loc=3; loc<end; loc++) {
2717
27.7M
        forwardedLength.setElementAt(INT32_MAX, loc);
2718
27.7M
    }
2719
2720
19.4M
    for (loc = 3; loc<end; loc++) {
2721
19.4M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722
19.4M
        opType = URX_TYPE(op);
2723
2724
        // The loop is advancing linearly through the pattern.
2725
        // If the op we are now at was the destination of a branch in the pattern,
2726
        // and that path has a shorter minimum length than the current accumulated value,
2727
        // replace the current accumulated value.
2728
19.4M
        if (forwardedLength.elementAti(loc) < currentLen) {
2729
334k
            currentLen = forwardedLength.elementAti(loc);
2730
334k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731
334k
        }
2732
2733
19.4M
        switch (opType) {
2734
            // Ops that don't change the total length matched
2735
0
        case URX_RESERVED_OP:
2736
4.59k
        case URX_END:
2737
4.59k
        case URX_FAIL:
2738
4.59k
        case URX_STRING_LEN:
2739
4.59k
        case URX_NOP:
2740
76.7k
        case URX_START_CAPTURE:
2741
148k
        case URX_END_CAPTURE:
2742
149k
        case URX_BACKSLASH_B:
2743
149k
        case URX_BACKSLASH_BU:
2744
149k
        case URX_BACKSLASH_G:
2745
149k
        case URX_BACKSLASH_Z:
2746
150k
        case URX_DOLLAR:
2747
151k
        case URX_DOLLAR_M:
2748
151k
        case URX_DOLLAR_D:
2749
151k
        case URX_DOLLAR_MD:
2750
151k
        case URX_RELOC_OPRND:
2751
153k
        case URX_STO_INP_LOC:
2752
154k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753
154k
        case URX_BACKREF_I:
2754
2755
157k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756
160k
        case URX_LD_SP:
2757
160k
            break;
2758
2759
3.37k
        case URX_CARET:
2760
3.37k
            if (atStart) {
2761
389
                fRXPat->fStartType = START_START;
2762
389
            }
2763
3.37k
            break;
2764
2765
586
        case URX_CARET_M:
2766
1.04k
        case URX_CARET_M_UNIX:
2767
1.04k
            if (atStart) {
2768
269
                fRXPat->fStartType = START_LINE;
2769
269
            }
2770
1.04k
            break;
2771
2772
709k
        case URX_ONECHAR:
2773
709k
            if (currentLen == 0) {
2774
                // This character could appear at the start of a match.
2775
                //   Add it to the set of possible starting characters.
2776
116k
                fRXPat->fInitialChars->add(URX_VAL(op));
2777
116k
                numInitialStrings += 2;
2778
116k
            }
2779
709k
            currentLen = safeIncrement(currentLen, 1);
2780
709k
            atStart = false;
2781
709k
            break;
2782
2783
2784
393k
        case URX_SETREF:
2785
393k
            if (currentLen == 0) {
2786
1.42k
                int32_t  sn = URX_VAL(op);
2787
1.42k
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788
1.42k
                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2789
1.42k
                fRXPat->fInitialChars->addAll(*s);
2790
1.42k
                numInitialStrings += 2;
2791
1.42k
            }
2792
393k
            currentLen = safeIncrement(currentLen, 1);
2793
393k
            atStart = false;
2794
393k
            break;
2795
2796
2.59k
        case URX_LOOP_SR_I:
2797
            // [Set]*, like a SETREF, above, in what it can match,
2798
            //  but may not match at all, so currentLen is not incremented.
2799
2.59k
            if (currentLen == 0) {
2800
200
                int32_t  sn = URX_VAL(op);
2801
200
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802
200
                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2803
200
                fRXPat->fInitialChars->addAll(*s);
2804
200
                numInitialStrings += 2;
2805
200
            }
2806
2.59k
            atStart = false;
2807
2.59k
            break;
2808
2809
1.82k
        case URX_LOOP_DOT_I:
2810
1.82k
            if (currentLen == 0) {
2811
                // .* at the start of a pattern.
2812
                //    Any character can begin the match.
2813
333
                fRXPat->fInitialChars->clear();
2814
333
                fRXPat->fInitialChars->complement();
2815
333
                numInitialStrings += 2;
2816
333
            }
2817
1.82k
            atStart = false;
2818
1.82k
            break;
2819
2820
2821
2.06k
        case URX_STATIC_SETREF:
2822
2.06k
            if (currentLen == 0) {
2823
338
                int32_t  sn = URX_VAL(op);
2824
338
                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825
338
                const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826
338
                fRXPat->fInitialChars->addAll(s);
2827
338
                numInitialStrings += 2;
2828
338
            }
2829
2.06k
            currentLen = safeIncrement(currentLen, 1);
2830
2.06k
            atStart = false;
2831
2.06k
            break;
2832
2833
2834
2835
1.31k
        case URX_STAT_SETREF_N:
2836
1.31k
            if (currentLen == 0) {
2837
244
                int32_t  sn = URX_VAL(op);
2838
244
                UnicodeSet sc;
2839
244
                sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840
244
                fRXPat->fInitialChars->addAll(sc);
2841
244
                numInitialStrings += 2;
2842
244
            }
2843
1.31k
            currentLen = safeIncrement(currentLen, 1);
2844
1.31k
            atStart = false;
2845
1.31k
            break;
2846
2847
2848
2849
1.22k
        case URX_BACKSLASH_D:
2850
            // Digit Char
2851
1.22k
             if (currentLen == 0) {
2852
416
                 UnicodeSet s;
2853
416
                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854
416
                 if (URX_VAL(op) != 0) {
2855
204
                     s.complement();
2856
204
                 }
2857
416
                 fRXPat->fInitialChars->addAll(s);
2858
416
                 numInitialStrings += 2;
2859
416
            }
2860
1.22k
            currentLen = safeIncrement(currentLen, 1);
2861
1.22k
            atStart = false;
2862
1.22k
            break;
2863
2864
2865
1.82k
        case URX_BACKSLASH_H:
2866
            // Horiz white space
2867
1.82k
            if (currentLen == 0) {
2868
657
                UnicodeSet s;
2869
657
                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870
657
                s.add((UChar32)9);   // Tab
2871
657
                if (URX_VAL(op) != 0) {
2872
456
                    s.complement();
2873
456
                }
2874
657
                fRXPat->fInitialChars->addAll(s);
2875
657
                numInitialStrings += 2;
2876
657
            }
2877
1.82k
            currentLen = safeIncrement(currentLen, 1);
2878
1.82k
            atStart = false;
2879
1.82k
            break;
2880
2881
2882
387
        case URX_BACKSLASH_R:       // Any line ending sequence
2883
2.28k
        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884
2.28k
            if (currentLen == 0) {
2885
976
                UnicodeSet s;
2886
976
                s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2887
976
                s.add((UChar32)0x85);
2888
976
                s.add((UChar32)0x2028, (UChar32)0x2029);
2889
976
                if (URX_VAL(op) != 0) {
2890
                     // Complement option applies to URX_BACKSLASH_V only.
2891
390
                     s.complement();
2892
390
                }
2893
976
                fRXPat->fInitialChars->addAll(s);
2894
976
                numInitialStrings += 2;
2895
976
            }
2896
2.28k
            currentLen = safeIncrement(currentLen, 1);
2897
2.28k
            atStart = false;
2898
2.28k
            break;
2899
2900
2901
2902
14.1k
        case URX_ONECHAR_I:
2903
            // Case Insensitive Single Character.
2904
14.1k
            if (currentLen == 0) {
2905
2.06k
                UChar32  c = URX_VAL(op);
2906
2.06k
                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907
2.06k
                    UnicodeSet starters(c, c);
2908
2.06k
                    starters.closeOver(USET_CASE_INSENSITIVE);
2909
                    // findCaseInsensitiveStarters(c, &starters);
2910
                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911
                    //   The expanded folding can't match the pattern.
2912
2.06k
                    fRXPat->fInitialChars->addAll(starters);
2913
2.06k
                } else {
2914
                    // Char has no case variants.  Just add it as-is to the
2915
                    //   set of possible starting chars.
2916
0
                    fRXPat->fInitialChars->add(c);
2917
0
                }
2918
2.06k
                numInitialStrings += 2;
2919
2.06k
            }
2920
14.1k
            currentLen = safeIncrement(currentLen, 1);
2921
14.1k
            atStart = false;
2922
14.1k
            break;
2923
2924
2925
422
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926
1.06k
        case URX_DOTANY_ALL:    // . matches one or two.
2927
4.27k
        case URX_DOTANY:
2928
4.67k
        case URX_DOTANY_UNIX:
2929
4.67k
            if (currentLen == 0) {
2930
                // These constructs are all bad news when they appear at the start
2931
                //   of a match.  Any character can begin the match.
2932
656
                fRXPat->fInitialChars->clear();
2933
656
                fRXPat->fInitialChars->complement();
2934
656
                numInitialStrings += 2;
2935
656
            }
2936
4.67k
            currentLen = safeIncrement(currentLen, 1);
2937
4.67k
            atStart = false;
2938
4.67k
            break;
2939
2940
2941
0
        case URX_JMPX:
2942
0
            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943
0
            U_FALLTHROUGH;
2944
8.53M
        case URX_JMP:
2945
8.53M
            {
2946
8.53M
                int32_t  jmpDest = URX_VAL(op);
2947
8.53M
                if (jmpDest < loc) {
2948
                    // Loop of some kind.  Can safely ignore, the worst that will happen
2949
                    //  is that we understate the true minimum length
2950
2.33k
                    currentLen = forwardedLength.elementAti(loc+1);
2951
2952
8.53M
                } else {
2953
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2954
8.53M
                    U_ASSERT(jmpDest <= end+1);
2955
8.53M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956
1.92k
                        forwardedLength.setElementAt(currentLen, jmpDest);
2957
1.92k
                    }
2958
8.53M
                }
2959
8.53M
            }
2960
8.53M
            atStart = false;
2961
8.53M
            break;
2962
2963
122k
        case URX_JMP_SAV:
2964
124k
        case URX_JMP_SAV_X:
2965
            // Combo of state save to the next loc, + jmp backwards.
2966
            //   Net effect on min. length computation is nothing.
2967
124k
            atStart = false;
2968
124k
            break;
2969
2970
1.90k
        case URX_BACKTRACK:
2971
            // Fails are kind of like a branch, except that the min length was
2972
            //   propagated already, by the state save.
2973
1.90k
            currentLen = forwardedLength.elementAti(loc+1);
2974
1.90k
            atStart = false;
2975
1.90k
            break;
2976
2977
2978
9.28M
        case URX_STATE_SAVE:
2979
9.28M
            {
2980
                // State Save, for forward jumps, propagate the current minimum.
2981
                //             of the state save.
2982
9.28M
                int32_t  jmpDest = URX_VAL(op);
2983
9.28M
                if (jmpDest > loc) {
2984
9.28M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985
8.72M
                        forwardedLength.setElementAt(currentLen, jmpDest);
2986
8.72M
                    }
2987
9.28M
                }
2988
9.28M
            }
2989
9.28M
            atStart = false;
2990
9.28M
            break;
2991
2992
2993
2994
2995
26.2k
        case URX_STRING:
2996
26.2k
            {
2997
26.2k
                loc++;
2998
26.2k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999
26.2k
                int32_t stringLen   = URX_VAL(stringLenOp);
3000
26.2k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001
26.2k
                U_ASSERT(stringLenOp >= 2);
3002
26.2k
                if (currentLen == 0) {
3003
                    // Add the starting character of this string to the set of possible starting
3004
                    //   characters for this pattern.
3005
21.2k
                    int32_t stringStartIdx = URX_VAL(op);
3006
21.2k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007
21.2k
                    fRXPat->fInitialChars->add(c);
3008
3009
                    // Remember this string.  After the entire pattern has been checked,
3010
                    //  if nothing else is identified that can start a match, we'll use it.
3011
21.2k
                    numInitialStrings++;
3012
21.2k
                    fRXPat->fInitialStringIdx = stringStartIdx;
3013
21.2k
                    fRXPat->fInitialStringLen = stringLen;
3014
21.2k
                }
3015
3016
26.2k
                currentLen = safeIncrement(currentLen, stringLen);
3017
26.2k
                atStart = false;
3018
26.2k
            }
3019
26.2k
            break;
3020
3021
184k
        case URX_STRING_I:
3022
184k
            {
3023
                // Case-insensitive string.  Unlike exact-match strings, we won't
3024
                //   attempt a string search for possible match positions.  But we
3025
                //   do update the set of possible starting characters.
3026
184k
                loc++;
3027
184k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028
184k
                int32_t stringLen   = URX_VAL(stringLenOp);
3029
184k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030
184k
                U_ASSERT(stringLenOp >= 2);
3031
184k
                if (currentLen == 0) {
3032
                    // Add the starting character of this string to the set of possible starting
3033
                    //   characters for this pattern.
3034
64.3k
                    int32_t stringStartIdx = URX_VAL(op);
3035
64.3k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036
64.3k
                    UnicodeSet s;
3037
64.3k
                    findCaseInsensitiveStarters(c, &s);
3038
64.3k
                    fRXPat->fInitialChars->addAll(s);
3039
64.3k
                    numInitialStrings += 2;  // Matching on an initial string not possible.
3040
64.3k
                }
3041
184k
                currentLen = safeIncrement(currentLen, stringLen);
3042
184k
                atStart = false;
3043
184k
            }
3044
184k
            break;
3045
3046
1.99k
        case URX_CTR_INIT:
3047
2.54k
        case URX_CTR_INIT_NG:
3048
2.54k
            {
3049
                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050
                //   so location must be updated accordingly.
3051
                // Loop Init Ops.
3052
                //   If the min loop count == 0
3053
                //      move loc forwards to the end of the loop, skipping over the body.
3054
                //   If the min count is > 0,
3055
                //      continue normal processing of the body of the loop.
3056
2.54k
                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057
2.54k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3058
2.54k
                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059
2.54k
                if (minLoopCount == 0) {
3060
                    // Min Loop Count of 0, treat like a forward branch and
3061
                    //   move the current minimum length up to the target
3062
                    //   (end of loop) location.
3063
1.17k
                    U_ASSERT(loopEndLoc <= end+1);
3064
1.17k
                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065
869
                        forwardedLength.setElementAt(currentLen, loopEndLoc);
3066
869
                    }
3067
1.17k
                }
3068
2.54k
                loc+=3;  // Skips over operands of CTR_INIT
3069
2.54k
            }
3070
2.54k
            atStart = false;
3071
2.54k
            break;
3072
3073
3074
1.99k
        case URX_CTR_LOOP:
3075
2.54k
        case URX_CTR_LOOP_NG:
3076
            // Loop ops.
3077
            //  The jump is conditional, backwards only.
3078
2.54k
            atStart = false;
3079
2.54k
            break;
3080
3081
4.41k
        case URX_LOOP_C:
3082
            // More loop ops.  These state-save to themselves.
3083
            //   don't change the minimum match
3084
4.41k
            atStart = false;
3085
4.41k
            break;
3086
3087
3088
522
        case URX_LA_START:
3089
1.02k
        case URX_LB_START:
3090
1.02k
            {
3091
                // Look-around.  Scan forward until the matching look-ahead end,
3092
                //   without processing the look-around block.  This is overly pessimistic.
3093
3094
                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095
                //   lookahead contains two LA_END instructions, so count goes up by two
3096
                //   for each LA_START.
3097
1.02k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3098
8.06M
                for (;;) {
3099
8.06M
                    loc++;
3100
8.06M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101
8.06M
                    if (URX_TYPE(op) == URX_LA_START) {
3102
9.33k
                        depth+=2;
3103
9.33k
                    }
3104
8.06M
                    if (URX_TYPE(op) == URX_LB_START) {
3105
598
                        depth++;
3106
598
                    }
3107
8.06M
                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108
20.8k
                        depth--;
3109
20.8k
                        if (depth == 0) {
3110
1.02k
                            break;
3111
1.02k
                        }
3112
20.8k
                    }
3113
8.06M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3114
                        // Need this because neg lookahead blocks will FAIL to outside
3115
                        //   of the block.
3116
3.95M
                        int32_t  jmpDest = URX_VAL(op);
3117
3.95M
                        if (jmpDest > loc) {
3118
3.95M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119
3.94M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3120
3.94M
                            }
3121
3.95M
                        }
3122
3.95M
                    }
3123
8.06M
                    U_ASSERT(loc <= end);
3124
8.06M
                }
3125
1.02k
            }
3126
1.02k
            break;
3127
3128
0
        case URX_LA_END:
3129
0
        case URX_LB_CONT:
3130
0
        case URX_LB_END:
3131
0
        case URX_LBN_CONT:
3132
0
        case URX_LBN_END:
3133
0
            UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134
                                    //  consumed by the scan in URX_LA_START and LB_START
3135
0
        default:
3136
0
            UPRV_UNREACHABLE_EXIT;
3137
19.4M
            }
3138
3139
19.4M
        }
3140
3141
3142
    // We have finished walking through the ops.  Check whether some forward jump
3143
    //   propagated a shorter length to location end+1.
3144
4.59k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3145
3.62k
        currentLen = forwardedLength.elementAti(end+1);
3146
3.62k
    }
3147
3148
3149
4.59k
    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152
    // Sort out what we should check for when looking for candidate match start positions.
3153
    // In order of preference,
3154
    //     1.   Start of input text buffer.
3155
    //     2.   A literal string.
3156
    //     3.   Start of line in multi-line mode.
3157
    //     4.   A single literal character.
3158
    //     5.   A character from a set of characters.
3159
    //
3160
4.59k
    if (fRXPat->fStartType == START_START) {
3161
        // Match only at the start of an input text string.
3162
        //    start type is already set.  We're done.
3163
4.56k
    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164
        // Match beginning only with a literal string.
3165
486
        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166
486
        U_ASSERT(fRXPat->fInitialChars->contains(c));
3167
486
        fRXPat->fStartType   = START_STRING;
3168
486
        fRXPat->fInitialChar = c;
3169
4.08k
    } else if (fRXPat->fStartType == START_LINE) {
3170
        // Match at start of line in Multi-Line mode.
3171
        // Nothing to do here; everything is already set.
3172
4.06k
    } else if (fRXPat->fMinMatchLen == 0) {
3173
        // Zero length match possible.  We could start anywhere.
3174
926
        fRXPat->fStartType = START_NO_INFO;
3175
3.13k
    } else if (fRXPat->fInitialChars->size() == 1) {
3176
        // All matches begin with the same char.
3177
825
        fRXPat->fStartType   = START_CHAR;
3178
825
        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179
825
        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180
2.31k
    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == false &&
3181
2.31k
        fRXPat->fMinMatchLen > 0) {
3182
        // Matches start with a set of character smaller than the set of all chars.
3183
2.21k
        fRXPat->fStartType = START_SET;
3184
2.21k
    } else {
3185
        // Matches can start with anything
3186
101
        fRXPat->fStartType = START_NO_INFO;
3187
101
    }
3188
3189
4.59k
    return;
3190
4.59k
}
3191
3192
3193
3194
//------------------------------------------------------------------------------
3195
//
3196
//   minMatchLength    Calculate the length of the shortest string that could
3197
//                     match the specified pattern.
3198
//                     Length is in 16 bit code units, not code points.
3199
//
3200
//                     The calculated length may not be exact.  The returned
3201
//                     value may be shorter than the actual minimum; it must
3202
//                     never be longer.
3203
//
3204
//                     start and end are the range of p-code operations to be
3205
//                     examined.  The endpoints are included in the range.
3206
//
3207
//------------------------------------------------------------------------------
3208
184k
int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3209
184k
    if (U_FAILURE(*fStatus)) {
3210
45
        return 0;
3211
45
    }
3212
3213
184k
    U_ASSERT(start <= end);
3214
184k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3215
3216
3217
184k
    int32_t    loc;
3218
184k
    int32_t    op;
3219
184k
    int32_t    opType;
3220
184k
    int32_t    currentLen = 0;
3221
3222
3223
    // forwardedLength is a vector holding minimum-match-length values that
3224
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3225
    //   It must be one longer than the pattern being checked because some  ops
3226
    //   will jmp to a end-of-block+1 location from within a block, and we must
3227
    //   count those when checking the block.
3228
184k
    UVector32  forwardedLength(end+2, *fStatus);
3229
184k
    forwardedLength.setSize(end+2);
3230
47.8M
    for (loc=start; loc<=end+1; loc++) {
3231
47.6M
        forwardedLength.setElementAt(INT32_MAX, loc);
3232
47.6M
    }
3233
3234
32.3M
    for (loc = start; loc<=end; loc++) {
3235
32.1M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3236
32.1M
        opType = URX_TYPE(op);
3237
3238
        // The loop is advancing linearly through the pattern.
3239
        // If the op we are now at was the destination of a branch in the pattern,
3240
        // and that path has a shorter minimum length than the current accumulated value,
3241
        // replace the current accumulated value.
3242
        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3243
                                                               //   no-match-possible cases.
3244
32.1M
        if (forwardedLength.elementAti(loc) < currentLen) {
3245
449k
            currentLen = forwardedLength.elementAti(loc);
3246
449k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3247
449k
        }
3248
3249
32.1M
        switch (opType) {
3250
            // Ops that don't change the total length matched
3251
0
        case URX_RESERVED_OP:
3252
4.59k
        case URX_END:
3253
4.59k
        case URX_STRING_LEN:
3254
196k
        case URX_NOP:
3255
277k
        case URX_START_CAPTURE:
3256
357k
        case URX_END_CAPTURE:
3257
358k
        case URX_BACKSLASH_B:
3258
358k
        case URX_BACKSLASH_BU:
3259
359k
        case URX_BACKSLASH_G:
3260
359k
        case URX_BACKSLASH_Z:
3261
364k
        case URX_CARET:
3262
366k
        case URX_DOLLAR:
3263
366k
        case URX_DOLLAR_M:
3264
367k
        case URX_DOLLAR_D:
3265
367k
        case URX_DOLLAR_MD:
3266
367k
        case URX_RELOC_OPRND:
3267
371k
        case URX_STO_INP_LOC:
3268
371k
        case URX_CARET_M:
3269
372k
        case URX_CARET_M_UNIX:
3270
373k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3271
373k
        case URX_BACKREF_I:
3272
3273
379k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3274
385k
        case URX_LD_SP:
3275
3276
511k
        case URX_JMP_SAV:
3277
514k
        case URX_JMP_SAV_X:
3278
514k
            break;
3279
3280
3281
            // Ops that match a minimum of one character (one or two 16 bit code units.)
3282
            //
3283
931k
        case URX_ONECHAR:
3284
934k
        case URX_STATIC_SETREF:
3285
935k
        case URX_STAT_SETREF_N:
3286
1.33M
        case URX_SETREF:
3287
1.33M
        case URX_BACKSLASH_D:
3288
1.33M
        case URX_BACKSLASH_H:
3289
1.33M
        case URX_BACKSLASH_R:
3290
1.33M
        case URX_BACKSLASH_V:
3291
1.36M
        case URX_ONECHAR_I:
3292
1.36M
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3293
1.36M
        case URX_DOTANY_ALL:    // . matches one or two.
3294
1.36M
        case URX_DOTANY:
3295
1.36M
        case URX_DOTANY_UNIX:
3296
1.36M
            currentLen = safeIncrement(currentLen, 1);
3297
1.36M
            break;
3298
3299
3300
0
        case URX_JMPX:
3301
0
            loc++;              // URX_JMPX has an extra operand, ignored here,
3302
                                //   otherwise processed identically to URX_JMP.
3303
0
            U_FALLTHROUGH;
3304
14.4M
        case URX_JMP:
3305
14.4M
            {
3306
14.4M
                int32_t  jmpDest = URX_VAL(op);
3307
14.4M
                if (jmpDest < loc) {
3308
                    // Loop of some kind.  Can safely ignore, the worst that will happen
3309
                    //  is that we understate the true minimum length
3310
3.62k
                    currentLen = forwardedLength.elementAti(loc+1);
3311
14.4M
                } else {
3312
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3313
14.4M
                    U_ASSERT(jmpDest <= end+1);
3314
14.4M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
3315
3.54k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3316
3.54k
                    }
3317
14.4M
                }
3318
14.4M
            }
3319
14.4M
            break;
3320
3321
2.81k
        case URX_BACKTRACK:
3322
2.81k
            {
3323
                // Back-tracks are kind of like a branch, except that the min length was
3324
                //   propagated already, by the state save.
3325
2.81k
                currentLen = forwardedLength.elementAti(loc+1);
3326
2.81k
            }
3327
2.81k
            break;
3328
3329
3330
15.3M
        case URX_STATE_SAVE:
3331
15.3M
            {
3332
                // State Save, for forward jumps, propagate the current minimum.
3333
                //             of the state save.
3334
15.3M
                int32_t  jmpDest = URX_VAL(op);
3335
15.3M
                if (jmpDest > loc) {
3336
15.3M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3337
14.6M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3338
14.6M
                    }
3339
15.3M
                }
3340
15.3M
            }
3341
15.3M
            break;
3342
3343
3344
62.5k
        case URX_STRING:
3345
62.5k
            {
3346
62.5k
                loc++;
3347
62.5k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3348
62.5k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3349
62.5k
            }
3350
62.5k
            break;
3351
3352
3353
356k
        case URX_STRING_I:
3354
356k
            {
3355
356k
                loc++;
3356
                // TODO: with full case folding, matching input text may be shorter than
3357
                //       the string we have here.  More smarts could put some bounds on it.
3358
                //       Assume a min length of one for now.  A min length of zero causes
3359
                //        optimization failures for a pattern like "string"+
3360
                // currentLen += URX_VAL(stringLenOp);
3361
356k
                currentLen = safeIncrement(currentLen, 1);
3362
356k
            }
3363
356k
            break;
3364
3365
4.11k
        case URX_CTR_INIT:
3366
6.46k
        case URX_CTR_INIT_NG:
3367
6.46k
            {
3368
                // Loop Init Ops.
3369
                //   If the min loop count == 0
3370
                //      move loc forwards to the end of the loop, skipping over the body.
3371
                //   If the min count is > 0,
3372
                //      continue normal processing of the body of the loop.
3373
6.46k
                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3374
6.46k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3375
6.46k
                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3376
6.46k
                if (minLoopCount == 0) {
3377
3.80k
                    loc = loopEndLoc;
3378
3.80k
                } else {
3379
2.66k
                    loc+=3;  // Skips over operands of CTR_INIT
3380
2.66k
                }
3381
6.46k
            }
3382
6.46k
            break;
3383
3384
3385
1.91k
        case URX_CTR_LOOP:
3386
2.66k
        case URX_CTR_LOOP_NG:
3387
            // Loop ops.
3388
            //  The jump is conditional, backwards only.
3389
2.66k
            break;
3390
3391
2.59k
        case URX_LOOP_SR_I:
3392
4.66k
        case URX_LOOP_DOT_I:
3393
9.32k
        case URX_LOOP_C:
3394
            // More loop ops.  These state-save to themselves.
3395
            //   don't change the minimum match - could match nothing at all.
3396
9.32k
            break;
3397
3398
3399
3.73k
        case URX_LA_START:
3400
5.45k
        case URX_LB_START:
3401
5.45k
            {
3402
                // Look-around.  Scan forward until the matching look-ahead end,
3403
                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3404
                //   it assumes that the look-ahead match might be zero-length.
3405
                //   TODO:  Positive lookahead could recursively do the block, then continue
3406
                //          with the longer of the block or the value coming in.  Ticket 6060
3407
5.45k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3408
14.5M
                for (;;) {
3409
14.5M
                    loc++;
3410
14.5M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411
14.5M
                    if (URX_TYPE(op) == URX_LA_START) {
3412
                        // The boilerplate for look-ahead includes two LA_END instructions,
3413
                        //    Depth will be decremented by each one when it is seen.
3414
116k
                        depth += 2;
3415
116k
                    }
3416
14.5M
                    if (URX_TYPE(op) == URX_LB_START) {
3417
2.01k
                        depth++;
3418
2.01k
                    }
3419
14.5M
                    if (URX_TYPE(op) == URX_LA_END) {
3420
243k
                        depth--;
3421
243k
                        if (depth == 0) {
3422
4.60k
                            break;
3423
4.60k
                        }
3424
243k
                    }
3425
14.5M
                    if (URX_TYPE(op)==URX_LBN_END) {
3426
1.57k
                        depth--;
3427
1.57k
                        if (depth == 0) {
3428
851
                            break;
3429
851
                        }
3430
1.57k
                    }
3431
14.5M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3432
                        // Need this because neg lookahead blocks will FAIL to outside
3433
                        //   of the block.
3434
6.82M
                        int32_t  jmpDest = URX_VAL(op);
3435
6.82M
                        if (jmpDest > loc) {
3436
6.82M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3437
6.81M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3438
6.81M
                            }
3439
6.82M
                        }
3440
6.82M
                    }
3441
14.5M
                    U_ASSERT(loc <= end);
3442
14.5M
                }
3443
5.45k
            }
3444
5.45k
            break;
3445
3446
1.52k
        case URX_LA_END:
3447
1.52k
        case URX_LB_CONT:
3448
3.04k
        case URX_LB_END:
3449
3.04k
        case URX_LBN_CONT:
3450
4.68k
        case URX_LBN_END:
3451
            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3452
            //   range being sized, which happens when measuring size of look-behind blocks.
3453
4.68k
            break;
3454
3455
0
        default:
3456
0
            UPRV_UNREACHABLE_EXIT;
3457
32.1M
            }
3458
3459
32.1M
        }
3460
3461
    // We have finished walking through the ops.  Check whether some forward jump
3462
    //   propagated a shorter length to location end+1.
3463
184k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3464
320
        currentLen = forwardedLength.elementAti(end+1);
3465
320
        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3466
320
    }
3467
3468
184k
    return currentLen;
3469
184k
}
3470
3471
//------------------------------------------------------------------------------
3472
//
3473
//   maxMatchLength    Calculate the length of the longest string that could
3474
//                     match the specified pattern.
3475
//                     Length is in 16 bit code units, not code points.
3476
//
3477
//                     The calculated length may not be exact.  The returned
3478
//                     value may be longer than the actual maximum; it must
3479
//                     never be shorter.
3480
//
3481
//                     start, end: the range of the pattern to check.
3482
//                     end is inclusive.
3483
//
3484
//------------------------------------------------------------------------------
3485
7.28k
int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3486
7.28k
    if (U_FAILURE(*fStatus)) {
3487
0
        return 0;
3488
0
    }
3489
7.28k
    U_ASSERT(start <= end);
3490
7.28k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3491
3492
7.28k
    int32_t    loc;
3493
7.28k
    int32_t    op;
3494
7.28k
    int32_t    opType;
3495
7.28k
    int32_t    currentLen = 0;
3496
7.28k
    UVector32  forwardedLength(end+1, *fStatus);
3497
7.28k
    forwardedLength.setSize(end+1);
3498
3499
18.8M
    for (loc=start; loc<=end; loc++) {
3500
18.7M
        forwardedLength.setElementAt(0, loc);
3501
18.7M
    }
3502
3503
11.7M
    for (loc = start; loc<=end; loc++) {
3504
11.6M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3505
11.6M
        opType = URX_TYPE(op);
3506
3507
        // The loop is advancing linearly through the pattern.
3508
        // If the op we are now at was the destination of a branch in the pattern,
3509
        // and that path has a longer maximum length than the current accumulated value,
3510
        // replace the current accumulated value.
3511
11.6M
        if (forwardedLength.elementAti(loc) > currentLen) {
3512
1.57M
            currentLen = forwardedLength.elementAti(loc);
3513
1.57M
        }
3514
3515
11.6M
        switch (opType) {
3516
            // Ops that don't change the total length matched
3517
0
        case URX_RESERVED_OP:
3518
0
        case URX_END:
3519
0
        case URX_STRING_LEN:
3520
133k
        case URX_NOP:
3521
141k
        case URX_START_CAPTURE:
3522
147k
        case URX_END_CAPTURE:
3523
147k
        case URX_BACKSLASH_B:
3524
147k
        case URX_BACKSLASH_BU:
3525
148k
        case URX_BACKSLASH_G:
3526
148k
        case URX_BACKSLASH_Z:
3527
148k
        case URX_CARET:
3528
149k
        case URX_DOLLAR:
3529
149k
        case URX_DOLLAR_M:
3530
149k
        case URX_DOLLAR_D:
3531
149k
        case URX_DOLLAR_MD:
3532
149k
        case URX_RELOC_OPRND:
3533
149k
        case URX_STO_INP_LOC:
3534
149k
        case URX_CARET_M:
3535
150k
        case URX_CARET_M_UNIX:
3536
3537
151k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3538
153k
        case URX_LD_SP:
3539
3540
154k
        case URX_LB_END:
3541
154k
        case URX_LB_CONT:
3542
154k
        case URX_LBN_CONT:
3543
156k
        case URX_LBN_END:
3544
156k
            break;
3545
3546
3547
            // Ops that increase that cause an unbounded increase in the length
3548
            //   of a matched string, or that increase it a hard to characterize way.
3549
            //   Call the max length unbounded, and stop further checking.
3550
266
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3551
719
        case URX_BACKREF_I:
3552
1.10k
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3553
1.10k
            currentLen = INT32_MAX;
3554
1.10k
            break;
3555
3556
3557
            // Ops that match a max of one character (possibly two 16 bit code units.)
3558
            //
3559
266
        case URX_STATIC_SETREF:
3560
617
        case URX_STAT_SETREF_N:
3561
1.17k
        case URX_SETREF:
3562
1.43k
        case URX_BACKSLASH_D:
3563
1.70k
        case URX_BACKSLASH_H:
3564
1.89k
        case URX_BACKSLASH_R:
3565
2.14k
        case URX_BACKSLASH_V:
3566
2.78k
        case URX_ONECHAR_I:
3567
2.97k
        case URX_DOTANY_ALL:
3568
4.70k
        case URX_DOTANY:
3569
4.91k
        case URX_DOTANY_UNIX:
3570
4.91k
            currentLen = safeIncrement(currentLen, 2);
3571
4.91k
            break;
3572
3573
            // Single literal character.  Increase current max length by one or two,
3574
            //       depending on whether the char is in the supplementary range.
3575
199k
        case URX_ONECHAR:
3576
199k
            currentLen = safeIncrement(currentLen, 1);
3577
199k
            if (URX_VAL(op) > 0x10000) {
3578
286
                currentLen = safeIncrement(currentLen, 1);
3579
286
            }
3580
199k
            break;
3581
3582
            // Jumps.
3583
            //
3584
5.44M
        case URX_JMP:
3585
5.44M
        case URX_JMPX:
3586
5.44M
        case URX_JMP_SAV:
3587
5.44M
        case URX_JMP_SAV_X:
3588
5.44M
            {
3589
5.44M
                int32_t  jmpDest = URX_VAL(op);
3590
5.44M
                if (jmpDest < loc) {
3591
                    // Loop of some kind.  Max match length is unbounded.
3592
610
                    currentLen = INT32_MAX;
3593
5.44M
                } else {
3594
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3595
5.44M
                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3596
64.9k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3597
64.9k
                    }
3598
5.44M
                    currentLen = 0;
3599
5.44M
                }
3600
5.44M
            }
3601
5.44M
            break;
3602
3603
57.5k
        case URX_BACKTRACK:
3604
            // back-tracks are kind of like a branch, except that the max length was
3605
            //   propagated already, by the state save.
3606
57.5k
            currentLen = forwardedLength.elementAti(loc+1);
3607
57.5k
            break;
3608
3609
3610
5.60M
        case URX_STATE_SAVE:
3611
5.60M
            {
3612
                // State Save, for forward jumps, propagate the current minimum.
3613
                //               of the state save.
3614
                //             For backwards jumps, they create a loop, maximum
3615
                //               match length is unbounded.
3616
5.60M
                int32_t  jmpDest = URX_VAL(op);
3617
5.60M
                if (jmpDest > loc) {
3618
5.60M
                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3619
1.71M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3620
1.71M
                    }
3621
5.60M
                } else {
3622
68
                    currentLen = INT32_MAX;
3623
68
                }
3624
5.60M
            }
3625
5.60M
            break;
3626
3627
3628
3629
3630
38.7k
        case URX_STRING:
3631
38.7k
            {
3632
38.7k
                loc++;
3633
38.7k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3634
38.7k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3635
38.7k
                break;
3636
5.44M
            }
3637
3638
11.9k
        case URX_STRING_I:
3639
            // TODO:  This code assumes that any user string that matches will be no longer
3640
            //        than our compiled string, with case insensitive matching.
3641
            //        Our compiled string has been case-folded already.
3642
            //
3643
            //        Any matching user string will have no more code points than our
3644
            //        compiled (folded) string.  Folding may add code points, but
3645
            //        not remove them.
3646
            //
3647
            //        There is a potential problem if a supplemental code point
3648
            //        case-folds to a BMP code point.  In this case our compiled string
3649
            //        could be shorter (in code units) than a matching user string.
3650
            //
3651
            //        At this time (Unicode 6.1) there are no such characters, and this case
3652
            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3653
            //        any problematic characters are added to Unicode.
3654
            //
3655
            //        If this happens, we can make a set of the BMP chars that the
3656
            //        troublesome supplementals fold to, scan our string, and bump the
3657
            //        currentLen one extra for each that is found.
3658
            //
3659
11.9k
            {
3660
11.9k
                loc++;
3661
11.9k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3662
11.9k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3663
11.9k
            }
3664
11.9k
            break;
3665
3666
2.48k
        case URX_CTR_INIT:
3667
4.23k
        case URX_CTR_INIT_NG:
3668
            // For Loops, recursively call this function on the pattern for the loop body,
3669
            //   then multiply the result by the maximum loop count.
3670
4.23k
            {
3671
4.23k
                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3672
4.23k
                if (loopEndLoc == loc+4) {
3673
                    // Loop has an empty body. No affect on max match length.
3674
                    // Continue processing with code after the loop end.
3675
0
                    loc = loopEndLoc;
3676
0
                    break;
3677
0
                }
3678
3679
4.23k
                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3680
4.23k
                if (maxLoopCount == -1) {
3681
                    // Unbounded Loop. No upper bound on match length.
3682
109
                    currentLen = INT32_MAX;
3683
109
                    break;
3684
109
                }
3685
3686
4.12k
                U_ASSERT(loopEndLoc >= loc+4);
3687
4.12k
                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3688
4.12k
                int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; 
3689
4.12k
                if (updatedLen >= INT32_MAX) {
3690
139
                    currentLen = INT32_MAX;
3691
139
                    break;
3692
139
                }
3693
3.98k
                currentLen = (int32_t)updatedLen;
3694
3.98k
                loc = loopEndLoc;
3695
3.98k
                break;
3696
4.12k
            }
3697
3698
0
        case URX_CTR_LOOP:
3699
0
        case URX_CTR_LOOP_NG:
3700
            // These opcodes will be skipped over by code for URX_CTR_INIT.
3701
            // We shouldn't encounter them here.
3702
0
            UPRV_UNREACHABLE_EXIT;
3703
3704
98
        case URX_LOOP_SR_I:
3705
643
        case URX_LOOP_DOT_I:
3706
643
        case URX_LOOP_C:
3707
            // For anything to do with loops, make the match length unbounded.
3708
643
            currentLen = INT32_MAX;
3709
643
            break;
3710
3711
3712
3713
56.7k
        case URX_LA_START:
3714
171k
        case URX_LA_END:
3715
            // Look-ahead.  Just ignore, treat the look-ahead block as if
3716
            // it were normal pattern.  Gives a too-long match length,
3717
            //  but good enough for now.
3718
171k
            break;
3719
3720
            // End of look-ahead ops should always be consumed by the processing at
3721
            //  the URX_LA_START op.
3722
            // UPRV_UNREACHABLE_EXIT;
3723
3724
1.23k
        case URX_LB_START:
3725
1.23k
            {
3726
                // Look-behind.  Scan forward until the matching look-around end,
3727
                //   without processing the look-behind block.
3728
1.23k
                int32_t dataLoc = URX_VAL(op);
3729
3.15M
                for (loc = loc + 1; loc <= end; ++loc) {
3730
3.15M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3731
3.15M
                    int32_t opType = URX_TYPE(op);
3732
3.15M
                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3733
1.23k
                        break;
3734
1.23k
                    }
3735
3.15M
                }
3736
1.23k
                U_ASSERT(loc <= end);
3737
1.23k
            }
3738
1.23k
            break;
3739
3740
0
        default:
3741
0
            UPRV_UNREACHABLE_EXIT;
3742
11.6M
        }
3743
3744
3745
11.6M
        if (currentLen == INT32_MAX) {
3746
            //  The maximum length is unbounded.
3747
            //  Stop further processing of the pattern.
3748
2.68k
            break;
3749
2.68k
        }
3750
3751
11.6M
    }
3752
7.28k
    return currentLen;
3753
3754
7.28k
}
3755
3756
3757
//------------------------------------------------------------------------------
3758
//
3759
//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3760
//                Extra NOPs are inserted for some constructs during the initial
3761
//                code generation to provide locations that may be patched later.
3762
//                Many end up unneeded, and are removed by this function.
3763
//
3764
//                In order to minimize the number of passes through the pattern,
3765
//                back-reference fixup is also performed here (adjusting
3766
//                back-reference operands to point to the correct frame offsets).
3767
//
3768
//------------------------------------------------------------------------------
3769
4.64k
void RegexCompile::stripNOPs() {
3770
3771
4.64k
    if (U_FAILURE(*fStatus)) {
3772
0
        return;
3773
0
    }
3774
3775
4.64k
    int32_t    end = fRXPat->fCompiledPat->size();
3776
4.64k
    UVector32  deltas(end, *fStatus);
3777
3778
    // Make a first pass over the code, computing the amount that things
3779
    //   will be offset at each location in the original code.
3780
4.64k
    int32_t   loc;
3781
4.64k
    int32_t   d = 0;
3782
28.0M
    for (loc=0; loc<end; loc++) {
3783
28.0M
        deltas.addElement(d, *fStatus);
3784
28.0M
        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3785
28.0M
        if (URX_TYPE(op) == URX_NOP) {
3786
175k
            d++;
3787
175k
        }
3788
28.0M
    }
3789
3790
4.64k
    UnicodeString caseStringBuffer;
3791
3792
    // Make a second pass over the code, removing the NOPs by moving following
3793
    //  code up, and patching operands that refer to code locations that
3794
    //  are being moved.  The array of offsets from the first step is used
3795
    //  to compute the new operand values.
3796
4.64k
    int32_t src;
3797
4.64k
    int32_t dst = 0;
3798
28.0M
    for (src=0; src<end; src++) {
3799
28.0M
        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3800
28.0M
        int32_t opType = URX_TYPE(op);
3801
28.0M
        switch (opType) {
3802
175k
        case URX_NOP:
3803
175k
            break;
3804
3805
13.3M
        case URX_STATE_SAVE:
3806
25.8M
        case URX_JMP:
3807
25.8M
        case URX_CTR_LOOP:
3808
25.8M
        case URX_CTR_LOOP_NG:
3809
25.8M
        case URX_RELOC_OPRND:
3810
25.8M
        case URX_JMPX:
3811
25.9M
        case URX_JMP_SAV:
3812
25.9M
        case URX_JMP_SAV_X:
3813
            // These are instructions with operands that refer to code locations.
3814
25.9M
            {
3815
25.9M
                int32_t  operandAddress = URX_VAL(op);
3816
25.9M
                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3817
25.9M
                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3818
25.9M
                op = buildOp(opType, fixedOperandAddress);
3819
25.9M
                fRXPat->fCompiledPat->setElementAt(op, dst);
3820
25.9M
                dst++;
3821
25.9M
                break;
3822
25.9M
            }
3823
3824
757
        case URX_BACKREF:
3825
1.31k
        case URX_BACKREF_I:
3826
1.31k
            {
3827
1.31k
                int32_t where = URX_VAL(op);
3828
1.31k
                if (where > fRXPat->fGroupMap->size()) {
3829
369
                    error(U_REGEX_INVALID_BACK_REF);
3830
369
                    break;
3831
369
                }
3832
947
                where = fRXPat->fGroupMap->elementAti(where-1);
3833
947
                op    = buildOp(opType, where);
3834
947
                fRXPat->fCompiledPat->setElementAt(op, dst);
3835
947
                dst++;
3836
3837
947
                fRXPat->fNeedsAltInput = true;
3838
947
                break;
3839
1.31k
            }
3840
7.29k
        case URX_RESERVED_OP:
3841
7.90k
        case URX_RESERVED_OP_N:
3842
19.8k
        case URX_BACKTRACK:
3843
24.4k
        case URX_END:
3844
770k
        case URX_ONECHAR:
3845
826k
        case URX_STRING:
3846
1.07M
        case URX_STRING_LEN:
3847
1.14M
        case URX_START_CAPTURE:
3848
1.21M
        case URX_END_CAPTURE:
3849
1.22M
        case URX_STATIC_SETREF:
3850
1.22M
        case URX_STAT_SETREF_N:
3851
1.61M
        case URX_SETREF:
3852
1.62M
        case URX_DOTANY:
3853
1.62M
        case URX_FAIL:
3854
1.62M
        case URX_BACKSLASH_B:
3855
1.62M
        case URX_BACKSLASH_BU:
3856
1.62M
        case URX_BACKSLASH_G:
3857
1.62M
        case URX_BACKSLASH_X:
3858
1.62M
        case URX_BACKSLASH_Z:
3859
1.62M
        case URX_DOTANY_ALL:
3860
1.62M
        case URX_BACKSLASH_D:
3861
1.63M
        case URX_CARET:
3862
1.63M
        case URX_DOLLAR:
3863
1.63M
        case URX_CTR_INIT:
3864
1.63M
        case URX_CTR_INIT_NG:
3865
1.63M
        case URX_DOTANY_UNIX:
3866
1.63M
        case URX_STO_SP:
3867
1.64M
        case URX_LD_SP:
3868
1.64M
        case URX_STO_INP_LOC:
3869
1.65M
        case URX_LA_START:
3870
1.67M
        case URX_LA_END:
3871
1.68M
        case URX_ONECHAR_I:
3872
1.88M
        case URX_STRING_I:
3873
1.88M
        case URX_DOLLAR_M:
3874
1.88M
        case URX_CARET_M:
3875
1.88M
        case URX_CARET_M_UNIX:
3876
1.88M
        case URX_LB_START:
3877
1.88M
        case URX_LB_CONT:
3878
1.88M
        case URX_LB_END:
3879
1.88M
        case URX_LBN_CONT:
3880
1.88M
        case URX_LBN_END:
3881
1.88M
        case URX_LOOP_SR_I:
3882
1.89M
        case URX_LOOP_DOT_I:
3883
1.89M
        case URX_LOOP_C:
3884
1.89M
        case URX_DOLLAR_D:
3885
1.89M
        case URX_DOLLAR_MD:
3886
1.89M
        case URX_BACKSLASH_H:
3887
1.89M
        case URX_BACKSLASH_R:
3888
1.89M
        case URX_BACKSLASH_V:
3889
            // These instructions are unaltered by the relocation.
3890
1.89M
            fRXPat->fCompiledPat->setElementAt(op, dst);
3891
1.89M
            dst++;
3892
1.89M
            break;
3893
3894
0
        default:
3895
            // Some op is unaccounted for.
3896
0
            UPRV_UNREACHABLE_EXIT;
3897
28.0M
        }
3898
28.0M
    }
3899
3900
4.64k
    fRXPat->fCompiledPat->setSize(dst);
3901
4.64k
}
3902
3903
3904
3905
3906
//------------------------------------------------------------------------------
3907
//
3908
//  Error         Report a rule parse error.
3909
//                Only report it if no previous error has been recorded.
3910
//
3911
//------------------------------------------------------------------------------
3912
7.56k
void RegexCompile::error(UErrorCode e) {
3913
7.56k
    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3914
6.65k
        *fStatus = e;
3915
        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3916
        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3917
        // int64_t. If the values of the latter are out of range for the former,
3918
        // set them to the appropriate "field not supported" values.
3919
6.65k
        if (fLineNum > 0x7FFFFFFF) {
3920
0
            fParseErr->line   = 0;
3921
0
            fParseErr->offset = -1;
3922
6.65k
        } else if (fCharNum > 0x7FFFFFFF) {
3923
0
            fParseErr->line   = (int32_t)fLineNum;
3924
0
            fParseErr->offset = -1;
3925
6.65k
        } else {
3926
6.65k
            fParseErr->line   = (int32_t)fLineNum;
3927
6.65k
            fParseErr->offset = (int32_t)fCharNum;
3928
6.65k
        }
3929
3930
6.65k
        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3931
3932
        // Fill in the context.
3933
        //   Note: extractBetween() pins supplied indices to the string bounds.
3934
6.65k
        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3935
6.65k
        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3936
6.65k
        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3937
6.65k
        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3938
6.65k
    }
3939
7.56k
}
3940
3941
3942
//
3943
//  Assorted Unicode character constants.
3944
//     Numeric because there is no portable way to enter them as literals.
3945
//     (Think EBCDIC).
3946
//
3947
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
3948
static const char16_t   chLF        = 0x0a;      // Line Feed
3949
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
3950
static const char16_t   chDigit0    = 0x30;      // '0'
3951
static const char16_t   chDigit7    = 0x37;      // '9'
3952
static const char16_t   chColon     = 0x3A;      // ':'
3953
static const char16_t   chE         = 0x45;      // 'E'
3954
static const char16_t   chQ         = 0x51;      // 'Q'
3955
//static const char16_t   chN         = 0x4E;      // 'N'
3956
static const char16_t   chP         = 0x50;      // 'P'
3957
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
3958
//static const char16_t   chLBracket  = 0x5b;      // '['
3959
static const char16_t   chRBracket  = 0x5d;      // ']'
3960
static const char16_t   chUp        = 0x5e;      // '^'
3961
static const char16_t   chLowerP    = 0x70;
3962
static const char16_t   chLBrace    = 0x7b;      // '{'
3963
static const char16_t   chRBrace    = 0x7d;      // '}'
3964
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
3965
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
3966
3967
3968
//------------------------------------------------------------------------------
3969
//
3970
//  nextCharLL    Low Level Next Char from the regex pattern.
3971
//                Get a char from the string, keep track of input position
3972
//                     for error reporting.
3973
//
3974
//------------------------------------------------------------------------------
3975
103M
UChar32  RegexCompile::nextCharLL() {
3976
103M
    UChar32       ch;
3977
3978
103M
    if (fPeekChar != -1) {
3979
373k
        ch = fPeekChar;
3980
373k
        fPeekChar = -1;
3981
373k
        return ch;
3982
373k
    }
3983
3984
    // assume we're already in the right place
3985
102M
    ch = UTEXT_NEXT32(fRXPat->fPattern);
3986
102M
    if (ch == U_SENTINEL) {
3987
14.6k
        return ch;
3988
14.6k
    }
3989
3990
102M
    if (ch == chCR ||
3991
102M
        ch == chNEL ||
3992
102M
        ch == chLS   ||
3993
102M
        (ch == chLF && fLastChar != chCR)) {
3994
        // Character is starting a new line.  Bump up the line number, and
3995
        //  reset the column to 0.
3996
377k
        fLineNum++;
3997
377k
        fCharNum=0;
3998
377k
    }
3999
102M
    else {
4000
        // Character is not starting a new line.  Except in the case of a
4001
        //   LF following a CR, increment the column position.
4002
102M
        if (ch != chLF) {
4003
102M
            fCharNum++;
4004
102M
        }
4005
102M
    }
4006
102M
    fLastChar = ch;
4007
102M
    return ch;
4008
102M
}
4009
4010
//------------------------------------------------------------------------------
4011
//
4012
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4013
//                 character without actually getting it.
4014
//
4015
//------------------------------------------------------------------------------
4016
624k
UChar32  RegexCompile::peekCharLL() {
4017
624k
    if (fPeekChar == -1) {
4018
376k
        fPeekChar = nextCharLL();
4019
376k
    }
4020
624k
    return fPeekChar;
4021
624k
}
4022
4023
4024
//------------------------------------------------------------------------------
4025
//
4026
//   nextChar     for pattern scanning.  At this level, we handle stripping
4027
//                out comments and processing some backslash character escapes.
4028
//                The rest of the pattern grammar is handled at the next level up.
4029
//
4030
//------------------------------------------------------------------------------
4031
98.6M
void RegexCompile::nextChar(RegexPatternChar &c) {
4032
98.6M
  tailRecursion:
4033
98.6M
    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4034
98.6M
    c.fChar    = nextCharLL();
4035
98.6M
    c.fQuoted  = false;
4036
4037
98.6M
    if (fQuoteMode) {
4038
263k
        c.fQuoted = true;
4039
263k
        if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4040
263k
            c.fChar == (UChar32)-1) {
4041
539
            fQuoteMode = false;  //  Exit quote mode,
4042
539
            nextCharLL();        // discard the E
4043
            // nextChar(c);      // recurse to get the real next char
4044
539
            goto tailRecursion;  // Note: fuzz testing produced testcases that
4045
                                 //       resulted in stack overflow here.
4046
539
        }
4047
263k
    }
4048
98.4M
    else if (fInBackslashQuote) {
4049
        // The current character immediately follows a '\'
4050
        // Don't check for any further escapes, just return it as-is.
4051
        // Don't set c.fQuoted, because that would prevent the state machine from
4052
        //    dispatching on the character.
4053
122k
        fInBackslashQuote = false;
4054
122k
    }
4055
98.3M
    else
4056
98.3M
    {
4057
        // We are not in a \Q quoted region \E of the source.
4058
        //
4059
98.3M
        if (fModeFlags & UREGEX_COMMENTS) {
4060
            //
4061
            // We are in free-spacing and comments mode.
4062
            //  Scan through any white space and comments, until we
4063
            //  reach a significant character or the end of input.
4064
11.4M
            for (;;) {
4065
11.4M
                if (c.fChar == (UChar32)-1) {
4066
1.28k
                    break;     // End of Input
4067
1.28k
                }
4068
11.4M
                if  (c.fChar == chPound && fEOLComments) {
4069
                    // Start of a comment.  Consume the rest of it, until EOF or a new line
4070
3.54M
                    for (;;) {
4071
3.54M
                        c.fChar = nextCharLL();
4072
3.54M
                        if (c.fChar == (UChar32)-1 ||  // EOF
4073
3.54M
                            c.fChar == chCR        ||
4074
3.54M
                            c.fChar == chLF        ||
4075
3.54M
                            c.fChar == chNEL       ||
4076
3.54M
                            c.fChar == chLS)       {
4077
5.13k
                            break;
4078
5.13k
                        }
4079
3.54M
                    }
4080
5.13k
                }
4081
                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4082
11.4M
                if (PatternProps::isWhiteSpace(c.fChar) == false) {
4083
11.1M
                    break;
4084
11.1M
                }
4085
277k
                c.fChar = nextCharLL();
4086
277k
            }
4087
11.1M
        }
4088
4089
        //
4090
        //  check for backslash escaped characters.
4091
        //
4092
98.3M
        if (c.fChar == chBackSlash) {
4093
363k
            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4094
363k
            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4095
                //
4096
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
4097
                //   Includes \uxxxx, \n, \r, many others.
4098
                //   Return the single equivalent character.
4099
                //
4100
237k
                nextCharLL();                 // get & discard the peeked char.
4101
237k
                c.fQuoted = true;
4102
4103
237k
                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4104
237k
                    int32_t endIndex = (int32_t)pos;
4105
237k
                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4106
4107
237k
                    if (endIndex == pos) {
4108
148
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4109
148
                    }
4110
237k
                    fCharNum += endIndex - pos;
4111
237k
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4112
237k
                } else {
4113
0
                    int32_t offset = 0;
4114
0
                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4115
4116
0
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4117
0
                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4118
4119
0
                    if (offset == 0) {
4120
0
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4121
0
                    } else if (context.lastOffset == offset) {
4122
0
                        UTEXT_PREVIOUS32(fRXPat->fPattern);
4123
0
                    } else if (context.lastOffset != offset-1) {
4124
0
                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4125
0
                    }
4126
0
                    fCharNum += offset;
4127
0
                }
4128
237k
            }
4129
125k
            else if (peekCharLL() == chDigit0) {
4130
                //  Octal Escape, using Java Regexp Conventions
4131
                //    which are \0 followed by 1-3 octal digits.
4132
                //    Different from ICU Unescape handling of Octal, which does not
4133
                //    require the leading 0.
4134
                //  Java also has the convention of only consuming 2 octal digits if
4135
                //    the three digit number would be > 0xff
4136
                //
4137
2.22k
                c.fChar = 0;
4138
2.22k
                nextCharLL();    // Consume the initial 0.
4139
2.22k
                int index;
4140
5.18k
                for (index=0; index<3; index++) {
4141
4.72k
                    int32_t ch = peekCharLL();
4142
4.72k
                    if (ch<chDigit0 || ch>chDigit7) {
4143
1.75k
                        if (index==0) {
4144
                           // \0 is not followed by any octal digits.
4145
263
                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4146
263
                        }
4147
1.75k
                        break;
4148
1.75k
                    }
4149
2.96k
                    c.fChar <<= 3;
4150
2.96k
                    c.fChar += ch&7;
4151
2.96k
                    if (c.fChar <= 255) {
4152
2.72k
                        nextCharLL();
4153
2.72k
                    } else {
4154
                        // The last digit made the number too big.  Forget we saw it.
4155
248
                        c.fChar >>= 3;
4156
248
                    }
4157
2.96k
                }
4158
2.22k
                c.fQuoted = true;
4159
2.22k
            }
4160
123k
            else if (peekCharLL() == chQ) {
4161
                //  "\Q"  enter quote mode, which will continue until "\E"
4162
610
                fQuoteMode = true;
4163
610
                nextCharLL();        // discard the 'Q'.
4164
                // nextChar(c);      // recurse to get the real next char.
4165
610
                goto tailRecursion;  // Note: fuzz testing produced test cases that
4166
                //                            resulted in stack overflow here.
4167
610
            }
4168
122k
            else
4169
122k
            {
4170
                // We are in a '\' escape that will be handled by the state table scanner.
4171
                // Just return the backslash, but remember that the following char is to
4172
                //  be taken literally.
4173
122k
                fInBackslashQuote = true;
4174
122k
            }
4175
363k
        }
4176
98.3M
    }
4177
4178
    // re-enable # to end-of-line comments, in case they were disabled.
4179
    // They are disabled by the parser upon seeing '(?', but this lasts for
4180
    //  the fetching of the next character only.
4181
98.6M
    fEOLComments = true;
4182
4183
    // putc(c.fChar, stdout);
4184
98.6M
}
4185
4186
4187
4188
//------------------------------------------------------------------------------
4189
//
4190
//  scanNamedChar
4191
//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4192
//
4193
//             The scan position will be at the 'N'.  On return
4194
//             the scan position should be just after the '}'
4195
//
4196
//             Return the UChar32
4197
//
4198
//------------------------------------------------------------------------------
4199
3.07k
UChar32  RegexCompile::scanNamedChar() {
4200
3.07k
    if (U_FAILURE(*fStatus)) {
4201
0
        return 0;
4202
0
    }
4203
4204
3.07k
    nextChar(fC);
4205
3.07k
    if (fC.fChar != chLBrace) {
4206
37
        error(U_REGEX_PROPERTY_SYNTAX);
4207
37
        return 0;
4208
37
    }
4209
4210
3.03k
    UnicodeString  charName;
4211
23.4k
    for (;;) {
4212
23.4k
        nextChar(fC);
4213
23.4k
        if (fC.fChar == chRBrace) {
4214
2.99k
            break;
4215
2.99k
        }
4216
20.4k
        if (fC.fChar == -1) {
4217
37
            error(U_REGEX_PROPERTY_SYNTAX);
4218
37
            return 0;
4219
37
        }
4220
20.4k
        charName.append(fC.fChar);
4221
20.4k
    }
4222
4223
2.99k
    char name[100];
4224
2.99k
    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4225
2.99k
         (uint32_t)charName.length()>=sizeof(name)) {
4226
        // All Unicode character names have only invariant characters.
4227
        // The API to get a character, given a name, accepts only char *, forcing us to convert,
4228
        //   which requires this error check
4229
7
        error(U_REGEX_PROPERTY_SYNTAX);
4230
7
        return 0;
4231
7
    }
4232
2.98k
    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4233
4234
2.98k
    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4235
2.98k
    if (U_FAILURE(*fStatus)) {
4236
102
        error(U_REGEX_PROPERTY_SYNTAX);
4237
102
    }
4238
4239
2.98k
    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4240
2.98k
    return theChar;
4241
2.99k
}
4242
4243
//------------------------------------------------------------------------------
4244
//
4245
//  scanProp   Construct a UnicodeSet from the text at the current scan
4246
//             position, which will be of the form \p{whaterver}
4247
//
4248
//             The scan position will be at the 'p' or 'P'.  On return
4249
//             the scan position should be just after the '}'
4250
//
4251
//             Return a UnicodeSet, constructed from the \P pattern,
4252
//             or nullptr if the pattern is invalid.
4253
//
4254
//------------------------------------------------------------------------------
4255
1.23k
UnicodeSet *RegexCompile::scanProp() {
4256
1.23k
    UnicodeSet    *uset = nullptr;
4257
4258
1.23k
    if (U_FAILURE(*fStatus)) {
4259
0
        return nullptr;
4260
0
    }
4261
1.23k
    (void)chLowerP;   // Suppress compiler unused variable warning.
4262
1.23k
    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4263
1.23k
    UBool negated = (fC.fChar == chP);
4264
4265
1.23k
    UnicodeString propertyName;
4266
1.23k
    nextChar(fC);
4267
1.23k
    if (fC.fChar != chLBrace) {
4268
29
        error(U_REGEX_PROPERTY_SYNTAX);
4269
29
        return nullptr;
4270
29
    }
4271
82.4k
    for (;;) {
4272
82.4k
        nextChar(fC);
4273
82.4k
        if (fC.fChar == chRBrace) {
4274
1.16k
            break;
4275
1.16k
        }
4276
81.2k
        if (fC.fChar == -1) {
4277
            // Hit the end of the input string without finding the closing '}'
4278
41
            error(U_REGEX_PROPERTY_SYNTAX);
4279
41
            return nullptr;
4280
41
        }
4281
81.2k
        propertyName.append(fC.fChar);
4282
81.2k
    }
4283
1.16k
    uset = createSetForProperty(propertyName, negated);
4284
1.16k
    nextChar(fC);    // Move input scan to position following the closing '}'
4285
1.16k
    return uset;
4286
1.20k
}
4287
4288
//------------------------------------------------------------------------------
4289
//
4290
//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4291
//             position, which is expected be of the form [:property expression:]
4292
//
4293
//             The scan position will be at the opening ':'.  On return
4294
//             the scan position must be on the closing ']'
4295
//
4296
//             Return a UnicodeSet constructed from the pattern,
4297
//             or nullptr if this is not a valid POSIX-style set expression.
4298
//             If not a property expression, restore the initial scan position
4299
//                (to the opening ':')
4300
//
4301
//               Note:  the opening '[:' is not sufficient to guarantee that
4302
//                      this is a [:property:] expression.
4303
//                      [:'+=,] is a perfectly good ordinary set expression that
4304
//                              happens to include ':' as one of its characters.
4305
//
4306
//------------------------------------------------------------------------------
4307
91.1k
UnicodeSet *RegexCompile::scanPosixProp() {
4308
91.1k
    UnicodeSet    *uset = nullptr;
4309
4310
91.1k
    if (U_FAILURE(*fStatus)) {
4311
0
        return nullptr;
4312
0
    }
4313
4314
91.1k
    U_ASSERT(fC.fChar == chColon);
4315
4316
    // Save the scanner state.
4317
    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4318
91.1k
    int64_t     savedScanIndex        = fScanIndex;
4319
91.1k
    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4320
91.1k
    UBool       savedQuoteMode        = fQuoteMode;
4321
91.1k
    UBool       savedInBackslashQuote = fInBackslashQuote;
4322
91.1k
    UBool       savedEOLComments      = fEOLComments;
4323
91.1k
    int64_t     savedLineNum          = fLineNum;
4324
91.1k
    int64_t     savedCharNum          = fCharNum;
4325
91.1k
    UChar32     savedLastChar         = fLastChar;
4326
91.1k
    UChar32     savedPeekChar         = fPeekChar;
4327
91.1k
    RegexPatternChar savedfC          = fC;
4328
4329
    // Scan for a closing ].   A little tricky because there are some perverse
4330
    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4331
    //   ending on the second closing ].
4332
4333
91.1k
    UnicodeString propName;
4334
91.1k
    UBool         negated  = false;
4335
4336
    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4337
91.1k
    nextChar(fC);
4338
91.1k
    if (fC.fChar == chUp) {
4339
78
       negated = true;
4340
78
       nextChar(fC);
4341
78
    }
4342
4343
    // Scan for the closing ":]", collecting the property name along the way.
4344
91.1k
    UBool  sawPropSetTerminator = false;
4345
40.9M
    for (;;) {
4346
40.9M
        propName.append(fC.fChar);
4347
40.9M
        nextChar(fC);
4348
40.9M
        if (fC.fQuoted || fC.fChar == -1) {
4349
            // Escaped characters or end of input - either says this isn't a [:Property:]
4350
8.88k
            break;
4351
8.88k
        }
4352
40.9M
        if (fC.fChar == chColon) {
4353
82.2k
            nextChar(fC);
4354
82.2k
            if (fC.fChar == chRBracket) {
4355
69.6k
                sawPropSetTerminator = true;
4356
69.6k
            }
4357
82.2k
            break;
4358
82.2k
        }
4359
40.9M
    }
4360
4361
91.1k
    if (sawPropSetTerminator) {
4362
69.6k
        uset = createSetForProperty(propName, negated);
4363
69.6k
    }
4364
21.4k
    else
4365
21.4k
    {
4366
        // No closing ":]".
4367
        //  Restore the original scan position.
4368
        //  The main scanner will retry the input as a normal set expression,
4369
        //    not a [:Property:] expression.
4370
21.4k
        fScanIndex        = savedScanIndex;
4371
21.4k
        fQuoteMode        = savedQuoteMode;
4372
21.4k
        fInBackslashQuote = savedInBackslashQuote;
4373
21.4k
        fEOLComments      = savedEOLComments;
4374
21.4k
        fLineNum          = savedLineNum;
4375
21.4k
        fCharNum          = savedCharNum;
4376
21.4k
        fLastChar         = savedLastChar;
4377
21.4k
        fPeekChar         = savedPeekChar;
4378
21.4k
        fC                = savedfC;
4379
21.4k
        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4380
21.4k
    }
4381
91.1k
    return uset;
4382
91.1k
}
4383
4384
0
static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4385
0
    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4386
0
    addCategory(set, U_GC_CF_MASK, ec);
4387
0
}
4388
4389
//
4390
//  Create a Unicode Set from a Unicode Property expression.
4391
//     This is common code underlying both \p{...} and [:...:] expressions.
4392
//     Includes trying the Java "properties" that aren't supported as
4393
//     normal ICU UnicodeSet properties
4394
//
4395
70.8k
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4396
4397
70.8k
    if (U_FAILURE(*fStatus)) {
4398
1
        return nullptr;
4399
1
    }
4400
70.8k
    LocalPointer<UnicodeSet> set;
4401
70.8k
    UErrorCode status = U_ZERO_ERROR;
4402
4403
70.8k
    do {      // non-loop, exists to allow breaks from the block.
4404
        //
4405
        //  First try the property as we received it
4406
        //
4407
70.8k
        UnicodeString   setExpr;
4408
70.8k
        uint32_t        usetFlags = 0;
4409
70.8k
        setExpr.append(u"[\\p{", -1);
4410
70.8k
        setExpr.append(propName);
4411
70.8k
        setExpr.append(u"}]", -1);
4412
70.8k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4413
9.58k
            usetFlags |= USET_CASE_INSENSITIVE;
4414
9.58k
        }
4415
70.8k
        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
4416
70.8k
        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4417
55.4k
            break;
4418
55.4k
        }
4419
4420
        //
4421
        //  The incoming property wasn't directly recognized by ICU.
4422
4423
        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4424
        //     Java accepts 'word' with mixed case.
4425
        //     Java accepts 'all' only in all lower case.
4426
4427
15.4k
        status = U_ZERO_ERROR;
4428
15.4k
        if (propName.caseCompare(u"word", -1, 0) == 0) {
4429
430
            set.adoptInsteadAndCheckErrorCode(
4430
430
                RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4431
430
            break;
4432
430
        }
4433
15.0k
        if (propName.compare(u"all", -1) == 0) {
4434
280
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4435
280
            break;
4436
280
        }
4437
4438
4439
        //    Do Java InBlock expressions
4440
        //
4441
14.7k
        UnicodeString mPropName = propName;
4442
14.7k
        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4443
7.79k
            status = U_ZERO_ERROR;
4444
7.79k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4445
7.79k
            if (U_FAILURE(status)) {
4446
0
                break;
4447
0
            }
4448
7.79k
            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4449
7.79k
            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4450
7.79k
            break;
4451
7.79k
        }
4452
4453
        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4454
        //  as "BooleanPropertyValue". The property value can be either a
4455
        //  a General Category or a Script Name.
4456
4457
6.92k
        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4458
5.03k
            mPropName.remove(0, 2);      // Strip the "Is"
4459
5.03k
            if (mPropName.indexOf(u'=') >= 0) {
4460
                // Reject any "Is..." property expression containing an '=', that is,
4461
                // any non-binary property expression.
4462
16
                status = U_REGEX_PROPERTY_SYNTAX;
4463
16
                break;
4464
16
            }
4465
4466
5.01k
            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4467
0
                mPropName.setTo(u"unassigned", -1);
4468
0
                negated = !negated;
4469
5.01k
            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4470
0
                mPropName.setTo(u"Titlecase_Letter", -1);
4471
0
            }
4472
4473
5.01k
            mPropName.insert(0, u"[\\p{", -1);
4474
5.01k
            mPropName.append(u"}]", -1);
4475
5.01k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4476
4477
5.01k
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4478
1.04k
                set->closeOver(USET_CASE_INSENSITIVE);
4479
1.04k
            }
4480
5.01k
            break;
4481
4482
5.03k
        }
4483
4484
1.89k
        if (propName.startsWith(u"java", -1)) {
4485
0
            status = U_ZERO_ERROR;
4486
0
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4487
0
            if (U_FAILURE(status)) {
4488
0
                break;
4489
0
            }
4490
            //
4491
            //  Try the various Java specific properties.
4492
            //   These all begin with "java"
4493
            //
4494
0
            if (propName.compare(u"javaDefined", -1) == 0) {
4495
0
                addCategory(set.getAlias(), U_GC_CN_MASK, status);
4496
0
                set->complement();
4497
0
            }
4498
0
            else if (propName.compare(u"javaDigit", -1) == 0) {
4499
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4500
0
            }
4501
0
            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4502
0
                addIdentifierIgnorable(set.getAlias(), status);
4503
0
            }
4504
0
            else if (propName.compare(u"javaISOControl", -1) == 0) {
4505
0
                set->add(0, 0x1F).add(0x7F, 0x9F);
4506
0
            }
4507
0
            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4508
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4509
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4510
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4511
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4512
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4513
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4514
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4515
0
                addIdentifierIgnorable(set.getAlias(), status);
4516
0
            }
4517
0
            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4518
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4519
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4520
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4521
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4522
0
            }
4523
0
            else if (propName.compare(u"javaLetter", -1) == 0) {
4524
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4525
0
            }
4526
0
            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4527
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4528
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4529
0
            }
4530
0
            else if (propName.compare(u"javaLowerCase", -1) == 0) {
4531
0
                addCategory(set.getAlias(), U_GC_LL_MASK, status);
4532
0
            }
4533
0
            else if (propName.compare(u"javaMirrored", -1) == 0) {
4534
0
                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4535
0
            }
4536
0
            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4537
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4538
0
            }
4539
0
            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4540
0
                set->add(0x10000, UnicodeSet::MAX_VALUE);
4541
0
            }
4542
0
            else if (propName.compare(u"javaTitleCase", -1) == 0) {
4543
0
                addCategory(set.getAlias(), U_GC_LT_MASK, status);
4544
0
            }
4545
0
            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4546
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4547
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548
0
            }
4549
0
            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4550
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4551
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4552
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4553
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4554
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4555
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4556
0
                addIdentifierIgnorable(set.getAlias(), status);
4557
0
            }
4558
0
            else if (propName.compare(u"javaUpperCase", -1) == 0) {
4559
0
                addCategory(set.getAlias(), U_GC_LU_MASK, status);
4560
0
            }
4561
0
            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4562
0
                set->add(0, UnicodeSet::MAX_VALUE);
4563
0
            }
4564
0
            else if (propName.compare(u"javaWhitespace", -1) == 0) {
4565
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4566
0
                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4567
0
                set->add(9, 0x0d).add(0x1c, 0x1f);
4568
0
            } else {
4569
0
                status = U_REGEX_PROPERTY_SYNTAX;
4570
0
            }
4571
4572
0
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4573
0
                set->closeOver(USET_CASE_INSENSITIVE);
4574
0
            }
4575
0
            break;
4576
0
        }
4577
4578
        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4579
        // extensions matched it.
4580
1.89k
        status = U_REGEX_PROPERTY_SYNTAX;
4581
1.89k
    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4582
4583
70.8k
    if (U_SUCCESS(status)) {
4584
        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4585
        // deal with properties of strings and character classes with strings, we ignore them.
4586
        // Just in case something downstream might stumble over the strings,
4587
        // we remove them from the set.
4588
        // Note that when we support strings, the complement of a property (as with \P)
4589
        // should be implemented as .complement().removeAllStrings() (code point complement).
4590
68.8k
        set->removeAllStrings();
4591
68.8k
        U_ASSERT(set.isValid());
4592
68.8k
        if (negated) {
4593
965
            set->complement();
4594
965
        }
4595
68.8k
        return set.orphan();
4596
68.8k
    } else {
4597
1.98k
        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4598
71
            status = U_REGEX_PROPERTY_SYNTAX;
4599
71
        }
4600
1.98k
        error(status);
4601
1.98k
        return nullptr;
4602
1.98k
    }
4603
70.8k
}
4604
4605
4606
//
4607
//  SetEval   Part of the evaluation of [set expressions].
4608
//            Perform any pending (stacked) operations with precedence
4609
//            equal or greater to that of the next operator encountered
4610
//            in the expression.
4611
//
4612
8.91M
void RegexCompile::setEval(int32_t nextOp) {
4613
8.91M
    UnicodeSet *rightOperand = nullptr;
4614
8.91M
    UnicodeSet *leftOperand  = nullptr;
4615
9.54M
    for (;;) {
4616
9.54M
        U_ASSERT(fSetOpStack.empty()==false);
4617
9.54M
        int32_t pendingSetOperation = fSetOpStack.peeki();
4618
9.54M
        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4619
8.91M
            break;
4620
8.91M
        }
4621
634k
        fSetOpStack.popi();
4622
634k
        U_ASSERT(fSetStack.empty() == false);
4623
634k
        rightOperand = (UnicodeSet *)fSetStack.peek();
4624
        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4625
        // (see comments there).
4626
        // We also do not yet support string literals in character classes,
4627
        // so there should not be any strings.
4628
        // Note that when we support strings, the complement of a set (as with ^ or \P)
4629
        // should be implemented as .complement().removeAllStrings() (code point complement).
4630
634k
        U_ASSERT(!rightOperand->hasStrings());
4631
634k
        switch (pendingSetOperation) {
4632
671
            case setNegation:
4633
671
                rightOperand->complement();
4634
671
                break;
4635
559k
            case setCaseClose:
4636
                // TODO: need a simple close function.  Ticket 6065
4637
559k
                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4638
559k
                rightOperand->removeAllStrings();
4639
559k
                break;
4640
1.67k
            case setDifference1:
4641
2.19k
            case setDifference2:
4642
2.19k
                fSetStack.pop();
4643
2.19k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4644
2.19k
                leftOperand->removeAll(*rightOperand);
4645
2.19k
                delete rightOperand;
4646
2.19k
                break;
4647
515
            case setIntersection1:
4648
2.11k
            case setIntersection2:
4649
2.11k
                fSetStack.pop();
4650
2.11k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4651
2.11k
                leftOperand->retainAll(*rightOperand);
4652
2.11k
                delete rightOperand;
4653
2.11k
                break;
4654
70.2k
            case setUnion:
4655
70.2k
                fSetStack.pop();
4656
70.2k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4657
70.2k
                leftOperand->addAll(*rightOperand);
4658
70.2k
                delete rightOperand;
4659
70.2k
                break;
4660
0
            default:
4661
0
                UPRV_UNREACHABLE_EXIT;
4662
634k
            }
4663
634k
        }
4664
8.91M
    }
4665
4666
82.2k
void RegexCompile::setPushOp(int32_t op) {
4667
82.2k
    setEval(op);
4668
82.2k
    fSetOpStack.push(op, *fStatus);
4669
82.2k
    LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4670
82.2k
    fSetStack.push(lpSet.orphan(), *fStatus);
4671
82.2k
}
4672
4673
U_NAMESPACE_END
4674
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4675