Coverage Report

Created: 2023-03-29 06:15

/src/icu/icu4c/source/i18n/regexcmp.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  regexcmp.cpp
5
//
6
//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the ICU regular expression compiler, which is responsible
10
//  for processing a regular expression pattern into the compiled form that
11
//  is used by the match finding engine.
12
//
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18
#include "unicode/ustring.h"
19
#include "unicode/unistr.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/uchriter.h"
23
#include "unicode/parsepos.h"
24
#include "unicode/parseerr.h"
25
#include "unicode/regex.h"
26
#include "unicode/utf.h"
27
#include "unicode/utf16.h"
28
#include "patternprops.h"
29
#include "putilimp.h"
30
#include "cmemory.h"
31
#include "cstr.h"
32
#include "cstring.h"
33
#include "uvectr32.h"
34
#include "uvectr64.h"
35
#include "uassert.h"
36
#include "uinvchar.h"
37
38
#include "regeximp.h"
39
#include "regexcst.h"   // Contains state table for the regex pattern parser.
40
                        //   generated by a Perl script.
41
#include "regexcmp.h"
42
#include "regexst.h"
43
#include "regextxt.h"
44
45
46
47
U_NAMESPACE_BEGIN
48
49
50
//------------------------------------------------------------------------------
51
//
52
//  Constructor.
53
//
54
//------------------------------------------------------------------------------
55
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56
   fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57
11.5k
{
58
    // Lazy init of all shared global sets (needed for init()'s empty text)
59
11.5k
    RegexStaticSets::initGlobals(&status);
60
61
11.5k
    fStatus           = &status;
62
63
11.5k
    fRXPat            = rxp;
64
11.5k
    fScanIndex        = 0;
65
11.5k
    fLastChar         = -1;
66
11.5k
    fPeekChar         = -1;
67
11.5k
    fLineNum          = 1;
68
11.5k
    fCharNum          = 0;
69
11.5k
    fQuoteMode        = false;
70
11.5k
    fInBackslashQuote = false;
71
11.5k
    fModeFlags        = fRXPat->fFlags | 0x80000000;
72
11.5k
    fEOLComments      = true;
73
74
11.5k
    fMatchOpenParen   = -1;
75
11.5k
    fMatchCloseParen  = -1;
76
11.5k
    fCaptureName      = nullptr;
77
11.5k
    fLastSetLiteral   = U_SENTINEL;
78
79
11.5k
    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80
0
        status = rxp->fDeferredStatus;
81
0
    }
82
11.5k
}
83
84
static const char16_t   chAmp       = 0x26;      // '&'
85
static const char16_t   chDash      = 0x2d;      // '-'
86
87
88
//------------------------------------------------------------------------------
89
//
90
//  Destructor
91
//
92
//------------------------------------------------------------------------------
93
11.5k
RegexCompile::~RegexCompile() {
94
11.5k
    delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
95
                                 //   compilation stops with a syntax error.
96
11.5k
}
97
98
519
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99
519
    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100
519
}
101
102
//------------------------------------------------------------------------------
103
//
104
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105
//                           The state tables are hand-written in the file regexcst.txt,
106
//                           and converted to the form used here by a perl
107
//                           script regexcst.pl
108
//
109
//------------------------------------------------------------------------------
110
void    RegexCompile::compile(
111
                         const UnicodeString &pat,   // Source pat to be compiled.
112
                         UParseError &pp,            // Error position info
113
                         UErrorCode &e)              // Error Code
114
0
{
115
0
    fRXPat->fPatternString = new UnicodeString(pat);
116
0
    UText patternText = UTEXT_INITIALIZER;
117
0
    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119
0
    if (U_SUCCESS(e)) {
120
0
        compile(&patternText, pp, e);
121
0
        utext_close(&patternText);
122
0
    }
123
0
}
124
125
//
126
//   compile, UText mode
127
//     All the work is actually done here.
128
//
129
void    RegexCompile::compile(
130
                         UText *pat,                 // Source pat to be compiled.
131
                         UParseError &pp,            // Error position info
132
                         UErrorCode &e)              // Error Code
133
11.5k
{
134
11.5k
    fStatus             = &e;
135
11.5k
    fParseErr           = &pp;
136
11.5k
    fStackPtr           = 0;
137
11.5k
    fStack[fStackPtr]   = 0;
138
139
11.5k
    if (U_FAILURE(*fStatus)) {
140
0
        return;
141
0
    }
142
143
    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144
11.5k
    U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
145
146
    // Prepare the RegexPattern object to receive the compiled pattern.
147
11.5k
    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
148
11.5k
    if (U_FAILURE(*fStatus)) {
149
0
        return;
150
0
    }
151
152
    // Initialize the pattern scanning state machine
153
11.5k
    fPatternLength = utext_nativeLength(pat);
154
11.5k
    uint16_t                state = 1;
155
11.5k
    const RegexTableEl      *tableEl;
156
157
    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158
11.5k
    if (fModeFlags & UREGEX_LITERAL) {
159
0
        fQuoteMode = true;
160
0
    }
161
162
11.5k
    nextChar(fC);                        // Fetch the first char from the pattern string.
163
164
    //
165
    // Main loop for the regex pattern parsing state machine.
166
    //   Runs once per state transition.
167
    //   Each time through optionally performs, depending on the state table,
168
    //      - an advance to the the next pattern char
169
    //      - an action to be performed.
170
    //      - pushing or popping a state to/from the local state return stack.
171
    //   file regexcst.txt is the source for the state table.  The logic behind
172
    //     recongizing the pattern syntax is there, not here.
173
    //
174
105M
    for (;;) {
175
        //  Bail out if anything has gone wrong.
176
        //  Regex pattern parsing stops on the first error encountered.
177
105M
        if (U_FAILURE(*fStatus)) {
178
234
            break;
179
234
        }
180
181
105M
        U_ASSERT(state != 0);
182
183
        // Find the state table element that matches the input char from the pattern, or the
184
        //    class of the input character.  Start with the first table row for this
185
        //    state, then linearly scan forward until we find a row that matches the
186
        //    character.  The last row for each state always matches all characters, so
187
        //    the search will stop there, if not before.
188
        //
189
105M
        tableEl = &gRuleParseStateTable[state];
190
105M
        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191
105M
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193
544M
        for (;;) {    // loop through table rows belonging to this state, looking for one
194
                      //   that matches the current input char.
195
544M
            REGEX_SCAN_DEBUG_PRINTF(("."));
196
544M
            if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
197
                // Table row specified an individual character, not a set, and
198
                //   the input character is not quoted, and
199
                //   the input character matched it.
200
28.3M
                break;
201
28.3M
            }
202
515M
            if (tableEl->fCharClass == 255) {
203
                // Table row specified default, match anything character class.
204
54.0M
                break;
205
54.0M
            }
206
461M
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207
                // Table row specified "quoted" and the char was quoted.
208
1.26M
                break;
209
1.26M
            }
210
460M
            if (tableEl->fCharClass == 253 && fC.fChar == (UChar32)-1)  {
211
                // Table row specified eof and we hit eof on the input.
212
8.30k
                break;
213
8.30k
            }
214
215
460M
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216
460M
                fC.fQuoted == false &&                                       //   char is not escaped &&
217
460M
                fC.fChar != (UChar32)-1) {                                   //   char is not EOF
218
48.6M
                U_ASSERT(tableEl->fCharClass <= 137);
219
48.6M
                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220
                    // Table row specified a character class, or set of characters,
221
                    //   and the current char matches it.
222
22.1M
                    break;
223
22.1M
                }
224
48.6M
            }
225
226
            // No match on this row, advance to the next  row for this state,
227
438M
            tableEl++;
228
438M
        }
229
105M
        REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231
        //
232
        // We've found the row of the state table that matches the current input
233
        //   character from the rules string.
234
        // Perform any action specified  by this row in the state table.
235
105M
        if (doParseActions(tableEl->fAction) == false) {
236
            // Break out of the state machine loop if the
237
            //   the action signalled some kind of error, or
238
            //   the action was to exit, occurs on normal end-of-rules-input.
239
11.2k
            break;
240
11.2k
        }
241
242
105M
        if (tableEl->fPushState != 0) {
243
644k
            fStackPtr++;
244
644k
            if (fStackPtr >= kStackSize) {
245
3
                error(U_REGEX_INTERNAL_ERROR);
246
3
                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247
3
                fStackPtr--;
248
3
            }
249
644k
            fStack[fStackPtr] = tableEl->fPushState;
250
644k
        }
251
252
        //
253
        //  NextChar.  This is where characters are actually fetched from the pattern.
254
        //             Happens under control of the 'n' tag in the state table.
255
        //
256
105M
        if (tableEl->fNextChar) {
257
58.1M
            nextChar(fC);
258
58.1M
        }
259
260
        // Get the next state from the table entry, or from the
261
        //   state stack if the next state was specified as "pop".
262
105M
        if (tableEl->fNextState != 255) {
263
105M
            state = tableEl->fNextState;
264
105M
        } else {
265
625k
            state = fStack[fStackPtr];
266
625k
            fStackPtr--;
267
625k
            if (fStackPtr < 0) {
268
                // state stack underflow
269
                // This will occur if the user pattern has mis-matched parentheses,
270
                //   with extra close parens.
271
                //
272
0
                fStackPtr++;
273
0
                error(U_REGEX_MISMATCHED_PAREN);
274
0
            }
275
625k
        }
276
277
105M
    }
278
279
11.5k
    if (U_FAILURE(*fStatus)) {
280
        // Bail out if the pattern had errors.
281
7.03k
        return;
282
7.03k
    }
283
284
    //
285
    // The pattern has now been read and processed, and the compiled code generated.
286
    //
287
288
    //
289
    // The pattern's fFrameSize so far has accumulated the requirements for
290
    //   storage for capture parentheses, counters, etc. that are encountered
291
    //   in the pattern.  Add space for the two variables that are always
292
    //   present in the saved state:  the input string position (int64_t) and
293
    //   the position in the compiled pattern.
294
    //
295
4.49k
    allocateStackData(RESTACKFRAME_HDRCOUNT);
296
297
    //
298
    // Optimization pass 1: NOPs, back-references, and case-folding
299
    //
300
4.49k
    stripNOPs();
301
302
    //
303
    // Get bounds for the minimum and maximum length of a string that this
304
    //   pattern can match.  Used to avoid looking for matches in strings that
305
    //   are too short.
306
    //
307
4.49k
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309
    //
310
    // Optimization pass 2: match start type
311
    //
312
4.49k
    matchStartType();
313
314
    //
315
    // Set up fast latin-1 range sets
316
    //
317
4.49k
    int32_t numSets = fRXPat->fSets->size();
318
4.49k
    fRXPat->fSets8 = new Regex8BitSet[numSets];
319
    // Null pointer check.
320
4.49k
    if (fRXPat->fSets8 == nullptr) {
321
0
        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322
0
        return;
323
0
    }
324
4.49k
    int32_t i;
325
116k
    for (i=0; i<numSets; i++) {
326
111k
        UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(i);
327
111k
        fRXPat->fSets8[i].init(s);
328
111k
    }
329
330
4.49k
}
331
332
333
334
335
336
//------------------------------------------------------------------------------
337
//
338
//  doParseAction        Do some action during regex pattern parsing.
339
//                       Called by the parse state machine.
340
//
341
//                       Generation of the match engine PCode happens here, or
342
//                       in functions called from the parse actions defined here.
343
//
344
//
345
//------------------------------------------------------------------------------
346
UBool RegexCompile::doParseActions(int32_t action)
347
105M
{
348
105M
    UBool   returnVal = true;
349
350
105M
    switch ((Regex_PatternParseAction)action) {
351
352
11.3k
    case doPatStart:
353
        // Start of pattern compiles to:
354
        //0   SAVE   2        Fall back to position of FAIL
355
        //1   jmp    3
356
        //2   FAIL            Stop if we ever reach here.
357
        //3   NOP             Dummy, so start of pattern looks the same as
358
        //                    the start of an ( grouping.
359
        //4   NOP             Resreved, will be replaced by a save if there are
360
        //                    OR | operators at the top level
361
11.3k
        appendOp(URX_STATE_SAVE, 2);
362
11.3k
        appendOp(URX_JMP,  3);
363
11.3k
        appendOp(URX_FAIL, 0);
364
365
        // Standard open nonCapture paren action emits the two NOPs and
366
        //   sets up the paren stack frame.
367
11.3k
        doParseActions(doOpenNonCaptureParen);
368
11.3k
        break;
369
370
6.68k
    case doPatFinish:
371
        // We've scanned to the end of the pattern
372
        //  The end of pattern compiles to:
373
        //        URX_END
374
        //    which will stop the runtime match engine.
375
        //  Encountering end of pattern also behaves like a close paren,
376
        //   and forces fixups of the State Save at the beginning of the compiled pattern
377
        //   and of any OR operations at the top level.
378
        //
379
6.68k
        handleCloseParen();
380
6.68k
        if (fParenStack.size() > 0) {
381
            // Missing close paren in pattern.
382
2.19k
            error(U_REGEX_MISMATCHED_PAREN);
383
2.19k
        }
384
385
        // add the END operation to the compiled pattern.
386
6.68k
        appendOp(URX_END, 0);
387
388
        // Terminate the pattern compilation state machine.
389
6.68k
        returnVal = false;
390
6.68k
        break;
391
392
393
394
26.1M
    case doOrOperator:
395
        // Scanning a '|', as in (A|B)
396
26.1M
        {
397
            // Generate code for any pending literals preceding the '|'
398
26.1M
            fixLiterals(false);
399
400
            // Insert a SAVE operation at the start of the pattern section preceding
401
            //   this OR at this level.  This SAVE will branch the match forward
402
            //   to the right hand side of the OR in the event that the left hand
403
            //   side fails to match and backtracks.  Locate the position for the
404
            //   save from the location on the top of the parentheses stack.
405
26.1M
            int32_t savePosition = fParenStack.popi();
406
26.1M
            int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(savePosition);
407
26.1M
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408
26.1M
            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409
26.1M
            fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411
            // Append an JMP operation into the compiled pattern.  The operand for
412
            //  the JMP will eventually be the location following the ')' for the
413
            //  group.  This will be patched in later, when the ')' is encountered.
414
26.1M
            appendOp(URX_JMP, 0);
415
416
            // Push the position of the newly added JMP op onto the parentheses stack.
417
            // This registers if for fixup when this block's close paren is encountered.
418
26.1M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420
            // Append a NOP to the compiled pattern.  This is the slot reserved
421
            //   for a SAVE in the event that there is yet another '|' following
422
            //   this one.
423
26.1M
            appendOp(URX_NOP, 0);
424
26.1M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425
26.1M
        }
426
26.1M
        break;
427
428
429
620
    case doBeginNamedCapture:
430
        // Scanning (?<letter.
431
        //   The first letter of the name will come through again under doConinueNamedCapture.
432
620
        fCaptureName = new UnicodeString();
433
620
        if (fCaptureName == nullptr) {
434
0
            error(U_MEMORY_ALLOCATION_ERROR);
435
0
        }
436
620
        break;
437
438
2.19k
    case  doContinueNamedCapture:
439
2.19k
        fCaptureName->append(fC.fChar);
440
2.19k
        break;
441
442
27
    case doBadNamedCapture:
443
27
        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444
27
        break;
445
        
446
329k
    case doOpenCaptureParen:
447
        // Open Capturing Paren, possibly named.
448
        //   Compile to a
449
        //      - NOP, which later may be replaced by a save-state if the
450
        //         parenthesized group gets a * quantifier, followed by
451
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452
        //      - NOP, which may later be replaced by a save-state if there
453
        //             is an '|' alternation within the parens.
454
        //
455
        //    Each capture group gets three slots in the save stack frame:
456
        //         0: Capture Group start position (in input string being matched.)
457
        //         1: Capture Group end position.
458
        //         2: Start of Match-in-progress.
459
        //    The first two locations are for a completed capture group, and are
460
        //     referred to by back references and the like.
461
        //    The third location stores the capture start position when an START_CAPTURE is
462
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463
        //      END_CAPTURE is encountered.
464
329k
        {
465
329k
            fixLiterals();
466
329k
            appendOp(URX_NOP, 0);
467
329k
            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468
329k
            appendOp(URX_START_CAPTURE, varsLoc);
469
329k
            appendOp(URX_NOP, 0);
470
471
            // On the Parentheses stack, start a new frame and add the positions
472
            //   of the two NOPs.  Depending on what follows in the pattern, the
473
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474
            //   address of the end of the parenthesized group.
475
329k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476
329k
            fParenStack.push(capturing, *fStatus);                        // Frame type.
477
329k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478
329k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479
480
            // Save the mapping from group number to stack frame variable position.
481
329k
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483
            // If this is a named capture group, add the name->group number mapping.
484
329k
            if (fCaptureName != nullptr) {
485
611
                if (!fRXPat->initNamedCaptureMap()) {
486
0
                    if (U_SUCCESS(*fStatus)) {
487
0
                        error(fRXPat->fDeferredStatus);
488
0
                    }
489
0
                    break;
490
0
                }
491
611
                int32_t groupNumber = fRXPat->fGroupMap->size();
492
611
                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493
611
                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
494
611
                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495
35
                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496
35
                }
497
611
            }
498
329k
        }
499
329k
        break;
500
501
329k
    case doOpenNonCaptureParen:
502
        // Open non-caputuring (grouping only) Paren.
503
        //   Compile to a
504
        //      - NOP, which later may be replaced by a save-state if the
505
        //         parenthesized group gets a * quantifier, followed by
506
        //      - NOP, which may later be replaced by a save-state if there
507
        //             is an '|' alternation within the parens.
508
12.6k
        {
509
12.6k
            fixLiterals();
510
12.6k
            appendOp(URX_NOP, 0);
511
12.6k
            appendOp(URX_NOP, 0);
512
513
            // On the Parentheses stack, start a new frame and add the positions
514
            //   of the two NOPs.
515
12.6k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516
12.6k
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517
12.6k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518
12.6k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519
12.6k
        }
520
12.6k
         break;
521
522
523
560
    case doOpenAtomicParen:
524
        // Open Atomic Paren.  (?>
525
        //   Compile to a
526
        //      - NOP, which later may be replaced if the parenthesized group
527
        //         has a quantifier, followed by
528
        //      - STO_SP  save state stack position, so it can be restored at the ")"
529
        //      - NOP, which may later be replaced by a save-state if there
530
        //             is an '|' alternation within the parens.
531
560
        {
532
560
            fixLiterals();
533
560
            appendOp(URX_NOP, 0);
534
560
            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535
560
            appendOp(URX_STO_SP, varLoc);
536
560
            appendOp(URX_NOP, 0);
537
538
            // On the Parentheses stack, start a new frame and add the positions
539
            //   of the two NOPs.  Depending on what follows in the pattern, the
540
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541
            //   address of the end of the parenthesized group.
542
560
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543
560
            fParenStack.push(atomic, *fStatus);                           // Frame type.
544
560
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545
560
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546
560
        }
547
560
        break;
548
549
550
41.4k
    case doOpenLookAhead:
551
        // Positive Look-ahead   (?=  stuff  )
552
        //
553
        //   Note:   Addition of transparent input regions, with the need to
554
        //           restore the original regions when failing out of a lookahead
555
        //           block, complicated this sequence.  Some combined opcodes
556
        //           might make sense - or might not, lookahead aren't that common.
557
        //
558
        //      Caution:  min match length optimization knows about this
559
        //               sequence; don't change without making updates there too.
560
        //
561
        // Compiles to
562
        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563
        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564
        //    3    JMP          6           continue ...
565
        //
566
        //    4.   LA_END                   Look Ahead failed.  Restore regions.
567
        //    5.   BACKTRACK                and back track again.
568
        //
569
        //    6.   NOP              reserved for use by quantifiers on the block.
570
        //                          Look-ahead can't have quantifiers, but paren stack
571
        //                             compile time conventions require the slot anyhow.
572
        //    7.   NOP              may be replaced if there is are '|' ops in the block.
573
        //    8.     code for parenthesized stuff.
574
        //    9.   LA_END
575
        //
576
        //  Four data slots are reserved, for saving state on entry to the look-around
577
        //    0:   stack pointer on entry.
578
        //    1:   input position on entry.
579
        //    2:   fActiveStart, the active bounds start on entry.
580
        //    3:   fActiveLimit, the active bounds limit on entry.
581
41.4k
        {
582
41.4k
            fixLiterals();
583
41.4k
            int32_t dataLoc = allocateData(4);
584
41.4k
            appendOp(URX_LA_START, dataLoc);
585
41.4k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586
41.4k
            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587
41.4k
            appendOp(URX_LA_END, dataLoc);
588
41.4k
            appendOp(URX_BACKTRACK, 0);
589
41.4k
            appendOp(URX_NOP, 0);
590
41.4k
            appendOp(URX_NOP, 0);
591
592
            // On the Parentheses stack, start a new frame and add the positions
593
            //   of the NOPs.
594
41.4k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595
41.4k
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596
41.4k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597
41.4k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598
41.4k
        }
599
41.4k
        break;
600
601
657
    case doOpenLookAheadNeg:
602
        // Negated Lookahead.   (?! stuff )
603
        // Compiles to
604
        //    1.    LA_START    dataloc
605
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606
        //                                //   which continues with the match.
607
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608
        //    4.       code for parenthesized stuff.
609
        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611
        //    7.    END_LA                // Restore match region, in case look-ahead was using
612
        //                                        an alternate (transparent) region.
613
        //  Four data slots are reserved, for saving state on entry to the look-around
614
        //    0:   stack pointer on entry.
615
        //    1:   input position on entry.
616
        //    2:   fActiveStart, the active bounds start on entry.
617
        //    3:   fActiveLimit, the active bounds limit on entry.
618
657
        {
619
657
            fixLiterals();
620
657
            int32_t dataLoc = allocateData(4);
621
657
            appendOp(URX_LA_START, dataLoc);
622
657
            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623
657
            appendOp(URX_NOP, 0);
624
625
            // On the Parentheses stack, start a new frame and add the positions
626
            //   of the StateSave and NOP.
627
657
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628
657
            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629
657
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630
657
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631
632
            // Instructions #5 - #7 will be added when the ')' is encountered.
633
657
        }
634
657
        break;
635
636
2.49k
    case doOpenLookBehind:
637
2.49k
        {
638
            //   Compile a (?<= look-behind open paren.
639
            //
640
            //          Compiles to
641
            //              0       URX_LB_START     dataLoc
642
            //              1       URX_LB_CONT      dataLoc
643
            //              2                        MinMatchLen
644
            //              3                        MaxMatchLen
645
            //              4       URX_NOP          Standard '(' boilerplate.
646
            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647
            //              6         <code for LookBehind expression>
648
            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649
            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650
            //
651
            //          Allocate a block of matcher data, to contain (when running a match)
652
            //              0:    Stack ptr on entry
653
            //              1:    Input Index on entry
654
            //              2:    fActiveStart, the active bounds start on entry.
655
            //              3:    fActiveLimit, the active bounds limit on entry.
656
            //              4:    Start index of match current match attempt.
657
            //          The first four items must match the layout of data for LA_START / LA_END
658
659
            // Generate match code for any pending literals.
660
2.49k
            fixLiterals();
661
662
            // Allocate data space
663
2.49k
            int32_t dataLoc = allocateData(5);
664
665
            // Emit URX_LB_START
666
2.49k
            appendOp(URX_LB_START, dataLoc);
667
668
            // Emit URX_LB_CONT
669
2.49k
            appendOp(URX_LB_CONT, dataLoc);
670
2.49k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671
2.49k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672
673
            // Emit the NOPs
674
2.49k
            appendOp(URX_NOP, 0);
675
2.49k
            appendOp(URX_NOP, 0);
676
677
            // On the Parentheses stack, start a new frame and add the positions
678
            //   of the URX_LB_CONT and the NOP.
679
2.49k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680
2.49k
            fParenStack.push(lookBehind, *fStatus);                       // Frame type
681
2.49k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682
2.49k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683
684
            // The final two instructions will be added when the ')' is encountered.
685
2.49k
        }
686
687
2.49k
        break;
688
689
2.95k
    case doOpenLookBehindNeg:
690
2.95k
        {
691
            //   Compile a (?<! negated look-behind open paren.
692
            //
693
            //          Compiles to
694
            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695
            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696
            //              2                        MinMatchLen
697
            //              3                        MaxMatchLen
698
            //              4                        continueLoc (9)
699
            //              5       URX_NOP          Standard '(' boilerplate.
700
            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701
            //              7         <code for LookBehind expression>
702
            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703
            //              9       ...
704
            //
705
            //          Allocate a block of matcher data, to contain (when running a match)
706
            //              0:    Stack ptr on entry
707
            //              1:    Input Index on entry
708
            //              2:    fActiveStart, the active bounds start on entry.
709
            //              3:    fActiveLimit, the active bounds limit on entry.
710
            //              4:    Start index of match current match attempt.
711
            //          The first four items must match the layout of data for LA_START / LA_END
712
713
            // Generate match code for any pending literals.
714
2.95k
            fixLiterals();
715
716
            // Allocate data space
717
2.95k
            int32_t dataLoc = allocateData(5);
718
719
            // Emit URX_LB_START
720
2.95k
            appendOp(URX_LB_START, dataLoc);
721
722
            // Emit URX_LBN_CONT
723
2.95k
            appendOp(URX_LBN_CONT, dataLoc);
724
2.95k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725
2.95k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726
2.95k
            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727
728
            // Emit the NOPs
729
2.95k
            appendOp(URX_NOP, 0);
730
2.95k
            appendOp(URX_NOP, 0);
731
732
            // On the Parentheses stack, start a new frame and add the positions
733
            //   of the URX_LB_CONT and the NOP.
734
2.95k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735
2.95k
            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736
2.95k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737
2.95k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738
739
            // The final two instructions will be added when the ')' is encountered.
740
2.95k
        }
741
2.95k
        break;
742
743
1
    case doConditionalExpr:
744
        // Conditionals such as (?(1)a:b)
745
3
    case doPerlInline:
746
        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747
3
        error(U_REGEX_UNIMPLEMENTED);
748
3
        break;
749
750
751
376k
    case doCloseParen:
752
376k
        handleCloseParen();
753
376k
        if (fParenStack.size() <= 0) {
754
            //  Extra close paren, or missing open paren.
755
50
            error(U_REGEX_MISMATCHED_PAREN);
756
50
        }
757
376k
        break;
758
759
47.6M
    case doNOP:
760
47.6M
        break;
761
762
763
38
    case doBadOpenParenType:
764
84
    case doRuleError:
765
84
        error(U_REGEX_RULE_SYNTAX);
766
84
        break;
767
768
769
7
    case doMismatchedParenErr:
770
7
        error(U_REGEX_MISMATCHED_PAREN);
771
7
        break;
772
773
21.2k
    case doPlus:
774
        //  Normal '+'  compiles to
775
        //     1.   stuff to be repeated  (already built)
776
        //     2.   jmp-sav 1
777
        //     3.   ...
778
        //
779
        //  Or, if the item to be repeated can match a zero length string,
780
        //     1.   STO_INP_LOC  data-loc
781
        //     2.      body of stuff to be repeated
782
        //     3.   JMP_SAV_X    2
783
        //     4.   ...
784
785
        //
786
        //  Or, if the item to be repeated is simple
787
        //     1.   Item to be repeated.
788
        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789
        //     3.   LOOP_C       stack location
790
21.2k
        {
791
21.2k
            int32_t  topLoc = blockTopLoc(false);        // location of item #1
792
21.2k
            int32_t  frameLoc;
793
794
            // Check for simple constructs, which may get special optimized code.
795
21.2k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796
15.8k
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
797
798
15.8k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
799
                    // Emit optimized code for [char set]+
800
4.02k
                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801
4.02k
                    frameLoc = allocateStackData(1);
802
4.02k
                    appendOp(URX_LOOP_C, frameLoc);
803
4.02k
                    break;
804
4.02k
                }
805
806
11.7k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807
11.7k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808
11.7k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809
                    // Emit Optimized code for .+ operations.
810
2.77k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811
2.77k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812
                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813
511
                        loopOpI |= 1;
814
511
                    }
815
2.77k
                    if (fModeFlags & UREGEX_UNIX_LINES) {
816
609
                        loopOpI |= 2;
817
609
                    }
818
2.77k
                    appendOp(loopOpI);
819
2.77k
                    frameLoc = allocateStackData(1);
820
2.77k
                    appendOp(URX_LOOP_C, frameLoc);
821
2.77k
                    break;
822
2.77k
                }
823
824
11.7k
            }
825
826
            // General case.
827
828
            // Check for minimum match length of zero, which requires
829
            //    extra loop-breaking code.
830
14.4k
            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831
                // Zero length match is possible.
832
                // Emit the code sequence that can handle it.
833
6.31k
                insertOp(topLoc);
834
6.31k
                frameLoc = allocateStackData(1);
835
836
6.31k
                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837
6.31k
                fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839
6.31k
                appendOp(URX_JMP_SAV_X, topLoc+1);
840
8.15k
            } else {
841
                // Simpler code when the repeated body must match something non-empty
842
8.15k
                appendOp(URX_JMP_SAV, topLoc);
843
8.15k
            }
844
14.4k
        }
845
0
        break;
846
847
1.15k
    case doNGPlus:
848
        //  Non-greedy '+?'  compiles to
849
        //     1.   stuff to be repeated  (already built)
850
        //     2.   state-save  1
851
        //     3.   ...
852
1.15k
        {
853
1.15k
            int32_t topLoc      = blockTopLoc(false);
854
1.15k
            appendOp(URX_STATE_SAVE, topLoc);
855
1.15k
        }
856
1.15k
        break;
857
858
859
8.66k
    case doOpt:
860
        // Normal (greedy) ? quantifier.
861
        //  Compiles to
862
        //     1. state save 3
863
        //     2.    body of optional block
864
        //     3. ...
865
        // Insert the state save into the compiled pattern, and we're done.
866
8.66k
        {
867
8.66k
            int32_t   saveStateLoc = blockTopLoc(true);
868
8.66k
            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869
8.66k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870
8.66k
        }
871
8.66k
        break;
872
873
712
    case doNGOpt:
874
        // Non-greedy ?? quantifier
875
        //   compiles to
876
        //    1.  jmp   4
877
        //    2.     body of optional block
878
        //    3   jmp   5
879
        //    4.  state save 2
880
        //    5    ...
881
        //  This code is less than ideal, with two jmps instead of one, because we can only
882
        //  insert one instruction at the top of the block being iterated.
883
712
        {
884
712
            int32_t  jmp1_loc = blockTopLoc(true);
885
712
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886
887
712
            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888
712
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890
712
            appendOp(URX_JMP, jmp2_loc+2);
891
892
712
            appendOp(URX_STATE_SAVE, jmp1_loc+1);
893
712
        }
894
712
        break;
895
896
897
144k
    case doStar:
898
        // Normal (greedy) * quantifier.
899
        // Compiles to
900
        //       1.   STATE_SAVE   4
901
        //       2.      body of stuff being iterated over
902
        //       3.   JMP_SAV      2
903
        //       4.   ...
904
        //
905
        // Or, if the body is a simple [Set],
906
        //       1.   LOOP_SR_I    set number
907
        //       2.   LOOP_C       stack location
908
        //       ...
909
        //
910
        // Or if this is a .*
911
        //       1.   LOOP_DOT_I    (. matches all mode flag)
912
        //       2.   LOOP_C        stack location
913
        //
914
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
915
        //       1.   STATE_SAVE   5
916
        //       2.   STO_INP_LOC  data-loc
917
        //       3.      body of stuff
918
        //       4.   JMP_SAV_X    2
919
        //       5.   ...
920
144k
        {
921
            // location of item #1, the STATE_SAVE
922
144k
            int32_t   topLoc = blockTopLoc(false);
923
144k
            int32_t   dataLoc = -1;
924
925
            // Check for simple *, where the construct being repeated
926
            //   compiled to single opcode, and might be optimizable.
927
144k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928
18.1k
                int32_t repeatedOp = (int32_t)fRXPat->fCompiledPat->elementAti(topLoc);
929
930
18.1k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
931
                    // Emit optimized code for a [char set]*
932
227
                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933
227
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934
227
                    dataLoc = allocateStackData(1);
935
227
                    appendOp(URX_LOOP_C, dataLoc);
936
227
                    break;
937
227
                }
938
939
17.9k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940
17.9k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941
17.9k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942
                    // Emit Optimized code for .* operations.
943
1.15k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944
1.15k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945
                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946
287
                        loopOpI |= 1;
947
287
                    }
948
1.15k
                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949
252
                        loopOpI |= 2;
950
252
                    }
951
1.15k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952
1.15k
                    dataLoc = allocateStackData(1);
953
1.15k
                    appendOp(URX_LOOP_C, dataLoc);
954
1.15k
                    break;
955
1.15k
                }
956
17.9k
            }
957
958
            // Emit general case code for this *
959
            // The optimizations did not apply.
960
961
143k
            int32_t   saveStateLoc = blockTopLoc(true);
962
143k
            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964
            // Check for minimum match length of zero, which requires
965
            //    extra loop-breaking code.
966
143k
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967
1.92k
                insertOp(saveStateLoc);
968
1.92k
                dataLoc = allocateStackData(1);
969
970
1.92k
                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971
1.92k
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972
1.92k
                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973
1.92k
            }
974
975
            // Locate the position in the compiled pattern where the match will continue
976
            //   after completing the *.   (4 or 5 in the comment above)
977
143k
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979
            // Put together the save state op and store it into the compiled code.
980
143k
            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981
143k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983
            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984
143k
            appendOp(jmpOp);
985
143k
        }
986
0
        break;
987
988
233
    case doNGStar:
989
        // Non-greedy *? quantifier
990
        // compiles to
991
        //     1.   JMP    3
992
        //     2.      body of stuff being iterated over
993
        //     3.   STATE_SAVE  2
994
        //     4    ...
995
233
        {
996
233
            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
997
233
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998
233
            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999
233
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000
233
            appendOp(URX_STATE_SAVE, jmpLoc+1);
1001
233
        }
1002
233
        break;
1003
1004
1005
102k
    case doIntervalInit:
1006
        // The '{' opening an interval quantifier was just scanned.
1007
        // Init the counter variables that will accumulate the values as the digits
1008
        //    are scanned.
1009
102k
        fIntervalLow = 0;
1010
102k
        fIntervalUpper = -1;
1011
102k
        break;
1012
1013
107k
    case doIntevalLowerDigit:
1014
        // Scanned a digit from the lower value of an {lower,upper} interval
1015
107k
        {
1016
107k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1017
107k
            U_ASSERT(digitValue >= 0);
1018
107k
            int64_t val = (int64_t)fIntervalLow*10 + digitValue;
1019
107k
            if (val > INT32_MAX) {
1020
2
                error(U_REGEX_NUMBER_TOO_BIG);
1021
107k
            } else {
1022
107k
                fIntervalLow = (int32_t)val;
1023
107k
            }
1024
107k
        }
1025
107k
        break;
1026
1027
86.2k
    case doIntervalUpperDigit:
1028
        // Scanned a digit from the upper value of an {lower,upper} interval
1029
86.2k
        {
1030
86.2k
            if (fIntervalUpper < 0) {
1031
85.2k
                fIntervalUpper = 0;
1032
85.2k
            }
1033
86.2k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1034
86.2k
            U_ASSERT(digitValue >= 0);
1035
86.2k
            int64_t val = (int64_t)fIntervalUpper*10 + digitValue;
1036
86.2k
            if (val > INT32_MAX) {
1037
11
                error(U_REGEX_NUMBER_TOO_BIG);
1038
86.2k
            } else {
1039
86.2k
                fIntervalUpper = (int32_t)val;
1040
86.2k
            }
1041
86.2k
        }
1042
86.2k
        break;
1043
1044
16.0k
    case doIntervalSame:
1045
        // Scanned a single value interval like {27}.  Upper = Lower.
1046
16.0k
        fIntervalUpper = fIntervalLow;
1047
16.0k
        break;
1048
1049
94.6k
    case doInterval:
1050
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051
94.6k
        if (compileInlineInterval() == false) {
1052
4.28k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
4.28k
        }
1054
94.6k
        break;
1055
1056
3.89k
    case doPossessiveInterval:
1057
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058
3.89k
        {
1059
            // Remember the loc for the top of the block being looped over.
1060
            //   (Can not reserve a slot in the compiled pattern at this time, because
1061
            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062
            //    once per block.)
1063
3.89k
            int32_t topLoc = blockTopLoc(false);
1064
1065
            // Produce normal looping code.
1066
3.89k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068
            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069
            //  just as if the loop was inclosed in atomic parentheses.
1070
1071
            // First the STO_SP before the start of the loop
1072
3.89k
            insertOp(topLoc);
1073
1074
3.89k
            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075
3.89k
            int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076
3.89k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078
3.89k
            int32_t loopOp = (int32_t)fRXPat->fCompiledPat->popi();
1079
3.89k
            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080
3.89k
            loopOp++;     // point LoopOp after the just-inserted STO_SP
1081
3.89k
            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083
            // Then the LD_SP after the end of the loop
1084
3.89k
            appendOp(URX_LD_SP, varLoc);
1085
3.89k
        }
1086
1087
3.89k
        break;
1088
1089
3.63k
    case doNGInterval:
1090
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091
3.63k
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092
3.63k
        break;
1093
1094
93
    case doIntervalError:
1095
93
        error(U_REGEX_BAD_INTERVAL);
1096
93
        break;
1097
1098
23.0M
    case doLiteralChar:
1099
        // We've just scanned a "normal" character from the pattern,
1100
23.0M
        literalChar(fC.fChar);
1101
23.0M
        break;
1102
1103
1104
8.59k
    case doEscapedLiteralChar:
1105
        // We've just scanned an backslashed escaped character with  no
1106
        //   special meaning.  It represents itself.
1107
8.59k
        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108
8.59k
            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109
0
            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110
0
               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111
0
             }
1112
8.59k
        literalChar(fC.fChar);
1113
8.59k
        break;
1114
1115
1116
11.1k
    case doDotAny:
1117
        // scanned a ".",  match any single character.
1118
11.1k
        {
1119
11.1k
            fixLiterals(false);
1120
11.1k
            if (fModeFlags & UREGEX_DOTALL) {
1121
1.27k
                appendOp(URX_DOTANY_ALL, 0);
1122
9.83k
            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123
859
                appendOp(URX_DOTANY_UNIX, 0);
1124
8.98k
            } else {
1125
8.98k
                appendOp(URX_DOTANY, 0);
1126
8.98k
            }
1127
11.1k
        }
1128
11.1k
        break;
1129
1130
10.6k
    case doCaret:
1131
10.6k
        {
1132
10.6k
            fixLiterals(false);
1133
10.6k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134
9.19k
                appendOp(URX_CARET, 0);
1135
9.19k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136
660
                appendOp(URX_CARET_M, 0);
1137
837
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138
239
                appendOp(URX_CARET, 0);   // Only testing true start of input.
1139
598
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140
598
                appendOp(URX_CARET_M_UNIX, 0);
1141
598
            }
1142
10.6k
        }
1143
10.6k
        break;
1144
1145
2.66k
    case doDollar:
1146
2.66k
        {
1147
2.66k
            fixLiterals(false);
1148
2.66k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149
1.64k
                appendOp(URX_DOLLAR, 0);
1150
1.64k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151
511
                appendOp(URX_DOLLAR_M, 0);
1152
511
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153
256
                appendOp(URX_DOLLAR_D, 0);
1154
256
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155
253
                appendOp(URX_DOLLAR_MD, 0);
1156
253
            }
1157
2.66k
        }
1158
2.66k
        break;
1159
1160
200
    case doBackslashA:
1161
200
        fixLiterals(false);
1162
200
        appendOp(URX_CARET, 0);
1163
200
        break;
1164
1165
384
    case doBackslashB:
1166
384
        {
1167
            #if  UCONFIG_NO_BREAK_ITERATION==1
1168
            if (fModeFlags & UREGEX_UWORD) {
1169
                error(U_UNSUPPORTED_ERROR);
1170
            }
1171
            #endif
1172
384
            fixLiterals(false);
1173
384
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174
384
            appendOp(op, 1);
1175
384
        }
1176
384
        break;
1177
1178
1.89k
    case doBackslashb:
1179
1.89k
        {
1180
            #if  UCONFIG_NO_BREAK_ITERATION==1
1181
            if (fModeFlags & UREGEX_UWORD) {
1182
                error(U_UNSUPPORTED_ERROR);
1183
            }
1184
            #endif
1185
1.89k
            fixLiterals(false);
1186
1.89k
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187
1.89k
            appendOp(op, 0);
1188
1.89k
        }
1189
1.89k
        break;
1190
1191
1.11k
    case doBackslashD:
1192
1.11k
        fixLiterals(false);
1193
1.11k
        appendOp(URX_BACKSLASH_D, 1);
1194
1.11k
        break;
1195
1196
528
    case doBackslashd:
1197
528
        fixLiterals(false);
1198
528
        appendOp(URX_BACKSLASH_D, 0);
1199
528
        break;
1200
1201
716
    case doBackslashG:
1202
716
        fixLiterals(false);
1203
716
        appendOp(URX_BACKSLASH_G, 0);
1204
716
        break;
1205
1206
732
    case doBackslashH:
1207
732
        fixLiterals(false);
1208
732
        appendOp(URX_BACKSLASH_H, 1);
1209
732
        break;
1210
1211
509
    case doBackslashh:
1212
509
        fixLiterals(false);
1213
509
        appendOp(URX_BACKSLASH_H, 0);
1214
509
        break;
1215
1216
4.32k
    case doBackslashR:
1217
4.32k
        fixLiterals(false);
1218
4.32k
        appendOp(URX_BACKSLASH_R, 0);
1219
4.32k
        break;
1220
1221
704
    case doBackslashS:
1222
704
        fixLiterals(false);
1223
704
        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224
704
        break;
1225
1226
4.03k
    case doBackslashs:
1227
4.03k
        fixLiterals(false);
1228
4.03k
        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229
4.03k
        break;
1230
1231
5.37k
    case doBackslashV:
1232
5.37k
        fixLiterals(false);
1233
5.37k
        appendOp(URX_BACKSLASH_V, 1);
1234
5.37k
        break;
1235
1236
1.11k
    case doBackslashv:
1237
1.11k
        fixLiterals(false);
1238
1.11k
        appendOp(URX_BACKSLASH_V, 0);
1239
1.11k
        break;
1240
1241
1.08k
    case doBackslashW:
1242
1.08k
        fixLiterals(false);
1243
1.08k
        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244
1.08k
        break;
1245
1246
421
    case doBackslashw:
1247
421
        fixLiterals(false);
1248
421
        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249
421
        break;
1250
1251
1.14k
    case doBackslashX:
1252
        #if  UCONFIG_NO_BREAK_ITERATION==1
1253
        // Grapheme Cluster Boundary requires ICU break iteration.
1254
        error(U_UNSUPPORTED_ERROR);
1255
        #endif
1256
1.14k
        fixLiterals(false);
1257
1.14k
        appendOp(URX_BACKSLASH_X, 0);
1258
1.14k
        break;
1259
1260
1.86k
    case doBackslashZ:
1261
1.86k
        fixLiterals(false);
1262
1.86k
        appendOp(URX_DOLLAR, 0);
1263
1.86k
        break;
1264
1265
2.40k
    case doBackslashz:
1266
2.40k
        fixLiterals(false);
1267
2.40k
        appendOp(URX_BACKSLASH_Z, 0);
1268
2.40k
        break;
1269
1270
40
    case doEscapeError:
1271
40
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272
40
        break;
1273
1274
0
    case doExit:
1275
0
        fixLiterals(false);
1276
0
        returnVal = false;
1277
0
        break;
1278
1279
738
    case doProperty:
1280
738
        {
1281
738
            fixLiterals(false);
1282
738
            UnicodeSet *theSet = scanProp();
1283
738
            compileSet(theSet);
1284
738
        }
1285
738
        break;
1286
1287
1.67k
    case doNamedChar:
1288
1.67k
        {
1289
1.67k
            UChar32 c = scanNamedChar();
1290
1.67k
            literalChar(c);
1291
1.67k
        }
1292
1.67k
        break;
1293
1294
1295
60.5k
    case doBackRef:
1296
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297
        //                 the regular expression, because the number of digits to be consumed
1298
        //                 depends on the number of capture groups that have been defined.  So
1299
        //                 we have to do it here instead.
1300
60.5k
        {
1301
60.5k
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302
60.5k
            int32_t  groupNum = 0;
1303
60.5k
            UChar32  c        = fC.fChar;
1304
1305
61.0k
            for (;;) {
1306
                // Loop once per digit, for max allowed number of digits in a back reference.
1307
61.0k
                int32_t digit = u_charDigitValue(c);
1308
61.0k
                groupNum = groupNum * 10 + digit;
1309
61.0k
                if (groupNum >= numCaptureGroups) {
1310
2.89k
                    break;
1311
2.89k
                }
1312
58.1k
                c = peekCharLL();
1313
58.1k
                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
1314
57.6k
                    break;
1315
57.6k
                }
1316
454
                nextCharLL();
1317
454
            }
1318
1319
            // Scan of the back reference in the source regexp is complete.  Now generate
1320
            //  the compiled code for it.
1321
            // Because capture groups can be forward-referenced by back-references,
1322
            //  we fill the operand with the capture group number.  At the end
1323
            //  of compilation, it will be changed to the variable's location.
1324
60.5k
            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325
                                     //    and shouldn't enter this code path at all.
1326
60.5k
            fixLiterals(false);
1327
60.5k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328
10.1k
                appendOp(URX_BACKREF_I, groupNum);
1329
50.4k
            } else {
1330
50.4k
                appendOp(URX_BACKREF, groupNum);
1331
50.4k
            }
1332
60.5k
        }
1333
60.5k
        break;
1334
1335
1.10k
    case doBeginNamedBackRef:
1336
1.10k
        U_ASSERT(fCaptureName == nullptr);
1337
1.10k
        fCaptureName = new UnicodeString;
1338
1.10k
        if (fCaptureName == nullptr) {
1339
0
            error(U_MEMORY_ALLOCATION_ERROR);
1340
0
        }
1341
1.10k
        break;
1342
            
1343
2.51k
    case doContinueNamedBackRef:
1344
2.51k
        fCaptureName->append(fC.fChar);
1345
2.51k
        break;
1346
1347
1.09k
    case doCompleteNamedBackRef:
1348
1.09k
        {
1349
1.09k
        int32_t groupNumber =
1350
1.09k
            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351
1.09k
        if (groupNumber == 0) {
1352
            // Group name has not been defined.
1353
            //   Could be a forward reference. If we choose to support them at some
1354
            //   future time, extra mechanism will be required at this point.
1355
53
            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356
1.04k
        } else {
1357
            // Given the number, handle identically to a \n numbered back reference.
1358
            // See comments above, under doBackRef
1359
1.04k
            fixLiterals(false);
1360
1.04k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361
197
                appendOp(URX_BACKREF_I, groupNumber);
1362
843
            } else {
1363
843
                appendOp(URX_BACKREF, groupNumber);
1364
843
            }
1365
1.04k
        }
1366
1.09k
        delete fCaptureName;
1367
1.09k
        fCaptureName = nullptr;
1368
1.09k
        break;
1369
144k
        }
1370
       
1371
985
    case doPossessivePlus:
1372
        // Possessive ++ quantifier.
1373
        // Compiles to
1374
        //       1.   STO_SP
1375
        //       2.      body of stuff being iterated over
1376
        //       3.   STATE_SAVE 5
1377
        //       4.   JMP        2
1378
        //       5.   LD_SP
1379
        //       6.   ...
1380
        //
1381
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382
        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383
        //
1384
985
        {
1385
            // Emit the STO_SP
1386
985
            int32_t   topLoc = blockTopLoc(true);
1387
985
            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388
985
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389
985
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391
            // Emit the STATE_SAVE
1392
985
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394
            // Emit the JMP
1395
985
            appendOp(URX_JMP, topLoc+1);
1396
1397
            // Emit the LD_SP
1398
985
            appendOp(URX_LD_SP, stoLoc);
1399
985
        }
1400
985
        break;
1401
1402
388
    case doPossessiveStar:
1403
        // Possessive *+ quantifier.
1404
        // Compiles to
1405
        //       1.   STO_SP       loc
1406
        //       2.   STATE_SAVE   5
1407
        //       3.      body of stuff being iterated over
1408
        //       4.   JMP          2
1409
        //       5.   LD_SP        loc
1410
        //       6    ...
1411
        // TODO:  do something to cut back the state stack each time through the loop.
1412
388
        {
1413
            // Reserve two slots at the top of the block.
1414
388
            int32_t   topLoc = blockTopLoc(true);
1415
388
            insertOp(topLoc);
1416
1417
            // emit   STO_SP     loc
1418
388
            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419
388
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420
388
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422
            // Emit the SAVE_STATE   5
1423
388
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424
388
            op = buildOp(URX_STATE_SAVE, L7);
1425
388
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427
            // Append the JMP operation.
1428
388
            appendOp(URX_JMP, topLoc+1);
1429
1430
            // Emit the LD_SP       loc
1431
388
            appendOp(URX_LD_SP, stoLoc);
1432
388
        }
1433
388
        break;
1434
1435
205
    case doPossessiveOpt:
1436
        // Possessive  ?+ quantifier.
1437
        //  Compiles to
1438
        //     1. STO_SP      loc
1439
        //     2. SAVE_STATE  5
1440
        //     3.    body of optional block
1441
        //     4. LD_SP       loc
1442
        //     5. ...
1443
        //
1444
205
        {
1445
            // Reserve two slots at the top of the block.
1446
205
            int32_t   topLoc = blockTopLoc(true);
1447
205
            insertOp(topLoc);
1448
1449
            // Emit the STO_SP
1450
205
            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451
205
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452
205
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454
            // Emit the SAVE_STATE
1455
205
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456
205
            op = buildOp(URX_STATE_SAVE, continueLoc);
1457
205
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459
            // Emit the LD_SP
1460
205
            appendOp(URX_LD_SP, stoLoc);
1461
205
        }
1462
205
        break;
1463
1464
1465
8.94k
    case doBeginMatchMode:
1466
8.94k
        fNewModeFlags = fModeFlags;
1467
8.94k
        fSetModeFlag  = true;
1468
8.94k
        break;
1469
1470
10.2k
    case doMatchMode:   //  (?i)    and similar
1471
10.2k
        {
1472
10.2k
            int32_t  bit = 0;
1473
10.2k
            switch (fC.fChar) {
1474
3.76k
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475
459
            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476
418
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477
253
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478
67
            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479
347
            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480
4.76k
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481
196
            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
1482
0
            default:
1483
0
                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484
                                        // by the scanner.
1485
10.2k
            }
1486
10.2k
            if (fSetModeFlag) {
1487
10.0k
                fNewModeFlags |= bit;
1488
10.0k
            } else {
1489
230
                fNewModeFlags &= ~bit;
1490
230
            }
1491
10.2k
        }
1492
0
        break;
1493
1494
2.51k
    case doSetMatchMode:
1495
        // Emit code to match any pending literals, using the not-yet changed match mode.
1496
2.51k
        fixLiterals();
1497
1498
        // We've got a (?i) or similar.  The match mode is being changed, but
1499
        //   the change is not scoped to a parenthesized block.
1500
2.51k
        U_ASSERT(fNewModeFlags < 0);
1501
2.51k
        fModeFlags = fNewModeFlags;
1502
1503
2.51k
        break;
1504
1505
1506
6.35k
    case doMatchModeParen:
1507
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508
        //   mode flags so they can be restored at the close of the block.
1509
        //
1510
        //   Compile to a
1511
        //      - NOP, which later may be replaced by a save-state if the
1512
        //         parenthesized group gets a * quantifier, followed by
1513
        //      - NOP, which may later be replaced by a save-state if there
1514
        //             is an '|' alternation within the parens.
1515
6.35k
        {
1516
6.35k
            fixLiterals(false);
1517
6.35k
            appendOp(URX_NOP, 0);
1518
6.35k
            appendOp(URX_NOP, 0);
1519
1520
            // On the Parentheses stack, start a new frame and add the positions
1521
            //   of the two NOPs (a normal non-capturing () frame, except for the
1522
            //   saving of the original mode flags.)
1523
6.35k
            fParenStack.push(fModeFlags, *fStatus);
1524
6.35k
            fParenStack.push(flags, *fStatus);                            // Frame Marker
1525
6.35k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526
6.35k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527
1528
            // Set the current mode flags to the new values.
1529
6.35k
            U_ASSERT(fNewModeFlags < 0);
1530
6.35k
            fModeFlags = fNewModeFlags;
1531
6.35k
        }
1532
6.35k
        break;
1533
1534
74
    case doBadModeFlag:
1535
74
        error(U_REGEX_INVALID_FLAG);
1536
74
        break;
1537
1538
59.1k
    case doSuppressComments:
1539
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540
        // treating a '#' as a to-the-end-of-line comment.
1541
        //   (This Perl compatibility just gets uglier and uglier to do...)
1542
59.1k
        fEOLComments = false;
1543
59.1k
        break;
1544
1545
1546
616
    case doSetAddAmp:
1547
616
        {
1548
616
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1549
616
          set->add(chAmp);
1550
616
        }
1551
616
        break;
1552
1553
1.25k
    case doSetAddDash:
1554
1.25k
        {
1555
1.25k
          UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1556
1.25k
          set->add(chDash);
1557
1.25k
        }
1558
1.25k
        break;
1559
1560
602
     case doSetBackslash_s:
1561
602
        {
1562
602
         UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1563
602
         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564
602
         break;
1565
10.2k
        }
1566
1567
526
     case doSetBackslash_S:
1568
526
        {
1569
526
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1570
526
            UnicodeSet SSet;
1571
526
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572
526
            set->addAll(SSet);
1573
526
            break;
1574
10.2k
        }
1575
1576
519
    case doSetBackslash_d:
1577
519
        {
1578
519
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1579
            // TODO - make a static set, ticket 6058.
1580
519
            addCategory(set, U_GC_ND_MASK, *fStatus);
1581
519
            break;
1582
10.2k
        }
1583
1584
950
    case doSetBackslash_D:
1585
950
        {
1586
950
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1587
950
            UnicodeSet digits;
1588
            // TODO - make a static set, ticket 6058.
1589
950
            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590
950
            digits.complement();
1591
950
            set->addAll(digits);
1592
950
            break;
1593
10.2k
        }
1594
1595
530
    case doSetBackslash_h:
1596
530
        {
1597
530
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1598
530
            UnicodeSet h;
1599
530
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600
530
            h.add((UChar32)9);   // Tab
1601
530
            set->addAll(h);
1602
530
            break;
1603
10.2k
        }
1604
1605
1.13k
    case doSetBackslash_H:
1606
1.13k
        {
1607
1.13k
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1608
1.13k
            UnicodeSet h;
1609
1.13k
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610
1.13k
            h.add((UChar32)9);   // Tab
1611
1.13k
            h.complement();
1612
1.13k
            set->addAll(h);
1613
1.13k
            break;
1614
10.2k
        }
1615
1616
580
    case doSetBackslash_v:
1617
580
        {
1618
580
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1619
580
            set->add((UChar32)0x0a, (UChar32)0x0d);  // add range
1620
580
            set->add((UChar32)0x85);
1621
580
            set->add((UChar32)0x2028, (UChar32)0x2029);
1622
580
            break;
1623
10.2k
        }
1624
1625
738
    case doSetBackslash_V:
1626
738
        {
1627
738
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1628
738
            UnicodeSet v;
1629
738
            v.add((UChar32)0x0a, (UChar32)0x0d);  // add range
1630
738
            v.add((UChar32)0x85);
1631
738
            v.add((UChar32)0x2028, (UChar32)0x2029);
1632
738
            v.complement();
1633
738
            set->addAll(v);
1634
738
            break;
1635
10.2k
        }
1636
1637
1.45k
    case doSetBackslash_w:
1638
1.45k
        {
1639
1.45k
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1640
1.45k
            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641
1.45k
            break;
1642
10.2k
        }
1643
1644
2.78k
    case doSetBackslash_W:
1645
2.78k
        {
1646
2.78k
            UnicodeSet *set = (UnicodeSet *)fSetStack.peek();
1647
2.78k
            UnicodeSet SSet;
1648
2.78k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649
2.78k
            set->addAll(SSet);
1650
2.78k
            break;
1651
10.2k
        }
1652
1653
190k
    case doSetBegin:
1654
190k
        {
1655
190k
            fixLiterals(false);
1656
190k
            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657
190k
            fSetStack.push(lpSet.orphan(), *fStatus);
1658
190k
            fSetOpStack.push(setStart, *fStatus);
1659
190k
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660
153k
                fSetOpStack.push(setCaseClose, *fStatus);
1661
153k
            }
1662
190k
            break;
1663
10.2k
        }
1664
1665
1.37k
    case doSetBeginDifference1:
1666
        //  We have scanned something like [[abc]-[
1667
        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668
        //  Push a Difference operator, which will cause the new set to be subtracted from what
1669
        //    went before once it is created.
1670
1.37k
        setPushOp(setDifference1);
1671
1.37k
        fSetOpStack.push(setStart, *fStatus);
1672
1.37k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673
525
            fSetOpStack.push(setCaseClose, *fStatus);
1674
525
        }
1675
1.37k
        break;
1676
1677
695
    case doSetBeginIntersection1:
1678
        //  We have scanned something like  [[abc]&[
1679
        //   Need both the '&' operator and the open '[' operator.
1680
695
        setPushOp(setIntersection1);
1681
695
        fSetOpStack.push(setStart, *fStatus);
1682
695
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683
202
            fSetOpStack.push(setCaseClose, *fStatus);
1684
202
        }
1685
695
        break;
1686
1687
67.0k
    case doSetBeginUnion:
1688
        //  We have scanned something like  [[abc][
1689
        //     Need to handle the union operation explicitly [[abc] | [
1690
67.0k
        setPushOp(setUnion);
1691
67.0k
        fSetOpStack.push(setStart, *fStatus);
1692
67.0k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693
22.6k
            fSetOpStack.push(setCaseClose, *fStatus);
1694
22.6k
        }
1695
67.0k
        break;
1696
1697
457
    case doSetDifference2:
1698
        // We have scanned something like [abc--
1699
        //   Consider this to unambiguously be a set difference operator.
1700
457
        setPushOp(setDifference2);
1701
457
        break;
1702
1703
248k
    case doSetEnd:
1704
        // Have encountered the ']' that closes a set.
1705
        //    Force the evaluation of any pending operations within this set,
1706
        //    leave the completed set on the top of the set stack.
1707
248k
        setEval(setEnd);
1708
248k
        U_ASSERT(fSetOpStack.peeki()==setStart);
1709
248k
        fSetOpStack.popi();
1710
248k
        break;
1711
1712
186k
    case doSetFinish:
1713
186k
        {
1714
        // Finished a complete set expression, including all nested sets.
1715
        //   The close bracket has already triggered clearing out pending set operators,
1716
        //    the operator stack should be empty and the operand stack should have just
1717
        //    one entry, the result set.
1718
186k
        U_ASSERT(fSetOpStack.empty());
1719
186k
        UnicodeSet *theSet = (UnicodeSet *)fSetStack.pop();
1720
186k
        U_ASSERT(fSetStack.empty());
1721
186k
        compileSet(theSet);
1722
186k
        break;
1723
10.2k
        }
1724
1725
1.80k
    case doSetIntersection2:
1726
        // Have scanned something like [abc&&
1727
1.80k
        setPushOp(setIntersection2);
1728
1.80k
        break;
1729
1730
6.54M
    case doSetLiteral:
1731
        // Union the just-scanned literal character into the set being built.
1732
        //    This operation is the highest precedence set operation, so we can always do
1733
        //    it immediately, without waiting to see what follows.  It is necessary to perform
1734
        //    any pending '-' or '&' operation first, because these have the same precedence
1735
        //    as union-ing in a literal'
1736
6.54M
        {
1737
6.54M
            setEval(setUnion);
1738
6.54M
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1739
6.54M
            s->add(fC.fChar);
1740
6.54M
            fLastSetLiteral = fC.fChar;
1741
6.54M
            break;
1742
10.2k
        }
1743
1744
33.4k
    case doSetLiteralEscaped:
1745
        // A back-slash escaped literal character was encountered.
1746
        // Processing is the same as with setLiteral, above, with the addition of
1747
        //  the optional check for errors on escaped ASCII letters.
1748
33.4k
        {
1749
33.4k
            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750
33.4k
                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751
0
                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752
0
                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753
0
            }
1754
33.4k
            setEval(setUnion);
1755
33.4k
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1756
33.4k
            s->add(fC.fChar);
1757
33.4k
            fLastSetLiteral = fC.fChar;
1758
33.4k
            break;
1759
10.2k
        }
1760
1761
852
        case doSetNamedChar:
1762
        // Scanning a \N{UNICODE CHARACTER NAME}
1763
        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764
        //    above.
1765
852
        {
1766
852
            UChar32  c = scanNamedChar();
1767
852
            setEval(setUnion);
1768
852
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1769
852
            s->add(c);
1770
852
            fLastSetLiteral = c;
1771
852
            break;
1772
10.2k
        }
1773
1774
80
    case doSetNamedRange:
1775
        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776
        // The left character is already in the set, and is saved in fLastSetLiteral.
1777
        // The right side needs to be picked up, the scan is at the 'N'.
1778
        // Lower Limit > Upper limit being an error matches both Java
1779
        //        and ICU UnicodeSet behavior.
1780
80
        {
1781
80
            UChar32  c = scanNamedChar();
1782
80
            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783
3
                error(U_REGEX_INVALID_RANGE);
1784
3
            }
1785
80
            UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1786
80
            s->add(fLastSetLiteral, c);
1787
80
            fLastSetLiteral = c;
1788
80
            break;
1789
10.2k
        }
1790
1791
1792
1.00k
    case  doSetNegate:
1793
        // Scanned a '^' at the start of a set.
1794
        // Push the negation operator onto the set op stack.
1795
        // A twist for case-insensitive matching:
1796
        //   the case closure operation must happen _before_ negation.
1797
        //   But the case closure operation will already be on the stack if it's required.
1798
        //   This requires checking for case closure, and swapping the stack order
1799
        //    if it is present.
1800
1.00k
        {
1801
1.00k
            int32_t  tosOp = fSetOpStack.peeki();
1802
1.00k
            if (tosOp == setCaseClose) {
1803
559
                fSetOpStack.popi();
1804
559
                fSetOpStack.push(setNegation, *fStatus);
1805
559
                fSetOpStack.push(setCaseClose, *fStatus);
1806
559
            } else {
1807
444
                fSetOpStack.push(setNegation, *fStatus);
1808
444
            }
1809
1.00k
        }
1810
1.00k
        break;
1811
1812
1.57k
    case doSetNoCloseError:
1813
1.57k
        error(U_REGEX_MISSING_CLOSE_BRACKET);
1814
1.57k
        break;
1815
1816
1
    case doSetOpError:
1817
1
        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818
1
        break;
1819
1820
82.4k
    case doSetPosixProp:
1821
82.4k
        {
1822
82.4k
            UnicodeSet *s = scanPosixProp();
1823
82.4k
            if (s != nullptr) {
1824
61.3k
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1825
61.3k
                tos->addAll(*s);
1826
61.3k
                delete s;
1827
61.3k
            }  // else error.  scanProp() reported the error status already.
1828
82.4k
        }
1829
82.4k
        break;
1830
1831
366
    case doSetProp:
1832
        //  Scanned a \p \P within [brackets].
1833
366
        {
1834
366
            UnicodeSet *s = scanProp();
1835
366
            if (s != nullptr) {
1836
359
                UnicodeSet *tos = (UnicodeSet *)fSetStack.peek();
1837
359
                tos->addAll(*s);
1838
359
                delete s;
1839
359
            }  // else error.  scanProp() reported the error status already.
1840
366
        }
1841
366
        break;
1842
1843
1844
1.62k
    case doSetRange:
1845
        // We have scanned literal-literal.  Add the range to the set.
1846
        // The left character is already in the set, and is saved in fLastSetLiteral.
1847
        // The right side is the current character.
1848
        // Lower Limit > Upper limit being an error matches both Java
1849
        //        and ICU UnicodeSet behavior.
1850
1.62k
        {
1851
1852
1.62k
        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853
55
            error(U_REGEX_INVALID_RANGE);
1854
55
        }
1855
1.62k
        UnicodeSet *s = (UnicodeSet *)fSetStack.peek();
1856
1.62k
        s->add(fLastSetLiteral, fC.fChar);
1857
1.62k
        break;
1858
10.2k
        }
1859
1860
0
    default:
1861
0
        UPRV_UNREACHABLE_EXIT;
1862
105M
    }
1863
1864
105M
    if (U_FAILURE(*fStatus)) {
1865
6.80k
        returnVal = false;
1866
6.80k
    }
1867
1868
105M
    return returnVal;
1869
105M
}
1870
1871
1872
1873
//------------------------------------------------------------------------------
1874
//
1875
//   literalChar           We've encountered a literal character from the pattern,
1876
//                             or an escape sequence that reduces to a character.
1877
//                         Add it to the string containing all literal chars/strings from
1878
//                             the pattern.
1879
//
1880
//------------------------------------------------------------------------------
1881
23.2M
void RegexCompile::literalChar(UChar32 c)  {
1882
23.2M
    fLiteralChars.append(c);
1883
23.2M
}
1884
1885
1886
//------------------------------------------------------------------------------
1887
//
1888
//    fixLiterals           When compiling something that can follow a literal
1889
//                          string in a pattern, emit the code to match the
1890
//                          accumulated literal string.
1891
//
1892
//                          Optionally, split the last char of the string off into
1893
//                          a single "ONE_CHAR" operation, so that quantifiers can
1894
//                          apply to that char alone.  Example:   abc*
1895
//                          The * must apply to the 'c' only.
1896
//
1897
//------------------------------------------------------------------------------
1898
28.1M
void    RegexCompile::fixLiterals(UBool split) {
1899
1900
    // If no literal characters have been scanned but not yet had code generated
1901
    //   for them, nothing needs to be done.
1902
28.1M
    if (fLiteralChars.length() == 0) {
1903
27.0M
        return;
1904
27.0M
    }
1905
1906
1.03M
    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907
1.03M
    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909
    // Split:  We need to  ensure that the last item in the compiled pattern
1910
    //     refers only to the last literal scanned in the pattern, so that
1911
    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912
    //     Split before case folding for case insensitive matches.
1913
1914
1.03M
    if (split) {
1915
251k
        fLiteralChars.truncate(indexOfLastCodePoint);
1916
251k
        fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
1917
                              //  Note that the truncated literal string may be empty, in which case
1918
                              //  nothing will be emitted.
1919
1920
251k
        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921
251k
        fixLiterals(false);          // Second recursive call, code for the final code point.
1922
251k
        return;
1923
251k
    }
1924
1925
    // If we are doing case-insensitive matching, case fold the string.  This may expand
1926
    //   the string, e.g. the German sharp-s turns into "ss"
1927
783k
    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928
438k
        fLiteralChars.foldCase();
1929
438k
        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930
438k
        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931
438k
    }
1932
1933
783k
    if (indexOfLastCodePoint == 0) {
1934
        // Single character, emit a URX_ONECHAR op to match it.
1935
368k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936
368k
                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937
15.2k
            appendOp(URX_ONECHAR_I, lastCodePoint);
1938
352k
        } else {
1939
352k
            appendOp(URX_ONECHAR, lastCodePoint);
1940
352k
        }
1941
415k
    } else {
1942
        // Two or more chars, emit a URX_STRING to match them.
1943
415k
        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944
0
            error(U_REGEX_PATTERN_TOO_BIG);
1945
0
        }
1946
415k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947
294k
            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948
294k
        } else {
1949
            // TODO here:  add optimization to split case sensitive strings of length two
1950
            //             into two single char ops, for efficiency.
1951
121k
            appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952
121k
        }
1953
415k
        appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955
        // Add this string into the accumulated strings of the compiled pattern.
1956
415k
        fRXPat->fLiteralText.append(fLiteralChars);
1957
415k
    }
1958
1959
783k
    fLiteralChars.remove();
1960
783k
}
1961
1962
1963
116M
int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964
116M
    if (U_FAILURE(*fStatus)) {
1965
3.30k
        return 0;
1966
3.30k
    }
1967
116M
    if (type < 0 || type > 255) {
1968
0
        UPRV_UNREACHABLE_EXIT;
1969
0
    }
1970
116M
    if (val > 0x00ffffff) {
1971
0
        UPRV_UNREACHABLE_EXIT;
1972
0
    }
1973
116M
    if (val < 0) {
1974
0
        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975
0
            UPRV_UNREACHABLE_EXIT;
1976
0
        }
1977
0
        if (URX_TYPE(val) != 0xff) {
1978
0
            UPRV_UNREACHABLE_EXIT;
1979
0
        }
1980
0
        type = URX_RESERVED_OP_N;
1981
0
    }
1982
116M
    return (type << 24) | val;
1983
116M
}
1984
1985
1986
//------------------------------------------------------------------------------
1987
//
1988
//   appendOp()             Append a new instruction onto the compiled pattern
1989
//                          Includes error checking, limiting the size of the
1990
//                          pattern to lengths that can be represented in the
1991
//                          24 bit operand field of an instruction.
1992
//
1993
//------------------------------------------------------------------------------
1994
57.0M
void RegexCompile::appendOp(int32_t op) {
1995
57.0M
    if (U_FAILURE(*fStatus)) {
1996
2.20k
        return;
1997
2.20k
    }
1998
57.0M
    fRXPat->fCompiledPat->addElement(op, *fStatus);
1999
57.0M
    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000
0
        error(U_REGEX_PATTERN_TOO_BIG);
2001
0
    }
2002
57.0M
}
2003
2004
55.5M
void RegexCompile::appendOp(int32_t type, int32_t val) {
2005
55.5M
    appendOp(buildOp(type, val));
2006
55.5M
}
2007
2008
2009
//------------------------------------------------------------------------------
2010
//
2011
//   insertOp()             Insert a slot for a new opcode into the already
2012
//                          compiled pattern code.
2013
//
2014
//                          Fill the slot with a NOP.  Our caller will replace it
2015
//                          with what they really wanted.
2016
//
2017
//------------------------------------------------------------------------------
2018
132k
void   RegexCompile::insertOp(int32_t where) {
2019
132k
    UVector64 *code = fRXPat->fCompiledPat;
2020
132k
    U_ASSERT(where>0 && where < code->size());
2021
2022
132k
    int32_t  nop = buildOp(URX_NOP, 0);
2023
132k
    code->insertElementAt(nop, where, *fStatus);
2024
2025
    // Walk through the pattern, looking for any ops with targets that
2026
    //  were moved down by the insert.  Fix them.
2027
132k
    int32_t loc;
2028
3.65G
    for (loc=0; loc<code->size(); loc++) {
2029
3.65G
        int32_t op = (int32_t)code->elementAti(loc);
2030
3.65G
        int32_t opType = URX_TYPE(op);
2031
3.65G
        int32_t opValue = URX_VAL(op);
2032
3.65G
        if ((opType == URX_JMP         ||
2033
3.65G
            opType == URX_JMPX         ||
2034
3.65G
            opType == URX_STATE_SAVE   ||
2035
3.65G
            opType == URX_CTR_LOOP     ||
2036
3.65G
            opType == URX_CTR_LOOP_NG  ||
2037
3.65G
            opType == URX_JMP_SAV      ||
2038
3.65G
            opType == URX_JMP_SAV_X    ||
2039
3.65G
            opType == URX_RELOC_OPRND)    && opValue > where) {
2040
            // Target location for this opcode is after the insertion point and
2041
            //   needs to be incremented to adjust for the insertion.
2042
12.4M
            opValue++;
2043
12.4M
            op = buildOp(opType, opValue);
2044
12.4M
            code->setElementAt(op, loc);
2045
12.4M
        }
2046
3.65G
    }
2047
2048
    // Now fix up the parentheses stack.  All positive values in it are locations in
2049
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050
921M
    for (loc=0; loc<fParenStack.size(); loc++) {
2051
921M
        int32_t x = fParenStack.elementAti(loc);
2052
921M
        U_ASSERT(x < code->size());
2053
921M
        if (x>where) {
2054
0
            x++;
2055
0
            fParenStack.setElementAt(x, loc);
2056
0
        }
2057
921M
    }
2058
2059
132k
    if (fMatchCloseParen > where) {
2060
19.5k
        fMatchCloseParen++;
2061
19.5k
    }
2062
132k
    if (fMatchOpenParen > where) {
2063
0
        fMatchOpenParen++;
2064
0
    }
2065
132k
}
2066
2067
2068
//------------------------------------------------------------------------------
2069
//
2070
//   allocateData()        Allocate storage in the matcher's static data area.
2071
//                         Return the index for the newly allocated data.
2072
//                         The storage won't actually exist until we are running a match
2073
//                         operation, but the storage indexes are inserted into various
2074
//                         opcodes while compiling the pattern.
2075
//
2076
//------------------------------------------------------------------------------
2077
53.5k
int32_t RegexCompile::allocateData(int32_t size) {
2078
53.5k
    if (U_FAILURE(*fStatus)) {
2079
18
        return 0;
2080
18
    }
2081
53.5k
    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082
0
        error(U_REGEX_INTERNAL_ERROR);
2083
0
        return 0;
2084
0
    }
2085
53.5k
    int32_t dataIndex = fRXPat->fDataSize;
2086
53.5k
    fRXPat->fDataSize += size;
2087
53.5k
    if (fRXPat->fDataSize >= 0x00fffff0) {
2088
0
        error(U_REGEX_INTERNAL_ERROR);
2089
0
    }
2090
53.5k
    return dataIndex;
2091
53.5k
}
2092
2093
2094
//------------------------------------------------------------------------------
2095
//
2096
//   allocateStackData()   Allocate space in the back-tracking stack frame.
2097
//                         Return the index for the newly allocated data.
2098
//                         The frame indexes are inserted into various
2099
//                         opcodes while compiling the pattern, meaning that frame
2100
//                         size must be restricted to the size that will fit
2101
//                         as an operand (24 bits).
2102
//
2103
//------------------------------------------------------------------------------
2104
362k
int32_t RegexCompile::allocateStackData(int32_t size) {
2105
362k
    if (U_FAILURE(*fStatus)) {
2106
0
        return 0;
2107
0
    }
2108
362k
    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109
0
        error(U_REGEX_INTERNAL_ERROR);
2110
0
        return 0;
2111
0
    }
2112
362k
    int32_t dataIndex = fRXPat->fFrameSize;
2113
362k
    fRXPat->fFrameSize += size;
2114
362k
    if (fRXPat->fFrameSize >= 0x00fffff0) {
2115
0
        error(U_REGEX_PATTERN_TOO_BIG);
2116
0
    }
2117
362k
    return dataIndex;
2118
362k
}
2119
2120
2121
//------------------------------------------------------------------------------
2122
//
2123
//   blockTopLoc()          Find or create a location in the compiled pattern
2124
//                          at the start of the operation or block that has
2125
//                          just been compiled.  Needed when a quantifier (* or
2126
//                          whatever) appears, and we need to add an operation
2127
//                          at the start of the thing being quantified.
2128
//
2129
//                          (Parenthesized Blocks) have a slot with a NOP that
2130
//                          is reserved for this purpose.  .* or similar don't
2131
//                          and a slot needs to be added.
2132
//
2133
//       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
2134
//                                         at the returned location.
2135
//                                 false - just return the address,
2136
//                                         do not reserve a location there.
2137
//
2138
//------------------------------------------------------------------------------
2139
428k
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140
428k
    int32_t   theLoc;
2141
428k
    fixLiterals(true);  // Emit code for any pending literals.
2142
                        //   If last item was a string, emit separate op for the its last char.
2143
428k
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144
16.8k
    {
2145
        // The item just processed is a parenthesized block.
2146
16.8k
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147
16.8k
        U_ASSERT(theLoc > 0);
2148
16.8k
        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149
16.8k
    }
2150
411k
    else {
2151
        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153
        // We need to make space now.
2154
411k
        theLoc = fRXPat->fCompiledPat->size()-1;
2155
411k
        int32_t opAtTheLoc = (int32_t)fRXPat->fCompiledPat->elementAti(theLoc);
2156
411k
        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157
            // Strings take two opcode, we want the position of the first one.
2158
            // We can have a string at this point if a single character case-folded to two.
2159
252k
            theLoc--;
2160
252k
        }
2161
411k
        if (reserveLoc) {
2162
161k
            int32_t  nop = buildOp(URX_NOP, 0);
2163
161k
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164
161k
        }
2165
411k
    }
2166
428k
    return theLoc;
2167
428k
}
2168
2169
2170
2171
//------------------------------------------------------------------------------
2172
//
2173
//    handleCloseParen      When compiling a close paren, we need to go back
2174
//                          and fix up any JMP or SAVE operations within the
2175
//                          parenthesized block that need to target the end
2176
//                          of the block.  The locations of these are kept on
2177
//                          the paretheses stack.
2178
//
2179
//                          This function is called both when encountering a
2180
//                          real ) and at the end of the pattern.
2181
//
2182
//------------------------------------------------------------------------------
2183
382k
void  RegexCompile::handleCloseParen() {
2184
382k
    int32_t   patIdx;
2185
382k
    int32_t   patOp;
2186
382k
    if (fParenStack.size() <= 0) {
2187
0
        error(U_REGEX_MISMATCHED_PAREN);
2188
0
        return;
2189
0
    }
2190
2191
    // Emit code for any pending literals.
2192
382k
    fixLiterals(false);
2193
2194
    // Fixup any operations within the just-closed parenthesized group
2195
    //    that need to reference the end of the (block).
2196
    //    (The first one popped from the stack is an unused slot for
2197
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198
19.9M
    for (;;) {
2199
19.9M
        patIdx = fParenStack.popi();
2200
19.9M
        if (patIdx < 0) {
2201
            // value < 0 flags the start of the frame on the paren stack.
2202
382k
            break;
2203
382k
        }
2204
19.5M
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205
19.5M
        patOp = (int32_t)fRXPat->fCompiledPat->elementAti(patIdx);
2206
19.5M
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207
19.5M
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208
19.5M
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209
19.5M
        fMatchOpenParen     = patIdx;
2210
19.5M
    }
2211
2212
    //  At the close of any parenthesized block, restore the match mode flags  to
2213
    //  the value they had at the open paren.  Saved value is
2214
    //  at the top of the paren stack.
2215
382k
    fModeFlags = fParenStack.popi();
2216
382k
    U_ASSERT(fModeFlags < 0);
2217
2218
    // DO any additional fixups, depending on the specific kind of
2219
    // parentesized grouping this is
2220
2221
382k
    switch (patIdx) {
2222
5.61k
    case plain:
2223
10.9k
    case flags:
2224
        // No additional fixups required.
2225
        //   (Grouping-only parentheses)
2226
10.9k
        break;
2227
324k
    case capturing:
2228
        // Capturing Parentheses.
2229
        //   Insert a End Capture op into the pattern.
2230
        //   The frame offset of the variables for this cg is obtained from the
2231
        //       start capture op and put it into the end-capture op.
2232
324k
        {
2233
324k
            int32_t   captureOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2234
324k
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235
2236
324k
            int32_t   frameVarLocation = URX_VAL(captureOp);
2237
324k
            appendOp(URX_END_CAPTURE, frameVarLocation);
2238
324k
        }
2239
324k
        break;
2240
522
    case atomic:
2241
        // Atomic Parenthesis.
2242
        //   Insert a LD_SP operation to restore the state stack to the position
2243
        //   it was when the atomic parens were entered.
2244
522
        {
2245
522
            int32_t   stoOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen+1);
2246
522
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247
522
            int32_t   stoLoc = URX_VAL(stoOp);
2248
522
            appendOp(URX_LD_SP, stoLoc);
2249
522
        }
2250
522
        break;
2251
2252
41.3k
    case lookAhead:
2253
41.3k
        {
2254
41.3k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2255
41.3k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256
41.3k
            int32_t dataLoc  = URX_VAL(startOp);
2257
41.3k
            appendOp(URX_LA_END, dataLoc);
2258
41.3k
        }
2259
41.3k
        break;
2260
2261
609
    case negLookAhead:
2262
609
        {
2263
            // See comment at doOpenLookAheadNeg
2264
609
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-1);
2265
609
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266
609
            int32_t dataLoc  = URX_VAL(startOp);
2267
609
            appendOp(URX_LA_END, dataLoc);
2268
609
            appendOp(URX_BACKTRACK, 0);
2269
609
            appendOp(URX_LA_END, dataLoc);
2270
2271
            // Patch the URX_SAVE near the top of the block.
2272
            // The destination of the SAVE is the final LA_END that was just added.
2273
609
            int32_t saveOp   = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen);
2274
609
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275
609
            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276
609
            saveOp           = buildOp(URX_STATE_SAVE, dest);
2277
609
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278
609
        }
2279
609
        break;
2280
2281
2.21k
    case lookBehind:
2282
2.21k
        {
2283
            // See comment at doOpenLookBehind.
2284
2285
            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286
2.21k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-4);
2287
2.21k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288
2.21k
            int32_t dataLoc  = URX_VAL(startOp);
2289
2.21k
            appendOp(URX_LB_END, dataLoc);
2290
2.21k
            appendOp(URX_LA_END, dataLoc);
2291
2292
            // Determine the min and max bounds for the length of the
2293
            //  string that the pattern can match.
2294
            //  An unbounded upper limit is an error.
2295
2.21k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296
2.21k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297
2.21k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298
2.21k
            if (URX_TYPE(maxML) != 0) {
2299
145
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2300
145
                break;
2301
145
            }
2302
2.07k
            if (maxML == INT32_MAX) {
2303
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2304
0
                break;
2305
0
            }
2306
2.07k
            if (minML == INT32_MAX) {
2307
                // This condition happens when no match is possible, such as with a
2308
                // [set] expression containing no elements.
2309
                // In principle, the generated code to evaluate the expression could be deleted,
2310
                // but it's probably not worth the complication.
2311
257
                minML = 0;
2312
257
            }
2313
2.07k
            U_ASSERT(minML <= maxML);
2314
2315
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2316
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317
2.07k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318
2.07k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319
2320
2.07k
        }
2321
0
        break;
2322
2323
2324
2325
2.76k
    case lookBehindN:
2326
2.76k
        {
2327
            // See comment at doOpenLookBehindNeg.
2328
2329
            // Append the URX_LBN_END to the compiled pattern.
2330
2.76k
            int32_t  startOp = (int32_t)fRXPat->fCompiledPat->elementAti(fMatchOpenParen-5);
2331
2.76k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332
2.76k
            int32_t dataLoc  = URX_VAL(startOp);
2333
2.76k
            appendOp(URX_LBN_END, dataLoc);
2334
2335
            // Determine the min and max bounds for the length of the
2336
            //  string that the pattern can match.
2337
            //  An unbounded upper limit is an error.
2338
2.76k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339
2.76k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340
2.76k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341
2.76k
            if (URX_TYPE(maxML) != 0) {
2342
203
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2343
203
                break;
2344
203
            }
2345
2.56k
            if (maxML == INT32_MAX) {
2346
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2347
0
                break;
2348
0
            }
2349
2.56k
            if (minML == INT32_MAX) {
2350
                // This condition happens when no match is possible, such as with a
2351
                // [set] expression containing no elements.
2352
                // In principle, the generated code to evaluate the expression could be deleted,
2353
                // but it's probably not worth the complication.
2354
367
                minML = 0;
2355
367
            }
2356
2357
2.56k
            U_ASSERT(minML <= maxML);
2358
2359
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2360
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361
2.56k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362
2.56k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363
2364
            // Insert the pattern location to continue at after a successful match
2365
            //  as the last operand of the URX_LBN_CONT
2366
2.56k
            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367
2.56k
            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368
2.56k
        }
2369
0
        break;
2370
2371
2372
2373
0
    default:
2374
0
        UPRV_UNREACHABLE_EXIT;
2375
382k
    }
2376
2377
    // remember the next location in the compiled pattern.
2378
    // The compilation of Quantifiers will look at this to see whether its looping
2379
    //   over a parenthesized block or a single item
2380
382k
    fMatchCloseParen = fRXPat->fCompiledPat->size();
2381
382k
}
2382
2383
2384
2385
//------------------------------------------------------------------------------
2386
//
2387
//   compileSet       Compile the pattern operations for a reference to a
2388
//                    UnicodeSet.
2389
//
2390
//------------------------------------------------------------------------------
2391
void        RegexCompile::compileSet(UnicodeSet *theSet)
2392
187k
{
2393
187k
    if (theSet == nullptr) {
2394
90
        return;
2395
90
    }
2396
    //  Remove any strings from the set.
2397
    //  There shouldn't be any, but just in case.
2398
    //     (Case Closure can add them; if we had a simple case closure available that
2399
    //      ignored strings, that would be better.)
2400
187k
    theSet->removeAllStrings();
2401
187k
    int32_t  setSize = theSet->size();
2402
2403
187k
    switch (setSize) {
2404
3.59k
    case 0:
2405
3.59k
        {
2406
            // Set of no elements.   Always fails to match.
2407
3.59k
            appendOp(URX_BACKTRACK, 0);
2408
3.59k
            delete theSet;
2409
3.59k
        }
2410
3.59k
        break;
2411
2412
6.46k
    case 1:
2413
6.46k
        {
2414
            // The set contains only a single code point.  Put it into
2415
            //   the compiled pattern as a single char operation rather
2416
            //   than a set, and discard the set itself.
2417
6.46k
            literalChar(theSet->charAt(0));
2418
6.46k
            delete theSet;
2419
6.46k
        }
2420
6.46k
        break;
2421
2422
177k
    default:
2423
177k
        {
2424
            //  The set contains two or more chars.  (the normal case)
2425
            //  Put it into the compiled pattern as a set.
2426
177k
            theSet->freeze();
2427
177k
            int32_t setNumber = fRXPat->fSets->size();
2428
177k
            fRXPat->fSets->addElement(theSet, *fStatus);
2429
177k
            if (U_SUCCESS(*fStatus)) {
2430
177k
                appendOp(URX_SETREF, setNumber);
2431
177k
            } else {
2432
4
                delete theSet;
2433
4
            }
2434
177k
        }
2435
187k
    }
2436
187k
}
2437
2438
2439
//------------------------------------------------------------------------------
2440
//
2441
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442
//                      Except for the specific opcodes used, the code is the same
2443
//                      for all three types (greedy, non-greedy, possessive) of
2444
//                      intervals.  The opcodes are supplied as parameters.
2445
//                      (There are two sets of opcodes - greedy & possessive use the
2446
//                      same ones, while non-greedy has it's own.)
2447
//
2448
//                      The code for interval loops has this form:
2449
//                         0  CTR_INIT   counter loc (in stack frame)
2450
//                         1             5  patt address of CTR_LOOP at bottom of block
2451
//                         2             min count
2452
//                         3             max count   (-1 for unbounded)
2453
//                         4  ...        block to be iterated over
2454
//                         5  CTR_LOOP
2455
//
2456
//                       In
2457
//------------------------------------------------------------------------------
2458
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459
11.8k
{
2460
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461
    //   four slots in the compiled code.  Reserve them.
2462
11.8k
    int32_t   topOfBlock = blockTopLoc(true);
2463
11.8k
    insertOp(topOfBlock);
2464
11.8k
    insertOp(topOfBlock);
2465
11.8k
    insertOp(topOfBlock);
2466
2467
    // The operands for the CTR_INIT opcode include the index in the matcher data
2468
    //   of the counter.  Allocate it now. There are two data items
2469
    //        counterLoc   -->  Loop counter
2470
    //               +1    -->  Input index (for breaking non-progressing loops)
2471
    //                          (Only present if unbounded upper limit on loop)
2472
11.8k
    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473
11.8k
    int32_t   counterLoc = allocateStackData(dataSize);
2474
2475
11.8k
    int32_t   op = buildOp(InitOp, counterLoc);
2476
11.8k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478
    // The second operand of CTR_INIT is the location following the end of the loop.
2479
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480
    //   compilation of something later on causes the code to grow and the target
2481
    //   position to move.
2482
11.8k
    int32_t loopEnd = fRXPat->fCompiledPat->size();
2483
11.8k
    op = buildOp(URX_RELOC_OPRND, loopEnd);
2484
11.8k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486
    // Followed by the min and max counts.
2487
11.8k
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488
11.8k
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490
    // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491
    //   Goes at end of the block being looped over, so just append to the code so far.
2492
11.8k
    appendOp(LoopOp, topOfBlock);
2493
2494
11.8k
    if ((fIntervalLow & 0xff000000) != 0 ||
2495
11.8k
        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496
40
            error(U_REGEX_NUMBER_TOO_BIG);
2497
40
        }
2498
2499
11.8k
    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500
19
        error(U_REGEX_MAX_LT_MIN);
2501
19
    }
2502
11.8k
}
2503
2504
2505
2506
94.6k
UBool RegexCompile::compileInlineInterval() {
2507
94.6k
    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508
        // Too big to inline.  Fail, which will cause looping code to be generated.
2509
        //   (Upper < Lower picks up unbounded upper and errors, both.)
2510
3.25k
        return false;
2511
3.25k
    }
2512
2513
91.3k
    int32_t   topOfBlock = blockTopLoc(false);
2514
91.3k
    if (fIntervalUpper == 0) {
2515
        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516
        // Discard the generated code for the block.
2517
        // If the block included parens, discard the info pertaining to them as well.
2518
1.95k
        fRXPat->fCompiledPat->setSize(topOfBlock);
2519
1.95k
        if (fMatchOpenParen >= topOfBlock) {
2520
454
            fMatchOpenParen = -1;
2521
454
        }
2522
1.95k
        if (fMatchCloseParen >= topOfBlock) {
2523
454
            fMatchCloseParen = -1;
2524
454
        }
2525
1.95k
        return true;
2526
1.95k
    }
2527
2528
89.4k
    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529
        // The thing being repeated is not a single op, but some
2530
        //   more complex block.  Do it as a loop, not inlines.
2531
        //   Note that things "repeated" a max of once are handled as inline, because
2532
        //     the one copy of the code already generated is just fine.
2533
1.03k
        return false;
2534
1.03k
    }
2535
2536
    // Pick up the opcode that is to be repeated
2537
    //
2538
88.3k
    int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(topOfBlock);
2539
2540
    // Compute the pattern location where the inline sequence
2541
    //   will end, and set up the state save op that will be needed.
2542
    //
2543
88.3k
    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544
88.3k
                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545
88.3k
    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546
88.3k
    if (fIntervalLow == 0) {
2547
83.8k
        insertOp(topOfBlock);
2548
83.8k
        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549
83.8k
    }
2550
2551
2552
2553
    //  Loop, emitting the op for the thing being repeated each time.
2554
    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555
    //    it was put there when it was originally encountered.
2556
88.3k
    int32_t i;
2557
780k
    for (i=1; i<fIntervalUpper; i++ ) {
2558
692k
        if (i >= fIntervalLow) {
2559
674k
            appendOp(saveOp);
2560
674k
        }
2561
692k
        appendOp(op);
2562
692k
    }
2563
88.3k
    return true;
2564
89.4k
}
2565
2566
2567
2568
//------------------------------------------------------------------------------
2569
//
2570
//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
2571
//                         set of characters that could potentially begin a case-insensitive 
2572
//                         match of a string beginning with that character, using full Unicode
2573
//                         case insensitive matching.
2574
//
2575
//          This is used in optimizing find().
2576
//
2577
//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578
//          misses cases like this:
2579
//             A string from the pattern begins with 'ss' (although all we know
2580
//                 in this context is that it begins with 's')
2581
//             The pattern could match a string beginning with a German sharp-s
2582
//
2583
//           To the ordinary case closure for a character c, we add all other
2584
//           characters cx where the case closure of cx includes a string form that begins
2585
//           with the original character c.
2586
//
2587
//           This function could be made smarter. The full pattern string is available
2588
//           and it would be possible to verify that the extra characters being added
2589
//           to the starting set fully match, rather than having just a first-char of the
2590
//           folded form match.
2591
//
2592
//------------------------------------------------------------------------------
2593
43.4k
void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595
// Machine Generated below.
2596
// It may need updating with new versions of Unicode.
2597
// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598
// The update tool is here:
2599
// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601
// Machine Generated Data. Do not hand edit.
2602
43.4k
    static const UChar32 RECaseFixCodePoints[] = {
2603
43.4k
        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
2604
43.4k
        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
2605
43.4k
        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
2606
43.4k
        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
2607
43.4k
        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609
43.4k
    static const int16_t RECaseFixStringOffsets[] = {
2610
43.4k
        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
2611
43.4k
        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
2612
43.4k
        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
2613
43.4k
        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
2614
43.4k
        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616
43.4k
    static const int16_t RECaseFixCounts[] = {
2617
43.4k
        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
2618
43.4k
        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
2619
43.4k
        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2620
43.4k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2621
43.4k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623
43.4k
    static const char16_t RECaseFixData[] = {
2624
43.4k
        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
2625
43.4k
        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
2626
43.4k
        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
2627
43.4k
        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
2628
43.4k
        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
2629
43.4k
        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
2630
43.4k
        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
2631
43.4k
        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
2632
43.4k
        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
2633
43.4k
        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
2634
43.4k
        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636
// End of machine generated data.
2637
2638
43.4k
    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639
        // This function should never be called with an invalid input character.
2640
0
        UPRV_UNREACHABLE_EXIT;
2641
43.4k
    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642
6.75k
        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643
6.75k
        starterChars->set(caseFoldedC, caseFoldedC);
2644
2645
6.75k
        int32_t i;
2646
108k
        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647
            // Simple linear search through the sorted list of interesting code points.
2648
101k
        }
2649
2650
6.75k
        if (RECaseFixCodePoints[i] == c) {
2651
5.90k
            int32_t dataIndex = RECaseFixStringOffsets[i];
2652
5.90k
            int32_t numCharsToAdd = RECaseFixCounts[i];
2653
5.90k
            UChar32 cpToAdd = 0;
2654
32.0k
            for (int32_t j=0; j<numCharsToAdd; j++) {
2655
26.1k
                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656
26.1k
                starterChars->add(cpToAdd);
2657
26.1k
            }
2658
5.90k
        }
2659
2660
6.75k
        starterChars->closeOver(USET_CASE_INSENSITIVE);
2661
6.75k
        starterChars->removeAllStrings();
2662
36.6k
    } else {
2663
        // Not a cased character. Just return it alone.
2664
36.6k
        starterChars->set(c, c);
2665
36.6k
    }
2666
43.4k
}
2667
2668
2669
// Increment with overflow check.
2670
// val and delta will both be positive.
2671
2672
2.67M
static int32_t safeIncrement(int32_t val, int32_t delta) {
2673
2.67M
    if (INT32_MAX - val > delta) {
2674
2.65M
        return val + delta;
2675
2.65M
    } else {
2676
15.9k
        return INT32_MAX;
2677
15.9k
    }
2678
2.67M
}
2679
2680
2681
//------------------------------------------------------------------------------
2682
//
2683
//   matchStartType    Determine how a match can start.
2684
//                     Used to optimize find() operations.
2685
//
2686
//                     Operation is very similar to minMatchLength().  Walk the compiled
2687
//                     pattern, keeping an on-going minimum-match-length.  For any
2688
//                     op where the min match coming in is zero, add that ops possible
2689
//                     starting matches to the possible starts for the overall pattern.
2690
//
2691
//------------------------------------------------------------------------------
2692
4.49k
void   RegexCompile::matchStartType() {
2693
4.49k
    if (U_FAILURE(*fStatus)) {
2694
52
        return;
2695
52
    }
2696
2697
2698
4.44k
    int32_t    loc;                    // Location in the pattern of the current op being processed.
2699
4.44k
    int32_t    op;                     // The op being processed
2700
4.44k
    int32_t    opType;                 // The opcode type of the op
2701
4.44k
    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702
4.44k
    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703
2704
4.44k
    UBool      atStart = true;         // True if no part of the pattern yet encountered
2705
                                       //   could have advanced the position in a match.
2706
                                       //   (Maximum match length so far == 0)
2707
2708
    // forwardedLength is a vector holding minimum-match-length values that
2709
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710
    //   It must be one longer than the pattern being checked because some  ops
2711
    //   will jmp to a end-of-block+1 location from within a block, and we must
2712
    //   count those when checking the block.
2713
4.44k
    int32_t end = fRXPat->fCompiledPat->size();
2714
4.44k
    UVector32  forwardedLength(end+1, *fStatus);
2715
4.44k
    forwardedLength.setSize(end+1);
2716
22.7M
    for (loc=3; loc<end; loc++) {
2717
22.7M
        forwardedLength.setElementAt(INT32_MAX, loc);
2718
22.7M
    }
2719
2720
15.0M
    for (loc = 3; loc<end; loc++) {
2721
15.0M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2722
15.0M
        opType = URX_TYPE(op);
2723
2724
        // The loop is advancing linearly through the pattern.
2725
        // If the op we are now at was the destination of a branch in the pattern,
2726
        // and that path has a shorter minimum length than the current accumulated value,
2727
        // replace the current accumulated value.
2728
15.0M
        if (forwardedLength.elementAti(loc) < currentLen) {
2729
272k
            currentLen = forwardedLength.elementAti(loc);
2730
272k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731
272k
        }
2732
2733
15.0M
        switch (opType) {
2734
            // Ops that don't change the total length matched
2735
0
        case URX_RESERVED_OP:
2736
4.44k
        case URX_END:
2737
4.44k
        case URX_FAIL:
2738
4.44k
        case URX_STRING_LEN:
2739
4.44k
        case URX_NOP:
2740
8.81k
        case URX_START_CAPTURE:
2741
13.1k
        case URX_END_CAPTURE:
2742
13.6k
        case URX_BACKSLASH_B:
2743
13.8k
        case URX_BACKSLASH_BU:
2744
14.0k
        case URX_BACKSLASH_G:
2745
14.3k
        case URX_BACKSLASH_Z:
2746
16.7k
        case URX_DOLLAR:
2747
17.3k
        case URX_DOLLAR_M:
2748
17.7k
        case URX_DOLLAR_D:
2749
18.1k
        case URX_DOLLAR_MD:
2750
18.1k
        case URX_RELOC_OPRND:
2751
19.5k
        case URX_STO_INP_LOC:
2752
19.8k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753
20.1k
        case URX_BACKREF_I:
2754
2755
22.3k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756
24.5k
        case URX_LD_SP:
2757
24.5k
            break;
2758
2759
3.35k
        case URX_CARET:
2760
3.35k
            if (atStart) {
2761
344
                fRXPat->fStartType = START_START;
2762
344
            }
2763
3.35k
            break;
2764
2765
530
        case URX_CARET_M:
2766
1.19k
        case URX_CARET_M_UNIX:
2767
1.19k
            if (atStart) {
2768
290
                fRXPat->fStartType = START_LINE;
2769
290
            }
2770
1.19k
            break;
2771
2772
606k
        case URX_ONECHAR:
2773
606k
            if (currentLen == 0) {
2774
                // This character could appear at the start of a match.
2775
                //   Add it to the set of possible starting characters.
2776
87.0k
                fRXPat->fInitialChars->add(URX_VAL(op));
2777
87.0k
                numInitialStrings += 2;
2778
87.0k
            }
2779
606k
            currentLen = safeIncrement(currentLen, 1);
2780
606k
            atStart = false;
2781
606k
            break;
2782
2783
2784
104k
        case URX_SETREF:
2785
104k
            if (currentLen == 0) {
2786
1.36k
                int32_t  sn = URX_VAL(op);
2787
1.36k
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788
1.36k
                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2789
1.36k
                fRXPat->fInitialChars->addAll(*s);
2790
1.36k
                numInitialStrings += 2;
2791
1.36k
            }
2792
104k
            currentLen = safeIncrement(currentLen, 1);
2793
104k
            atStart = false;
2794
104k
            break;
2795
2796
4.17k
        case URX_LOOP_SR_I:
2797
            // [Set]*, like a SETREF, above, in what it can match,
2798
            //  but may not match at all, so currentLen is not incremented.
2799
4.17k
            if (currentLen == 0) {
2800
92
                int32_t  sn = URX_VAL(op);
2801
92
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802
92
                const UnicodeSet *s = (UnicodeSet *)fRXPat->fSets->elementAt(sn);
2803
92
                fRXPat->fInitialChars->addAll(*s);
2804
92
                numInitialStrings += 2;
2805
92
            }
2806
4.17k
            atStart = false;
2807
4.17k
            break;
2808
2809
1.69k
        case URX_LOOP_DOT_I:
2810
1.69k
            if (currentLen == 0) {
2811
                // .* at the start of a pattern.
2812
                //    Any character can begin the match.
2813
227
                fRXPat->fInitialChars->clear();
2814
227
                fRXPat->fInitialChars->complement();
2815
227
                numInitialStrings += 2;
2816
227
            }
2817
1.69k
            atStart = false;
2818
1.69k
            break;
2819
2820
2821
4.17k
        case URX_STATIC_SETREF:
2822
4.17k
            if (currentLen == 0) {
2823
212
                int32_t  sn = URX_VAL(op);
2824
212
                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825
212
                const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826
212
                fRXPat->fInitialChars->addAll(s);
2827
212
                numInitialStrings += 2;
2828
212
            }
2829
4.17k
            currentLen = safeIncrement(currentLen, 1);
2830
4.17k
            atStart = false;
2831
4.17k
            break;
2832
2833
2834
2835
1.37k
        case URX_STAT_SETREF_N:
2836
1.37k
            if (currentLen == 0) {
2837
249
                int32_t  sn = URX_VAL(op);
2838
249
                UnicodeSet sc;
2839
249
                sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840
249
                fRXPat->fInitialChars->addAll(sc);
2841
249
                numInitialStrings += 2;
2842
249
            }
2843
1.37k
            currentLen = safeIncrement(currentLen, 1);
2844
1.37k
            atStart = false;
2845
1.37k
            break;
2846
2847
2848
2849
1.36k
        case URX_BACKSLASH_D:
2850
            // Digit Char
2851
1.36k
             if (currentLen == 0) {
2852
417
                 UnicodeSet s;
2853
417
                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854
417
                 if (URX_VAL(op) != 0) {
2855
201
                     s.complement();
2856
201
                 }
2857
417
                 fRXPat->fInitialChars->addAll(s);
2858
417
                 numInitialStrings += 2;
2859
417
            }
2860
1.36k
            currentLen = safeIncrement(currentLen, 1);
2861
1.36k
            atStart = false;
2862
1.36k
            break;
2863
2864
2865
1.19k
        case URX_BACKSLASH_H:
2866
            // Horiz white space
2867
1.19k
            if (currentLen == 0) {
2868
460
                UnicodeSet s;
2869
460
                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870
460
                s.add((UChar32)9);   // Tab
2871
460
                if (URX_VAL(op) != 0) {
2872
241
                    s.complement();
2873
241
                }
2874
460
                fRXPat->fInitialChars->addAll(s);
2875
460
                numInitialStrings += 2;
2876
460
            }
2877
1.19k
            currentLen = safeIncrement(currentLen, 1);
2878
1.19k
            atStart = false;
2879
1.19k
            break;
2880
2881
2882
1.61k
        case URX_BACKSLASH_R:       // Any line ending sequence
2883
7.17k
        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884
7.17k
            if (currentLen == 0) {
2885
3.21k
                UnicodeSet s;
2886
3.21k
                s.add((UChar32)0x0a, (UChar32)0x0d);  // add range
2887
3.21k
                s.add((UChar32)0x85);
2888
3.21k
                s.add((UChar32)0x2028, (UChar32)0x2029);
2889
3.21k
                if (URX_VAL(op) != 0) {
2890
                     // Complement option applies to URX_BACKSLASH_V only.
2891
2.98k
                     s.complement();
2892
2.98k
                }
2893
3.21k
                fRXPat->fInitialChars->addAll(s);
2894
3.21k
                numInitialStrings += 2;
2895
3.21k
            }
2896
7.17k
            currentLen = safeIncrement(currentLen, 1);
2897
7.17k
            atStart = false;
2898
7.17k
            break;
2899
2900
2901
2902
13.8k
        case URX_ONECHAR_I:
2903
            // Case Insensitive Single Character.
2904
13.8k
            if (currentLen == 0) {
2905
1.31k
                UChar32  c = URX_VAL(op);
2906
1.31k
                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907
1.31k
                    UnicodeSet starters(c, c);
2908
1.31k
                    starters.closeOver(USET_CASE_INSENSITIVE);
2909
                    // findCaseInsensitiveStarters(c, &starters);
2910
                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911
                    //   The expanded folding can't match the pattern.
2912
1.31k
                    fRXPat->fInitialChars->addAll(starters);
2913
1.31k
                } else {
2914
                    // Char has no case variants.  Just add it as-is to the
2915
                    //   set of possible starting chars.
2916
0
                    fRXPat->fInitialChars->add(c);
2917
0
                }
2918
1.31k
                numInitialStrings += 2;
2919
1.31k
            }
2920
13.8k
            currentLen = safeIncrement(currentLen, 1);
2921
13.8k
            atStart = false;
2922
13.8k
            break;
2923
2924
2925
693
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926
939
        case URX_DOTANY_ALL:    // . matches one or two.
2927
5.64k
        case URX_DOTANY:
2928
5.89k
        case URX_DOTANY_UNIX:
2929
5.89k
            if (currentLen == 0) {
2930
                // These constructs are all bad news when they appear at the start
2931
                //   of a match.  Any character can begin the match.
2932
650
                fRXPat->fInitialChars->clear();
2933
650
                fRXPat->fInitialChars->complement();
2934
650
                numInitialStrings += 2;
2935
650
            }
2936
5.89k
            currentLen = safeIncrement(currentLen, 1);
2937
5.89k
            atStart = false;
2938
5.89k
            break;
2939
2940
2941
0
        case URX_JMPX:
2942
0
            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943
0
            U_FALLTHROUGH;
2944
6.68M
        case URX_JMP:
2945
6.68M
            {
2946
6.68M
                int32_t  jmpDest = URX_VAL(op);
2947
6.68M
                if (jmpDest < loc) {
2948
                    // Loop of some kind.  Can safely ignore, the worst that will happen
2949
                    //  is that we understate the true minimum length
2950
917
                    currentLen = forwardedLength.elementAti(loc+1);
2951
2952
6.68M
                } else {
2953
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2954
6.68M
                    U_ASSERT(jmpDest <= end+1);
2955
6.68M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956
1.76k
                        forwardedLength.setElementAt(currentLen, jmpDest);
2957
1.76k
                    }
2958
6.68M
                }
2959
6.68M
            }
2960
6.68M
            atStart = false;
2961
6.68M
            break;
2962
2963
102k
        case URX_JMP_SAV:
2964
104k
        case URX_JMP_SAV_X:
2965
            // Combo of state save to the next loc, + jmp backwards.
2966
            //   Net effect on min. length computation is nothing.
2967
104k
            atStart = false;
2968
104k
            break;
2969
2970
1.55k
        case URX_BACKTRACK:
2971
            // Fails are kind of like a branch, except that the min length was
2972
            //   propagated already, by the state save.
2973
1.55k
            currentLen = forwardedLength.elementAti(loc+1);
2974
1.55k
            atStart = false;
2975
1.55k
            break;
2976
2977
2978
7.32M
        case URX_STATE_SAVE:
2979
7.32M
            {
2980
                // State Save, for forward jumps, propagate the current minimum.
2981
                //             of the state save.
2982
7.32M
                int32_t  jmpDest = URX_VAL(op);
2983
7.32M
                if (jmpDest > loc) {
2984
7.32M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985
6.82M
                        forwardedLength.setElementAt(currentLen, jmpDest);
2986
6.82M
                    }
2987
7.32M
                }
2988
7.32M
            }
2989
7.32M
            atStart = false;
2990
7.32M
            break;
2991
2992
2993
2994
2995
13.6k
        case URX_STRING:
2996
13.6k
            {
2997
13.6k
                loc++;
2998
13.6k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
2999
13.6k
                int32_t stringLen   = URX_VAL(stringLenOp);
3000
13.6k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001
13.6k
                U_ASSERT(stringLenOp >= 2);
3002
13.6k
                if (currentLen == 0) {
3003
                    // Add the starting character of this string to the set of possible starting
3004
                    //   characters for this pattern.
3005
8.04k
                    int32_t stringStartIdx = URX_VAL(op);
3006
8.04k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007
8.04k
                    fRXPat->fInitialChars->add(c);
3008
3009
                    // Remember this string.  After the entire pattern has been checked,
3010
                    //  if nothing else is identified that can start a match, we'll use it.
3011
8.04k
                    numInitialStrings++;
3012
8.04k
                    fRXPat->fInitialStringIdx = stringStartIdx;
3013
8.04k
                    fRXPat->fInitialStringLen = stringLen;
3014
8.04k
                }
3015
3016
13.6k
                currentLen = safeIncrement(currentLen, stringLen);
3017
13.6k
                atStart = false;
3018
13.6k
            }
3019
13.6k
            break;
3020
3021
154k
        case URX_STRING_I:
3022
154k
            {
3023
                // Case-insensitive string.  Unlike exact-match strings, we won't
3024
                //   attempt a string search for possible match positions.  But we
3025
                //   do update the set of possible starting characters.
3026
154k
                loc++;
3027
154k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3028
154k
                int32_t stringLen   = URX_VAL(stringLenOp);
3029
154k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030
154k
                U_ASSERT(stringLenOp >= 2);
3031
154k
                if (currentLen == 0) {
3032
                    // Add the starting character of this string to the set of possible starting
3033
                    //   characters for this pattern.
3034
43.4k
                    int32_t stringStartIdx = URX_VAL(op);
3035
43.4k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036
43.4k
                    UnicodeSet s;
3037
43.4k
                    findCaseInsensitiveStarters(c, &s);
3038
43.4k
                    fRXPat->fInitialChars->addAll(s);
3039
43.4k
                    numInitialStrings += 2;  // Matching on an initial string not possible.
3040
43.4k
                }
3041
154k
                currentLen = safeIncrement(currentLen, stringLen);
3042
154k
                atStart = false;
3043
154k
            }
3044
154k
            break;
3045
3046
2.44k
        case URX_CTR_INIT:
3047
2.73k
        case URX_CTR_INIT_NG:
3048
2.73k
            {
3049
                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050
                //   so location must be updated accordingly.
3051
                // Loop Init Ops.
3052
                //   If the min loop count == 0
3053
                //      move loc forwards to the end of the loop, skipping over the body.
3054
                //   If the min count is > 0,
3055
                //      continue normal processing of the body of the loop.
3056
2.73k
                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3057
2.73k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3058
2.73k
                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3059
2.73k
                if (minLoopCount == 0) {
3060
                    // Min Loop Count of 0, treat like a forward branch and
3061
                    //   move the current minimum length up to the target
3062
                    //   (end of loop) location.
3063
1.65k
                    U_ASSERT(loopEndLoc <= end+1);
3064
1.65k
                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065
1.34k
                        forwardedLength.setElementAt(currentLen, loopEndLoc);
3066
1.34k
                    }
3067
1.65k
                }
3068
2.73k
                loc+=3;  // Skips over operands of CTR_INIT
3069
2.73k
            }
3070
2.73k
            atStart = false;
3071
2.73k
            break;
3072
3073
3074
2.44k
        case URX_CTR_LOOP:
3075
2.73k
        case URX_CTR_LOOP_NG:
3076
            // Loop ops.
3077
            //  The jump is conditional, backwards only.
3078
2.73k
            atStart = false;
3079
2.73k
            break;
3080
3081
5.86k
        case URX_LOOP_C:
3082
            // More loop ops.  These state-save to themselves.
3083
            //   don't change the minimum match
3084
5.86k
            atStart = false;
3085
5.86k
            break;
3086
3087
3088
483
        case URX_LA_START:
3089
849
        case URX_LB_START:
3090
849
            {
3091
                // Look-around.  Scan forward until the matching look-ahead end,
3092
                //   without processing the look-around block.  This is overly pessimistic.
3093
3094
                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095
                //   lookahead contains two LA_END instructions, so count goes up by two
3096
                //   for each LA_START.
3097
849
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3098
7.48M
                for (;;) {
3099
7.48M
                    loc++;
3100
7.48M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3101
7.48M
                    if (URX_TYPE(op) == URX_LA_START) {
3102
978
                        depth+=2;
3103
978
                    }
3104
7.48M
                    if (URX_TYPE(op) == URX_LB_START) {
3105
556
                        depth++;
3106
556
                    }
3107
7.48M
                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108
3.84k
                        depth--;
3109
3.84k
                        if (depth == 0) {
3110
849
                            break;
3111
849
                        }
3112
3.84k
                    }
3113
7.48M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3114
                        // Need this because neg lookahead blocks will FAIL to outside
3115
                        //   of the block.
3116
3.68M
                        int32_t  jmpDest = URX_VAL(op);
3117
3.68M
                        if (jmpDest > loc) {
3118
3.68M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119
3.66M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3120
3.66M
                            }
3121
3.68M
                        }
3122
3.68M
                    }
3123
7.48M
                    U_ASSERT(loc <= end);
3124
7.48M
                }
3125
849
            }
3126
849
            break;
3127
3128
0
        case URX_LA_END:
3129
0
        case URX_LB_CONT:
3130
0
        case URX_LB_END:
3131
0
        case URX_LBN_CONT:
3132
0
        case URX_LBN_END:
3133
0
            UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134
                                    //  consumed by the scan in URX_LA_START and LB_START
3135
0
        default:
3136
0
            UPRV_UNREACHABLE_EXIT;
3137
15.0M
            }
3138
3139
15.0M
        }
3140
3141
3142
    // We have finished walking through the ops.  Check whether some forward jump
3143
    //   propagated a shorter length to location end+1.
3144
4.44k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3145
3.55k
        currentLen = forwardedLength.elementAti(end+1);
3146
3.55k
    }
3147
3148
3149
4.44k
    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152
    // Sort out what we should check for when looking for candidate match start positions.
3153
    // In order of preference,
3154
    //     1.   Start of input text buffer.
3155
    //     2.   A literal string.
3156
    //     3.   Start of line in multi-line mode.
3157
    //     4.   A single literal character.
3158
    //     5.   A character from a set of characters.
3159
    //
3160
4.44k
    if (fRXPat->fStartType == START_START) {
3161
        // Match only at the start of an input text string.
3162
        //    start type is already set.  We're done.
3163
4.41k
    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164
        // Match beginning only with a literal string.
3165
491
        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166
491
        U_ASSERT(fRXPat->fInitialChars->contains(c));
3167
491
        fRXPat->fStartType   = START_STRING;
3168
491
        fRXPat->fInitialChar = c;
3169
3.91k
    } else if (fRXPat->fStartType == START_LINE) {
3170
        // Match at start of line in Multi-Line mode.
3171
        // Nothing to do here; everything is already set.
3172
3.90k
    } else if (fRXPat->fMinMatchLen == 0) {
3173
        // Zero length match possible.  We could start anywhere.
3174
842
        fRXPat->fStartType = START_NO_INFO;
3175
3.05k
    } else if (fRXPat->fInitialChars->size() == 1) {
3176
        // All matches begin with the same char.
3177
842
        fRXPat->fStartType   = START_CHAR;
3178
842
        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179
842
        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180
2.21k
    } else if (fRXPat->fInitialChars->contains((UChar32)0, (UChar32)0x10ffff) == false &&
3181
2.21k
        fRXPat->fMinMatchLen > 0) {
3182
        // Matches start with a set of character smaller than the set of all chars.
3183
2.11k
        fRXPat->fStartType = START_SET;
3184
2.11k
    } else {
3185
        // Matches can start with anything
3186
98
        fRXPat->fStartType = START_NO_INFO;
3187
98
    }
3188
3189
4.44k
    return;
3190
4.44k
}
3191
3192
3193
3194
//------------------------------------------------------------------------------
3195
//
3196
//   minMatchLength    Calculate the length of the shortest string that could
3197
//                     match the specified pattern.
3198
//                     Length is in 16 bit code units, not code points.
3199
//
3200
//                     The calculated length may not be exact.  The returned
3201
//                     value may be shorter than the actual minimum; it must
3202
//                     never be longer.
3203
//
3204
//                     start and end are the range of p-code operations to be
3205
//                     examined.  The endpoints are included in the range.
3206
//
3207
//------------------------------------------------------------------------------
3208
167k
int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3209
167k
    if (U_FAILURE(*fStatus)) {
3210
52
        return 0;
3211
52
    }
3212
3213
167k
    U_ASSERT(start <= end);
3214
167k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3215
3216
3217
167k
    int32_t    loc;
3218
167k
    int32_t    op;
3219
167k
    int32_t    opType;
3220
167k
    int32_t    currentLen = 0;
3221
3222
3223
    // forwardedLength is a vector holding minimum-match-length values that
3224
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3225
    //   It must be one longer than the pattern being checked because some  ops
3226
    //   will jmp to a end-of-block+1 location from within a block, and we must
3227
    //   count those when checking the block.
3228
167k
    UVector32  forwardedLength(end+2, *fStatus);
3229
167k
    forwardedLength.setSize(end+2);
3230
45.8M
    for (loc=start; loc<=end+1; loc++) {
3231
45.6M
        forwardedLength.setElementAt(INT32_MAX, loc);
3232
45.6M
    }
3233
3234
28.7M
    for (loc = start; loc<=end; loc++) {
3235
28.5M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3236
28.5M
        opType = URX_TYPE(op);
3237
3238
        // The loop is advancing linearly through the pattern.
3239
        // If the op we are now at was the destination of a branch in the pattern,
3240
        // and that path has a shorter minimum length than the current accumulated value,
3241
        // replace the current accumulated value.
3242
        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3243
                                                               //   no-match-possible cases.
3244
28.5M
        if (forwardedLength.elementAti(loc) < currentLen) {
3245
403k
            currentLen = forwardedLength.elementAti(loc);
3246
403k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3247
403k
        }
3248
3249
28.5M
        switch (opType) {
3250
            // Ops that don't change the total length matched
3251
0
        case URX_RESERVED_OP:
3252
4.44k
        case URX_END:
3253
4.44k
        case URX_STRING_LEN:
3254
196k
        case URX_NOP:
3255
219k
        case URX_START_CAPTURE:
3256
243k
        case URX_END_CAPTURE:
3257
246k
        case URX_BACKSLASH_B:
3258
246k
        case URX_BACKSLASH_BU:
3259
247k
        case URX_BACKSLASH_G:
3260
251k
        case URX_BACKSLASH_Z:
3261
262k
        case URX_CARET:
3262
266k
        case URX_DOLLAR:
3263
267k
        case URX_DOLLAR_M:
3264
267k
        case URX_DOLLAR_D:
3265
268k
        case URX_DOLLAR_MD:
3266
268k
        case URX_RELOC_OPRND:
3267
279k
        case URX_STO_INP_LOC:
3268
280k
        case URX_CARET_M:
3269
280k
        case URX_CARET_M_UNIX:
3270
387k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3271
389k
        case URX_BACKREF_I:
3272
3273
394k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3274
399k
        case URX_LD_SP:
3275
3276
513k
        case URX_JMP_SAV:
3277
524k
        case URX_JMP_SAV_X:
3278
524k
            break;
3279
3280
3281
            // Ops that match a minimum of one character (one or two 16 bit code units.)
3282
            //
3283
917k
        case URX_ONECHAR:
3284
921k
        case URX_STATIC_SETREF:
3285
924k
        case URX_STAT_SETREF_N:
3286
1.03M
        case URX_SETREF:
3287
1.03M
        case URX_BACKSLASH_D:
3288
1.03M
        case URX_BACKSLASH_H:
3289
1.04M
        case URX_BACKSLASH_R:
3290
1.04M
        case URX_BACKSLASH_V:
3291
1.07M
        case URX_ONECHAR_I:
3292
1.07M
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3293
1.07M
        case URX_DOTANY_ALL:    // . matches one or two.
3294
1.08M
        case URX_DOTANY:
3295
1.08M
        case URX_DOTANY_UNIX:
3296
1.08M
            currentLen = safeIncrement(currentLen, 1);
3297
1.08M
            break;
3298
3299
3300
0
        case URX_JMPX:
3301
0
            loc++;              // URX_JMPX has an extra operand, ignored here,
3302
                                //   otherwise processed identically to URX_JMP.
3303
0
            U_FALLTHROUGH;
3304
12.8M
        case URX_JMP:
3305
12.8M
            {
3306
12.8M
                int32_t  jmpDest = URX_VAL(op);
3307
12.8M
                if (jmpDest < loc) {
3308
                    // Loop of some kind.  Can safely ignore, the worst that will happen
3309
                    //  is that we understate the true minimum length
3310
1.67k
                    currentLen = forwardedLength.elementAti(loc+1);
3311
12.8M
                } else {
3312
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3313
12.8M
                    U_ASSERT(jmpDest <= end+1);
3314
12.8M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
3315
6.27k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3316
6.27k
                    }
3317
12.8M
                }
3318
12.8M
            }
3319
12.8M
            break;
3320
3321
2.58k
        case URX_BACKTRACK:
3322
2.58k
            {
3323
                // Back-tracks are kind of like a branch, except that the min length was
3324
                //   propagated already, by the state save.
3325
2.58k
                currentLen = forwardedLength.elementAti(loc+1);
3326
2.58k
            }
3327
2.58k
            break;
3328
3329
3330
13.6M
        case URX_STATE_SAVE:
3331
13.6M
            {
3332
                // State Save, for forward jumps, propagate the current minimum.
3333
                //             of the state save.
3334
13.6M
                int32_t  jmpDest = URX_VAL(op);
3335
13.6M
                if (jmpDest > loc) {
3336
13.6M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3337
13.0M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3338
13.0M
                    }
3339
13.6M
                }
3340
13.6M
            }
3341
13.6M
            break;
3342
3343
3344
85.9k
        case URX_STRING:
3345
85.9k
            {
3346
85.9k
                loc++;
3347
85.9k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3348
85.9k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3349
85.9k
            }
3350
85.9k
            break;
3351
3352
3353
309k
        case URX_STRING_I:
3354
309k
            {
3355
309k
                loc++;
3356
                // TODO: with full case folding, matching input text may be shorter than
3357
                //       the string we have here.  More smarts could put some bounds on it.
3358
                //       Assume a min length of one for now.  A min length of zero causes
3359
                //        optimization failures for a pattern like "string"+
3360
                // currentLen += URX_VAL(stringLenOp);
3361
309k
                currentLen = safeIncrement(currentLen, 1);
3362
309k
            }
3363
309k
            break;
3364
3365
5.31k
        case URX_CTR_INIT:
3366
7.19k
        case URX_CTR_INIT_NG:
3367
7.19k
            {
3368
                // Loop Init Ops.
3369
                //   If the min loop count == 0
3370
                //      move loc forwards to the end of the loop, skipping over the body.
3371
                //   If the min count is > 0,
3372
                //      continue normal processing of the body of the loop.
3373
7.19k
                int32_t loopEndLoc   = (int32_t)fRXPat->fCompiledPat->elementAti(loc+1);
3374
7.19k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3375
7.19k
                int32_t minLoopCount = (int32_t)fRXPat->fCompiledPat->elementAti(loc+2);
3376
7.19k
                if (minLoopCount == 0) {
3377
4.61k
                    loc = loopEndLoc;
3378
4.61k
                } else {
3379
2.58k
                    loc+=3;  // Skips over operands of CTR_INIT
3380
2.58k
                }
3381
7.19k
            }
3382
7.19k
            break;
3383
3384
3385
1.89k
        case URX_CTR_LOOP:
3386
2.58k
        case URX_CTR_LOOP_NG:
3387
            // Loop ops.
3388
            //  The jump is conditional, backwards only.
3389
2.58k
            break;
3390
3391
4.36k
        case URX_LOOP_SR_I:
3392
6.50k
        case URX_LOOP_DOT_I:
3393
13.0k
        case URX_LOOP_C:
3394
            // More loop ops.  These state-save to themselves.
3395
            //   don't change the minimum match - could match nothing at all.
3396
13.0k
            break;
3397
3398
3399
2.39k
        case URX_LA_START:
3400
4.94k
        case URX_LB_START:
3401
4.94k
            {
3402
                // Look-around.  Scan forward until the matching look-ahead end,
3403
                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3404
                //   it assumes that the look-ahead match might be zero-length.
3405
                //   TODO:  Positive lookahead could recursively do the block, then continue
3406
                //          with the longer of the block or the value coming in.  Ticket 6060
3407
4.94k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3408
14.6M
                for (;;) {
3409
14.6M
                    loc++;
3410
14.6M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3411
14.6M
                    if (URX_TYPE(op) == URX_LA_START) {
3412
                        // The boilerplate for look-ahead includes two LA_END instructions,
3413
                        //    Depth will be decremented by each one when it is seen.
3414
24.4k
                        depth += 2;
3415
24.4k
                    }
3416
14.6M
                    if (URX_TYPE(op) == URX_LB_START) {
3417
3.14k
                        depth++;
3418
3.14k
                    }
3419
14.6M
                    if (URX_TYPE(op) == URX_LA_END) {
3420
56.0k
                        depth--;
3421
56.0k
                        if (depth == 0) {
3422
3.61k
                            break;
3423
3.61k
                        }
3424
56.0k
                    }
3425
14.6M
                    if (URX_TYPE(op)==URX_LBN_END) {
3426
3.24k
                        depth--;
3427
3.24k
                        if (depth == 0) {
3428
1.33k
                            break;
3429
1.33k
                        }
3430
3.24k
                    }
3431
14.6M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3432
                        // Need this because neg lookahead blocks will FAIL to outside
3433
                        //   of the block.
3434
7.11M
                        int32_t  jmpDest = URX_VAL(op);
3435
7.11M
                        if (jmpDest > loc) {
3436
7.11M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3437
7.08M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3438
7.08M
                            }
3439
7.11M
                        }
3440
7.11M
                    }
3441
14.6M
                    U_ASSERT(loc <= end);
3442
14.6M
                }
3443
4.94k
            }
3444
4.94k
            break;
3445
3446
2.21k
        case URX_LA_END:
3447
2.21k
        case URX_LB_CONT:
3448
4.43k
        case URX_LB_END:
3449
4.43k
        case URX_LBN_CONT:
3450
7.19k
        case URX_LBN_END:
3451
            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3452
            //   range being sized, which happens when measuring size of look-behind blocks.
3453
7.19k
            break;
3454
3455
0
        default:
3456
0
            UPRV_UNREACHABLE_EXIT;
3457
28.5M
            }
3458
3459
28.5M
        }
3460
3461
    // We have finished walking through the ops.  Check whether some forward jump
3462
    //   propagated a shorter length to location end+1.
3463
167k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3464
339
        currentLen = forwardedLength.elementAti(end+1);
3465
339
        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3466
339
    }
3467
3468
167k
    return currentLen;
3469
167k
}
3470
3471
//------------------------------------------------------------------------------
3472
//
3473
//   maxMatchLength    Calculate the length of the longest string that could
3474
//                     match the specified pattern.
3475
//                     Length is in 16 bit code units, not code points.
3476
//
3477
//                     The calculated length may not be exact.  The returned
3478
//                     value may be longer than the actual maximum; it must
3479
//                     never be shorter.
3480
//
3481
//                     start, end: the range of the pattern to check.
3482
//                     end is inclusive.
3483
//
3484
//------------------------------------------------------------------------------
3485
9.21k
int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3486
9.21k
    if (U_FAILURE(*fStatus)) {
3487
0
        return 0;
3488
0
    }
3489
9.21k
    U_ASSERT(start <= end);
3490
9.21k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3491
3492
9.21k
    int32_t    loc;
3493
9.21k
    int32_t    op;
3494
9.21k
    int32_t    opType;
3495
9.21k
    int32_t    currentLen = 0;
3496
9.21k
    UVector32  forwardedLength(end+1, *fStatus);
3497
9.21k
    forwardedLength.setSize(end+1);
3498
3499
22.1M
    for (loc=start; loc<=end; loc++) {
3500
22.1M
        forwardedLength.setElementAt(0, loc);
3501
22.1M
    }
3502
3503
14.2M
    for (loc = start; loc<=end; loc++) {
3504
14.2M
        op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3505
14.2M
        opType = URX_TYPE(op);
3506
3507
        // The loop is advancing linearly through the pattern.
3508
        // If the op we are now at was the destination of a branch in the pattern,
3509
        // and that path has a longer maximum length than the current accumulated value,
3510
        // replace the current accumulated value.
3511
14.2M
        if (forwardedLength.elementAti(loc) > currentLen) {
3512
2.17M
            currentLen = forwardedLength.elementAti(loc);
3513
2.17M
        }
3514
3515
14.2M
        switch (opType) {
3516
            // Ops that don't change the total length matched
3517
0
        case URX_RESERVED_OP:
3518
0
        case URX_END:
3519
0
        case URX_STRING_LEN:
3520
43.9k
        case URX_NOP:
3521
47.1k
        case URX_START_CAPTURE:
3522
48.9k
        case URX_END_CAPTURE:
3523
49.2k
        case URX_BACKSLASH_B:
3524
49.4k
        case URX_BACKSLASH_BU:
3525
49.6k
        case URX_BACKSLASH_G:
3526
49.9k
        case URX_BACKSLASH_Z:
3527
50.6k
        case URX_CARET:
3528
51.5k
        case URX_DOLLAR:
3529
51.7k
        case URX_DOLLAR_M:
3530
51.9k
        case URX_DOLLAR_D:
3531
52.1k
        case URX_DOLLAR_MD:
3532
52.1k
        case URX_RELOC_OPRND:
3533
52.4k
        case URX_STO_INP_LOC:
3534
52.7k
        case URX_CARET_M:
3535
52.9k
        case URX_CARET_M_UNIX:
3536
3537
54.9k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3538
56.8k
        case URX_LD_SP:
3539
3540
58.9k
        case URX_LB_END:
3541
58.9k
        case URX_LB_CONT:
3542
58.9k
        case URX_LBN_CONT:
3543
61.6k
        case URX_LBN_END:
3544
61.6k
            break;
3545
3546
3547
            // Ops that increase that cause an unbounded increase in the length
3548
            //   of a matched string, or that increase it a hard to characterize way.
3549
            //   Call the max length unbounded, and stop further checking.
3550
362
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3551
622
        case URX_BACKREF_I:
3552
947
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3553
947
            currentLen = INT32_MAX;
3554
947
            break;
3555
3556
3557
            // Ops that match a max of one character (possibly two 16 bit code units.)
3558
            //
3559
232
        case URX_STATIC_SETREF:
3560
1.86k
        case URX_STAT_SETREF_N:
3561
2.57k
        case URX_SETREF:
3562
3.03k
        case URX_BACKSLASH_D:
3563
3.29k
        case URX_BACKSLASH_H:
3564
5.01k
        case URX_BACKSLASH_R:
3565
5.80k
        case URX_BACKSLASH_V:
3566
6.70k
        case URX_ONECHAR_I:
3567
6.90k
        case URX_DOTANY_ALL:
3568
8.98k
        case URX_DOTANY:
3569
9.18k
        case URX_DOTANY_UNIX:
3570
9.18k
            currentLen = safeIncrement(currentLen, 2);
3571
9.18k
            break;
3572
3573
            // Single literal character.  Increase current max length by one or two,
3574
            //       depending on whether the char is in the supplementary range.
3575
207k
        case URX_ONECHAR:
3576
207k
            currentLen = safeIncrement(currentLen, 1);
3577
207k
            if (URX_VAL(op) > 0x10000) {
3578
507
                currentLen = safeIncrement(currentLen, 1);
3579
507
            }
3580
207k
            break;
3581
3582
            // Jumps.
3583
            //
3584
6.83M
        case URX_JMP:
3585
6.83M
        case URX_JMPX:
3586
6.83M
        case URX_JMP_SAV:
3587
6.83M
        case URX_JMP_SAV_X:
3588
6.83M
            {
3589
6.83M
                int32_t  jmpDest = URX_VAL(op);
3590
6.83M
                if (jmpDest < loc) {
3591
                    // Loop of some kind.  Max match length is unbounded.
3592
996
                    currentLen = INT32_MAX;
3593
6.83M
                } else {
3594
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3595
6.83M
                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3596
22.1k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3597
22.1k
                    }
3598
6.83M
                    currentLen = 0;
3599
6.83M
                }
3600
6.83M
            }
3601
6.83M
            break;
3602
3603
15.3k
        case URX_BACKTRACK:
3604
            // back-tracks are kind of like a branch, except that the max length was
3605
            //   propagated already, by the state save.
3606
15.3k
            currentLen = forwardedLength.elementAti(loc+1);
3607
15.3k
            break;
3608
3609
3610
6.99M
        case URX_STATE_SAVE:
3611
6.99M
            {
3612
                // State Save, for forward jumps, propagate the current minimum.
3613
                //               of the state save.
3614
                //             For backwards jumps, they create a loop, maximum
3615
                //               match length is unbounded.
3616
6.99M
                int32_t  jmpDest = URX_VAL(op);
3617
6.99M
                if (jmpDest > loc) {
3618
6.99M
                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3619
2.31M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3620
2.31M
                    }
3621
6.99M
                } else {
3622
23
                    currentLen = INT32_MAX;
3623
23
                }
3624
6.99M
            }
3625
6.99M
            break;
3626
3627
3628
3629
3630
41.9k
        case URX_STRING:
3631
41.9k
            {
3632
41.9k
                loc++;
3633
41.9k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3634
41.9k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3635
41.9k
                break;
3636
6.83M
            }
3637
3638
23.8k
        case URX_STRING_I:
3639
            // TODO:  This code assumes that any user string that matches will be no longer
3640
            //        than our compiled string, with case insensitive matching.
3641
            //        Our compiled string has been case-folded already.
3642
            //
3643
            //        Any matching user string will have no more code points than our
3644
            //        compiled (folded) string.  Folding may add code points, but
3645
            //        not remove them.
3646
            //
3647
            //        There is a potential problem if a supplemental code point
3648
            //        case-folds to a BMP code point.  In this case our compiled string
3649
            //        could be shorter (in code units) than a matching user string.
3650
            //
3651
            //        At this time (Unicode 6.1) there are no such characters, and this case
3652
            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3653
            //        any problematic characters are added to Unicode.
3654
            //
3655
            //        If this happens, we can make a set of the BMP chars that the
3656
            //        troublesome supplementals fold to, scan our string, and bump the
3657
            //        currentLen one extra for each that is found.
3658
            //
3659
23.8k
            {
3660
23.8k
                loc++;
3661
23.8k
                int32_t stringLenOp = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3662
23.8k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3663
23.8k
            }
3664
23.8k
            break;
3665
3666
3.00k
        case URX_CTR_INIT:
3667
4.44k
        case URX_CTR_INIT_NG:
3668
            // For Loops, recursively call this function on the pattern for the loop body,
3669
            //   then multiply the result by the maximum loop count.
3670
4.44k
            {
3671
4.44k
                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3672
4.44k
                if (loopEndLoc == loc+4) {
3673
                    // Loop has an empty body. No affect on max match length.
3674
                    // Continue processing with code after the loop end.
3675
0
                    loc = loopEndLoc;
3676
0
                    break;
3677
0
                }
3678
3679
4.44k
                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3680
4.44k
                if (maxLoopCount == -1) {
3681
                    // Unbounded Loop. No upper bound on match length.
3682
216
                    currentLen = INT32_MAX;
3683
216
                    break;
3684
216
                }
3685
3686
4.23k
                U_ASSERT(loopEndLoc >= loc+4);
3687
4.23k
                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3688
4.23k
                int64_t updatedLen = (int64_t)currentLen + blockLen * maxLoopCount; 
3689
4.23k
                if (updatedLen >= INT32_MAX) {
3690
90
                    currentLen = INT32_MAX;
3691
90
                    break;
3692
90
                }
3693
4.14k
                currentLen = (int32_t)updatedLen;
3694
4.14k
                loc = loopEndLoc;
3695
4.14k
                break;
3696
4.23k
            }
3697
3698
0
        case URX_CTR_LOOP:
3699
0
        case URX_CTR_LOOP_NG:
3700
            // These opcodes will be skipped over by code for URX_CTR_INIT.
3701
            // We shouldn't encounter them here.
3702
0
            UPRV_UNREACHABLE_EXIT;
3703
3704
44
        case URX_LOOP_SR_I:
3705
516
        case URX_LOOP_DOT_I:
3706
516
        case URX_LOOP_C:
3707
            // For anything to do with loops, make the match length unbounded.
3708
516
            currentLen = INT32_MAX;
3709
516
            break;
3710
3711
3712
3713
14.5k
        case URX_LA_START:
3714
45.6k
        case URX_LA_END:
3715
            // Look-ahead.  Just ignore, treat the look-ahead block as if
3716
            // it were normal pattern.  Gives a too-long match length,
3717
            //  but good enough for now.
3718
45.6k
            break;
3719
3720
            // End of look-ahead ops should always be consumed by the processing at
3721
            //  the URX_LA_START op.
3722
            // UPRV_UNREACHABLE_EXIT;
3723
3724
2.15k
        case URX_LB_START:
3725
2.15k
            {
3726
                // Look-behind.  Scan forward until the matching look-around end,
3727
                //   without processing the look-behind block.
3728
2.15k
                int32_t dataLoc = URX_VAL(op);
3729
3.02M
                for (loc = loc + 1; loc <= end; ++loc) {
3730
3.02M
                    op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3731
3.02M
                    int32_t opType = URX_TYPE(op);
3732
3.02M
                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3733
2.15k
                        break;
3734
2.15k
                    }
3735
3.02M
                }
3736
2.15k
                U_ASSERT(loc <= end);
3737
2.15k
            }
3738
2.15k
            break;
3739
3740
0
        default:
3741
0
            UPRV_UNREACHABLE_EXIT;
3742
14.2M
        }
3743
3744
3745
14.2M
        if (currentLen == INT32_MAX) {
3746
            //  The maximum length is unbounded.
3747
            //  Stop further processing of the pattern.
3748
2.79k
            break;
3749
2.79k
        }
3750
3751
14.2M
    }
3752
9.21k
    return currentLen;
3753
3754
9.21k
}
3755
3756
3757
//------------------------------------------------------------------------------
3758
//
3759
//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3760
//                Extra NOPs are inserted for some constructs during the initial
3761
//                code generation to provide locations that may be patched later.
3762
//                Many end up unneeded, and are removed by this function.
3763
//
3764
//                In order to minimize the number of passes through the pattern,
3765
//                back-reference fixup is also performed here (adjusting
3766
//                back-reference operands to point to the correct frame offsets).
3767
//
3768
//------------------------------------------------------------------------------
3769
4.49k
void RegexCompile::stripNOPs() {
3770
3771
4.49k
    if (U_FAILURE(*fStatus)) {
3772
0
        return;
3773
0
    }
3774
3775
4.49k
    int32_t    end = fRXPat->fCompiledPat->size();
3776
4.49k
    UVector32  deltas(end, *fStatus);
3777
3778
    // Make a first pass over the code, computing the amount that things
3779
    //   will be offset at each location in the original code.
3780
4.49k
    int32_t   loc;
3781
4.49k
    int32_t   d = 0;
3782
23.3M
    for (loc=0; loc<end; loc++) {
3783
23.3M
        deltas.addElement(d, *fStatus);
3784
23.3M
        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(loc);
3785
23.3M
        if (URX_TYPE(op) == URX_NOP) {
3786
22.8k
            d++;
3787
22.8k
        }
3788
23.3M
    }
3789
3790
4.49k
    UnicodeString caseStringBuffer;
3791
3792
    // Make a second pass over the code, removing the NOPs by moving following
3793
    //  code up, and patching operands that refer to code locations that
3794
    //  are being moved.  The array of offsets from the first step is used
3795
    //  to compute the new operand values.
3796
4.49k
    int32_t src;
3797
4.49k
    int32_t dst = 0;
3798
23.3M
    for (src=0; src<end; src++) {
3799
23.3M
        int32_t op = (int32_t)fRXPat->fCompiledPat->elementAti(src);
3800
23.3M
        int32_t opType = URX_TYPE(op);
3801
23.3M
        switch (opType) {
3802
22.8k
        case URX_NOP:
3803
22.8k
            break;
3804
3805
11.2M
        case URX_STATE_SAVE:
3806
21.8M
        case URX_JMP:
3807
21.8M
        case URX_CTR_LOOP:
3808
21.8M
        case URX_CTR_LOOP_NG:
3809
21.9M
        case URX_RELOC_OPRND:
3810
21.9M
        case URX_JMPX:
3811
22.0M
        case URX_JMP_SAV:
3812
22.0M
        case URX_JMP_SAV_X:
3813
            // These are instructions with operands that refer to code locations.
3814
22.0M
            {
3815
22.0M
                int32_t  operandAddress = URX_VAL(op);
3816
22.0M
                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3817
22.0M
                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3818
22.0M
                op = buildOp(opType, fixedOperandAddress);
3819
22.0M
                fRXPat->fCompiledPat->setElementAt(op, dst);
3820
22.0M
                dst++;
3821
22.0M
                break;
3822
22.0M
            }
3823
3824
1.04k
        case URX_BACKREF:
3825
1.35k
        case URX_BACKREF_I:
3826
1.35k
            {
3827
1.35k
                int32_t where = URX_VAL(op);
3828
1.35k
                if (where > fRXPat->fGroupMap->size()) {
3829
583
                    error(U_REGEX_INVALID_BACK_REF);
3830
583
                    break;
3831
583
                }
3832
773
                where = fRXPat->fGroupMap->elementAti(where-1);
3833
773
                op    = buildOp(opType, where);
3834
773
                fRXPat->fCompiledPat->setElementAt(op, dst);
3835
773
                dst++;
3836
3837
773
                fRXPat->fNeedsAltInput = true;
3838
773
                break;
3839
1.35k
            }
3840
7.61k
        case URX_RESERVED_OP:
3841
8.17k
        case URX_RESERVED_OP_N:
3842
11.2k
        case URX_BACKTRACK:
3843
15.7k
        case URX_END:
3844
658k
        case URX_ONECHAR:
3845
691k
        case URX_STRING:
3846
899k
        case URX_STRING_LEN:
3847
904k
        case URX_START_CAPTURE:
3848
908k
        case URX_END_CAPTURE:
3849
913k
        case URX_STATIC_SETREF:
3850
915k
        case URX_STAT_SETREF_N:
3851
1.02M
        case URX_SETREF:
3852
1.02M
        case URX_DOTANY:
3853
1.03M
        case URX_FAIL:
3854
1.03M
        case URX_BACKSLASH_B:
3855
1.03M
        case URX_BACKSLASH_BU:
3856
1.03M
        case URX_BACKSLASH_G:
3857
1.03M
        case URX_BACKSLASH_X:
3858
1.03M
        case URX_BACKSLASH_Z:
3859
1.03M
        case URX_DOTANY_ALL:
3860
1.03M
        case URX_BACKSLASH_D:
3861
1.03M
        case URX_CARET:
3862
1.04M
        case URX_DOLLAR:
3863
1.04M
        case URX_CTR_INIT:
3864
1.04M
        case URX_CTR_INIT_NG:
3865
1.04M
        case URX_DOTANY_UNIX:
3866
1.04M
        case URX_STO_SP:
3867
1.04M
        case URX_LD_SP:
3868
1.05M
        case URX_STO_INP_LOC:
3869
1.05M
        case URX_LA_START:
3870
1.05M
        case URX_LA_END:
3871
1.06M
        case URX_ONECHAR_I:
3872
1.24M
        case URX_STRING_I:
3873
1.24M
        case URX_DOLLAR_M:
3874
1.24M
        case URX_CARET_M:
3875
1.24M
        case URX_CARET_M_UNIX:
3876
1.24M
        case URX_LB_START:
3877
1.24M
        case URX_LB_CONT:
3878
1.24M
        case URX_LB_END:
3879
1.24M
        case URX_LBN_CONT:
3880
1.24M
        case URX_LBN_END:
3881
1.25M
        case URX_LOOP_SR_I:
3882
1.25M
        case URX_LOOP_DOT_I:
3883
1.26M
        case URX_LOOP_C:
3884
1.26M
        case URX_DOLLAR_D:
3885
1.26M
        case URX_DOLLAR_MD:
3886
1.26M
        case URX_BACKSLASH_H:
3887
1.26M
        case URX_BACKSLASH_R:
3888
1.27M
        case URX_BACKSLASH_V:
3889
            // These instructions are unaltered by the relocation.
3890
1.27M
            fRXPat->fCompiledPat->setElementAt(op, dst);
3891
1.27M
            dst++;
3892
1.27M
            break;
3893
3894
0
        default:
3895
            // Some op is unaccounted for.
3896
0
            UPRV_UNREACHABLE_EXIT;
3897
23.3M
        }
3898
23.3M
    }
3899
3900
4.49k
    fRXPat->fCompiledPat->setSize(dst);
3901
4.49k
}
3902
3903
3904
3905
3906
//------------------------------------------------------------------------------
3907
//
3908
//  Error         Report a rule parse error.
3909
//                Only report it if no previous error has been recorded.
3910
//
3911
//------------------------------------------------------------------------------
3912
7.94k
void RegexCompile::error(UErrorCode e) {
3913
7.94k
    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3914
6.80k
        *fStatus = e;
3915
        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3916
        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3917
        // int64_t. If the values of the latter are out of range for the former,
3918
        // set them to the appropriate "field not supported" values.
3919
6.80k
        if (fLineNum > 0x7FFFFFFF) {
3920
0
            fParseErr->line   = 0;
3921
0
            fParseErr->offset = -1;
3922
6.80k
        } else if (fCharNum > 0x7FFFFFFF) {
3923
0
            fParseErr->line   = (int32_t)fLineNum;
3924
0
            fParseErr->offset = -1;
3925
6.80k
        } else {
3926
6.80k
            fParseErr->line   = (int32_t)fLineNum;
3927
6.80k
            fParseErr->offset = (int32_t)fCharNum;
3928
6.80k
        }
3929
3930
6.80k
        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3931
3932
        // Fill in the context.
3933
        //   Note: extractBetween() pins supplied indices to the string bounds.
3934
6.80k
        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3935
6.80k
        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3936
6.80k
        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3937
6.80k
        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3938
6.80k
    }
3939
7.94k
}
3940
3941
3942
//
3943
//  Assorted Unicode character constants.
3944
//     Numeric because there is no portable way to enter them as literals.
3945
//     (Think EBCDIC).
3946
//
3947
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
3948
static const char16_t   chLF        = 0x0a;      // Line Feed
3949
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
3950
static const char16_t   chDigit0    = 0x30;      // '0'
3951
static const char16_t   chDigit7    = 0x37;      // '9'
3952
static const char16_t   chColon     = 0x3A;      // ':'
3953
static const char16_t   chE         = 0x45;      // 'E'
3954
static const char16_t   chQ         = 0x51;      // 'Q'
3955
//static const char16_t   chN         = 0x4E;      // 'N'
3956
static const char16_t   chP         = 0x50;      // 'P'
3957
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
3958
//static const char16_t   chLBracket  = 0x5b;      // '['
3959
static const char16_t   chRBracket  = 0x5d;      // ']'
3960
static const char16_t   chUp        = 0x5e;      // '^'
3961
static const char16_t   chLowerP    = 0x70;
3962
static const char16_t   chLBrace    = 0x7b;      // '{'
3963
static const char16_t   chRBrace    = 0x7d;      // '}'
3964
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
3965
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
3966
3967
3968
//------------------------------------------------------------------------------
3969
//
3970
//  nextCharLL    Low Level Next Char from the regex pattern.
3971
//                Get a char from the string, keep track of input position
3972
//                     for error reporting.
3973
//
3974
//------------------------------------------------------------------------------
3975
85.1M
UChar32  RegexCompile::nextCharLL() {
3976
85.1M
    UChar32       ch;
3977
3978
85.1M
    if (fPeekChar != -1) {
3979
454k
        ch = fPeekChar;
3980
454k
        fPeekChar = -1;
3981
454k
        return ch;
3982
454k
    }
3983
3984
    // assume we're already in the right place
3985
84.7M
    ch = UTEXT_NEXT32(fRXPat->fPattern);
3986
84.7M
    if (ch == U_SENTINEL) {
3987
14.8k
        return ch;
3988
14.8k
    }
3989
3990
84.6M
    if (ch == chCR ||
3991
84.6M
        ch == chNEL ||
3992
84.6M
        ch == chLS   ||
3993
84.6M
        (ch == chLF && fLastChar != chCR)) {
3994
        // Character is starting a new line.  Bump up the line number, and
3995
        //  reset the column to 0.
3996
212k
        fLineNum++;
3997
212k
        fCharNum=0;
3998
212k
    }
3999
84.4M
    else {
4000
        // Character is not starting a new line.  Except in the case of a
4001
        //   LF following a CR, increment the column position.
4002
84.4M
        if (ch != chLF) {
4003
84.4M
            fCharNum++;
4004
84.4M
        }
4005
84.4M
    }
4006
84.6M
    fLastChar = ch;
4007
84.6M
    return ch;
4008
84.7M
}
4009
4010
//------------------------------------------------------------------------------
4011
//
4012
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4013
//                 character without actually getting it.
4014
//
4015
//------------------------------------------------------------------------------
4016
846k
UChar32  RegexCompile::peekCharLL() {
4017
846k
    if (fPeekChar == -1) {
4018
457k
        fPeekChar = nextCharLL();
4019
457k
    }
4020
846k
    return fPeekChar;
4021
846k
}
4022
4023
4024
//------------------------------------------------------------------------------
4025
//
4026
//   nextChar     for pattern scanning.  At this level, we handle stripping
4027
//                out comments and processing some backslash character escapes.
4028
//                The rest of the pattern grammar is handled at the next level up.
4029
//
4030
//------------------------------------------------------------------------------
4031
81.6M
void RegexCompile::nextChar(RegexPatternChar &c) {
4032
81.6M
  tailRecursion:
4033
81.6M
    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4034
81.6M
    c.fChar    = nextCharLL();
4035
81.6M
    c.fQuoted  = false;
4036
4037
81.6M
    if (fQuoteMode) {
4038
1.24M
        c.fQuoted = true;
4039
1.24M
        if ((c.fChar==chBackSlash && peekCharLL()==chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4040
1.24M
            c.fChar == (UChar32)-1) {
4041
691
            fQuoteMode = false;  //  Exit quote mode,
4042
691
            nextCharLL();        // discard the E
4043
            // nextChar(c);      // recurse to get the real next char
4044
691
            goto tailRecursion;  // Note: fuzz testing produced testcases that
4045
                                 //       resulted in stack overflow here.
4046
691
        }
4047
1.24M
    }
4048
80.4M
    else if (fInBackslashQuote) {
4049
        // The current character immediately follows a '\'
4050
        // Don't check for any further escapes, just return it as-is.
4051
        // Don't set c.fQuoted, because that would prevent the state machine from
4052
        //    dispatching on the character.
4053
191k
        fInBackslashQuote = false;
4054
191k
    }
4055
80.2M
    else
4056
80.2M
    {
4057
        // We are not in a \Q quoted region \E of the source.
4058
        //
4059
80.2M
        if (fModeFlags & UREGEX_COMMENTS) {
4060
            //
4061
            // We are in free-spacing and comments mode.
4062
            //  Scan through any white space and comments, until we
4063
            //  reach a significant character or the end of input.
4064
1.25M
            for (;;) {
4065
1.25M
                if (c.fChar == (UChar32)-1) {
4066
1.33k
                    break;     // End of Input
4067
1.33k
                }
4068
1.25M
                if  (c.fChar == chPound && fEOLComments) {
4069
                    // Start of a comment.  Consume the rest of it, until EOF or a new line
4070
2.82M
                    for (;;) {
4071
2.82M
                        c.fChar = nextCharLL();
4072
2.82M
                        if (c.fChar == (UChar32)-1 ||  // EOF
4073
2.82M
                            c.fChar == chCR        ||
4074
2.82M
                            c.fChar == chLF        ||
4075
2.82M
                            c.fChar == chNEL       ||
4076
2.82M
                            c.fChar == chLS)       {
4077
6.33k
                            break;
4078
6.33k
                        }
4079
2.82M
                    }
4080
6.33k
                }
4081
                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4082
1.25M
                if (PatternProps::isWhiteSpace(c.fChar) == false) {
4083
1.22M
                    break;
4084
1.22M
                }
4085
25.0k
                c.fChar = nextCharLL();
4086
25.0k
            }
4087
1.22M
        }
4088
4089
        //
4090
        //  check for backslash escaped characters.
4091
        //
4092
80.2M
        if (c.fChar == chBackSlash) {
4093
376k
            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4094
376k
            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4095
                //
4096
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
4097
                //   Includes \uxxxx, \n, \r, many others.
4098
                //   Return the single equivalent character.
4099
                //
4100
178k
                nextCharLL();                 // get & discard the peeked char.
4101
178k
                c.fQuoted = true;
4102
4103
178k
                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4104
178k
                    int32_t endIndex = (int32_t)pos;
4105
178k
                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, (int32_t)fPatternLength, (void *)fRXPat->fPattern->chunkContents);
4106
4107
178k
                    if (endIndex == pos) {
4108
174
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4109
174
                    }
4110
178k
                    fCharNum += endIndex - pos;
4111
178k
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4112
178k
                } else {
4113
0
                    int32_t offset = 0;
4114
0
                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4115
4116
0
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4117
0
                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4118
4119
0
                    if (offset == 0) {
4120
0
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4121
0
                    } else if (context.lastOffset == offset) {
4122
0
                        UTEXT_PREVIOUS32(fRXPat->fPattern);
4123
0
                    } else if (context.lastOffset != offset-1) {
4124
0
                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4125
0
                    }
4126
0
                    fCharNum += offset;
4127
0
                }
4128
178k
            }
4129
197k
            else if (peekCharLL() == chDigit0) {
4130
                //  Octal Escape, using Java Regexp Conventions
4131
                //    which are \0 followed by 1-3 octal digits.
4132
                //    Different from ICU Unescape handling of Octal, which does not
4133
                //    require the leading 0.
4134
                //  Java also has the convention of only consuming 2 octal digits if
4135
                //    the three digit number would be > 0xff
4136
                //
4137
4.82k
                c.fChar = 0;
4138
4.82k
                nextCharLL();    // Consume the initial 0.
4139
4.82k
                int index;
4140
10.1k
                for (index=0; index<3; index++) {
4141
9.81k
                    int32_t ch = peekCharLL();
4142
9.81k
                    if (ch<chDigit0 || ch>chDigit7) {
4143
4.44k
                        if (index==0) {
4144
                           // \0 is not followed by any octal digits.
4145
261
                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4146
261
                        }
4147
4.44k
                        break;
4148
4.44k
                    }
4149
5.36k
                    c.fChar <<= 3;
4150
5.36k
                    c.fChar += ch&7;
4151
5.36k
                    if (c.fChar <= 255) {
4152
5.04k
                        nextCharLL();
4153
5.04k
                    } else {
4154
                        // The last digit made the number too big.  Forget we saw it.
4155
317
                        c.fChar >>= 3;
4156
317
                    }
4157
5.36k
                }
4158
4.82k
                c.fQuoted = true;
4159
4.82k
            }
4160
192k
            else if (peekCharLL() == chQ) {
4161
                //  "\Q"  enter quote mode, which will continue until "\E"
4162
888
                fQuoteMode = true;
4163
888
                nextCharLL();        // discard the 'Q'.
4164
                // nextChar(c);      // recurse to get the real next char.
4165
888
                goto tailRecursion;  // Note: fuzz testing produced test cases that
4166
                //                            resulted in stack overflow here.
4167
888
            }
4168
191k
            else
4169
191k
            {
4170
                // We are in a '\' escape that will be handled by the state table scanner.
4171
                // Just return the backslash, but remember that the following char is to
4172
                //  be taken literally.
4173
191k
                fInBackslashQuote = true;
4174
191k
            }
4175
376k
        }
4176
80.2M
    }
4177
4178
    // re-enable # to end-of-line comments, in case they were disabled.
4179
    // They are disabled by the parser upon seeing '(?', but this lasts for
4180
    //  the fetching of the next character only.
4181
81.6M
    fEOLComments = true;
4182
4183
    // putc(c.fChar, stdout);
4184
81.6M
}
4185
4186
4187
4188
//------------------------------------------------------------------------------
4189
//
4190
//  scanNamedChar
4191
//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4192
//
4193
//             The scan position will be at the 'N'.  On return
4194
//             the scan position should be just after the '}'
4195
//
4196
//             Return the UChar32
4197
//
4198
//------------------------------------------------------------------------------
4199
2.60k
UChar32  RegexCompile::scanNamedChar() {
4200
2.60k
    if (U_FAILURE(*fStatus)) {
4201
0
        return 0;
4202
0
    }
4203
4204
2.60k
    nextChar(fC);
4205
2.60k
    if (fC.fChar != chLBrace) {
4206
40
        error(U_REGEX_PROPERTY_SYNTAX);
4207
40
        return 0;
4208
40
    }
4209
4210
2.56k
    UnicodeString  charName;
4211
218k
    for (;;) {
4212
218k
        nextChar(fC);
4213
218k
        if (fC.fChar == chRBrace) {
4214
2.52k
            break;
4215
2.52k
        }
4216
215k
        if (fC.fChar == -1) {
4217
38
            error(U_REGEX_PROPERTY_SYNTAX);
4218
38
            return 0;
4219
38
        }
4220
215k
        charName.append(fC.fChar);
4221
215k
    }
4222
4223
2.52k
    char name[100];
4224
2.52k
    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4225
2.52k
         (uint32_t)charName.length()>=sizeof(name)) {
4226
        // All Unicode character names have only invariant characters.
4227
        // The API to get a character, given a name, accepts only char *, forcing us to convert,
4228
        //   which requires this error check
4229
9
        error(U_REGEX_PROPERTY_SYNTAX);
4230
9
        return 0;
4231
9
    }
4232
2.51k
    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4233
4234
2.51k
    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4235
2.51k
    if (U_FAILURE(*fStatus)) {
4236
92
        error(U_REGEX_PROPERTY_SYNTAX);
4237
92
    }
4238
4239
2.51k
    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4240
2.51k
    return theChar;
4241
2.52k
}
4242
4243
//------------------------------------------------------------------------------
4244
//
4245
//  scanProp   Construct a UnicodeSet from the text at the current scan
4246
//             position, which will be of the form \p{whaterver}
4247
//
4248
//             The scan position will be at the 'p' or 'P'.  On return
4249
//             the scan position should be just after the '}'
4250
//
4251
//             Return a UnicodeSet, constructed from the \P pattern,
4252
//             or nullptr if the pattern is invalid.
4253
//
4254
//------------------------------------------------------------------------------
4255
1.10k
UnicodeSet *RegexCompile::scanProp() {
4256
1.10k
    UnicodeSet    *uset = nullptr;
4257
4258
1.10k
    if (U_FAILURE(*fStatus)) {
4259
0
        return nullptr;
4260
0
    }
4261
1.10k
    (void)chLowerP;   // Suppress compiler unused variable warning.
4262
1.10k
    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4263
1.10k
    UBool negated = (fC.fChar == chP);
4264
4265
1.10k
    UnicodeString propertyName;
4266
1.10k
    nextChar(fC);
4267
1.10k
    if (fC.fChar != chLBrace) {
4268
11
        error(U_REGEX_PROPERTY_SYNTAX);
4269
11
        return nullptr;
4270
11
    }
4271
16.8k
    for (;;) {
4272
16.8k
        nextChar(fC);
4273
16.8k
        if (fC.fChar == chRBrace) {
4274
1.05k
            break;
4275
1.05k
        }
4276
15.8k
        if (fC.fChar == -1) {
4277
            // Hit the end of the input string without finding the closing '}'
4278
41
            error(U_REGEX_PROPERTY_SYNTAX);
4279
41
            return nullptr;
4280
41
        }
4281
15.7k
        propertyName.append(fC.fChar);
4282
15.7k
    }
4283
1.05k
    uset = createSetForProperty(propertyName, negated);
4284
1.05k
    nextChar(fC);    // Move input scan to position following the closing '}'
4285
1.05k
    return uset;
4286
1.09k
}
4287
4288
//------------------------------------------------------------------------------
4289
//
4290
//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4291
//             position, which is expected be of the form [:property expression:]
4292
//
4293
//             The scan position will be at the opening ':'.  On return
4294
//             the scan position must be on the closing ']'
4295
//
4296
//             Return a UnicodeSet constructed from the pattern,
4297
//             or nullptr if this is not a valid POSIX-style set expression.
4298
//             If not a property expression, restore the initial scan position
4299
//                (to the opening ':')
4300
//
4301
//               Note:  the opening '[:' is not sufficient to guarantee that
4302
//                      this is a [:property:] expression.
4303
//                      [:'+=,] is a perfectly good ordinary set expression that
4304
//                              happens to include ':' as one of its characters.
4305
//
4306
//------------------------------------------------------------------------------
4307
82.4k
UnicodeSet *RegexCompile::scanPosixProp() {
4308
82.4k
    UnicodeSet    *uset = nullptr;
4309
4310
82.4k
    if (U_FAILURE(*fStatus)) {
4311
0
        return nullptr;
4312
0
    }
4313
4314
82.4k
    U_ASSERT(fC.fChar == chColon);
4315
4316
    // Save the scanner state.
4317
    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4318
82.4k
    int64_t     savedScanIndex        = fScanIndex;
4319
82.4k
    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4320
82.4k
    UBool       savedQuoteMode        = fQuoteMode;
4321
82.4k
    UBool       savedInBackslashQuote = fInBackslashQuote;
4322
82.4k
    UBool       savedEOLComments      = fEOLComments;
4323
82.4k
    int64_t     savedLineNum          = fLineNum;
4324
82.4k
    int64_t     savedCharNum          = fCharNum;
4325
82.4k
    UChar32     savedLastChar         = fLastChar;
4326
82.4k
    UChar32     savedPeekChar         = fPeekChar;
4327
82.4k
    RegexPatternChar savedfC          = fC;
4328
4329
    // Scan for a closing ].   A little tricky because there are some perverse
4330
    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4331
    //   ending on the second closing ].
4332
4333
82.4k
    UnicodeString propName;
4334
82.4k
    UBool         negated  = false;
4335
4336
    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4337
82.4k
    nextChar(fC);
4338
82.4k
    if (fC.fChar == chUp) {
4339
218
       negated = true;
4340
218
       nextChar(fC);
4341
218
    }
4342
4343
    // Scan for the closing ":]", collecting the property name along the way.
4344
82.4k
    UBool  sawPropSetTerminator = false;
4345
23.1M
    for (;;) {
4346
23.1M
        propName.append(fC.fChar);
4347
23.1M
        nextChar(fC);
4348
23.1M
        if (fC.fQuoted || fC.fChar == -1) {
4349
            // Escaped characters or end of input - either says this isn't a [:Property:]
4350
8.02k
            break;
4351
8.02k
        }
4352
23.1M
        if (fC.fChar == chColon) {
4353
74.4k
            nextChar(fC);
4354
74.4k
            if (fC.fChar == chRBracket) {
4355
63.2k
                sawPropSetTerminator = true;
4356
63.2k
            }
4357
74.4k
            break;
4358
74.4k
        }
4359
23.1M
    }
4360
4361
82.4k
    if (sawPropSetTerminator) {
4362
63.2k
        uset = createSetForProperty(propName, negated);
4363
63.2k
    }
4364
19.1k
    else
4365
19.1k
    {
4366
        // No closing ":]".
4367
        //  Restore the original scan position.
4368
        //  The main scanner will retry the input as a normal set expression,
4369
        //    not a [:Property:] expression.
4370
19.1k
        fScanIndex        = savedScanIndex;
4371
19.1k
        fQuoteMode        = savedQuoteMode;
4372
19.1k
        fInBackslashQuote = savedInBackslashQuote;
4373
19.1k
        fEOLComments      = savedEOLComments;
4374
19.1k
        fLineNum          = savedLineNum;
4375
19.1k
        fCharNum          = savedCharNum;
4376
19.1k
        fLastChar         = savedLastChar;
4377
19.1k
        fPeekChar         = savedPeekChar;
4378
19.1k
        fC                = savedfC;
4379
19.1k
        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4380
19.1k
    }
4381
82.4k
    return uset;
4382
82.4k
}
4383
4384
0
static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4385
0
    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4386
0
    addCategory(set, U_GC_CF_MASK, ec);
4387
0
}
4388
4389
//
4390
//  Create a Unicode Set from a Unicode Property expression.
4391
//     This is common code underlying both \p{...} and [:...:] expressions.
4392
//     Includes trying the Java "properties" that aren't supported as
4393
//     normal ICU UnicodeSet properties
4394
//
4395
64.3k
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4396
4397
64.3k
    if (U_FAILURE(*fStatus)) {
4398
1
        return nullptr;
4399
1
    }
4400
64.3k
    LocalPointer<UnicodeSet> set;
4401
64.3k
    UErrorCode status = U_ZERO_ERROR;
4402
4403
64.3k
    do {      // non-loop, exists to allow breaks from the block.
4404
        //
4405
        //  First try the property as we received it
4406
        //
4407
64.3k
        UnicodeString   setExpr;
4408
64.3k
        uint32_t        usetFlags = 0;
4409
64.3k
        setExpr.append(u"[\\p{", -1);
4410
64.3k
        setExpr.append(propName);
4411
64.3k
        setExpr.append(u"}]", -1);
4412
64.3k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4413
13.3k
            usetFlags |= USET_CASE_INSENSITIVE;
4414
13.3k
        }
4415
64.3k
        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
4416
64.3k
        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4417
52.0k
            break;
4418
52.0k
        }
4419
4420
        //
4421
        //  The incoming property wasn't directly recognized by ICU.
4422
4423
        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4424
        //     Java accepts 'word' with mixed case.
4425
        //     Java accepts 'all' only in all lower case.
4426
4427
12.3k
        status = U_ZERO_ERROR;
4428
12.3k
        if (propName.caseCompare(u"word", -1, 0) == 0) {
4429
179
            set.adoptInsteadAndCheckErrorCode(
4430
179
                RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4431
179
            break;
4432
179
        }
4433
12.1k
        if (propName.compare(u"all", -1) == 0) {
4434
246
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4435
246
            break;
4436
246
        }
4437
4438
4439
        //    Do Java InBlock expressions
4440
        //
4441
11.8k
        UnicodeString mPropName = propName;
4442
11.8k
        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4443
5.65k
            status = U_ZERO_ERROR;
4444
5.65k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4445
5.65k
            if (U_FAILURE(status)) {
4446
0
                break;
4447
0
            }
4448
5.65k
            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4449
5.65k
            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4450
5.65k
            break;
4451
5.65k
        }
4452
4453
        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4454
        //  as "BooleanPropertyValue". The property value can be either a
4455
        //  a General Category or a Script Name.
4456
4457
6.22k
        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4458
4.31k
            mPropName.remove(0, 2);      // Strip the "Is"
4459
4.31k
            if (mPropName.indexOf(u'=') >= 0) {
4460
                // Reject any "Is..." property expression containing an '=', that is,
4461
                // any non-binary property expression.
4462
8
                status = U_REGEX_PROPERTY_SYNTAX;
4463
8
                break;
4464
8
            }
4465
4466
4.30k
            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4467
0
                mPropName.setTo(u"unassigned", -1);
4468
0
                negated = !negated;
4469
4.30k
            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4470
0
                mPropName.setTo(u"Titlecase_Letter", -1);
4471
0
            }
4472
4473
4.30k
            mPropName.insert(0, u"[\\p{", -1);
4474
4.30k
            mPropName.append(u"}]", -1);
4475
4.30k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4476
4477
4.30k
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4478
1.59k
                set->closeOver(USET_CASE_INSENSITIVE);
4479
1.59k
            }
4480
4.30k
            break;
4481
4482
4.31k
        }
4483
4484
1.91k
        if (propName.startsWith(u"java", -1)) {
4485
0
            status = U_ZERO_ERROR;
4486
0
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4487
0
            if (U_FAILURE(status)) {
4488
0
                break;
4489
0
            }
4490
            //
4491
            //  Try the various Java specific properties.
4492
            //   These all begin with "java"
4493
            //
4494
0
            if (propName.compare(u"javaDefined", -1) == 0) {
4495
0
                addCategory(set.getAlias(), U_GC_CN_MASK, status);
4496
0
                set->complement();
4497
0
            }
4498
0
            else if (propName.compare(u"javaDigit", -1) == 0) {
4499
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4500
0
            }
4501
0
            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4502
0
                addIdentifierIgnorable(set.getAlias(), status);
4503
0
            }
4504
0
            else if (propName.compare(u"javaISOControl", -1) == 0) {
4505
0
                set->add(0, 0x1F).add(0x7F, 0x9F);
4506
0
            }
4507
0
            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4508
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4509
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4510
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4511
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4512
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4513
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4514
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4515
0
                addIdentifierIgnorable(set.getAlias(), status);
4516
0
            }
4517
0
            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4518
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4519
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4520
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4521
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4522
0
            }
4523
0
            else if (propName.compare(u"javaLetter", -1) == 0) {
4524
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4525
0
            }
4526
0
            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4527
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4528
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4529
0
            }
4530
0
            else if (propName.compare(u"javaLowerCase", -1) == 0) {
4531
0
                addCategory(set.getAlias(), U_GC_LL_MASK, status);
4532
0
            }
4533
0
            else if (propName.compare(u"javaMirrored", -1) == 0) {
4534
0
                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4535
0
            }
4536
0
            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4537
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4538
0
            }
4539
0
            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4540
0
                set->add(0x10000, UnicodeSet::MAX_VALUE);
4541
0
            }
4542
0
            else if (propName.compare(u"javaTitleCase", -1) == 0) {
4543
0
                addCategory(set.getAlias(), U_GC_LT_MASK, status);
4544
0
            }
4545
0
            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4546
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4547
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4548
0
            }
4549
0
            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4550
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4551
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4552
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4553
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4554
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4555
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4556
0
                addIdentifierIgnorable(set.getAlias(), status);
4557
0
            }
4558
0
            else if (propName.compare(u"javaUpperCase", -1) == 0) {
4559
0
                addCategory(set.getAlias(), U_GC_LU_MASK, status);
4560
0
            }
4561
0
            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4562
0
                set->add(0, UnicodeSet::MAX_VALUE);
4563
0
            }
4564
0
            else if (propName.compare(u"javaWhitespace", -1) == 0) {
4565
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4566
0
                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4567
0
                set->add(9, 0x0d).add(0x1c, 0x1f);
4568
0
            } else {
4569
0
                status = U_REGEX_PROPERTY_SYNTAX;
4570
0
            }
4571
4572
0
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4573
0
                set->closeOver(USET_CASE_INSENSITIVE);
4574
0
            }
4575
0
            break;
4576
0
        }
4577
4578
        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4579
        // extensions matched it.
4580
1.91k
        status = U_REGEX_PROPERTY_SYNTAX;
4581
1.91k
    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4582
4583
64.3k
    if (U_SUCCESS(status)) {
4584
        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4585
        // deal with properties of strings and character classes with strings, we ignore them.
4586
        // Just in case something downstream might stumble over the strings,
4587
        // we remove them from the set.
4588
        // Note that when we support strings, the complement of a property (as with \P)
4589
        // should be implemented as .complement().removeAllStrings() (code point complement).
4590
62.3k
        set->removeAllStrings();
4591
62.3k
        U_ASSERT(set.isValid());
4592
62.3k
        if (negated) {
4593
906
            set->complement();
4594
906
        }
4595
62.3k
        return set.orphan();
4596
62.3k
    } else {
4597
1.98k
        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4598
62
            status = U_REGEX_PROPERTY_SYNTAX;
4599
62
        }
4600
1.98k
        error(status);
4601
1.98k
        return nullptr;
4602
1.98k
    }
4603
64.3k
}
4604
4605
4606
//
4607
//  SetEval   Part of the evaluation of [set expressions].
4608
//            Perform any pending (stacked) operations with precedence
4609
//            equal or greater to that of the next operator encountered
4610
//            in the expression.
4611
//
4612
6.90M
void RegexCompile::setEval(int32_t nextOp) {
4613
6.90M
    UnicodeSet *rightOperand = nullptr;
4614
6.90M
    UnicodeSet *leftOperand  = nullptr;
4615
7.14M
    for (;;) {
4616
7.14M
        U_ASSERT(fSetOpStack.empty()==false);
4617
7.14M
        int32_t pendingSetOperation = fSetOpStack.peeki();
4618
7.14M
        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4619
6.90M
            break;
4620
6.90M
        }
4621
239k
        fSetOpStack.popi();
4622
239k
        U_ASSERT(fSetStack.empty() == false);
4623
239k
        rightOperand = (UnicodeSet *)fSetStack.peek();
4624
        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4625
        // (see comments there).
4626
        // We also do not yet support string literals in character classes,
4627
        // so there should not be any strings.
4628
        // Note that when we support strings, the complement of a set (as with ^ or \P)
4629
        // should be implemented as .complement().removeAllStrings() (code point complement).
4630
239k
        U_ASSERT(!rightOperand->hasStrings());
4631
239k
        switch (pendingSetOperation) {
4632
731
            case setNegation:
4633
731
                rightOperand->complement();
4634
731
                break;
4635
174k
            case setCaseClose:
4636
                // TODO: need a simple close function.  Ticket 6065
4637
174k
                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4638
174k
                rightOperand->removeAllStrings();
4639
174k
                break;
4640
1.34k
            case setDifference1:
4641
1.71k
            case setDifference2:
4642
1.71k
                fSetStack.pop();
4643
1.71k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4644
1.71k
                leftOperand->removeAll(*rightOperand);
4645
1.71k
                delete rightOperand;
4646
1.71k
                break;
4647
676
            case setIntersection1:
4648
2.44k
            case setIntersection2:
4649
2.44k
                fSetStack.pop();
4650
2.44k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4651
2.44k
                leftOperand->retainAll(*rightOperand);
4652
2.44k
                delete rightOperand;
4653
2.44k
                break;
4654
60.2k
            case setUnion:
4655
60.2k
                fSetStack.pop();
4656
60.2k
                leftOperand = (UnicodeSet *)fSetStack.peek();
4657
60.2k
                leftOperand->addAll(*rightOperand);
4658
60.2k
                delete rightOperand;
4659
60.2k
                break;
4660
0
            default:
4661
0
                UPRV_UNREACHABLE_EXIT;
4662
239k
            }
4663
239k
        }
4664
6.90M
    }
4665
4666
71.3k
void RegexCompile::setPushOp(int32_t op) {
4667
71.3k
    setEval(op);
4668
71.3k
    fSetOpStack.push(op, *fStatus);
4669
71.3k
    LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4670
71.3k
    fSetStack.push(lpSet.orphan(), *fStatus);
4671
71.3k
}
4672
4673
U_NAMESPACE_END
4674
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4675