Coverage Report

Created: 2026-05-06 06:16

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/regexcmp.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  regexcmp.cpp
5
//
6
//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the ICU regular expression compiler, which is responsible
10
//  for processing a regular expression pattern into the compiled form that
11
//  is used by the match finding engine.
12
//
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18
#include "unicode/ustring.h"
19
#include "unicode/unistr.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/uchriter.h"
23
#include "unicode/parsepos.h"
24
#include "unicode/parseerr.h"
25
#include "unicode/regex.h"
26
#include "unicode/utf.h"
27
#include "unicode/utf16.h"
28
#include "patternprops.h"
29
#include "putilimp.h"
30
#include "cmemory.h"
31
#include "cstr.h"
32
#include "cstring.h"
33
#include "uvectr32.h"
34
#include "uvectr64.h"
35
#include "uassert.h"
36
#include "uinvchar.h"
37
38
#include "regeximp.h"
39
#include "regexcst.h"   // Contains state table for the regex pattern parser.
40
                        //   generated by a Perl script.
41
#include "regexcmp.h"
42
#include "regexst.h"
43
#include "regextxt.h"
44
45
46
47
U_NAMESPACE_BEGIN
48
49
50
//------------------------------------------------------------------------------
51
//
52
//  Constructor.
53
//
54
//------------------------------------------------------------------------------
55
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56
21.2k
   fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57
21.2k
{
58
    // Lazy init of all shared global sets (needed for init()'s empty text)
59
21.2k
    RegexStaticSets::initGlobals(&status);
60
61
21.2k
    fStatus           = &status;
62
63
21.2k
    fRXPat            = rxp;
64
21.2k
    fScanIndex        = 0;
65
21.2k
    fLastChar         = -1;
66
21.2k
    fPeekChar         = -1;
67
21.2k
    fLineNum          = 1;
68
21.2k
    fCharNum          = 0;
69
21.2k
    fQuoteMode        = false;
70
21.2k
    fInBackslashQuote = false;
71
21.2k
    fModeFlags        = fRXPat->fFlags | 0x80000000;
72
21.2k
    fEOLComments      = true;
73
74
21.2k
    fMatchOpenParen   = -1;
75
21.2k
    fMatchCloseParen  = -1;
76
21.2k
    fCaptureName      = nullptr;
77
21.2k
    fLastSetLiteral   = U_SENTINEL;
78
79
21.2k
    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80
0
        status = rxp->fDeferredStatus;
81
0
    }
82
21.2k
}
83
84
static const char16_t   chAmp       = 0x26;      // '&'
85
static const char16_t   chDash      = 0x2d;      // '-'
86
87
88
//------------------------------------------------------------------------------
89
//
90
//  Destructor
91
//
92
//------------------------------------------------------------------------------
93
21.2k
RegexCompile::~RegexCompile() {
94
21.2k
    delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
95
                                 //   compilation stops with a syntax error.
96
21.2k
}
97
98
977
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99
977
    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100
977
}
101
102
//------------------------------------------------------------------------------
103
//
104
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105
//                           The state tables are hand-written in the file regexcst.txt,
106
//                           and converted to the form used here by a perl
107
//                           script regexcst.pl
108
//
109
//------------------------------------------------------------------------------
110
void    RegexCompile::compile(
111
                         const UnicodeString &pat,   // Source pat to be compiled.
112
                         UParseError &pp,            // Error position info
113
                         UErrorCode &e)              // Error Code
114
10.2k
{
115
10.2k
    fRXPat->fPatternString = new UnicodeString(pat);
116
10.2k
    UText patternText = UTEXT_INITIALIZER;
117
10.2k
    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119
10.2k
    if (U_SUCCESS(e)) {
120
10.2k
        compile(&patternText, pp, e);
121
10.2k
        utext_close(&patternText);
122
10.2k
    }
123
10.2k
}
124
125
//
126
//   compile, UText mode
127
//     All the work is actually done here.
128
//
129
void    RegexCompile::compile(
130
                         UText *pat,                 // Source pat to be compiled.
131
                         UParseError &pp,            // Error position info
132
                         UErrorCode &e)              // Error Code
133
21.2k
{
134
21.2k
    fStatus             = &e;
135
21.2k
    fParseErr           = &pp;
136
21.2k
    fStackPtr           = 0;
137
21.2k
    fStack[fStackPtr]   = 0;
138
139
21.2k
    if (U_FAILURE(*fStatus)) {
140
0
        return;
141
0
    }
142
143
    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144
21.2k
    U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
145
146
    // Prepare the RegexPattern object to receive the compiled pattern.
147
21.2k
    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
148
21.2k
    if (U_FAILURE(*fStatus)) {
149
0
        return;
150
0
    }
151
152
    // Initialize the pattern scanning state machine
153
21.2k
    fPatternLength = utext_nativeLength(pat);
154
21.2k
    uint16_t                state = 1;
155
21.2k
    const RegexTableEl      *tableEl;
156
157
    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158
21.2k
    if (fModeFlags & UREGEX_LITERAL) {
159
0
        fQuoteMode = true;
160
0
    }
161
162
21.2k
    nextChar(fC);                        // Fetch the first char from the pattern string.
163
164
    //
165
    // Main loop for the regex pattern parsing state machine.
166
    //   Runs once per state transition.
167
    //   Each time through optionally performs, depending on the state table,
168
    //      - an advance to the next pattern char
169
    //      - an action to be performed.
170
    //      - pushing or popping a state to/from the local state return stack.
171
    //   file regexcst.txt is the source for the state table.  The logic behind
172
    //     recongizing the pattern syntax is there, not here.
173
    //
174
175M
    for (;;) {
175
        //  Bail out if anything has gone wrong.
176
        //  Regex pattern parsing stops on the first error encountered.
177
175M
        if (U_FAILURE(*fStatus)) {
178
275
            break;
179
275
        }
180
181
175M
        U_ASSERT(state != 0);
182
183
        // Find the state table element that matches the input char from the pattern, or the
184
        //    class of the input character.  Start with the first table row for this
185
        //    state, then linearly scan forward until we find a row that matches the
186
        //    character.  The last row for each state always matches all characters, so
187
        //    the search will stop there, if not before.
188
        //
189
175M
        tableEl = &gRuleParseStateTable[state];
190
175M
        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191
175M
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193
784M
        for (;;) {    // loop through table rows belonging to this state, looking for one
194
                      //   that matches the current input char.
195
784M
            REGEX_SCAN_DEBUG_PRINTF(("."));
196
784M
            if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
197
                // Table row specified an individual character, not a set, and
198
                //   the input character is not quoted, and
199
                //   the input character matched it.
200
24.6M
                break;
201
24.6M
            }
202
759M
            if (tableEl->fCharClass == 255) {
203
                // Table row specified default, match anything character class.
204
105M
                break;
205
105M
            }
206
653M
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207
                // Table row specified "quoted" and the char was quoted.
208
4.57M
                break;
209
4.57M
            }
210
649M
            if (tableEl->fCharClass == 253 && fC.fChar == static_cast<UChar32>(-1)) {
211
                // Table row specified eof and we hit eof on the input.
212
17.4k
                break;
213
17.4k
            }
214
215
649M
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216
62.6M
                fC.fQuoted == false &&                                       //   char is not escaped &&
217
62.6M
                fC.fChar != static_cast<UChar32>(-1)) {                      //   char is not EOF
218
62.6M
                U_ASSERT(tableEl->fCharClass <= 137);
219
62.6M
                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220
                    // Table row specified a character class, or set of characters,
221
                    //   and the current char matches it.
222
40.8M
                    break;
223
40.8M
                }
224
62.6M
            }
225
226
            // No match on this row, advance to the next  row for this state,
227
608M
            tableEl++;
228
608M
        }
229
175M
        REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231
        //
232
        // We've found the row of the state table that matches the current input
233
        //   character from the rules string.
234
        // Perform any action specified  by this row in the state table.
235
175M
        if (doParseActions(tableEl->fAction) == false) {
236
            // Break out of the state machine loop if the
237
            //   the action signalled some kind of error, or
238
            //   the action was to exit, occurs on normal end-of-rules-input.
239
20.9k
            break;
240
20.9k
        }
241
242
175M
        if (tableEl->fPushState != 0) {
243
927k
            fStackPtr++;
244
927k
            if (fStackPtr >= kStackSize) {
245
11
                error(U_REGEX_INTERNAL_ERROR);
246
11
                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247
11
                fStackPtr--;
248
11
            }
249
927k
            fStack[fStackPtr] = tableEl->fPushState;
250
927k
        }
251
252
        //
253
        //  NextChar.  This is where characters are actually fetched from the pattern.
254
        //             Happens under control of the 'n' tag in the state table.
255
        //
256
175M
        if (tableEl->fNextChar) {
257
81.9M
            nextChar(fC);
258
81.9M
        }
259
260
        // Get the next state from the table entry, or from the
261
        //   state stack if the next state was specified as "pop".
262
175M
        if (tableEl->fNextState != 255) {
263
174M
            state = tableEl->fNextState;
264
174M
        } else {
265
902k
            state = fStack[fStackPtr];
266
902k
            fStackPtr--;
267
902k
            if (fStackPtr < 0) {
268
                // state stack underflow
269
                // This will occur if the user pattern has mis-matched parentheses,
270
                //   with extra close parens.
271
                //
272
0
                fStackPtr++;
273
0
                error(U_REGEX_MISMATCHED_PAREN);
274
0
            }
275
902k
        }
276
277
175M
    }
278
279
21.2k
    if (U_FAILURE(*fStatus)) {
280
        // Bail out if the pattern had errors.
281
8.07k
        return;
282
8.07k
    }
283
284
    //
285
    // The pattern has now been read and processed, and the compiled code generated.
286
    //
287
288
    //
289
    // The pattern's fFrameSize so far has accumulated the requirements for
290
    //   storage for capture parentheses, counters, etc. that are encountered
291
    //   in the pattern.  Add space for the two variables that are always
292
    //   present in the saved state:  the input string position (int64_t) and
293
    //   the position in the compiled pattern.
294
    //
295
13.1k
    allocateStackData(RESTACKFRAME_HDRCOUNT);
296
297
    //
298
    // Optimization pass 1: NOPs, back-references, and case-folding
299
    //
300
13.1k
    stripNOPs();
301
302
    //
303
    // Get bounds for the minimum and maximum length of a string that this
304
    //   pattern can match.  Used to avoid looking for matches in strings that
305
    //   are too short.
306
    //
307
13.1k
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309
    //
310
    // Optimization pass 2: match start type
311
    //
312
13.1k
    matchStartType();
313
314
    //
315
    // Set up fast latin-1 range sets
316
    //
317
13.1k
    int32_t numSets = fRXPat->fSets->size();
318
13.1k
    fRXPat->fSets8 = new Regex8BitSet[numSets];
319
    // Null pointer check.
320
13.1k
    if (fRXPat->fSets8 == nullptr) {
321
0
        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322
0
        return;
323
0
    }
324
13.1k
    int32_t i;
325
106k
    for (i=0; i<numSets; i++) {
326
93.5k
        UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(i));
327
93.5k
        fRXPat->fSets8[i].init(s);
328
93.5k
    }
329
330
13.1k
}
331
332
333
334
335
336
//------------------------------------------------------------------------------
337
//
338
//  doParseAction        Do some action during regex pattern parsing.
339
//                       Called by the parse state machine.
340
//
341
//                       Generation of the match engine PCode happens here, or
342
//                       in functions called from the parse actions defined here.
343
//
344
//
345
//------------------------------------------------------------------------------
346
UBool RegexCompile::doParseActions(int32_t action)
347
175M
{
348
175M
    UBool   returnVal = true;
349
350
175M
    switch (static_cast<Regex_PatternParseAction>(action)) {
351
352
21.0k
    case doPatStart:
353
        // Start of pattern compiles to:
354
        //0   SAVE   2        Fall back to position of FAIL
355
        //1   jmp    3
356
        //2   FAIL            Stop if we ever reach here.
357
        //3   NOP             Dummy, so start of pattern looks the same as
358
        //                    the start of an ( grouping.
359
        //4   NOP             Resreved, will be replaced by a save if there are
360
        //                    OR | operators at the top level
361
21.0k
        appendOp(URX_STATE_SAVE, 2);
362
21.0k
        appendOp(URX_JMP,  3);
363
21.0k
        appendOp(URX_FAIL, 0);
364
365
        // Standard open nonCapture paren action emits the two NOPs and
366
        //   sets up the paren stack frame.
367
21.0k
        doParseActions(doOpenNonCaptureParen);
368
21.0k
        break;
369
370
15.4k
    case doPatFinish:
371
        // We've scanned to the end of the pattern
372
        //  The end of pattern compiles to:
373
        //        URX_END
374
        //    which will stop the runtime match engine.
375
        //  Encountering end of pattern also behaves like a close paren,
376
        //   and forces fixups of the State Save at the beginning of the compiled pattern
377
        //   and of any OR operations at the top level.
378
        //
379
15.4k
        handleCloseParen();
380
15.4k
        if (fParenStack.size() > 0) {
381
            // Missing close paren in pattern.
382
2.31k
            error(U_REGEX_MISMATCHED_PAREN);
383
2.31k
        }
384
385
        // add the END operation to the compiled pattern.
386
15.4k
        appendOp(URX_END, 0);
387
388
        // Terminate the pattern compilation state machine.
389
15.4k
        returnVal = false;
390
15.4k
        break;
391
392
393
394
20.7M
    case doOrOperator:
395
        // Scanning a '|', as in (A|B)
396
20.7M
        {
397
            // Generate code for any pending literals preceding the '|'
398
20.7M
            fixLiterals(false);
399
400
            // Insert a SAVE operation at the start of the pattern section preceding
401
            //   this OR at this level.  This SAVE will branch the match forward
402
            //   to the right hand side of the OR in the event that the left hand
403
            //   side fails to match and backtracks.  Locate the position for the
404
            //   save from the location on the top of the parentheses stack.
405
20.7M
            int32_t savePosition = fParenStack.popi();
406
20.7M
            int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(savePosition));
407
20.7M
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408
20.7M
            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409
20.7M
            fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411
            // Append an JMP operation into the compiled pattern.  The operand for
412
            //  the JMP will eventually be the location following the ')' for the
413
            //  group.  This will be patched in later, when the ')' is encountered.
414
20.7M
            appendOp(URX_JMP, 0);
415
416
            // Push the position of the newly added JMP op onto the parentheses stack.
417
            // This registers if for fixup when this block's close paren is encountered.
418
20.7M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420
            // Append a NOP to the compiled pattern.  This is the slot reserved
421
            //   for a SAVE in the event that there is yet another '|' following
422
            //   this one.
423
20.7M
            appendOp(URX_NOP, 0);
424
20.7M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425
20.7M
        }
426
20.7M
        break;
427
428
429
721
    case doBeginNamedCapture:
430
        // Scanning (?<letter.
431
        //   The first letter of the name will come through again under doConinueNamedCapture.
432
721
        fCaptureName = new UnicodeString();
433
721
        if (fCaptureName == nullptr) {
434
0
            error(U_MEMORY_ALLOCATION_ERROR);
435
0
        }
436
721
        break;
437
438
2.50k
    case  doContinueNamedCapture:
439
2.50k
        fCaptureName->append(fC.fChar);
440
2.50k
        break;
441
442
43
    case doBadNamedCapture:
443
43
        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444
43
        break;
445
        
446
414k
    case doOpenCaptureParen:
447
        // Open Capturing Paren, possibly named.
448
        //   Compile to a
449
        //      - NOP, which later may be replaced by a save-state if the
450
        //         parenthesized group gets a * quantifier, followed by
451
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452
        //      - NOP, which may later be replaced by a save-state if there
453
        //             is an '|' alternation within the parens.
454
        //
455
        //    Each capture group gets three slots in the save stack frame:
456
        //         0: Capture Group start position (in input string being matched.)
457
        //         1: Capture Group end position.
458
        //         2: Start of Match-in-progress.
459
        //    The first two locations are for a completed capture group, and are
460
        //     referred to by back references and the like.
461
        //    The third location stores the capture start position when an START_CAPTURE is
462
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463
        //      END_CAPTURE is encountered.
464
414k
        {
465
414k
            fixLiterals();
466
414k
            appendOp(URX_NOP, 0);
467
414k
            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468
414k
            appendOp(URX_START_CAPTURE, varsLoc);
469
414k
            appendOp(URX_NOP, 0);
470
471
            // On the Parentheses stack, start a new frame and add the positions
472
            //   of the two NOPs.  Depending on what follows in the pattern, the
473
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474
            //   address of the end of the parenthesized group.
475
414k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476
414k
            fParenStack.push(capturing, *fStatus);                        // Frame type.
477
414k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478
414k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479
480
            // Save the mapping from group number to stack frame variable position.
481
414k
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483
            // If this is a named capture group, add the name->group number mapping.
484
414k
            if (fCaptureName != nullptr) {
485
706
                if (!fRXPat->initNamedCaptureMap()) {
486
0
                    if (U_SUCCESS(*fStatus)) {
487
0
                        error(fRXPat->fDeferredStatus);
488
0
                    }
489
0
                    break;
490
0
                }
491
706
                int32_t groupNumber = fRXPat->fGroupMap->size();
492
706
                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493
706
                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
494
706
                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495
39
                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496
39
                }
497
706
            }
498
414k
        }
499
414k
        break;
500
501
414k
    case doOpenNonCaptureParen:
502
        // Open non-caputuring (grouping only) Paren.
503
        //   Compile to a
504
        //      - NOP, which later may be replaced by a save-state if the
505
        //         parenthesized group gets a * quantifier, followed by
506
        //      - NOP, which may later be replaced by a save-state if there
507
        //             is an '|' alternation within the parens.
508
22.1k
        {
509
22.1k
            fixLiterals();
510
22.1k
            appendOp(URX_NOP, 0);
511
22.1k
            appendOp(URX_NOP, 0);
512
513
            // On the Parentheses stack, start a new frame and add the positions
514
            //   of the two NOPs.
515
22.1k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516
22.1k
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517
22.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518
22.1k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519
22.1k
        }
520
22.1k
         break;
521
522
523
1.10k
    case doOpenAtomicParen:
524
        // Open Atomic Paren.  (?>
525
        //   Compile to a
526
        //      - NOP, which later may be replaced if the parenthesized group
527
        //         has a quantifier, followed by
528
        //      - STO_SP  save state stack position, so it can be restored at the ")"
529
        //      - NOP, which may later be replaced by a save-state if there
530
        //             is an '|' alternation within the parens.
531
1.10k
        {
532
1.10k
            fixLiterals();
533
1.10k
            appendOp(URX_NOP, 0);
534
1.10k
            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535
1.10k
            appendOp(URX_STO_SP, varLoc);
536
1.10k
            appendOp(URX_NOP, 0);
537
538
            // On the Parentheses stack, start a new frame and add the positions
539
            //   of the two NOPs.  Depending on what follows in the pattern, the
540
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541
            //   address of the end of the parenthesized group.
542
1.10k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543
1.10k
            fParenStack.push(atomic, *fStatus);                           // Frame type.
544
1.10k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545
1.10k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546
1.10k
        }
547
1.10k
        break;
548
549
550
1.88k
    case doOpenLookAhead:
551
        // Positive Look-ahead   (?=  stuff  )
552
        //
553
        //   Note:   Addition of transparent input regions, with the need to
554
        //           restore the original regions when failing out of a lookahead
555
        //           block, complicated this sequence.  Some combined opcodes
556
        //           might make sense - or might not, lookahead aren't that common.
557
        //
558
        //      Caution:  min match length optimization knows about this
559
        //               sequence; don't change without making updates there too.
560
        //
561
        // Compiles to
562
        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563
        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564
        //    3    JMP          6           continue ...
565
        //
566
        //    4.   LA_END                   Look Ahead failed.  Restore regions.
567
        //    5.   BACKTRACK                and back track again.
568
        //
569
        //    6.   NOP              reserved for use by quantifiers on the block.
570
        //                          Look-ahead can't have quantifiers, but paren stack
571
        //                             compile time conventions require the slot anyhow.
572
        //    7.   NOP              may be replaced if there is are '|' ops in the block.
573
        //    8.     code for parenthesized stuff.
574
        //    9.   LA_END
575
        //
576
        //  Four data slots are reserved, for saving state on entry to the look-around
577
        //    0:   stack pointer on entry.
578
        //    1:   input position on entry.
579
        //    2:   fActiveStart, the active bounds start on entry.
580
        //    3:   fActiveLimit, the active bounds limit on entry.
581
1.88k
        {
582
1.88k
            fixLiterals();
583
1.88k
            int32_t dataLoc = allocateData(4);
584
1.88k
            appendOp(URX_LA_START, dataLoc);
585
1.88k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586
1.88k
            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587
1.88k
            appendOp(URX_LA_END, dataLoc);
588
1.88k
            appendOp(URX_BACKTRACK, 0);
589
1.88k
            appendOp(URX_NOP, 0);
590
1.88k
            appendOp(URX_NOP, 0);
591
592
            // On the Parentheses stack, start a new frame and add the positions
593
            //   of the NOPs.
594
1.88k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595
1.88k
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596
1.88k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597
1.88k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598
1.88k
        }
599
1.88k
        break;
600
601
7.13k
    case doOpenLookAheadNeg:
602
        // Negated Lookahead.   (?! stuff )
603
        // Compiles to
604
        //    1.    LA_START    dataloc
605
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606
        //                                //   which continues with the match.
607
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608
        //    4.       code for parenthesized stuff.
609
        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611
        //    7.    END_LA                // Restore match region, in case look-ahead was using
612
        //                                        an alternate (transparent) region.
613
        //  Four data slots are reserved, for saving state on entry to the look-around
614
        //    0:   stack pointer on entry.
615
        //    1:   input position on entry.
616
        //    2:   fActiveStart, the active bounds start on entry.
617
        //    3:   fActiveLimit, the active bounds limit on entry.
618
7.13k
        {
619
7.13k
            fixLiterals();
620
7.13k
            int32_t dataLoc = allocateData(4);
621
7.13k
            appendOp(URX_LA_START, dataLoc);
622
7.13k
            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623
7.13k
            appendOp(URX_NOP, 0);
624
625
            // On the Parentheses stack, start a new frame and add the positions
626
            //   of the StateSave and NOP.
627
7.13k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628
7.13k
            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629
7.13k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630
7.13k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631
632
            // Instructions #5 - #7 will be added when the ')' is encountered.
633
7.13k
        }
634
7.13k
        break;
635
636
6.99k
    case doOpenLookBehind:
637
6.99k
        {
638
            //   Compile a (?<= look-behind open paren.
639
            //
640
            //          Compiles to
641
            //              0       URX_LB_START     dataLoc
642
            //              1       URX_LB_CONT      dataLoc
643
            //              2                        MinMatchLen
644
            //              3                        MaxMatchLen
645
            //              4       URX_NOP          Standard '(' boilerplate.
646
            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647
            //              6         <code for LookBehind expression>
648
            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649
            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650
            //
651
            //          Allocate a block of matcher data, to contain (when running a match)
652
            //              0:    Stack ptr on entry
653
            //              1:    Input Index on entry
654
            //              2:    fActiveStart, the active bounds start on entry.
655
            //              3:    fActiveLimit, the active bounds limit on entry.
656
            //              4:    Start index of match current match attempt.
657
            //          The first four items must match the layout of data for LA_START / LA_END
658
659
            // Generate match code for any pending literals.
660
6.99k
            fixLiterals();
661
662
            // Allocate data space
663
6.99k
            int32_t dataLoc = allocateData(5);
664
665
            // Emit URX_LB_START
666
6.99k
            appendOp(URX_LB_START, dataLoc);
667
668
            // Emit URX_LB_CONT
669
6.99k
            appendOp(URX_LB_CONT, dataLoc);
670
6.99k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671
6.99k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672
673
            // Emit the NOPs
674
6.99k
            appendOp(URX_NOP, 0);
675
6.99k
            appendOp(URX_NOP, 0);
676
677
            // On the Parentheses stack, start a new frame and add the positions
678
            //   of the URX_LB_CONT and the NOP.
679
6.99k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680
6.99k
            fParenStack.push(lookBehind, *fStatus);                       // Frame type
681
6.99k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682
6.99k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683
684
            // The final two instructions will be added when the ')' is encountered.
685
6.99k
        }
686
687
6.99k
        break;
688
689
4.54k
    case doOpenLookBehindNeg:
690
4.54k
        {
691
            //   Compile a (?<! negated look-behind open paren.
692
            //
693
            //          Compiles to
694
            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695
            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696
            //              2                        MinMatchLen
697
            //              3                        MaxMatchLen
698
            //              4                        continueLoc (9)
699
            //              5       URX_NOP          Standard '(' boilerplate.
700
            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701
            //              7         <code for LookBehind expression>
702
            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703
            //              9       ...
704
            //
705
            //          Allocate a block of matcher data, to contain (when running a match)
706
            //              0:    Stack ptr on entry
707
            //              1:    Input Index on entry
708
            //              2:    fActiveStart, the active bounds start on entry.
709
            //              3:    fActiveLimit, the active bounds limit on entry.
710
            //              4:    Start index of match current match attempt.
711
            //          The first four items must match the layout of data for LA_START / LA_END
712
713
            // Generate match code for any pending literals.
714
4.54k
            fixLiterals();
715
716
            // Allocate data space
717
4.54k
            int32_t dataLoc = allocateData(5);
718
719
            // Emit URX_LB_START
720
4.54k
            appendOp(URX_LB_START, dataLoc);
721
722
            // Emit URX_LBN_CONT
723
4.54k
            appendOp(URX_LBN_CONT, dataLoc);
724
4.54k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725
4.54k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726
4.54k
            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727
728
            // Emit the NOPs
729
4.54k
            appendOp(URX_NOP, 0);
730
4.54k
            appendOp(URX_NOP, 0);
731
732
            // On the Parentheses stack, start a new frame and add the positions
733
            //   of the URX_LB_CONT and the NOP.
734
4.54k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735
4.54k
            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736
4.54k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737
4.54k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738
739
            // The final two instructions will be added when the ')' is encountered.
740
4.54k
        }
741
4.54k
        break;
742
743
2
    case doConditionalExpr:
744
        // Conditionals such as (?(1)a:b)
745
3
    case doPerlInline:
746
        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747
3
        error(U_REGEX_UNIMPLEMENTED);
748
3
        break;
749
750
751
432k
    case doCloseParen:
752
432k
        handleCloseParen();
753
432k
        if (fParenStack.size() <= 0) {
754
            //  Extra close paren, or missing open paren.
755
29
            error(U_REGEX_MISMATCHED_PAREN);
756
29
        }
757
432k
        break;
758
759
93.7M
    case doNOP:
760
93.7M
        break;
761
762
763
39
    case doBadOpenParenType:
764
84
    case doRuleError:
765
84
        error(U_REGEX_RULE_SYNTAX);
766
84
        break;
767
768
769
13
    case doMismatchedParenErr:
770
13
        error(U_REGEX_MISMATCHED_PAREN);
771
13
        break;
772
773
30.9k
    case doPlus:
774
        //  Normal '+'  compiles to
775
        //     1.   stuff to be repeated  (already built)
776
        //     2.   jmp-sav 1
777
        //     3.   ...
778
        //
779
        //  Or, if the item to be repeated can match a zero length string,
780
        //     1.   STO_INP_LOC  data-loc
781
        //     2.      body of stuff to be repeated
782
        //     3.   JMP_SAV_X    2
783
        //     4.   ...
784
785
        //
786
        //  Or, if the item to be repeated is simple
787
        //     1.   Item to be repeated.
788
        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789
        //     3.   LOOP_C       stack location
790
30.9k
        {
791
30.9k
            int32_t  topLoc = blockTopLoc(false);        // location of item #1
792
30.9k
            int32_t  frameLoc;
793
794
            // Check for simple constructs, which may get special optimized code.
795
30.9k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796
30.0k
                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
797
798
30.0k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
799
                    // Emit optimized code for [char set]+
800
499
                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801
499
                    frameLoc = allocateStackData(1);
802
499
                    appendOp(URX_LOOP_C, frameLoc);
803
499
                    break;
804
499
                }
805
806
29.5k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807
25.7k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808
19.2k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809
                    // Emit Optimized code for .+ operations.
810
10.9k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811
10.9k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812
                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813
6.42k
                        loopOpI |= 1;
814
6.42k
                    }
815
10.9k
                    if (fModeFlags & UREGEX_UNIX_LINES) {
816
6.67k
                        loopOpI |= 2;
817
6.67k
                    }
818
10.9k
                    appendOp(loopOpI);
819
10.9k
                    frameLoc = allocateStackData(1);
820
10.9k
                    appendOp(URX_LOOP_C, frameLoc);
821
10.9k
                    break;
822
10.9k
                }
823
824
29.5k
            }
825
826
            // General case.
827
828
            // Check for minimum match length of zero, which requires
829
            //    extra loop-breaking code.
830
19.4k
            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831
                // Zero length match is possible.
832
                // Emit the code sequence that can handle it.
833
2.81k
                insertOp(topLoc);
834
2.81k
                frameLoc = allocateStackData(1);
835
836
2.81k
                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837
2.81k
                fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839
2.81k
                appendOp(URX_JMP_SAV_X, topLoc+1);
840
16.6k
            } else {
841
                // Simpler code when the repeated body must match something non-empty
842
16.6k
                appendOp(URX_JMP_SAV, topLoc);
843
16.6k
            }
844
19.4k
        }
845
0
        break;
846
847
2.88k
    case doNGPlus:
848
        //  Non-greedy '+?'  compiles to
849
        //     1.   stuff to be repeated  (already built)
850
        //     2.   state-save  1
851
        //     3.   ...
852
2.88k
        {
853
2.88k
            int32_t topLoc      = blockTopLoc(false);
854
2.88k
            appendOp(URX_STATE_SAVE, topLoc);
855
2.88k
        }
856
2.88k
        break;
857
858
859
23.2k
    case doOpt:
860
        // Normal (greedy) ? quantifier.
861
        //  Compiles to
862
        //     1. state save 3
863
        //     2.    body of optional block
864
        //     3. ...
865
        // Insert the state save into the compiled pattern, and we're done.
866
23.2k
        {
867
23.2k
            int32_t   saveStateLoc = blockTopLoc(true);
868
23.2k
            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869
23.2k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870
23.2k
        }
871
23.2k
        break;
872
873
3.62k
    case doNGOpt:
874
        // Non-greedy ?? quantifier
875
        //   compiles to
876
        //    1.  jmp   4
877
        //    2.     body of optional block
878
        //    3   jmp   5
879
        //    4.  state save 2
880
        //    5    ...
881
        //  This code is less than ideal, with two jmps instead of one, because we can only
882
        //  insert one instruction at the top of the block being iterated.
883
3.62k
        {
884
3.62k
            int32_t  jmp1_loc = blockTopLoc(true);
885
3.62k
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886
887
3.62k
            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888
3.62k
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890
3.62k
            appendOp(URX_JMP, jmp2_loc+2);
891
892
3.62k
            appendOp(URX_STATE_SAVE, jmp1_loc+1);
893
3.62k
        }
894
3.62k
        break;
895
896
897
249k
    case doStar:
898
        // Normal (greedy) * quantifier.
899
        // Compiles to
900
        //       1.   STATE_SAVE   4
901
        //       2.      body of stuff being iterated over
902
        //       3.   JMP_SAV      2
903
        //       4.   ...
904
        //
905
        // Or, if the body is a simple [Set],
906
        //       1.   LOOP_SR_I    set number
907
        //       2.   LOOP_C       stack location
908
        //       ...
909
        //
910
        // Or if this is a .*
911
        //       1.   LOOP_DOT_I    (. matches all mode flag)
912
        //       2.   LOOP_C        stack location
913
        //
914
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
915
        //       1.   STATE_SAVE   5
916
        //       2.   STO_INP_LOC  data-loc
917
        //       3.      body of stuff
918
        //       4.   JMP_SAV_X    2
919
        //       5.   ...
920
249k
        {
921
            // location of item #1, the STATE_SAVE
922
249k
            int32_t   topLoc = blockTopLoc(false);
923
249k
            int32_t   dataLoc = -1;
924
925
            // Check for simple *, where the construct being repeated
926
            //   compiled to single opcode, and might be optimizable.
927
249k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928
181k
                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
929
930
181k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
931
                    // Emit optimized code for a [char set]*
932
4.18k
                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933
4.18k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934
4.18k
                    dataLoc = allocateStackData(1);
935
4.18k
                    appendOp(URX_LOOP_C, dataLoc);
936
4.18k
                    break;
937
4.18k
                }
938
939
177k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940
171k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941
170k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942
                    // Emit Optimized code for .* operations.
943
8.76k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944
8.76k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945
                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946
900
                        loopOpI |= 1;
947
900
                    }
948
8.76k
                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949
1.47k
                        loopOpI |= 2;
950
1.47k
                    }
951
8.76k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952
8.76k
                    dataLoc = allocateStackData(1);
953
8.76k
                    appendOp(URX_LOOP_C, dataLoc);
954
8.76k
                    break;
955
8.76k
                }
956
177k
            }
957
958
            // Emit general case code for this *
959
            // The optimizations did not apply.
960
961
236k
            int32_t   saveStateLoc = blockTopLoc(true);
962
236k
            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964
            // Check for minimum match length of zero, which requires
965
            //    extra loop-breaking code.
966
236k
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967
107k
                insertOp(saveStateLoc);
968
107k
                dataLoc = allocateStackData(1);
969
970
107k
                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971
107k
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972
107k
                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973
107k
            }
974
975
            // Locate the position in the compiled pattern where the match will continue
976
            //   after completing the *.   (4 or 5 in the comment above)
977
236k
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979
            // Put together the save state op and store it into the compiled code.
980
236k
            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981
236k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983
            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984
236k
            appendOp(jmpOp);
985
236k
        }
986
0
        break;
987
988
1.01k
    case doNGStar:
989
        // Non-greedy *? quantifier
990
        // compiles to
991
        //     1.   JMP    3
992
        //     2.      body of stuff being iterated over
993
        //     3.   STATE_SAVE  2
994
        //     4    ...
995
1.01k
        {
996
1.01k
            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
997
1.01k
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998
1.01k
            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999
1.01k
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000
1.01k
            appendOp(URX_STATE_SAVE, jmpLoc+1);
1001
1.01k
        }
1002
1.01k
        break;
1003
1004
1005
105k
    case doIntervalInit:
1006
        // The '{' opening an interval quantifier was just scanned.
1007
        // Init the counter variables that will accumulate the values as the digits
1008
        //    are scanned.
1009
105k
        fIntervalLow = 0;
1010
105k
        fIntervalUpper = -1;
1011
105k
        break;
1012
1013
111k
    case doIntevalLowerDigit:
1014
        // Scanned a digit from the lower value of an {lower,upper} interval
1015
111k
        {
1016
111k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1017
111k
            U_ASSERT(digitValue >= 0);
1018
111k
            int64_t val = static_cast<int64_t>(fIntervalLow) * 10 + digitValue;
1019
111k
            if (val > INT32_MAX) {
1020
4
                error(U_REGEX_NUMBER_TOO_BIG);
1021
111k
            } else {
1022
111k
                fIntervalLow = static_cast<int32_t>(val);
1023
111k
            }
1024
111k
        }
1025
111k
        break;
1026
1027
86.2k
    case doIntervalUpperDigit:
1028
        // Scanned a digit from the upper value of an {lower,upper} interval
1029
86.2k
        {
1030
86.2k
            if (fIntervalUpper < 0) {
1031
82.5k
                fIntervalUpper = 0;
1032
82.5k
            }
1033
86.2k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1034
86.2k
            U_ASSERT(digitValue >= 0);
1035
86.2k
            int64_t val = static_cast<int64_t>(fIntervalUpper) * 10 + digitValue;
1036
86.2k
            if (val > INT32_MAX) {
1037
16
                error(U_REGEX_NUMBER_TOO_BIG);
1038
86.2k
            } else {
1039
86.2k
                fIntervalUpper = static_cast<int32_t>(val);
1040
86.2k
            }
1041
86.2k
        }
1042
86.2k
        break;
1043
1044
21.8k
    case doIntervalSame:
1045
        // Scanned a single value interval like {27}.  Upper = Lower.
1046
21.8k
        fIntervalUpper = fIntervalLow;
1047
21.8k
        break;
1048
1049
99.4k
    case doInterval:
1050
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051
99.4k
        if (compileInlineInterval() == false) {
1052
4.76k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
4.76k
        }
1054
99.4k
        break;
1055
1056
3.35k
    case doPossessiveInterval:
1057
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058
3.35k
        {
1059
            // Remember the loc for the top of the block being looped over.
1060
            //   (Can not reserve a slot in the compiled pattern at this time, because
1061
            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062
            //    once per block.)
1063
3.35k
            int32_t topLoc = blockTopLoc(false);
1064
1065
            // Produce normal looping code.
1066
3.35k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068
            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069
            //  just as if the loop was inclosed in atomic parentheses.
1070
1071
            // First the STO_SP before the start of the loop
1072
3.35k
            insertOp(topLoc);
1073
1074
3.35k
            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075
3.35k
            int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076
3.35k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078
3.35k
            int32_t loopOp = static_cast<int32_t>(fRXPat->fCompiledPat->popi());
1079
3.35k
            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080
3.35k
            loopOp++;     // point LoopOp after the just-inserted STO_SP
1081
3.35k
            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083
            // Then the LD_SP after the end of the loop
1084
3.35k
            appendOp(URX_LD_SP, varLoc);
1085
3.35k
        }
1086
1087
3.35k
        break;
1088
1089
2.49k
    case doNGInterval:
1090
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091
2.49k
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092
2.49k
        break;
1093
1094
134
    case doIntervalError:
1095
134
        error(U_REGEX_BAD_INTERVAL);
1096
134
        break;
1097
1098
45.0M
    case doLiteralChar:
1099
        // We've just scanned a "normal" character from the pattern,
1100
45.0M
        literalChar(fC.fChar);
1101
45.0M
        break;
1102
1103
1104
19.2k
    case doEscapedLiteralChar:
1105
        // We've just scanned an backslashed escaped character with  no
1106
        //   special meaning.  It represents itself.
1107
19.2k
        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108
0
            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109
0
            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110
0
               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111
0
             }
1112
19.2k
        literalChar(fC.fChar);
1113
19.2k
        break;
1114
1115
1116
567k
    case doDotAny:
1117
        // scanned a ".",  match any single character.
1118
567k
        {
1119
567k
            fixLiterals(false);
1120
567k
            if (fModeFlags & UREGEX_DOTALL) {
1121
15.8k
                appendOp(URX_DOTANY_ALL, 0);
1122
552k
            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123
7.81k
                appendOp(URX_DOTANY_UNIX, 0);
1124
544k
            } else {
1125
544k
                appendOp(URX_DOTANY, 0);
1126
544k
            }
1127
567k
        }
1128
567k
        break;
1129
1130
10.8k
    case doCaret:
1131
10.8k
        {
1132
10.8k
            fixLiterals(false);
1133
10.8k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134
6.54k
                appendOp(URX_CARET, 0);
1135
6.54k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136
1.75k
                appendOp(URX_CARET_M, 0);
1137
2.58k
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138
538
                appendOp(URX_CARET, 0);   // Only testing true start of input.
1139
2.04k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140
2.04k
                appendOp(URX_CARET_M_UNIX, 0);
1141
2.04k
            }
1142
10.8k
        }
1143
10.8k
        break;
1144
1145
113k
    case doDollar:
1146
113k
        {
1147
113k
            fixLiterals(false);
1148
113k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149
72.5k
                appendOp(URX_DOLLAR, 0);
1150
72.5k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151
1.89k
                appendOp(URX_DOLLAR_M, 0);
1152
39.2k
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153
36.0k
                appendOp(URX_DOLLAR_D, 0);
1154
36.0k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155
3.21k
                appendOp(URX_DOLLAR_MD, 0);
1156
3.21k
            }
1157
113k
        }
1158
113k
        break;
1159
1160
435
    case doBackslashA:
1161
435
        fixLiterals(false);
1162
435
        appendOp(URX_CARET, 0);
1163
435
        break;
1164
1165
9.08k
    case doBackslashB:
1166
9.08k
        {
1167
            #if  UCONFIG_NO_BREAK_ITERATION==1
1168
            if (fModeFlags & UREGEX_UWORD) {
1169
                error(U_UNSUPPORTED_ERROR);
1170
            }
1171
            #endif
1172
9.08k
            fixLiterals(false);
1173
9.08k
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174
9.08k
            appendOp(op, 1);
1175
9.08k
        }
1176
9.08k
        break;
1177
1178
1.44k
    case doBackslashb:
1179
1.44k
        {
1180
            #if  UCONFIG_NO_BREAK_ITERATION==1
1181
            if (fModeFlags & UREGEX_UWORD) {
1182
                error(U_UNSUPPORTED_ERROR);
1183
            }
1184
            #endif
1185
1.44k
            fixLiterals(false);
1186
1.44k
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187
1.44k
            appendOp(op, 0);
1188
1.44k
        }
1189
1.44k
        break;
1190
1191
3.65k
    case doBackslashD:
1192
3.65k
        fixLiterals(false);
1193
3.65k
        appendOp(URX_BACKSLASH_D, 1);
1194
3.65k
        break;
1195
1196
1.15k
    case doBackslashd:
1197
1.15k
        fixLiterals(false);
1198
1.15k
        appendOp(URX_BACKSLASH_D, 0);
1199
1.15k
        break;
1200
1201
767
    case doBackslashG:
1202
767
        fixLiterals(false);
1203
767
        appendOp(URX_BACKSLASH_G, 0);
1204
767
        break;
1205
1206
3.51k
    case doBackslashH:
1207
3.51k
        fixLiterals(false);
1208
3.51k
        appendOp(URX_BACKSLASH_H, 1);
1209
3.51k
        break;
1210
1211
1.48k
    case doBackslashh:
1212
1.48k
        fixLiterals(false);
1213
1.48k
        appendOp(URX_BACKSLASH_H, 0);
1214
1.48k
        break;
1215
1216
4.46k
    case doBackslashR:
1217
4.46k
        fixLiterals(false);
1218
4.46k
        appendOp(URX_BACKSLASH_R, 0);
1219
4.46k
        break;
1220
1221
24.4k
    case doBackslashS:
1222
24.4k
        fixLiterals(false);
1223
24.4k
        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224
24.4k
        break;
1225
1226
2.75k
    case doBackslashs:
1227
2.75k
        fixLiterals(false);
1228
2.75k
        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229
2.75k
        break;
1230
1231
1.80k
    case doBackslashV:
1232
1.80k
        fixLiterals(false);
1233
1.80k
        appendOp(URX_BACKSLASH_V, 1);
1234
1.80k
        break;
1235
1236
2.65k
    case doBackslashv:
1237
2.65k
        fixLiterals(false);
1238
2.65k
        appendOp(URX_BACKSLASH_V, 0);
1239
2.65k
        break;
1240
1241
4.00k
    case doBackslashW:
1242
4.00k
        fixLiterals(false);
1243
4.00k
        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244
4.00k
        break;
1245
1246
4.87k
    case doBackslashw:
1247
4.87k
        fixLiterals(false);
1248
4.87k
        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249
4.87k
        break;
1250
1251
2.73k
    case doBackslashX:
1252
        #if  UCONFIG_NO_BREAK_ITERATION==1
1253
        // Grapheme Cluster Boundary requires ICU break iteration.
1254
        error(U_UNSUPPORTED_ERROR);
1255
        #endif
1256
2.73k
        fixLiterals(false);
1257
2.73k
        appendOp(URX_BACKSLASH_X, 0);
1258
2.73k
        break;
1259
1260
945
    case doBackslashZ:
1261
945
        fixLiterals(false);
1262
945
        appendOp(URX_DOLLAR, 0);
1263
945
        break;
1264
1265
4.05k
    case doBackslashz:
1266
4.05k
        fixLiterals(false);
1267
4.05k
        appendOp(URX_BACKSLASH_Z, 0);
1268
4.05k
        break;
1269
1270
58
    case doEscapeError:
1271
58
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272
58
        break;
1273
1274
0
    case doExit:
1275
0
        fixLiterals(false);
1276
0
        returnVal = false;
1277
0
        break;
1278
1279
1.22k
    case doProperty:
1280
1.22k
        {
1281
1.22k
            fixLiterals(false);
1282
1.22k
            UnicodeSet *theSet = scanProp();
1283
1.22k
            compileSet(theSet);
1284
1.22k
        }
1285
1.22k
        break;
1286
1287
1.71k
    case doNamedChar:
1288
1.71k
        {
1289
1.71k
            UChar32 c = scanNamedChar();
1290
1.71k
            literalChar(c);
1291
1.71k
        }
1292
1.71k
        break;
1293
1294
1295
11.2k
    case doBackRef:
1296
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297
        //                 the regular expression, because the number of digits to be consumed
1298
        //                 depends on the number of capture groups that have been defined.  So
1299
        //                 we have to do it here instead.
1300
11.2k
        {
1301
11.2k
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302
11.2k
            int32_t  groupNum = 0;
1303
11.2k
            UChar32  c        = fC.fChar;
1304
1305
12.3k
            for (;;) {
1306
                // Loop once per digit, for max allowed number of digits in a back reference.
1307
12.3k
                int32_t digit = u_charDigitValue(c);
1308
12.3k
                groupNum = groupNum * 10 + digit;
1309
12.3k
                if (groupNum >= numCaptureGroups) {
1310
4.53k
                    break;
1311
4.53k
                }
1312
7.80k
                c = peekCharLL();
1313
7.80k
                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
1314
6.70k
                    break;
1315
6.70k
                }
1316
1.09k
                nextCharLL();
1317
1.09k
            }
1318
1319
            // Scan of the back reference in the source regexp is complete.  Now generate
1320
            //  the compiled code for it.
1321
            // Because capture groups can be forward-referenced by back-references,
1322
            //  we fill the operand with the capture group number.  At the end
1323
            //  of compilation, it will be changed to the variable's location.
1324
11.2k
            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325
                                     //    and shouldn't enter this code path at all.
1326
11.2k
            fixLiterals(false);
1327
11.2k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328
8.26k
                appendOp(URX_BACKREF_I, groupNum);
1329
8.26k
            } else {
1330
2.97k
                appendOp(URX_BACKREF, groupNum);
1331
2.97k
            }
1332
11.2k
        }
1333
11.2k
        break;
1334
1335
1.38k
    case doBeginNamedBackRef:
1336
1.38k
        U_ASSERT(fCaptureName == nullptr);
1337
1.38k
        fCaptureName = new UnicodeString;
1338
1.38k
        if (fCaptureName == nullptr) {
1339
0
            error(U_MEMORY_ALLOCATION_ERROR);
1340
0
        }
1341
1.38k
        break;
1342
            
1343
4.43k
    case doContinueNamedBackRef:
1344
4.43k
        fCaptureName->append(fC.fChar);
1345
4.43k
        break;
1346
1347
1.36k
    case doCompleteNamedBackRef:
1348
1.36k
        {
1349
1.36k
        int32_t groupNumber =
1350
1.36k
            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351
1.36k
        if (groupNumber == 0) {
1352
            // Group name has not been defined.
1353
            //   Could be a forward reference. If we choose to support them at some
1354
            //   future time, extra mechanism will be required at this point.
1355
45
            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356
1.31k
        } else {
1357
            // Given the number, handle identically to a \n numbered back reference.
1358
            // See comments above, under doBackRef
1359
1.31k
            fixLiterals(false);
1360
1.31k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361
514
                appendOp(URX_BACKREF_I, groupNumber);
1362
803
            } else {
1363
803
                appendOp(URX_BACKREF, groupNumber);
1364
803
            }
1365
1.31k
        }
1366
1.36k
        delete fCaptureName;
1367
1.36k
        fCaptureName = nullptr;
1368
1.36k
        break;
1369
249k
        }
1370
       
1371
21.1k
    case doPossessivePlus:
1372
        // Possessive ++ quantifier.
1373
        // Compiles to
1374
        //       1.   STO_SP
1375
        //       2.      body of stuff being iterated over
1376
        //       3.   STATE_SAVE 5
1377
        //       4.   JMP        2
1378
        //       5.   LD_SP
1379
        //       6.   ...
1380
        //
1381
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382
        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383
        //
1384
21.1k
        {
1385
            // Emit the STO_SP
1386
21.1k
            int32_t   topLoc = blockTopLoc(true);
1387
21.1k
            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388
21.1k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389
21.1k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391
            // Emit the STATE_SAVE
1392
21.1k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394
            // Emit the JMP
1395
21.1k
            appendOp(URX_JMP, topLoc+1);
1396
1397
            // Emit the LD_SP
1398
21.1k
            appendOp(URX_LD_SP, stoLoc);
1399
21.1k
        }
1400
21.1k
        break;
1401
1402
36.8k
    case doPossessiveStar:
1403
        // Possessive *+ quantifier.
1404
        // Compiles to
1405
        //       1.   STO_SP       loc
1406
        //       2.   STATE_SAVE   5
1407
        //       3.      body of stuff being iterated over
1408
        //       4.   JMP          2
1409
        //       5.   LD_SP        loc
1410
        //       6    ...
1411
        // TODO:  do something to cut back the state stack each time through the loop.
1412
36.8k
        {
1413
            // Reserve two slots at the top of the block.
1414
36.8k
            int32_t   topLoc = blockTopLoc(true);
1415
36.8k
            insertOp(topLoc);
1416
1417
            // emit   STO_SP     loc
1418
36.8k
            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419
36.8k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420
36.8k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422
            // Emit the SAVE_STATE   5
1423
36.8k
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424
36.8k
            op = buildOp(URX_STATE_SAVE, L7);
1425
36.8k
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427
            // Append the JMP operation.
1428
36.8k
            appendOp(URX_JMP, topLoc+1);
1429
1430
            // Emit the LD_SP       loc
1431
36.8k
            appendOp(URX_LD_SP, stoLoc);
1432
36.8k
        }
1433
36.8k
        break;
1434
1435
31.1k
    case doPossessiveOpt:
1436
        // Possessive  ?+ quantifier.
1437
        //  Compiles to
1438
        //     1. STO_SP      loc
1439
        //     2. SAVE_STATE  5
1440
        //     3.    body of optional block
1441
        //     4. LD_SP       loc
1442
        //     5. ...
1443
        //
1444
31.1k
        {
1445
            // Reserve two slots at the top of the block.
1446
31.1k
            int32_t   topLoc = blockTopLoc(true);
1447
31.1k
            insertOp(topLoc);
1448
1449
            // Emit the STO_SP
1450
31.1k
            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451
31.1k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452
31.1k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454
            // Emit the SAVE_STATE
1455
31.1k
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456
31.1k
            op = buildOp(URX_STATE_SAVE, continueLoc);
1457
31.1k
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459
            // Emit the LD_SP
1460
31.1k
            appendOp(URX_LD_SP, stoLoc);
1461
31.1k
        }
1462
31.1k
        break;
1463
1464
1465
14.5k
    case doBeginMatchMode:
1466
14.5k
        fNewModeFlags = fModeFlags;
1467
14.5k
        fSetModeFlag  = true;
1468
14.5k
        break;
1469
1470
17.2k
    case doMatchMode:   //  (?i)    and similar
1471
17.2k
        {
1472
17.2k
            int32_t  bit = 0;
1473
17.2k
            switch (fC.fChar) {
1474
4.67k
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475
1.25k
            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476
1.65k
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477
629
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478
254
            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479
2.95k
            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480
4.90k
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481
968
            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
1482
0
            default:
1483
0
                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484
                                        // by the scanner.
1485
17.2k
            }
1486
17.2k
            if (fSetModeFlag) {
1487
15.8k
                fNewModeFlags |= bit;
1488
15.8k
            } else {
1489
1.48k
                fNewModeFlags &= ~bit;
1490
1.48k
            }
1491
17.2k
        }
1492
0
        break;
1493
1494
9.24k
    case doSetMatchMode:
1495
        // Emit code to match any pending literals, using the not-yet changed match mode.
1496
9.24k
        fixLiterals();
1497
1498
        // We've got a (?i) or similar.  The match mode is being changed, but
1499
        //   the change is not scoped to a parenthesized block.
1500
9.24k
        U_ASSERT(fNewModeFlags < 0);
1501
9.24k
        fModeFlags = fNewModeFlags;
1502
1503
9.24k
        break;
1504
1505
1506
5.15k
    case doMatchModeParen:
1507
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508
        //   mode flags so they can be restored at the close of the block.
1509
        //
1510
        //   Compile to a
1511
        //      - NOP, which later may be replaced by a save-state if the
1512
        //         parenthesized group gets a * quantifier, followed by
1513
        //      - NOP, which may later be replaced by a save-state if there
1514
        //             is an '|' alternation within the parens.
1515
5.15k
        {
1516
5.15k
            fixLiterals(false);
1517
5.15k
            appendOp(URX_NOP, 0);
1518
5.15k
            appendOp(URX_NOP, 0);
1519
1520
            // On the Parentheses stack, start a new frame and add the positions
1521
            //   of the two NOPs (a normal non-capturing () frame, except for the
1522
            //   saving of the original mode flags.)
1523
5.15k
            fParenStack.push(fModeFlags, *fStatus);
1524
5.15k
            fParenStack.push(flags, *fStatus);                            // Frame Marker
1525
5.15k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526
5.15k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527
1528
            // Set the current mode flags to the new values.
1529
5.15k
            U_ASSERT(fNewModeFlags < 0);
1530
5.15k
            fModeFlags = fNewModeFlags;
1531
5.15k
        }
1532
5.15k
        break;
1533
1534
114
    case doBadModeFlag:
1535
114
        error(U_REGEX_INVALID_FLAG);
1536
114
        break;
1537
1538
38.4k
    case doSuppressComments:
1539
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540
        // treating a '#' as a to-the-end-of-line comment.
1541
        //   (This Perl compatibility just gets uglier and uglier to do...)
1542
38.4k
        fEOLComments = false;
1543
38.4k
        break;
1544
1545
1546
1.11k
    case doSetAddAmp:
1547
1.11k
        {
1548
1.11k
          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1549
1.11k
          set->add(chAmp);
1550
1.11k
        }
1551
1.11k
        break;
1552
1553
1.25k
    case doSetAddDash:
1554
1.25k
        {
1555
1.25k
          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1556
1.25k
          set->add(chDash);
1557
1.25k
        }
1558
1.25k
        break;
1559
1560
720
     case doSetBackslash_s:
1561
720
        {
1562
720
         UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1563
720
         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564
720
         break;
1565
17.2k
        }
1566
1567
1.72k
     case doSetBackslash_S:
1568
1.72k
        {
1569
1.72k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1570
1.72k
            UnicodeSet SSet;
1571
1.72k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572
1.72k
            set->addAll(SSet);
1573
1.72k
            break;
1574
17.2k
        }
1575
1576
977
    case doSetBackslash_d:
1577
977
        {
1578
977
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1579
            // TODO - make a static set, ticket 6058.
1580
977
            addCategory(set, U_GC_ND_MASK, *fStatus);
1581
977
            break;
1582
17.2k
        }
1583
1584
6.97k
    case doSetBackslash_D:
1585
6.97k
        {
1586
6.97k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1587
6.97k
            UnicodeSet digits;
1588
            // TODO - make a static set, ticket 6058.
1589
6.97k
            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590
6.97k
            digits.complement();
1591
6.97k
            set->addAll(digits);
1592
6.97k
            break;
1593
17.2k
        }
1594
1595
863
    case doSetBackslash_h:
1596
863
        {
1597
863
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1598
863
            UnicodeSet h;
1599
863
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600
863
            h.add(static_cast<UChar32>(9)); // Tab
1601
863
            set->addAll(h);
1602
863
            break;
1603
17.2k
        }
1604
1605
2.01k
    case doSetBackslash_H:
1606
2.01k
        {
1607
2.01k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1608
2.01k
            UnicodeSet h;
1609
2.01k
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610
2.01k
            h.add(static_cast<UChar32>(9)); // Tab
1611
2.01k
            h.complement();
1612
2.01k
            set->addAll(h);
1613
2.01k
            break;
1614
17.2k
        }
1615
1616
2.54k
    case doSetBackslash_v:
1617
2.54k
        {
1618
2.54k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1619
2.54k
            set->add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
1620
2.54k
            set->add(static_cast<UChar32>(0x85));
1621
2.54k
            set->add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
1622
2.54k
            break;
1623
17.2k
        }
1624
1625
1.80k
    case doSetBackslash_V:
1626
1.80k
        {
1627
1.80k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1628
1.80k
            UnicodeSet v;
1629
1.80k
            v.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
1630
1.80k
            v.add(static_cast<UChar32>(0x85));
1631
1.80k
            v.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
1632
1.80k
            v.complement();
1633
1.80k
            set->addAll(v);
1634
1.80k
            break;
1635
17.2k
        }
1636
1637
3.05k
    case doSetBackslash_w:
1638
3.05k
        {
1639
3.05k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1640
3.05k
            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641
3.05k
            break;
1642
17.2k
        }
1643
1644
17.6k
    case doSetBackslash_W:
1645
17.6k
        {
1646
17.6k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1647
17.6k
            UnicodeSet SSet;
1648
17.6k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649
17.6k
            set->addAll(SSet);
1650
17.6k
            break;
1651
17.2k
        }
1652
1653
394k
    case doSetBegin:
1654
394k
        {
1655
394k
            fixLiterals(false);
1656
394k
            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657
394k
            fSetStack.push(lpSet.orphan(), *fStatus);
1658
394k
            fSetOpStack.push(setStart, *fStatus);
1659
394k
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660
350k
                fSetOpStack.push(setCaseClose, *fStatus);
1661
350k
            }
1662
394k
            break;
1663
17.2k
        }
1664
1665
2.19k
    case doSetBeginDifference1:
1666
        //  We have scanned something like [[abc]-[
1667
        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668
        //  Push a Difference operator, which will cause the new set to be subtracted from what
1669
        //    went before once it is created.
1670
2.19k
        setPushOp(setDifference1);
1671
2.19k
        fSetOpStack.push(setStart, *fStatus);
1672
2.19k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673
989
            fSetOpStack.push(setCaseClose, *fStatus);
1674
989
        }
1675
2.19k
        break;
1676
1677
1.08k
    case doSetBeginIntersection1:
1678
        //  We have scanned something like  [[abc]&[
1679
        //   Need both the '&' operator and the open '[' operator.
1680
1.08k
        setPushOp(setIntersection1);
1681
1.08k
        fSetOpStack.push(setStart, *fStatus);
1682
1.08k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683
706
            fSetOpStack.push(setCaseClose, *fStatus);
1684
706
        }
1685
1.08k
        break;
1686
1687
86.9k
    case doSetBeginUnion:
1688
        //  We have scanned something like  [[abc][
1689
        //     Need to handle the union operation explicitly [[abc] | [
1690
86.9k
        setPushOp(setUnion);
1691
86.9k
        fSetOpStack.push(setStart, *fStatus);
1692
86.9k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693
54.9k
            fSetOpStack.push(setCaseClose, *fStatus);
1694
54.9k
        }
1695
86.9k
        break;
1696
1697
534
    case doSetDifference2:
1698
        // We have scanned something like [abc--
1699
        //   Consider this to unambiguously be a set difference operator.
1700
534
        setPushOp(setDifference2);
1701
534
        break;
1702
1703
469k
    case doSetEnd:
1704
        // Have encountered the ']' that closes a set.
1705
        //    Force the evaluation of any pending operations within this set,
1706
        //    leave the completed set on the top of the set stack.
1707
469k
        setEval(setEnd);
1708
469k
        U_ASSERT(fSetOpStack.peeki()==setStart);
1709
469k
        fSetOpStack.popi();
1710
469k
        break;
1711
1712
389k
    case doSetFinish:
1713
389k
        {
1714
        // Finished a complete set expression, including all nested sets.
1715
        //   The close bracket has already triggered clearing out pending set operators,
1716
        //    the operator stack should be empty and the operand stack should have just
1717
        //    one entry, the result set.
1718
389k
        U_ASSERT(fSetOpStack.empty());
1719
389k
        UnicodeSet* theSet = static_cast<UnicodeSet*>(fSetStack.pop());
1720
389k
        U_ASSERT(fSetStack.empty());
1721
389k
        compileSet(theSet);
1722
389k
        break;
1723
17.2k
        }
1724
1725
5.28k
    case doSetIntersection2:
1726
        // Have scanned something like [abc&&
1727
5.28k
        setPushOp(setIntersection2);
1728
5.28k
        break;
1729
1730
12.0M
    case doSetLiteral:
1731
        // Union the just-scanned literal character into the set being built.
1732
        //    This operation is the highest precedence set operation, so we can always do
1733
        //    it immediately, without waiting to see what follows.  It is necessary to perform
1734
        //    any pending '-' or '&' operation first, because these have the same precedence
1735
        //    as union-ing in a literal'
1736
12.0M
        {
1737
12.0M
            setEval(setUnion);
1738
12.0M
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1739
12.0M
            s->add(fC.fChar);
1740
12.0M
            fLastSetLiteral = fC.fChar;
1741
12.0M
            break;
1742
17.2k
        }
1743
1744
36.6k
    case doSetLiteralEscaped:
1745
        // A back-slash escaped literal character was encountered.
1746
        // Processing is the same as with setLiteral, above, with the addition of
1747
        //  the optional check for errors on escaped ASCII letters.
1748
36.6k
        {
1749
36.6k
            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750
0
                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751
0
                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752
0
                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753
0
            }
1754
36.6k
            setEval(setUnion);
1755
36.6k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1756
36.6k
            s->add(fC.fChar);
1757
36.6k
            fLastSetLiteral = fC.fChar;
1758
36.6k
            break;
1759
17.2k
        }
1760
1761
2.67k
        case doSetNamedChar:
1762
        // Scanning a \N{UNICODE CHARACTER NAME}
1763
        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764
        //    above.
1765
2.67k
        {
1766
2.67k
            UChar32  c = scanNamedChar();
1767
2.67k
            setEval(setUnion);
1768
2.67k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1769
2.67k
            s->add(c);
1770
2.67k
            fLastSetLiteral = c;
1771
2.67k
            break;
1772
17.2k
        }
1773
1774
2.34k
    case doSetNamedRange:
1775
        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776
        // The left character is already in the set, and is saved in fLastSetLiteral.
1777
        // The right side needs to be picked up, the scan is at the 'N'.
1778
        // Lower Limit > Upper limit being an error matches both Java
1779
        //        and ICU UnicodeSet behavior.
1780
2.34k
        {
1781
2.34k
            UChar32  c = scanNamedChar();
1782
2.34k
            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783
5
                error(U_REGEX_INVALID_RANGE);
1784
5
            }
1785
2.34k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1786
2.34k
            s->add(fLastSetLiteral, c);
1787
2.34k
            fLastSetLiteral = c;
1788
2.34k
            break;
1789
17.2k
        }
1790
1791
1792
11.9k
    case  doSetNegate:
1793
        // Scanned a '^' at the start of a set.
1794
        // Push the negation operator onto the set op stack.
1795
        // A twist for case-insensitive matching:
1796
        //   the case closure operation must happen _before_ negation.
1797
        //   But the case closure operation will already be on the stack if it's required.
1798
        //   This requires checking for case closure, and swapping the stack order
1799
        //    if it is present.
1800
11.9k
        {
1801
11.9k
            int32_t  tosOp = fSetOpStack.peeki();
1802
11.9k
            if (tosOp == setCaseClose) {
1803
7.16k
                fSetOpStack.popi();
1804
7.16k
                fSetOpStack.push(setNegation, *fStatus);
1805
7.16k
                fSetOpStack.push(setCaseClose, *fStatus);
1806
7.16k
            } else {
1807
4.81k
                fSetOpStack.push(setNegation, *fStatus);
1808
4.81k
            }
1809
11.9k
        }
1810
11.9k
        break;
1811
1812
1.92k
    case doSetNoCloseError:
1813
1.92k
        error(U_REGEX_MISSING_CLOSE_BRACKET);
1814
1.92k
        break;
1815
1816
3
    case doSetOpError:
1817
3
        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818
3
        break;
1819
1820
109k
    case doSetPosixProp:
1821
109k
        {
1822
109k
            UnicodeSet *s = scanPosixProp();
1823
109k
            if (s != nullptr) {
1824
78.3k
                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
1825
78.3k
                tos->addAll(*s);
1826
78.3k
                delete s;
1827
78.3k
            }  // else error.  scanProp() reported the error status already.
1828
109k
        }
1829
109k
        break;
1830
1831
1.87k
    case doSetProp:
1832
        //  Scanned a \p \P within [brackets].
1833
1.87k
        {
1834
1.87k
            UnicodeSet *s = scanProp();
1835
1.87k
            if (s != nullptr) {
1836
1.86k
                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
1837
1.86k
                tos->addAll(*s);
1838
1.86k
                delete s;
1839
1.86k
            }  // else error.  scanProp() reported the error status already.
1840
1.87k
        }
1841
1.87k
        break;
1842
1843
1844
5.58k
    case doSetRange:
1845
        // We have scanned literal-literal.  Add the range to the set.
1846
        // The left character is already in the set, and is saved in fLastSetLiteral.
1847
        // The right side is the current character.
1848
        // Lower Limit > Upper limit being an error matches both Java
1849
        //        and ICU UnicodeSet behavior.
1850
5.58k
        {
1851
1852
5.58k
        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853
80
            error(U_REGEX_INVALID_RANGE);
1854
80
        }
1855
5.58k
        UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1856
5.58k
        s->add(fLastSetLiteral, fC.fChar);
1857
5.58k
        break;
1858
17.2k
        }
1859
1860
0
    default:
1861
0
        UPRV_UNREACHABLE_EXIT;
1862
175M
    }
1863
1864
175M
    if (U_FAILURE(*fStatus)) {
1865
7.80k
        returnVal = false;
1866
7.80k
    }
1867
1868
175M
    return returnVal;
1869
175M
}
1870
1871
1872
1873
//------------------------------------------------------------------------------
1874
//
1875
//   literalChar           We've encountered a literal character from the pattern,
1876
//                             or an escape sequence that reduces to a character.
1877
//                         Add it to the string containing all literal chars/strings from
1878
//                             the pattern.
1879
//
1880
//------------------------------------------------------------------------------
1881
45.4M
void RegexCompile::literalChar(UChar32 c)  {
1882
45.4M
    fLiteralChars.append(c);
1883
45.4M
}
1884
1885
1886
//------------------------------------------------------------------------------
1887
//
1888
//    fixLiterals           When compiling something that can follow a literal
1889
//                          string in a pattern, emit the code to match the
1890
//                          accumulated literal string.
1891
//
1892
//                          Optionally, split the last char of the string off into
1893
//                          a single "ONE_CHAR" operation, so that quantifiers can
1894
//                          apply to that char alone.  Example:   abc*
1895
//                          The * must apply to the 'c' only.
1896
//
1897
//------------------------------------------------------------------------------
1898
24.2M
void    RegexCompile::fixLiterals(UBool split) {
1899
1900
    // If no literal characters have been scanned but not yet had code generated
1901
    //   for them, nothing needs to be done.
1902
24.2M
    if (fLiteralChars.length() == 0) {
1903
22.1M
        return;
1904
22.1M
    }
1905
1906
2.11M
    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907
2.11M
    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909
    // Split:  We need to  ensure that the last item in the compiled pattern
1910
    //     refers only to the last literal scanned in the pattern, so that
1911
    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912
    //     Split before case folding for case insensitive matches.
1913
1914
2.11M
    if (split) {
1915
339k
        fLiteralChars.truncate(indexOfLastCodePoint);
1916
339k
        fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
1917
                              //  Note that the truncated literal string may be empty, in which case
1918
                              //  nothing will be emitted.
1919
1920
339k
        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921
339k
        fixLiterals(false);          // Second recursive call, code for the final code point.
1922
339k
        return;
1923
339k
    }
1924
1925
    // If we are doing case-insensitive matching, case fold the string.  This may expand
1926
    //   the string, e.g. the German sharp-s turns into "ss"
1927
1.77M
    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928
1.38M
        fLiteralChars.foldCase();
1929
1.38M
        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930
1.38M
        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931
1.38M
    }
1932
1933
1.77M
    if (indexOfLastCodePoint == 0) {
1934
        // Single character, emit a URX_ONECHAR op to match it.
1935
596k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936
386k
                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937
27.9k
            appendOp(URX_ONECHAR_I, lastCodePoint);
1938
569k
        } else {
1939
569k
            appendOp(URX_ONECHAR, lastCodePoint);
1940
569k
        }
1941
1.17M
    } else {
1942
        // Two or more chars, emit a URX_STRING to match them.
1943
1.17M
        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944
0
            error(U_REGEX_PATTERN_TOO_BIG);
1945
0
        }
1946
1.17M
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947
998k
            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948
998k
        } else {
1949
            // TODO here:  add optimization to split case sensitive strings of length two
1950
            //             into two single char ops, for efficiency.
1951
179k
            appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952
179k
        }
1953
1.17M
        appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955
        // Add this string into the accumulated strings of the compiled pattern.
1956
1.17M
        fRXPat->fLiteralText.append(fLiteralChars);
1957
1.17M
    }
1958
1959
1.77M
    fLiteralChars.remove();
1960
1.77M
}
1961
1962
1963
99.9M
int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964
99.9M
    if (U_FAILURE(*fStatus)) {
1965
160k
        return 0;
1966
160k
    }
1967
99.7M
    if (type < 0 || type > 255) {
1968
0
        UPRV_UNREACHABLE_EXIT;
1969
0
    }
1970
99.7M
    if (val > 0x00ffffff) {
1971
0
        UPRV_UNREACHABLE_EXIT;
1972
0
    }
1973
99.7M
    if (val < 0) {
1974
0
        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975
0
            UPRV_UNREACHABLE_EXIT;
1976
0
        }
1977
0
        if (URX_TYPE(val) != 0xff) {
1978
0
            UPRV_UNREACHABLE_EXIT;
1979
0
        }
1980
0
        type = URX_RESERVED_OP_N;
1981
0
    }
1982
99.7M
    return (type << 24) | val;
1983
99.7M
}
1984
1985
1986
//------------------------------------------------------------------------------
1987
//
1988
//   appendOp()             Append a new instruction onto the compiled pattern
1989
//                          Includes error checking, limiting the size of the
1990
//                          pattern to lengths that can be represented in the
1991
//                          24 bit operand field of an instruction.
1992
//
1993
//------------------------------------------------------------------------------
1994
49.4M
void RegexCompile::appendOp(int32_t op) {
1995
49.4M
    if (U_FAILURE(*fStatus)) {
1996
2.37k
        return;
1997
2.37k
    }
1998
49.4M
    fRXPat->fCompiledPat->addElement(op, *fStatus);
1999
49.4M
    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000
0
        error(U_REGEX_PATTERN_TOO_BIG);
2001
0
    }
2002
49.4M
}
2003
2004
47.7M
void RegexCompile::appendOp(int32_t type, int32_t val) {
2005
47.7M
    appendOp(buildOp(type, val));
2006
47.7M
}
2007
2008
2009
//------------------------------------------------------------------------------
2010
//
2011
//   insertOp()             Insert a slot for a new opcode into the already
2012
//                          compiled pattern code.
2013
//
2014
//                          Fill the slot with a NOP.  Our caller will replace it
2015
//                          with what they really wanted.
2016
//
2017
//------------------------------------------------------------------------------
2018
292k
void   RegexCompile::insertOp(int32_t where) {
2019
292k
    UVector64 *code = fRXPat->fCompiledPat;
2020
292k
    U_ASSERT(where>0 && where < code->size());
2021
2022
292k
    int32_t  nop = buildOp(URX_NOP, 0);
2023
292k
    code->insertElementAt(nop, where, *fStatus);
2024
2025
    // Walk through the pattern, looking for any ops with targets that
2026
    //  were moved down by the insert.  Fix them.
2027
292k
    int32_t loc;
2028
5.65G
    for (loc=0; loc<code->size(); loc++) {
2029
5.65G
        int32_t op = static_cast<int32_t>(code->elementAti(loc));
2030
5.65G
        int32_t opType = URX_TYPE(op);
2031
5.65G
        int32_t opValue = URX_VAL(op);
2032
5.65G
        if ((opType == URX_JMP         ||
2033
4.96G
            opType == URX_JMPX         ||
2034
4.96G
            opType == URX_STATE_SAVE   ||
2035
3.05G
            opType == URX_CTR_LOOP     ||
2036
3.05G
            opType == URX_CTR_LOOP_NG  ||
2037
3.05G
            opType == URX_JMP_SAV      ||
2038
2.98G
            opType == URX_JMP_SAV_X    ||
2039
2.84G
            opType == URX_RELOC_OPRND)    && opValue > where) {
2040
            // Target location for this opcode is after the insertion point and
2041
            //   needs to be incremented to adjust for the insertion.
2042
5.37M
            opValue++;
2043
5.37M
            op = buildOp(opType, opValue);
2044
5.37M
            code->setElementAt(op, loc);
2045
5.37M
        }
2046
5.65G
    }
2047
2048
    // Now fix up the parentheses stack.  All positive values in it are locations in
2049
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050
577M
    for (loc=0; loc<fParenStack.size(); loc++) {
2051
577M
        int32_t x = fParenStack.elementAti(loc);
2052
577M
        U_ASSERT(x < code->size());
2053
577M
        if (x>where) {
2054
0
            x++;
2055
0
            fParenStack.setElementAt(x, loc);
2056
0
        }
2057
577M
    }
2058
2059
292k
    if (fMatchCloseParen > where) {
2060
18.5k
        fMatchCloseParen++;
2061
18.5k
    }
2062
292k
    if (fMatchOpenParen > where) {
2063
0
        fMatchOpenParen++;
2064
0
    }
2065
292k
}
2066
2067
2068
//------------------------------------------------------------------------------
2069
//
2070
//   allocateData()        Allocate storage in the matcher's static data area.
2071
//                         Return the index for the newly allocated data.
2072
//                         The storage won't actually exist until we are running a match
2073
//                         operation, but the storage indexes are inserted into various
2074
//                         opcodes while compiling the pattern.
2075
//
2076
//------------------------------------------------------------------------------
2077
114k
int32_t RegexCompile::allocateData(int32_t size) {
2078
114k
    if (U_FAILURE(*fStatus)) {
2079
29
        return 0;
2080
29
    }
2081
114k
    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082
0
        error(U_REGEX_INTERNAL_ERROR);
2083
0
        return 0;
2084
0
    }
2085
114k
    int32_t dataIndex = fRXPat->fDataSize;
2086
114k
    fRXPat->fDataSize += size;
2087
114k
    if (fRXPat->fDataSize >= 0x00fffff0) {
2088
0
        error(U_REGEX_INTERNAL_ERROR);
2089
0
    }
2090
114k
    return dataIndex;
2091
114k
}
2092
2093
2094
//------------------------------------------------------------------------------
2095
//
2096
//   allocateStackData()   Allocate space in the back-tracking stack frame.
2097
//                         Return the index for the newly allocated data.
2098
//                         The frame indexes are inserted into various
2099
//                         opcodes while compiling the pattern, meaning that frame
2100
//                         size must be restricted to the size that will fit
2101
//                         as an operand (24 bits).
2102
//
2103
//------------------------------------------------------------------------------
2104
572k
int32_t RegexCompile::allocateStackData(int32_t size) {
2105
572k
    if (U_FAILURE(*fStatus)) {
2106
0
        return 0;
2107
0
    }
2108
572k
    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109
0
        error(U_REGEX_INTERNAL_ERROR);
2110
0
        return 0;
2111
0
    }
2112
572k
    int32_t dataIndex = fRXPat->fFrameSize;
2113
572k
    fRXPat->fFrameSize += size;
2114
572k
    if (fRXPat->fFrameSize >= 0x00fffff0) {
2115
0
        error(U_REGEX_PATTERN_TOO_BIG);
2116
0
    }
2117
572k
    return dataIndex;
2118
572k
}
2119
2120
2121
//------------------------------------------------------------------------------
2122
//
2123
//   blockTopLoc()          Find or create a location in the compiled pattern
2124
//                          at the start of the operation or block that has
2125
//                          just been compiled.  Needed when a quantifier (* or
2126
//                          whatever) appears, and we need to add an operation
2127
//                          at the start of the thing being quantified.
2128
//
2129
//                          (Parenthesized Blocks) have a slot with a NOP that
2130
//                          is reserved for this purpose.  .* or similar don't
2131
//                          and a slot needs to be added.
2132
//
2133
//       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
2134
//                                         at the returned location.
2135
//                                 false - just return the address,
2136
//                                         do not reserve a location there.
2137
//
2138
//------------------------------------------------------------------------------
2139
745k
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140
745k
    int32_t   theLoc;
2141
745k
    fixLiterals(true);  // Emit code for any pending literals.
2142
                        //   If last item was a string, emit separate op for the its last char.
2143
745k
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144
22.2k
    {
2145
        // The item just processed is a parenthesized block.
2146
22.2k
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147
22.2k
        U_ASSERT(theLoc > 0);
2148
22.2k
        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149
22.2k
    }
2150
723k
    else {
2151
        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153
        // We need to make space now.
2154
723k
        theLoc = fRXPat->fCompiledPat->size()-1;
2155
723k
        int32_t opAtTheLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(theLoc));
2156
723k
        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157
            // Strings take two opcode, we want the position of the first one.
2158
            // We can have a string at this point if a single character case-folded to two.
2159
123k
            theLoc--;
2160
123k
        }
2161
723k
        if (reserveLoc) {
2162
352k
            int32_t  nop = buildOp(URX_NOP, 0);
2163
352k
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164
352k
        }
2165
723k
    }
2166
745k
    return theLoc;
2167
745k
}
2168
2169
2170
2171
//------------------------------------------------------------------------------
2172
//
2173
//    handleCloseParen      When compiling a close paren, we need to go back
2174
//                          and fix up any JMP or SAVE operations within the
2175
//                          parenthesized block that need to target the end
2176
//                          of the block.  The locations of these are kept on
2177
//                          the paretheses stack.
2178
//
2179
//                          This function is called both when encountering a
2180
//                          real ) and at the end of the pattern.
2181
//
2182
//------------------------------------------------------------------------------
2183
448k
void  RegexCompile::handleCloseParen() {
2184
448k
    int32_t   patIdx;
2185
448k
    int32_t   patOp;
2186
448k
    if (fParenStack.size() <= 0) {
2187
0
        error(U_REGEX_MISMATCHED_PAREN);
2188
0
        return;
2189
0
    }
2190
2191
    // Emit code for any pending literals.
2192
448k
    fixLiterals(false);
2193
2194
    // Fixup any operations within the just-closed parenthesized group
2195
    //    that need to reference the end of the (block).
2196
    //    (The first one popped from the stack is an unused slot for
2197
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198
18.4M
    for (;;) {
2199
18.4M
        patIdx = fParenStack.popi();
2200
18.4M
        if (patIdx < 0) {
2201
            // value < 0 flags the start of the frame on the paren stack.
2202
448k
            break;
2203
448k
        }
2204
17.9M
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205
17.9M
        patOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(patIdx));
2206
17.9M
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207
17.9M
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208
17.9M
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209
17.9M
        fMatchOpenParen     = patIdx;
2210
17.9M
    }
2211
2212
    //  At the close of any parenthesized block, restore the match mode flags  to
2213
    //  the value they had at the open paren.  Saved value is
2214
    //  at the top of the paren stack.
2215
448k
    fModeFlags = fParenStack.popi();
2216
448k
    U_ASSERT(fModeFlags < 0);
2217
2218
    // DO any additional fixups, depending on the specific kind of
2219
    // parentesized grouping this is
2220
2221
448k
    switch (patIdx) {
2222
14.0k
    case plain:
2223
18.3k
    case flags:
2224
        // No additional fixups required.
2225
        //   (Grouping-only parentheses)
2226
18.3k
        break;
2227
409k
    case capturing:
2228
        // Capturing Parentheses.
2229
        //   Insert a End Capture op into the pattern.
2230
        //   The frame offset of the variables for this cg is obtained from the
2231
        //       start capture op and put it into the end-capture op.
2232
409k
        {
2233
409k
            int32_t captureOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
2234
409k
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235
2236
409k
            int32_t   frameVarLocation = URX_VAL(captureOp);
2237
409k
            appendOp(URX_END_CAPTURE, frameVarLocation);
2238
409k
        }
2239
409k
        break;
2240
822
    case atomic:
2241
        // Atomic Parenthesis.
2242
        //   Insert a LD_SP operation to restore the state stack to the position
2243
        //   it was when the atomic parens were entered.
2244
822
        {
2245
822
            int32_t stoOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
2246
822
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247
822
            int32_t   stoLoc = URX_VAL(stoOp);
2248
822
            appendOp(URX_LD_SP, stoLoc);
2249
822
        }
2250
822
        break;
2251
2252
1.74k
    case lookAhead:
2253
1.74k
        {
2254
1.74k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
2255
1.74k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256
1.74k
            int32_t dataLoc  = URX_VAL(startOp);
2257
1.74k
            appendOp(URX_LA_END, dataLoc);
2258
1.74k
        }
2259
1.74k
        break;
2260
2261
6.93k
    case negLookAhead:
2262
6.93k
        {
2263
            // See comment at doOpenLookAheadNeg
2264
6.93k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 1));
2265
6.93k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266
6.93k
            int32_t dataLoc  = URX_VAL(startOp);
2267
6.93k
            appendOp(URX_LA_END, dataLoc);
2268
6.93k
            appendOp(URX_BACKTRACK, 0);
2269
6.93k
            appendOp(URX_LA_END, dataLoc);
2270
2271
            // Patch the URX_SAVE near the top of the block.
2272
            // The destination of the SAVE is the final LA_END that was just added.
2273
6.93k
            int32_t saveOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen));
2274
6.93k
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275
6.93k
            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276
6.93k
            saveOp           = buildOp(URX_STATE_SAVE, dest);
2277
6.93k
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278
6.93k
        }
2279
6.93k
        break;
2280
2281
6.86k
    case lookBehind:
2282
6.86k
        {
2283
            // See comment at doOpenLookBehind.
2284
2285
            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286
6.86k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 4));
2287
6.86k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288
6.86k
            int32_t dataLoc  = URX_VAL(startOp);
2289
6.86k
            appendOp(URX_LB_END, dataLoc);
2290
6.86k
            appendOp(URX_LA_END, dataLoc);
2291
2292
            // Determine the min and max bounds for the length of the
2293
            //  string that the pattern can match.
2294
            //  An unbounded upper limit is an error.
2295
6.86k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296
6.86k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297
6.86k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298
6.86k
            if (URX_TYPE(maxML) != 0) {
2299
168
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2300
168
                break;
2301
168
            }
2302
6.70k
            if (maxML == INT32_MAX) {
2303
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2304
0
                break;
2305
0
            }
2306
6.70k
            if (minML == INT32_MAX) {
2307
                // This condition happens when no match is possible, such as with a
2308
                // [set] expression containing no elements.
2309
                // In principle, the generated code to evaluate the expression could be deleted,
2310
                // but it's probably not worth the complication.
2311
392
                minML = 0;
2312
392
            }
2313
6.70k
            U_ASSERT(minML <= maxML);
2314
2315
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2316
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317
6.70k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318
6.70k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319
2320
6.70k
        }
2321
0
        break;
2322
2323
2324
2325
4.33k
    case lookBehindN:
2326
4.33k
        {
2327
            // See comment at doOpenLookBehindNeg.
2328
2329
            // Append the URX_LBN_END to the compiled pattern.
2330
4.33k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
2331
4.33k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332
4.33k
            int32_t dataLoc  = URX_VAL(startOp);
2333
4.33k
            appendOp(URX_LBN_END, dataLoc);
2334
2335
            // Determine the min and max bounds for the length of the
2336
            //  string that the pattern can match.
2337
            //  An unbounded upper limit is an error.
2338
4.33k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339
4.33k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340
4.33k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341
4.33k
            if (URX_TYPE(maxML) != 0) {
2342
141
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2343
141
                break;
2344
141
            }
2345
4.19k
            if (maxML == INT32_MAX) {
2346
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2347
0
                break;
2348
0
            }
2349
4.19k
            if (minML == INT32_MAX) {
2350
                // This condition happens when no match is possible, such as with a
2351
                // [set] expression containing no elements.
2352
                // In principle, the generated code to evaluate the expression could be deleted,
2353
                // but it's probably not worth the complication.
2354
241
                minML = 0;
2355
241
            }
2356
2357
4.19k
            U_ASSERT(minML <= maxML);
2358
2359
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2360
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361
4.19k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362
4.19k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363
2364
            // Insert the pattern location to continue at after a successful match
2365
            //  as the last operand of the URX_LBN_CONT
2366
4.19k
            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367
4.19k
            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368
4.19k
        }
2369
0
        break;
2370
2371
2372
2373
0
    default:
2374
0
        UPRV_UNREACHABLE_EXIT;
2375
448k
    }
2376
2377
    // remember the next location in the compiled pattern.
2378
    // The compilation of Quantifiers will look at this to see whether its looping
2379
    //   over a parenthesized block or a single item
2380
448k
    fMatchCloseParen = fRXPat->fCompiledPat->size();
2381
448k
}
2382
2383
2384
2385
//------------------------------------------------------------------------------
2386
//
2387
//   compileSet       Compile the pattern operations for a reference to a
2388
//                    UnicodeSet.
2389
//
2390
//------------------------------------------------------------------------------
2391
void        RegexCompile::compileSet(UnicodeSet *theSet)
2392
390k
{
2393
390k
    if (theSet == nullptr) {
2394
213
        return;
2395
213
    }
2396
    //  Remove any strings from the set.
2397
    //  There shouldn't be any, but just in case.
2398
    //     (Case Closure can add them; if we had a simple case closure available that
2399
    //      ignored strings, that would be better.)
2400
390k
    theSet->removeAllStrings();
2401
390k
    int32_t  setSize = theSet->size();
2402
2403
390k
    switch (setSize) {
2404
6.34k
    case 0:
2405
6.34k
        {
2406
            // Set of no elements.   Always fails to match.
2407
6.34k
            appendOp(URX_BACKTRACK, 0);
2408
6.34k
            delete theSet;
2409
6.34k
        }
2410
6.34k
        break;
2411
2412
7.69k
    case 1:
2413
7.69k
        {
2414
            // The set contains only a single code point.  Put it into
2415
            //   the compiled pattern as a single char operation rather
2416
            //   than a set, and discard the set itself.
2417
7.69k
            literalChar(theSet->charAt(0));
2418
7.69k
            delete theSet;
2419
7.69k
        }
2420
7.69k
        break;
2421
2422
376k
    default:
2423
376k
        {
2424
            //  The set contains two or more chars.  (the normal case)
2425
            //  Put it into the compiled pattern as a set.
2426
376k
            theSet->freeze();
2427
376k
            int32_t setNumber = fRXPat->fSets->size();
2428
376k
            fRXPat->fSets->addElement(theSet, *fStatus);
2429
376k
            if (U_SUCCESS(*fStatus)) {
2430
376k
                appendOp(URX_SETREF, setNumber);
2431
376k
            } else {
2432
5
                delete theSet;
2433
5
            }
2434
376k
        }
2435
390k
    }
2436
390k
}
2437
2438
2439
//------------------------------------------------------------------------------
2440
//
2441
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442
//                      Except for the specific opcodes used, the code is the same
2443
//                      for all three types (greedy, non-greedy, possessive) of
2444
//                      intervals.  The opcodes are supplied as parameters.
2445
//                      (There are two sets of opcodes - greedy & possessive use the
2446
//                      same ones, while non-greedy has it's own.)
2447
//
2448
//                      The code for interval loops has this form:
2449
//                         0  CTR_INIT   counter loc (in stack frame)
2450
//                         1             5  patt address of CTR_LOOP at bottom of block
2451
//                         2             min count
2452
//                         3             max count   (-1 for unbounded)
2453
//                         4  ...        block to be iterated over
2454
//                         5  CTR_LOOP
2455
//
2456
//                       In
2457
//------------------------------------------------------------------------------
2458
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459
10.6k
{
2460
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461
    //   four slots in the compiled code.  Reserve them.
2462
10.6k
    int32_t   topOfBlock = blockTopLoc(true);
2463
10.6k
    insertOp(topOfBlock);
2464
10.6k
    insertOp(topOfBlock);
2465
10.6k
    insertOp(topOfBlock);
2466
2467
    // The operands for the CTR_INIT opcode include the index in the matcher data
2468
    //   of the counter.  Allocate it now. There are two data items
2469
    //        counterLoc   -->  Loop counter
2470
    //               +1    -->  Input index (for breaking non-progressing loops)
2471
    //                          (Only present if unbounded upper limit on loop)
2472
10.6k
    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473
10.6k
    int32_t   counterLoc = allocateStackData(dataSize);
2474
2475
10.6k
    int32_t   op = buildOp(InitOp, counterLoc);
2476
10.6k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478
    // The second operand of CTR_INIT is the location following the end of the loop.
2479
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480
    //   compilation of something later on causes the code to grow and the target
2481
    //   position to move.
2482
10.6k
    int32_t loopEnd = fRXPat->fCompiledPat->size();
2483
10.6k
    op = buildOp(URX_RELOC_OPRND, loopEnd);
2484
10.6k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486
    // Followed by the min and max counts.
2487
10.6k
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488
10.6k
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490
    // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491
    //   Goes at end of the block being looped over, so just append to the code so far.
2492
10.6k
    appendOp(LoopOp, topOfBlock);
2493
2494
10.6k
    if ((fIntervalLow & 0xff000000) != 0 ||
2495
10.5k
        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496
51
            error(U_REGEX_NUMBER_TOO_BIG);
2497
51
        }
2498
2499
10.6k
    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500
24
        error(U_REGEX_MAX_LT_MIN);
2501
24
    }
2502
10.6k
}
2503
2504
2505
2506
99.4k
UBool RegexCompile::compileInlineInterval() {
2507
99.4k
    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508
        // Too big to inline.  Fail, which will cause looping code to be generated.
2509
        //   (Upper < Lower picks up unbounded upper and errors, both.)
2510
4.06k
        return false;
2511
4.06k
    }
2512
2513
95.3k
    int32_t   topOfBlock = blockTopLoc(false);
2514
95.3k
    if (fIntervalUpper == 0) {
2515
        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516
        // Discard the generated code for the block.
2517
        // If the block included parens, discard the info pertaining to them as well.
2518
1.69k
        fRXPat->fCompiledPat->setSize(topOfBlock);
2519
1.69k
        if (fMatchOpenParen >= topOfBlock) {
2520
171
            fMatchOpenParen = -1;
2521
171
        }
2522
1.69k
        if (fMatchCloseParen >= topOfBlock) {
2523
173
            fMatchCloseParen = -1;
2524
173
        }
2525
1.69k
        return true;
2526
1.69k
    }
2527
2528
93.6k
    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529
        // The thing being repeated is not a single op, but some
2530
        //   more complex block.  Do it as a loop, not inlines.
2531
        //   Note that things "repeated" a max of once are handled as inline, because
2532
        //     the one copy of the code already generated is just fine.
2533
699
        return false;
2534
699
    }
2535
2536
    // Pick up the opcode that is to be repeated
2537
    //
2538
92.9k
    int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topOfBlock));
2539
2540
    // Compute the pattern location where the inline sequence
2541
    //   will end, and set up the state save op that will be needed.
2542
    //
2543
92.9k
    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544
92.9k
                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545
92.9k
    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546
92.9k
    if (fIntervalLow == 0) {
2547
79.3k
        insertOp(topOfBlock);
2548
79.3k
        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549
79.3k
    }
2550
2551
2552
2553
    //  Loop, emitting the op for the thing being repeated each time.
2554
    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555
    //    it was put there when it was originally encountered.
2556
92.9k
    int32_t i;
2557
815k
    for (i=1; i<fIntervalUpper; i++ ) {
2558
722k
        if (i >= fIntervalLow) {
2559
638k
            appendOp(saveOp);
2560
638k
        }
2561
722k
        appendOp(op);
2562
722k
    }
2563
92.9k
    return true;
2564
93.6k
}
2565
2566
2567
2568
//------------------------------------------------------------------------------
2569
//
2570
//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
2571
//                         set of characters that could potentially begin a case-insensitive 
2572
//                         match of a string beginning with that character, using full Unicode
2573
//                         case insensitive matching.
2574
//
2575
//          This is used in optimizing find().
2576
//
2577
//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578
//          misses cases like this:
2579
//             A string from the pattern begins with 'ss' (although all we know
2580
//                 in this context is that it begins with 's')
2581
//             The pattern could match a string beginning with a German sharp-s
2582
//
2583
//           To the ordinary case closure for a character c, we add all other
2584
//           characters cx where the case closure of cx includes a string form that begins
2585
//           with the original character c.
2586
//
2587
//           This function could be made smarter. The full pattern string is available
2588
//           and it would be possible to verify that the extra characters being added
2589
//           to the starting set fully match, rather than having just a first-char of the
2590
//           folded form match.
2591
//
2592
//------------------------------------------------------------------------------
2593
396k
void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595
// Machine Generated below.
2596
// It may need updating with new versions of Unicode.
2597
// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598
// The update tool is here:
2599
// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601
// Machine Generated Data. Do not hand edit.
2602
396k
    static const UChar32 RECaseFixCodePoints[] = {
2603
396k
        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
2604
396k
        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
2605
396k
        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
2606
396k
        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
2607
396k
        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609
396k
    static const int16_t RECaseFixStringOffsets[] = {
2610
396k
        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
2611
396k
        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
2612
396k
        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
2613
396k
        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
2614
396k
        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616
396k
    static const int16_t RECaseFixCounts[] = {
2617
396k
        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
2618
396k
        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
2619
396k
        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2620
396k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2621
396k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623
396k
    static const char16_t RECaseFixData[] = {
2624
396k
        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
2625
396k
        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
2626
396k
        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
2627
396k
        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
2628
396k
        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
2629
396k
        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
2630
396k
        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
2631
396k
        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
2632
396k
        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
2633
396k
        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
2634
396k
        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636
// End of machine generated data.
2637
2638
396k
    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639
        // This function should never be called with an invalid input character.
2640
0
        UPRV_UNREACHABLE_EXIT;
2641
396k
    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642
11.0k
        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643
11.0k
        starterChars->set(caseFoldedC, caseFoldedC);
2644
2645
11.0k
        int32_t i;
2646
191k
        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647
            // Simple linear search through the sorted list of interesting code points.
2648
180k
        }
2649
2650
11.0k
        if (RECaseFixCodePoints[i] == c) {
2651
8.98k
            int32_t dataIndex = RECaseFixStringOffsets[i];
2652
8.98k
            int32_t numCharsToAdd = RECaseFixCounts[i];
2653
8.98k
            UChar32 cpToAdd = 0;
2654
50.5k
            for (int32_t j=0; j<numCharsToAdd; j++) {
2655
41.5k
                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656
41.5k
                starterChars->add(cpToAdd);
2657
41.5k
            }
2658
8.98k
        }
2659
2660
11.0k
        starterChars->closeOver(USET_CASE_INSENSITIVE);
2661
11.0k
        starterChars->removeAllStrings();
2662
385k
    } else {
2663
        // Not a cased character. Just return it alone.
2664
385k
        starterChars->set(c, c);
2665
385k
    }
2666
396k
}
2667
2668
2669
// Increment with overflow check.
2670
// val and delta will both be positive.
2671
2672
5.85M
static int32_t safeIncrement(int32_t val, int32_t delta) {
2673
5.85M
    if (INT32_MAX - val > delta) {
2674
5.68M
        return val + delta;
2675
5.68M
    } else {
2676
177k
        return INT32_MAX;
2677
177k
    }
2678
5.85M
}
2679
2680
2681
//------------------------------------------------------------------------------
2682
//
2683
//   matchStartType    Determine how a match can start.
2684
//                     Used to optimize find() operations.
2685
//
2686
//                     Operation is very similar to minMatchLength().  Walk the compiled
2687
//                     pattern, keeping an on-going minimum-match-length.  For any
2688
//                     op where the min match coming in is zero, add that ops possible
2689
//                     starting matches to the possible starts for the overall pattern.
2690
//
2691
//------------------------------------------------------------------------------
2692
13.1k
void   RegexCompile::matchStartType() {
2693
13.1k
    if (U_FAILURE(*fStatus)) {
2694
86
        return;
2695
86
    }
2696
2697
2698
13.0k
    int32_t    loc;                    // Location in the pattern of the current op being processed.
2699
13.0k
    int32_t    op;                     // The op being processed
2700
13.0k
    int32_t    opType;                 // The opcode type of the op
2701
13.0k
    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702
13.0k
    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703
2704
13.0k
    UBool      atStart = true;         // True if no part of the pattern yet encountered
2705
                                       //   could have advanced the position in a match.
2706
                                       //   (Maximum match length so far == 0)
2707
2708
    // forwardedLength is a vector holding minimum-match-length values that
2709
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710
    //   It must be one longer than the pattern being checked because some  ops
2711
    //   will jmp to a end-of-block+1 location from within a block, and we must
2712
    //   count those when checking the block.
2713
13.0k
    int32_t end = fRXPat->fCompiledPat->size();
2714
13.0k
    UVector32  forwardedLength(end+1, *fStatus);
2715
13.0k
    forwardedLength.setSize(end+1);
2716
28.5M
    for (loc=3; loc<end; loc++) {
2717
28.5M
        forwardedLength.setElementAt(INT32_MAX, loc);
2718
28.5M
    }
2719
2720
24.8M
    for (loc = 3; loc<end; loc++) {
2721
24.8M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
2722
24.8M
        opType = URX_TYPE(op);
2723
2724
        // The loop is advancing linearly through the pattern.
2725
        // If the op we are now at was the destination of a branch in the pattern,
2726
        // and that path has a shorter minimum length than the current accumulated value,
2727
        // replace the current accumulated value.
2728
24.8M
        if (forwardedLength.elementAti(loc) < currentLen) {
2729
731k
            currentLen = forwardedLength.elementAti(loc);
2730
731k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731
731k
        }
2732
2733
24.8M
        switch (opType) {
2734
            // Ops that don't change the total length matched
2735
0
        case URX_RESERVED_OP:
2736
13.0k
        case URX_END:
2737
13.0k
        case URX_FAIL:
2738
13.0k
        case URX_STRING_LEN:
2739
13.0k
        case URX_NOP:
2740
57.8k
        case URX_START_CAPTURE:
2741
102k
        case URX_END_CAPTURE:
2742
104k
        case URX_BACKSLASH_B:
2743
111k
        case URX_BACKSLASH_BU:
2744
112k
        case URX_BACKSLASH_G:
2745
114k
        case URX_BACKSLASH_Z:
2746
182k
        case URX_DOLLAR:
2747
184k
        case URX_DOLLAR_M:
2748
219k
        case URX_DOLLAR_D:
2749
222k
        case URX_DOLLAR_MD:
2750
222k
        case URX_RELOC_OPRND:
2751
328k
        case URX_STO_INP_LOC:
2752
329k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753
332k
        case URX_BACKREF_I:
2754
2755
421k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756
511k
        case URX_LD_SP:
2757
511k
            break;
2758
2759
4.39k
        case URX_CARET:
2760
4.39k
            if (atStart) {
2761
697
                fRXPat->fStartType = START_START;
2762
697
            }
2763
4.39k
            break;
2764
2765
1.30k
        case URX_CARET_M:
2766
2.48k
        case URX_CARET_M_UNIX:
2767
2.48k
            if (atStart) {
2768
537
                fRXPat->fStartType = START_LINE;
2769
537
            }
2770
2.48k
            break;
2771
2772
793k
        case URX_ONECHAR:
2773
793k
            if (currentLen == 0) {
2774
                // This character could appear at the start of a match.
2775
                //   Add it to the set of possible starting characters.
2776
76.5k
                fRXPat->fInitialChars->add(URX_VAL(op));
2777
76.5k
                numInitialStrings += 2;
2778
76.5k
            }
2779
793k
            currentLen = safeIncrement(currentLen, 1);
2780
793k
            atStart = false;
2781
793k
            break;
2782
2783
2784
76.4k
        case URX_SETREF:
2785
76.4k
            if (currentLen == 0) {
2786
3.01k
                int32_t  sn = URX_VAL(op);
2787
3.01k
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788
3.01k
                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
2789
3.01k
                fRXPat->fInitialChars->addAll(*s);
2790
3.01k
                numInitialStrings += 2;
2791
3.01k
            }
2792
76.4k
            currentLen = safeIncrement(currentLen, 1);
2793
76.4k
            atStart = false;
2794
76.4k
            break;
2795
2796
4.29k
        case URX_LOOP_SR_I:
2797
            // [Set]*, like a SETREF, above, in what it can match,
2798
            //  but may not match at all, so currentLen is not incremented.
2799
4.29k
            if (currentLen == 0) {
2800
406
                int32_t  sn = URX_VAL(op);
2801
406
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802
406
                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
2803
406
                fRXPat->fInitialChars->addAll(*s);
2804
406
                numInitialStrings += 2;
2805
406
            }
2806
4.29k
            atStart = false;
2807
4.29k
            break;
2808
2809
12.4k
        case URX_LOOP_DOT_I:
2810
12.4k
            if (currentLen == 0) {
2811
                // .* at the start of a pattern.
2812
                //    Any character can begin the match.
2813
2.47k
                fRXPat->fInitialChars->clear();
2814
2.47k
                fRXPat->fInitialChars->complement();
2815
2.47k
                numInitialStrings += 2;
2816
2.47k
            }
2817
12.4k
            atStart = false;
2818
12.4k
            break;
2819
2820
2821
5.54k
        case URX_STATIC_SETREF:
2822
5.54k
            if (currentLen == 0) {
2823
974
                int32_t  sn = URX_VAL(op);
2824
974
                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825
974
                const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826
974
                fRXPat->fInitialChars->addAll(s);
2827
974
                numInitialStrings += 2;
2828
974
            }
2829
5.54k
            currentLen = safeIncrement(currentLen, 1);
2830
5.54k
            atStart = false;
2831
5.54k
            break;
2832
2833
2834
2835
26.3k
        case URX_STAT_SETREF_N:
2836
26.3k
            if (currentLen == 0) {
2837
446
                int32_t  sn = URX_VAL(op);
2838
446
                UnicodeSet sc;
2839
446
                sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840
446
                fRXPat->fInitialChars->addAll(sc);
2841
446
                numInitialStrings += 2;
2842
446
            }
2843
26.3k
            currentLen = safeIncrement(currentLen, 1);
2844
26.3k
            atStart = false;
2845
26.3k
            break;
2846
2847
2848
2849
4.09k
        case URX_BACKSLASH_D:
2850
            // Digit Char
2851
4.09k
             if (currentLen == 0) {
2852
1.15k
                 UnicodeSet s;
2853
1.15k
                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854
1.15k
                 if (URX_VAL(op) != 0) {
2855
730
                     s.complement();
2856
730
                 }
2857
1.15k
                 fRXPat->fInitialChars->addAll(s);
2858
1.15k
                 numInitialStrings += 2;
2859
1.15k
            }
2860
4.09k
            currentLen = safeIncrement(currentLen, 1);
2861
4.09k
            atStart = false;
2862
4.09k
            break;
2863
2864
2865
4.74k
        case URX_BACKSLASH_H:
2866
            // Horiz white space
2867
4.74k
            if (currentLen == 0) {
2868
1.04k
                UnicodeSet s;
2869
1.04k
                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870
1.04k
                s.add(static_cast<UChar32>(9)); // Tab
2871
1.04k
                if (URX_VAL(op) != 0) {
2872
601
                    s.complement();
2873
601
                }
2874
1.04k
                fRXPat->fInitialChars->addAll(s);
2875
1.04k
                numInitialStrings += 2;
2876
1.04k
            }
2877
4.74k
            currentLen = safeIncrement(currentLen, 1);
2878
4.74k
            atStart = false;
2879
4.74k
            break;
2880
2881
2882
4.42k
        case URX_BACKSLASH_R:       // Any line ending sequence
2883
7.60k
        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884
7.60k
            if (currentLen == 0) {
2885
653
                UnicodeSet s;
2886
653
                s.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
2887
653
                s.add(static_cast<UChar32>(0x85));
2888
653
                s.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
2889
653
                if (URX_VAL(op) != 0) {
2890
                     // Complement option applies to URX_BACKSLASH_V only.
2891
358
                     s.complement();
2892
358
                }
2893
653
                fRXPat->fInitialChars->addAll(s);
2894
653
                numInitialStrings += 2;
2895
653
            }
2896
7.60k
            currentLen = safeIncrement(currentLen, 1);
2897
7.60k
            atStart = false;
2898
7.60k
            break;
2899
2900
2901
2902
47.7k
        case URX_ONECHAR_I:
2903
            // Case Insensitive Single Character.
2904
47.7k
            if (currentLen == 0) {
2905
3.45k
                UChar32  c = URX_VAL(op);
2906
3.45k
                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907
3.45k
                    UnicodeSet starters(c, c);
2908
3.45k
                    starters.closeOver(USET_CASE_INSENSITIVE);
2909
                    // findCaseInsensitiveStarters(c, &starters);
2910
                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911
                    //   The expanded folding can't match the pattern.
2912
3.45k
                    fRXPat->fInitialChars->addAll(starters);
2913
3.45k
                } else {
2914
                    // Char has no case variants.  Just add it as-is to the
2915
                    //   set of possible starting chars.
2916
0
                    fRXPat->fInitialChars->add(c);
2917
0
                }
2918
3.45k
                numInitialStrings += 2;
2919
3.45k
            }
2920
47.7k
            currentLen = safeIncrement(currentLen, 1);
2921
47.7k
            atStart = false;
2922
47.7k
            break;
2923
2924
2925
2.63k
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926
10.4k
        case URX_DOTANY_ALL:    // . matches one or two.
2927
533k
        case URX_DOTANY:
2928
539k
        case URX_DOTANY_UNIX:
2929
539k
            if (currentLen == 0) {
2930
                // These constructs are all bad news when they appear at the start
2931
                //   of a match.  Any character can begin the match.
2932
3.34k
                fRXPat->fInitialChars->clear();
2933
3.34k
                fRXPat->fInitialChars->complement();
2934
3.34k
                numInitialStrings += 2;
2935
3.34k
            }
2936
539k
            currentLen = safeIncrement(currentLen, 1);
2937
539k
            atStart = false;
2938
539k
            break;
2939
2940
2941
0
        case URX_JMPX:
2942
0
            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943
0
            U_FALLTHROUGH;
2944
10.4M
        case URX_JMP:
2945
10.4M
            {
2946
10.4M
                int32_t  jmpDest = URX_VAL(op);
2947
10.4M
                if (jmpDest < loc) {
2948
                    // Loop of some kind.  Can safely ignore, the worst that will happen
2949
                    //  is that we understate the true minimum length
2950
57.5k
                    currentLen = forwardedLength.elementAti(loc+1);
2951
2952
10.3M
                } else {
2953
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2954
10.3M
                    U_ASSERT(jmpDest <= end+1);
2955
10.3M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956
8.19k
                        forwardedLength.setElementAt(currentLen, jmpDest);
2957
8.19k
                    }
2958
10.3M
                }
2959
10.4M
            }
2960
10.4M
            atStart = false;
2961
10.4M
            break;
2962
2963
132k
        case URX_JMP_SAV:
2964
239k
        case URX_JMP_SAV_X:
2965
            // Combo of state save to the next loc, + jmp backwards.
2966
            //   Net effect on min. length computation is nothing.
2967
239k
            atStart = false;
2968
239k
            break;
2969
2970
3.77k
        case URX_BACKTRACK:
2971
            // Fails are kind of like a branch, except that the min length was
2972
            //   propagated already, by the state save.
2973
3.77k
            currentLen = forwardedLength.elementAti(loc+1);
2974
3.77k
            atStart = false;
2975
3.77k
            break;
2976
2977
2978
11.0M
        case URX_STATE_SAVE:
2979
11.0M
            {
2980
                // State Save, for forward jumps, propagate the current minimum.
2981
                //             of the state save.
2982
11.0M
                int32_t  jmpDest = URX_VAL(op);
2983
11.0M
                if (jmpDest > loc) {
2984
11.0M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985
10.7M
                        forwardedLength.setElementAt(currentLen, jmpDest);
2986
10.7M
                    }
2987
11.0M
                }
2988
11.0M
            }
2989
11.0M
            atStart = false;
2990
11.0M
            break;
2991
2992
2993
2994
2995
120k
        case URX_STRING:
2996
120k
            {
2997
120k
                loc++;
2998
120k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
2999
120k
                int32_t stringLen   = URX_VAL(stringLenOp);
3000
120k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001
120k
                U_ASSERT(stringLenOp >= 2);
3002
120k
                if (currentLen == 0) {
3003
                    // Add the starting character of this string to the set of possible starting
3004
                    //   characters for this pattern.
3005
19.8k
                    int32_t stringStartIdx = URX_VAL(op);
3006
19.8k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007
19.8k
                    fRXPat->fInitialChars->add(c);
3008
3009
                    // Remember this string.  After the entire pattern has been checked,
3010
                    //  if nothing else is identified that can start a match, we'll use it.
3011
19.8k
                    numInitialStrings++;
3012
19.8k
                    fRXPat->fInitialStringIdx = stringStartIdx;
3013
19.8k
                    fRXPat->fInitialStringLen = stringLen;
3014
19.8k
                }
3015
3016
120k
                currentLen = safeIncrement(currentLen, stringLen);
3017
120k
                atStart = false;
3018
120k
            }
3019
120k
            break;
3020
3021
890k
        case URX_STRING_I:
3022
890k
            {
3023
                // Case-insensitive string.  Unlike exact-match strings, we won't
3024
                //   attempt a string search for possible match positions.  But we
3025
                //   do update the set of possible starting characters.
3026
890k
                loc++;
3027
890k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3028
890k
                int32_t stringLen   = URX_VAL(stringLenOp);
3029
890k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030
890k
                U_ASSERT(stringLenOp >= 2);
3031
890k
                if (currentLen == 0) {
3032
                    // Add the starting character of this string to the set of possible starting
3033
                    //   characters for this pattern.
3034
396k
                    int32_t stringStartIdx = URX_VAL(op);
3035
396k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036
396k
                    UnicodeSet s;
3037
396k
                    findCaseInsensitiveStarters(c, &s);
3038
396k
                    fRXPat->fInitialChars->addAll(s);
3039
396k
                    numInitialStrings += 2;  // Matching on an initial string not possible.
3040
396k
                }
3041
890k
                currentLen = safeIncrement(currentLen, stringLen);
3042
890k
                atStart = false;
3043
890k
            }
3044
890k
            break;
3045
3046
2.72k
        case URX_CTR_INIT:
3047
4.34k
        case URX_CTR_INIT_NG:
3048
4.34k
            {
3049
                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050
                //   so location must be updated accordingly.
3051
                // Loop Init Ops.
3052
                //   If the min loop count == 0
3053
                //      move loc forwards to the end of the loop, skipping over the body.
3054
                //   If the min count is > 0,
3055
                //      continue normal processing of the body of the loop.
3056
4.34k
                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
3057
4.34k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3058
4.34k
                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
3059
4.34k
                if (minLoopCount == 0) {
3060
                    // Min Loop Count of 0, treat like a forward branch and
3061
                    //   move the current minimum length up to the target
3062
                    //   (end of loop) location.
3063
2.41k
                    U_ASSERT(loopEndLoc <= end+1);
3064
2.41k
                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065
1.61k
                        forwardedLength.setElementAt(currentLen, loopEndLoc);
3066
1.61k
                    }
3067
2.41k
                }
3068
4.34k
                loc+=3;  // Skips over operands of CTR_INIT
3069
4.34k
            }
3070
4.34k
            atStart = false;
3071
4.34k
            break;
3072
3073
3074
2.72k
        case URX_CTR_LOOP:
3075
4.34k
        case URX_CTR_LOOP_NG:
3076
            // Loop ops.
3077
            //  The jump is conditional, backwards only.
3078
4.34k
            atStart = false;
3079
4.34k
            break;
3080
3081
16.7k
        case URX_LOOP_C:
3082
            // More loop ops.  These state-save to themselves.
3083
            //   don't change the minimum match
3084
16.7k
            atStart = false;
3085
16.7k
            break;
3086
3087
3088
1.87k
        case URX_LA_START:
3089
3.60k
        case URX_LB_START:
3090
3.60k
            {
3091
                // Look-around.  Scan forward until the matching look-ahead end,
3092
                //   without processing the look-around block.  This is overly pessimistic.
3093
3094
                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095
                //   lookahead contains two LA_END instructions, so count goes up by two
3096
                //   for each LA_START.
3097
3.60k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3098
2.69M
                for (;;) {
3099
2.69M
                    loc++;
3100
2.69M
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3101
2.69M
                    if (URX_TYPE(op) == URX_LA_START) {
3102
2.96k
                        depth+=2;
3103
2.96k
                    }
3104
2.69M
                    if (URX_TYPE(op) == URX_LB_START) {
3105
1.19k
                        depth++;
3106
1.19k
                    }
3107
2.69M
                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108
12.6k
                        depth--;
3109
12.6k
                        if (depth == 0) {
3110
3.60k
                            break;
3111
3.60k
                        }
3112
12.6k
                    }
3113
2.69M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3114
                        // Need this because neg lookahead blocks will FAIL to outside
3115
                        //   of the block.
3116
1.27M
                        int32_t  jmpDest = URX_VAL(op);
3117
1.27M
                        if (jmpDest > loc) {
3118
1.27M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119
1.17M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3120
1.17M
                            }
3121
1.27M
                        }
3122
1.27M
                    }
3123
2.69M
                    U_ASSERT(loc <= end);
3124
2.69M
                }
3125
3.60k
            }
3126
3.60k
            break;
3127
3128
0
        case URX_LA_END:
3129
0
        case URX_LB_CONT:
3130
0
        case URX_LB_END:
3131
0
        case URX_LBN_CONT:
3132
0
        case URX_LBN_END:
3133
0
            UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134
                                    //  consumed by the scan in URX_LA_START and LB_START
3135
0
        default:
3136
0
            UPRV_UNREACHABLE_EXIT;
3137
24.8M
            }
3138
3139
24.8M
        }
3140
3141
3142
    // We have finished walking through the ops.  Check whether some forward jump
3143
    //   propagated a shorter length to location end+1.
3144
13.0k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3145
10.9k
        currentLen = forwardedLength.elementAti(end+1);
3146
10.9k
    }
3147
3148
3149
13.0k
    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152
    // Sort out what we should check for when looking for candidate match start positions.
3153
    // In order of preference,
3154
    //     1.   Start of input text buffer.
3155
    //     2.   A literal string.
3156
    //     3.   Start of line in multi-line mode.
3157
    //     4.   A single literal character.
3158
    //     5.   A character from a set of characters.
3159
    //
3160
13.0k
    if (fRXPat->fStartType == START_START) {
3161
        // Match only at the start of an input text string.
3162
        //    start type is already set.  We're done.
3163
12.9k
    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164
        // Match beginning only with a literal string.
3165
452
        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166
452
        U_ASSERT(fRXPat->fInitialChars->contains(c));
3167
452
        fRXPat->fStartType   = START_STRING;
3168
452
        fRXPat->fInitialChar = c;
3169
12.5k
    } else if (fRXPat->fStartType == START_LINE) {
3170
        // Match at start of line in Multi-Line mode.
3171
        // Nothing to do here; everything is already set.
3172
12.3k
    } else if (fRXPat->fMinMatchLen == 0) {
3173
        // Zero length match possible.  We could start anywhere.
3174
2.00k
        fRXPat->fStartType = START_NO_INFO;
3175
10.3k
    } else if (fRXPat->fInitialChars->size() == 1) {
3176
        // All matches begin with the same char.
3177
2.17k
        fRXPat->fStartType   = START_CHAR;
3178
2.17k
        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179
2.17k
        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180
8.18k
    } else if (fRXPat->fInitialChars->contains(static_cast<UChar32>(0), static_cast<UChar32>(0x10ffff)) == false &&
3181
4.57k
        fRXPat->fMinMatchLen > 0) {
3182
        // Matches start with a set of character smaller than the set of all chars.
3183
4.57k
        fRXPat->fStartType = START_SET;
3184
4.57k
    } else {
3185
        // Matches can start with anything
3186
3.61k
        fRXPat->fStartType = START_NO_INFO;
3187
3.61k
    }
3188
13.0k
}
3189
3190
3191
3192
//------------------------------------------------------------------------------
3193
//
3194
//   minMatchLength    Calculate the length of the shortest string that could
3195
//                     match the specified pattern.
3196
//                     Length is in 16 bit code units, not code points.
3197
//
3198
//                     The calculated length may not be exact.  The returned
3199
//                     value may be shorter than the actual minimum; it must
3200
//                     never be longer.
3201
//
3202
//                     start and end are the range of p-code operations to be
3203
//                     examined.  The endpoints are included in the range.
3204
//
3205
//------------------------------------------------------------------------------
3206
280k
int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3207
280k
    if (U_FAILURE(*fStatus)) {
3208
86
        return 0;
3209
86
    }
3210
3211
280k
    U_ASSERT(start <= end);
3212
280k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3213
3214
3215
280k
    int32_t    loc;
3216
280k
    int32_t    op;
3217
280k
    int32_t    opType;
3218
280k
    int32_t    currentLen = 0;
3219
3220
3221
    // forwardedLength is a vector holding minimum-match-length values that
3222
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3223
    //   It must be one longer than the pattern being checked because some  ops
3224
    //   will jmp to a end-of-block+1 location from within a block, and we must
3225
    //   count those when checking the block.
3226
280k
    UVector32  forwardedLength(end+2, *fStatus);
3227
280k
    forwardedLength.setSize(end+2);
3228
39.5M
    for (loc=start; loc<=end+1; loc++) {
3229
39.2M
        forwardedLength.setElementAt(INT32_MAX, loc);
3230
39.2M
    }
3231
3232
30.6M
    for (loc = start; loc<=end; loc++) {
3233
30.3M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3234
30.3M
        opType = URX_TYPE(op);
3235
3236
        // The loop is advancing linearly through the pattern.
3237
        // If the op we are now at was the destination of a branch in the pattern,
3238
        // and that path has a shorter minimum length than the current accumulated value,
3239
        // replace the current accumulated value.
3240
        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3241
                                                               //   no-match-possible cases.
3242
30.3M
        if (forwardedLength.elementAti(loc) < currentLen) {
3243
813k
            currentLen = forwardedLength.elementAti(loc);
3244
813k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3245
813k
        }
3246
3247
30.3M
        switch (opType) {
3248
            // Ops that don't change the total length matched
3249
0
        case URX_RESERVED_OP:
3250
13.0k
        case URX_END:
3251
13.0k
        case URX_STRING_LEN:
3252
289k
        case URX_NOP:
3253
347k
        case URX_START_CAPTURE:
3254
405k
        case URX_END_CAPTURE:
3255
408k
        case URX_BACKSLASH_B:
3256
416k
        case URX_BACKSLASH_BU:
3257
417k
        case URX_BACKSLASH_G:
3258
420k
        case URX_BACKSLASH_Z:
3259
428k
        case URX_CARET:
3260
557k
        case URX_DOLLAR:
3261
560k
        case URX_DOLLAR_M:
3262
631k
        case URX_DOLLAR_D:
3263
636k
        case URX_DOLLAR_MD:
3264
636k
        case URX_RELOC_OPRND:
3265
744k
        case URX_STO_INP_LOC:
3266
746k
        case URX_CARET_M:
3267
748k
        case URX_CARET_M_UNIX:
3268
749k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3269
752k
        case URX_BACKREF_I:
3270
3271
844k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3272
935k
        case URX_LD_SP:
3273
3274
1.06M
        case URX_JMP_SAV:
3275
1.17M
        case URX_JMP_SAV_X:
3276
1.17M
            break;
3277
3278
3279
            // Ops that match a minimum of one character (one or two 16 bit code units.)
3280
            //
3281
1.13M
        case URX_ONECHAR:
3282
1.14M
        case URX_STATIC_SETREF:
3283
1.17M
        case URX_STAT_SETREF_N:
3284
1.25M
        case URX_SETREF:
3285
1.25M
        case URX_BACKSLASH_D:
3286
1.26M
        case URX_BACKSLASH_H:
3287
1.27M
        case URX_BACKSLASH_R:
3288
1.27M
        case URX_BACKSLASH_V:
3289
1.33M
        case URX_ONECHAR_I:
3290
1.33M
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3291
1.34M
        case URX_DOTANY_ALL:    // . matches one or two.
3292
1.87M
        case URX_DOTANY:
3293
1.88M
        case URX_DOTANY_UNIX:
3294
1.88M
            currentLen = safeIncrement(currentLen, 1);
3295
1.88M
            break;
3296
3297
3298
0
        case URX_JMPX:
3299
0
            loc++;              // URX_JMPX has an extra operand, ignored here,
3300
                                //   otherwise processed identically to URX_JMP.
3301
0
            U_FALLTHROUGH;
3302
12.5M
        case URX_JMP:
3303
12.5M
            {
3304
12.5M
                int32_t  jmpDest = URX_VAL(op);
3305
12.5M
                if (jmpDest < loc) {
3306
                    // Loop of some kind.  Can safely ignore, the worst that will happen
3307
                    //  is that we understate the true minimum length
3308
57.9k
                    currentLen = forwardedLength.elementAti(loc+1);
3309
12.5M
                } else {
3310
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3311
12.5M
                    U_ASSERT(jmpDest <= end+1);
3312
12.5M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
3313
10.3k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3314
10.3k
                    }
3315
12.5M
                }
3316
12.5M
            }
3317
12.5M
            break;
3318
3319
4.78k
        case URX_BACKTRACK:
3320
4.78k
            {
3321
                // Back-tracks are kind of like a branch, except that the min length was
3322
                //   propagated already, by the state save.
3323
4.78k
                currentLen = forwardedLength.elementAti(loc+1);
3324
4.78k
            }
3325
4.78k
            break;
3326
3327
3328
13.4M
        case URX_STATE_SAVE:
3329
13.4M
            {
3330
                // State Save, for forward jumps, propagate the current minimum.
3331
                //             of the state save.
3332
13.4M
                int32_t  jmpDest = URX_VAL(op);
3333
13.4M
                if (jmpDest > loc) {
3334
13.4M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3335
12.9M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3336
12.9M
                    }
3337
13.4M
                }
3338
13.4M
            }
3339
13.4M
            break;
3340
3341
3342
137k
        case URX_STRING:
3343
137k
            {
3344
137k
                loc++;
3345
137k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3346
137k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3347
137k
            }
3348
137k
            break;
3349
3350
3351
993k
        case URX_STRING_I:
3352
993k
            {
3353
993k
                loc++;
3354
                // TODO: with full case folding, matching input text may be shorter than
3355
                //       the string we have here.  More smarts could put some bounds on it.
3356
                //       Assume a min length of one for now.  A min length of zero causes
3357
                //        optimization failures for a pattern like "string"+
3358
                // currentLen += URX_VAL(stringLenOp);
3359
993k
                currentLen = safeIncrement(currentLen, 1);
3360
993k
            }
3361
993k
            break;
3362
3363
6.99k
        case URX_CTR_INIT:
3364
9.56k
        case URX_CTR_INIT_NG:
3365
9.56k
            {
3366
                // Loop Init Ops.
3367
                //   If the min loop count == 0
3368
                //      move loc forwards to the end of the loop, skipping over the body.
3369
                //   If the min count is > 0,
3370
                //      continue normal processing of the body of the loop.
3371
9.56k
                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
3372
9.56k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3373
9.56k
                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
3374
9.56k
                if (minLoopCount == 0) {
3375
5.43k
                    loc = loopEndLoc;
3376
5.43k
                } else {
3377
4.12k
                    loc+=3;  // Skips over operands of CTR_INIT
3378
4.12k
                }
3379
9.56k
            }
3380
9.56k
            break;
3381
3382
3383
3.31k
        case URX_CTR_LOOP:
3384
4.12k
        case URX_CTR_LOOP_NG:
3385
            // Loop ops.
3386
            //  The jump is conditional, backwards only.
3387
4.12k
            break;
3388
3389
4.28k
        case URX_LOOP_SR_I:
3390
16.8k
        case URX_LOOP_DOT_I:
3391
33.7k
        case URX_LOOP_C:
3392
            // More loop ops.  These state-save to themselves.
3393
            //   don't change the minimum match - could match nothing at all.
3394
33.7k
            break;
3395
3396
3397
3.53k
        case URX_LA_START:
3398
11.4k
        case URX_LB_START:
3399
11.4k
            {
3400
                // Look-around.  Scan forward until the matching look-ahead end,
3401
                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3402
                //   it assumes that the look-ahead match might be zero-length.
3403
                //   TODO:  Positive lookahead could recursively do the block, then continue
3404
                //          with the longer of the block or the value coming in.  Ticket 6060
3405
11.4k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3406
7.15M
                for (;;) {
3407
7.15M
                    loc++;
3408
7.15M
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3409
7.15M
                    if (URX_TYPE(op) == URX_LA_START) {
3410
                        // The boilerplate for look-ahead includes two LA_END instructions,
3411
                        //    Depth will be decremented by each one when it is seen.
3412
5.44k
                        depth += 2;
3413
5.44k
                    }
3414
7.15M
                    if (URX_TYPE(op) == URX_LB_START) {
3415
5.40k
                        depth++;
3416
5.40k
                    }
3417
7.15M
                    if (URX_TYPE(op) == URX_LA_END) {
3418
27.9k
                        depth--;
3419
27.9k
                        if (depth == 0) {
3420
9.33k
                            break;
3421
9.33k
                        }
3422
27.9k
                    }
3423
7.15M
                    if (URX_TYPE(op)==URX_LBN_END) {
3424
3.34k
                        depth--;
3425
3.34k
                        if (depth == 0) {
3426
2.08k
                            break;
3427
2.08k
                        }
3428
3.34k
                    }
3429
7.14M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3430
                        // Need this because neg lookahead blocks will FAIL to outside
3431
                        //   of the block.
3432
3.42M
                        int32_t  jmpDest = URX_VAL(op);
3433
3.42M
                        if (jmpDest > loc) {
3434
3.42M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3435
3.31M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3436
3.31M
                            }
3437
3.42M
                        }
3438
3.42M
                    }
3439
7.14M
                    U_ASSERT(loc <= end);
3440
7.14M
                }
3441
11.4k
            }
3442
11.4k
            break;
3443
3444
6.86k
        case URX_LA_END:
3445
6.86k
        case URX_LB_CONT:
3446
13.7k
        case URX_LB_END:
3447
13.7k
        case URX_LBN_CONT:
3448
18.0k
        case URX_LBN_END:
3449
            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3450
            //   range being sized, which happens when measuring size of look-behind blocks.
3451
18.0k
            break;
3452
3453
0
        default:
3454
0
            UPRV_UNREACHABLE_EXIT;
3455
30.3M
            }
3456
3457
30.3M
        }
3458
3459
    // We have finished walking through the ops.  Check whether some forward jump
3460
    //   propagated a shorter length to location end+1.
3461
280k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3462
410
        currentLen = forwardedLength.elementAti(end+1);
3463
410
        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3464
410
    }
3465
3466
280k
    return currentLen;
3467
280k
}
3468
3469
//------------------------------------------------------------------------------
3470
//
3471
//   maxMatchLength    Calculate the length of the longest string that could
3472
//                     match the specified pattern.
3473
//                     Length is in 16 bit code units, not code points.
3474
//
3475
//                     The calculated length may not be exact.  The returned
3476
//                     value may be longer than the actual maximum; it must
3477
//                     never be shorter.
3478
//
3479
//                     start, end: the range of the pattern to check.
3480
//                     end is inclusive.
3481
//
3482
//------------------------------------------------------------------------------
3483
15.8k
int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3484
15.8k
    if (U_FAILURE(*fStatus)) {
3485
0
        return 0;
3486
0
    }
3487
15.8k
    U_ASSERT(start <= end);
3488
15.8k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3489
3490
15.8k
    int32_t    loc;
3491
15.8k
    int32_t    op;
3492
15.8k
    int32_t    opType;
3493
15.8k
    int32_t    currentLen = 0;
3494
15.8k
    UVector32  forwardedLength(end+1, *fStatus);
3495
15.8k
    forwardedLength.setSize(end+1);
3496
3497
8.38M
    for (loc=start; loc<=end; loc++) {
3498
8.37M
        forwardedLength.setElementAt(0, loc);
3499
8.37M
    }
3500
3501
6.28M
    for (loc = start; loc<=end; loc++) {
3502
6.27M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3503
6.27M
        opType = URX_TYPE(op);
3504
3505
        // The loop is advancing linearly through the pattern.
3506
        // If the op we are now at was the destination of a branch in the pattern,
3507
        // and that path has a longer maximum length than the current accumulated value,
3508
        // replace the current accumulated value.
3509
6.27M
        if (forwardedLength.elementAti(loc) > currentLen) {
3510
1.60M
            currentLen = forwardedLength.elementAti(loc);
3511
1.60M
        }
3512
3513
6.27M
        switch (opType) {
3514
            // Ops that don't change the total length matched
3515
0
        case URX_RESERVED_OP:
3516
0
        case URX_END:
3517
0
        case URX_STRING_LEN:
3518
33.9k
        case URX_NOP:
3519
38.5k
        case URX_START_CAPTURE:
3520
41.6k
        case URX_END_CAPTURE:
3521
42.2k
        case URX_BACKSLASH_B:
3522
42.8k
        case URX_BACKSLASH_BU:
3523
43.0k
        case URX_BACKSLASH_G:
3524
43.5k
        case URX_BACKSLASH_Z:
3525
44.6k
        case URX_CARET:
3526
45.2k
        case URX_DOLLAR:
3527
45.7k
        case URX_DOLLAR_M:
3528
45.9k
        case URX_DOLLAR_D:
3529
46.4k
        case URX_DOLLAR_MD:
3530
46.4k
        case URX_RELOC_OPRND:
3531
46.9k
        case URX_STO_INP_LOC:
3532
47.4k
        case URX_CARET_M:
3533
47.8k
        case URX_CARET_M_UNIX:
3534
3535
49.7k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3536
51.5k
        case URX_LD_SP:
3537
3538
58.3k
        case URX_LB_END:
3539
58.3k
        case URX_LB_CONT:
3540
58.3k
        case URX_LBN_CONT:
3541
62.5k
        case URX_LBN_END:
3542
62.5k
            break;
3543
3544
3545
            // Ops that increase that cause an unbounded increase in the length
3546
            //   of a matched string, or that increase it a hard to characterize way.
3547
            //   Call the max length unbounded, and stop further checking.
3548
375
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3549
570
        case URX_BACKREF_I:
3550
651
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3551
651
            currentLen = INT32_MAX;
3552
651
            break;
3553
3554
3555
            // Ops that match a max of one character (possibly two 16 bit code units.)
3556
            //
3557
1.84k
        case URX_STATIC_SETREF:
3558
2.49k
        case URX_STAT_SETREF_N:
3559
3.61k
        case URX_SETREF:
3560
3.97k
        case URX_BACKSLASH_D:
3561
4.46k
        case URX_BACKSLASH_H:
3562
4.79k
        case URX_BACKSLASH_R:
3563
5.81k
        case URX_BACKSLASH_V:
3564
7.13k
        case URX_ONECHAR_I:
3565
7.63k
        case URX_DOTANY_ALL:
3566
18.2k
        case URX_DOTANY:
3567
18.6k
        case URX_DOTANY_UNIX:
3568
18.6k
            currentLen = safeIncrement(currentLen, 2);
3569
18.6k
            break;
3570
3571
            // Single literal character.  Increase current max length by one or two,
3572
            //       depending on whether the char is in the supplementary range.
3573
246k
        case URX_ONECHAR:
3574
246k
            currentLen = safeIncrement(currentLen, 1);
3575
246k
            if (URX_VAL(op) > 0x10000) {
3576
1.07k
                currentLen = safeIncrement(currentLen, 1);
3577
1.07k
            }
3578
246k
            break;
3579
3580
            // Jumps.
3581
            //
3582
2.83M
        case URX_JMP:
3583
2.83M
        case URX_JMPX:
3584
2.83M
        case URX_JMP_SAV:
3585
2.83M
        case URX_JMP_SAV_X:
3586
2.83M
            {
3587
2.83M
                int32_t  jmpDest = URX_VAL(op);
3588
2.83M
                if (jmpDest < loc) {
3589
                    // Loop of some kind.  Max match length is unbounded.
3590
804
                    currentLen = INT32_MAX;
3591
2.83M
                } else {
3592
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3593
2.83M
                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3594
2.77k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3595
2.77k
                    }
3596
2.83M
                    currentLen = 0;
3597
2.83M
                }
3598
2.83M
            }
3599
2.83M
            break;
3600
3601
4.30k
        case URX_BACKTRACK:
3602
            // back-tracks are kind of like a branch, except that the max length was
3603
            //   propagated already, by the state save.
3604
4.30k
            currentLen = forwardedLength.elementAti(loc+1);
3605
4.30k
            break;
3606
3607
3608
3.01M
        case URX_STATE_SAVE:
3609
3.01M
            {
3610
                // State Save, for forward jumps, propagate the current minimum.
3611
                //               of the state save.
3612
                //             For backwards jumps, they create a loop, maximum
3613
                //               match length is unbounded.
3614
3.01M
                int32_t  jmpDest = URX_VAL(op);
3615
3.01M
                if (jmpDest > loc) {
3616
3.01M
                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3617
1.76M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3618
1.76M
                    }
3619
3.01M
                } else {
3620
217
                    currentLen = INT32_MAX;
3621
217
                }
3622
3.01M
            }
3623
3.01M
            break;
3624
3625
3626
3627
3628
29.5k
        case URX_STRING:
3629
29.5k
            {
3630
29.5k
                loc++;
3631
29.5k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3632
29.5k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3633
29.5k
                break;
3634
2.83M
            }
3635
3636
29.9k
        case URX_STRING_I:
3637
            // TODO:  This code assumes that any user string that matches will be no longer
3638
            //        than our compiled string, with case insensitive matching.
3639
            //        Our compiled string has been case-folded already.
3640
            //
3641
            //        Any matching user string will have no more code points than our
3642
            //        compiled (folded) string.  Folding may add code points, but
3643
            //        not remove them.
3644
            //
3645
            //        There is a potential problem if a supplemental code point
3646
            //        case-folds to a BMP code point.  In this case our compiled string
3647
            //        could be shorter (in code units) than a matching user string.
3648
            //
3649
            //        At this time (Unicode 6.1) there are no such characters, and this case
3650
            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3651
            //        any problematic characters are added to Unicode.
3652
            //
3653
            //        If this happens, we can make a set of the BMP chars that the
3654
            //        troublesome supplementals fold to, scan our string, and bump the
3655
            //        currentLen one extra for each that is found.
3656
            //
3657
29.9k
            {
3658
29.9k
                loc++;
3659
29.9k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3660
29.9k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3661
29.9k
            }
3662
29.9k
            break;
3663
3664
4.00k
        case URX_CTR_INIT:
3665
4.81k
        case URX_CTR_INIT_NG:
3666
            // For Loops, recursively call this function on the pattern for the loop body,
3667
            //   then multiply the result by the maximum loop count.
3668
4.81k
            {
3669
4.81k
                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3670
4.81k
                if (loopEndLoc == loc+4) {
3671
                    // Loop has an empty body. No affect on max match length.
3672
                    // Continue processing with code after the loop end.
3673
0
                    loc = loopEndLoc;
3674
0
                    break;
3675
0
                }
3676
3677
4.81k
                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3678
4.81k
                if (maxLoopCount == -1) {
3679
                    // Unbounded Loop. No upper bound on match length.
3680
125
                    currentLen = INT32_MAX;
3681
125
                    break;
3682
125
                }
3683
3684
4.68k
                U_ASSERT(loopEndLoc >= loc+4);
3685
4.68k
                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3686
4.68k
                int64_t updatedLen = static_cast<int64_t>(currentLen) + blockLen * maxLoopCount;
3687
4.68k
                if (updatedLen >= INT32_MAX) {
3688
122
                    currentLen = INT32_MAX;
3689
122
                    break;
3690
122
                }
3691
4.56k
                currentLen = static_cast<int32_t>(updatedLen);
3692
4.56k
                loc = loopEndLoc;
3693
4.56k
                break;
3694
4.68k
            }
3695
3696
0
        case URX_CTR_LOOP:
3697
0
        case URX_CTR_LOOP_NG:
3698
            // These opcodes will be skipped over by code for URX_CTR_INIT.
3699
            // We shouldn't encounter them here.
3700
0
            UPRV_UNREACHABLE_EXIT;
3701
3702
220
        case URX_LOOP_SR_I:
3703
329
        case URX_LOOP_DOT_I:
3704
329
        case URX_LOOP_C:
3705
            // For anything to do with loops, make the match length unbounded.
3706
329
            currentLen = INT32_MAX;
3707
329
            break;
3708
3709
3710
3711
3.38k
        case URX_LA_START:
3712
16.9k
        case URX_LA_END:
3713
            // Look-ahead.  Just ignore, treat the look-ahead block as if
3714
            // it were normal pattern.  Gives a too-long match length,
3715
            //  but good enough for now.
3716
16.9k
            break;
3717
3718
            // End of look-ahead ops should always be consumed by the processing at
3719
            //  the URX_LA_START op.
3720
            // UPRV_UNREACHABLE_EXIT;
3721
3722
6.09k
        case URX_LB_START:
3723
6.09k
            {
3724
                // Look-behind.  Scan forward until the matching look-around end,
3725
                //   without processing the look-behind block.
3726
6.09k
                int32_t dataLoc = URX_VAL(op);
3727
954k
                for (loc = loc + 1; loc <= end; ++loc) {
3728
954k
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3729
954k
                    int32_t opType = URX_TYPE(op);
3730
954k
                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3731
6.09k
                        break;
3732
6.09k
                    }
3733
954k
                }
3734
6.09k
                U_ASSERT(loc <= end);
3735
6.09k
            }
3736
6.09k
            break;
3737
3738
0
        default:
3739
0
            UPRV_UNREACHABLE_EXIT;
3740
6.27M
        }
3741
3742
3743
6.27M
        if (currentLen == INT32_MAX) {
3744
            //  The maximum length is unbounded.
3745
            //  Stop further processing of the pattern.
3746
2.25k
            break;
3747
2.25k
        }
3748
3749
6.27M
    }
3750
15.8k
    return currentLen;
3751
3752
15.8k
}
3753
3754
3755
//------------------------------------------------------------------------------
3756
//
3757
//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3758
//                Extra NOPs are inserted for some constructs during the initial
3759
//                code generation to provide locations that may be patched later.
3760
//                Many end up unneeded, and are removed by this function.
3761
//
3762
//                In order to minimize the number of passes through the pattern,
3763
//                back-reference fixup is also performed here (adjusting
3764
//                back-reference operands to point to the correct frame offsets).
3765
//
3766
//------------------------------------------------------------------------------
3767
13.1k
void RegexCompile::stripNOPs() {
3768
3769
13.1k
    if (U_FAILURE(*fStatus)) {
3770
0
        return;
3771
0
    }
3772
3773
13.1k
    int32_t    end = fRXPat->fCompiledPat->size();
3774
13.1k
    UVector32  deltas(end, *fStatus);
3775
3776
    // Make a first pass over the code, computing the amount that things
3777
    //   will be offset at each location in the original code.
3778
13.1k
    int32_t   loc;
3779
13.1k
    int32_t   d = 0;
3780
28.8M
    for (loc=0; loc<end; loc++) {
3781
28.8M
        deltas.addElement(d, *fStatus);
3782
28.8M
        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3783
28.8M
        if (URX_TYPE(op) == URX_NOP) {
3784
122k
            d++;
3785
122k
        }
3786
28.8M
    }
3787
3788
13.1k
    UnicodeString caseStringBuffer;
3789
3790
    // Make a second pass over the code, removing the NOPs by moving following
3791
    //  code up, and patching operands that refer to code locations that
3792
    //  are being moved.  The array of offsets from the first step is used
3793
    //  to compute the new operand values.
3794
13.1k
    int32_t src;
3795
13.1k
    int32_t dst = 0;
3796
28.8M
    for (src=0; src<end; src++) {
3797
28.8M
        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(src));
3798
28.8M
        int32_t opType = URX_TYPE(op);
3799
28.8M
        switch (opType) {
3800
122k
        case URX_NOP:
3801
122k
            break;
3802
3803
12.4M
        case URX_STATE_SAVE:
3804
24.0M
        case URX_JMP:
3805
24.0M
        case URX_CTR_LOOP:
3806
24.0M
        case URX_CTR_LOOP_NG:
3807
24.0M
        case URX_RELOC_OPRND:
3808
24.0M
        case URX_JMPX:
3809
24.2M
        case URX_JMP_SAV:
3810
24.3M
        case URX_JMP_SAV_X:
3811
            // These are instructions with operands that refer to code locations.
3812
24.3M
            {
3813
24.3M
                int32_t  operandAddress = URX_VAL(op);
3814
24.3M
                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3815
24.3M
                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3816
24.3M
                op = buildOp(opType, fixedOperandAddress);
3817
24.3M
                fRXPat->fCompiledPat->setElementAt(op, dst);
3818
24.3M
                dst++;
3819
24.3M
                break;
3820
24.2M
            }
3821
3822
1.64k
        case URX_BACKREF:
3823
6.21k
        case URX_BACKREF_I:
3824
6.21k
            {
3825
6.21k
                int32_t where = URX_VAL(op);
3826
6.21k
                if (where > fRXPat->fGroupMap->size()) {
3827
1.01k
                    error(U_REGEX_INVALID_BACK_REF);
3828
1.01k
                    break;
3829
1.01k
                }
3830
5.20k
                where = fRXPat->fGroupMap->elementAti(where-1);
3831
5.20k
                op    = buildOp(opType, where);
3832
5.20k
                fRXPat->fCompiledPat->setElementAt(op, dst);
3833
5.20k
                dst++;
3834
3835
5.20k
                fRXPat->fNeedsAltInput = true;
3836
5.20k
                break;
3837
6.21k
            }
3838
14.8k
        case URX_RESERVED_OP:
3839
15.5k
        case URX_RESERVED_OP_N:
3840
24.2k
        case URX_BACKTRACK:
3841
37.3k
        case URX_END:
3842
1.00M
        case URX_ONECHAR:
3843
1.14M
        case URX_STRING:
3844
2.17M
        case URX_STRING_LEN:
3845
2.22M
        case URX_START_CAPTURE:
3846
2.26M
        case URX_END_CAPTURE:
3847
2.27M
        case URX_STATIC_SETREF:
3848
2.30M
        case URX_STAT_SETREF_N:
3849
2.37M
        case URX_SETREF:
3850
2.90M
        case URX_DOTANY:
3851
2.91M
        case URX_FAIL:
3852
2.92M
        case URX_BACKSLASH_B:
3853
2.92M
        case URX_BACKSLASH_BU:
3854
2.92M
        case URX_BACKSLASH_G:
3855
2.93M
        case URX_BACKSLASH_X:
3856
2.93M
        case URX_BACKSLASH_Z:
3857
2.94M
        case URX_DOTANY_ALL:
3858
2.94M
        case URX_BACKSLASH_D:
3859
2.94M
        case URX_CARET:
3860
3.01M
        case URX_DOLLAR:
3861
3.02M
        case URX_CTR_INIT:
3862
3.02M
        case URX_CTR_INIT_NG:
3863
3.02M
        case URX_DOTANY_UNIX:
3864
3.11M
        case URX_STO_SP:
3865
3.20M
        case URX_LD_SP:
3866
3.31M
        case URX_STO_INP_LOC:
3867
3.31M
        case URX_LA_START:
3868
3.32M
        case URX_LA_END:
3869
3.37M
        case URX_ONECHAR_I:
3870
4.28M
        case URX_STRING_I:
3871
4.28M
        case URX_DOLLAR_M:
3872
4.28M
        case URX_CARET_M:
3873
4.28M
        case URX_CARET_M_UNIX:
3874
4.28M
        case URX_LB_START:
3875
4.29M
        case URX_LB_CONT:
3876
4.29M
        case URX_LB_END:
3877
4.29M
        case URX_LBN_CONT:
3878
4.29M
        case URX_LBN_END:
3879
4.29M
        case URX_LOOP_SR_I:
3880
4.31M
        case URX_LOOP_DOT_I:
3881
4.32M
        case URX_LOOP_C:
3882
4.36M
        case URX_DOLLAR_D:
3883
4.36M
        case URX_DOLLAR_MD:
3884
4.37M
        case URX_BACKSLASH_H:
3885
4.37M
        case URX_BACKSLASH_R:
3886
4.38M
        case URX_BACKSLASH_V:
3887
            // These instructions are unaltered by the relocation.
3888
4.38M
            fRXPat->fCompiledPat->setElementAt(op, dst);
3889
4.38M
            dst++;
3890
4.38M
            break;
3891
3892
0
        default:
3893
            // Some op is unaccounted for.
3894
0
            UPRV_UNREACHABLE_EXIT;
3895
28.8M
        }
3896
28.8M
    }
3897
3898
13.1k
    fRXPat->fCompiledPat->setSize(dst);
3899
13.1k
}
3900
3901
3902
3903
3904
//------------------------------------------------------------------------------
3905
//
3906
//  Error         Report a rule parse error.
3907
//                Only report it if no previous error has been recorded.
3908
//
3909
//------------------------------------------------------------------------------
3910
10.0k
void RegexCompile::error(UErrorCode e) {
3911
10.0k
    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3912
7.70k
        *fStatus = e;
3913
        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3914
        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3915
        // int64_t. If the values of the latter are out of range for the former,
3916
        // set them to the appropriate "field not supported" values.
3917
7.70k
        if (fLineNum > 0x7FFFFFFF) {
3918
0
            fParseErr->line   = 0;
3919
0
            fParseErr->offset = -1;
3920
7.70k
        } else if (fCharNum > 0x7FFFFFFF) {
3921
0
            fParseErr->line = static_cast<int32_t>(fLineNum);
3922
0
            fParseErr->offset = -1;
3923
7.70k
        } else {
3924
7.70k
            fParseErr->line = static_cast<int32_t>(fLineNum);
3925
7.70k
            fParseErr->offset = static_cast<int32_t>(fCharNum);
3926
7.70k
        }
3927
3928
7.70k
        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3929
3930
        // Fill in the context.
3931
        //   Note: extractBetween() pins supplied indices to the string bounds.
3932
7.70k
        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3933
7.70k
        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3934
7.70k
        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3935
7.70k
        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3936
7.70k
    }
3937
10.0k
}
3938
3939
3940
//
3941
//  Assorted Unicode character constants.
3942
//     Numeric because there is no portable way to enter them as literals.
3943
//     (Think EBCDIC).
3944
//
3945
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
3946
static const char16_t   chLF        = 0x0a;      // Line Feed
3947
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
3948
static const char16_t   chDigit0    = 0x30;      // '0'
3949
static const char16_t   chDigit7    = 0x37;      // '9'
3950
static const char16_t   chColon     = 0x3A;      // ':'
3951
static const char16_t   chE         = 0x45;      // 'E'
3952
static const char16_t   chQ         = 0x51;      // 'Q'
3953
//static const char16_t   chN         = 0x4E;      // 'N'
3954
static const char16_t   chP         = 0x50;      // 'P'
3955
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
3956
//static const char16_t   chLBracket  = 0x5b;      // '['
3957
static const char16_t   chRBracket  = 0x5d;      // ']'
3958
static const char16_t   chUp        = 0x5e;      // '^'
3959
static const char16_t   chLowerP    = 0x70;
3960
static const char16_t   chLBrace    = 0x7b;      // '{'
3961
static const char16_t   chRBrace    = 0x7d;      // '}'
3962
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
3963
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
3964
3965
3966
//------------------------------------------------------------------------------
3967
//
3968
//  nextCharLL    Low Level Next Char from the regex pattern.
3969
//                Get a char from the string, keep track of input position
3970
//                     for error reporting.
3971
//
3972
//------------------------------------------------------------------------------
3973
115M
UChar32  RegexCompile::nextCharLL() {
3974
115M
    UChar32       ch;
3975
3976
115M
    if (fPeekChar != -1) {
3977
620k
        ch = fPeekChar;
3978
620k
        fPeekChar = -1;
3979
620k
        return ch;
3980
620k
    }
3981
3982
    // assume we're already in the right place
3983
115M
    ch = UTEXT_NEXT32(fRXPat->fPattern);
3984
115M
    if (ch == U_SENTINEL) {
3985
24.3k
        return ch;
3986
24.3k
    }
3987
3988
115M
    if (ch == chCR ||
3989
115M
        ch == chNEL ||
3990
115M
        ch == chLS   ||
3991
115M
        (ch == chLF && fLastChar != chCR)) {
3992
        // Character is starting a new line.  Bump up the line number, and
3993
        //  reset the column to 0.
3994
193k
        fLineNum++;
3995
193k
        fCharNum=0;
3996
193k
    }
3997
115M
    else {
3998
        // Character is not starting a new line.  Except in the case of a
3999
        //   LF following a CR, increment the column position.
4000
115M
        if (ch != chLF) {
4001
114M
            fCharNum++;
4002
114M
        }
4003
115M
    }
4004
115M
    fLastChar = ch;
4005
115M
    return ch;
4006
115M
}
4007
4008
//------------------------------------------------------------------------------
4009
//
4010
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4011
//                 character without actually getting it.
4012
//
4013
//------------------------------------------------------------------------------
4014
1.25M
UChar32  RegexCompile::peekCharLL() {
4015
1.25M
    if (fPeekChar == -1) {
4016
623k
        fPeekChar = nextCharLL();
4017
623k
    }
4018
1.25M
    return fPeekChar;
4019
1.25M
}
4020
4021
4022
//------------------------------------------------------------------------------
4023
//
4024
//   nextChar     for pattern scanning.  At this level, we handle stripping
4025
//                out comments and processing some backslash character escapes.
4026
//                The rest of the pattern grammar is handled at the next level up.
4027
//
4028
//------------------------------------------------------------------------------
4029
112M
void RegexCompile::nextChar(RegexPatternChar &c) {
4030
112M
  tailRecursion:
4031
112M
    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4032
112M
    c.fChar    = nextCharLL();
4033
112M
    c.fQuoted  = false;
4034
4035
112M
    if (fQuoteMode) {
4036
4.48M
        c.fQuoted = true;
4037
4.48M
        if ((c.fChar == chBackSlash && peekCharLL() == chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4038
4.48M
            c.fChar == static_cast<UChar32>(-1)) {
4039
1.67k
            fQuoteMode = false;  //  Exit quote mode,
4040
1.67k
            nextCharLL();        // discard the E
4041
            // nextChar(c);      // recurse to get the real next char
4042
1.67k
            goto tailRecursion;  // Note: fuzz testing produced testcases that
4043
                                 //       resulted in stack overflow here.
4044
1.67k
        }
4045
4.48M
    }
4046
107M
    else if (fInBackslashQuote) {
4047
        // The current character immediately follows a '\'
4048
        // Don't check for any further escapes, just return it as-is.
4049
        // Don't set c.fQuoted, because that would prevent the state machine from
4050
        //    dispatching on the character.
4051
309k
        fInBackslashQuote = false;
4052
309k
    }
4053
107M
    else
4054
107M
    {
4055
        // We are not in a \Q quoted region \E of the source.
4056
        //
4057
107M
        if (fModeFlags & UREGEX_COMMENTS) {
4058
            //
4059
            // We are in free-spacing and comments mode.
4060
            //  Scan through any white space and comments, until we
4061
            //  reach a significant character or the end of input.
4062
2.55M
            for (;;) {
4063
2.55M
                if (c.fChar == static_cast<UChar32>(-1)) {
4064
1.35k
                    break;     // End of Input
4065
1.35k
                }
4066
2.55M
                if  (c.fChar == chPound && fEOLComments) {
4067
                    // Start of a comment.  Consume the rest of it, until EOF or a new line
4068
2.62M
                    for (;;) {
4069
2.62M
                        c.fChar = nextCharLL();
4070
2.62M
                        if (c.fChar == static_cast<UChar32>(-1) || // EOF
4071
2.62M
                            c.fChar == chCR        ||
4072
2.61M
                            c.fChar == chLF        ||
4073
2.61M
                            c.fChar == chNEL       ||
4074
2.61M
                            c.fChar == chLS)       {
4075
6.17k
                            break;
4076
6.17k
                        }
4077
2.62M
                    }
4078
6.17k
                }
4079
                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4080
2.55M
                if (PatternProps::isWhiteSpace(c.fChar) == false) {
4081
2.51M
                    break;
4082
2.51M
                }
4083
35.3k
                c.fChar = nextCharLL();
4084
35.3k
            }
4085
2.51M
        }
4086
4087
        //
4088
        //  check for backslash escaped characters.
4089
        //
4090
107M
        if (c.fChar == chBackSlash) {
4091
569k
            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4092
569k
            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4093
                //
4094
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
4095
                //   Includes \uxxxx, \n, \r, many others.
4096
                //   Return the single equivalent character.
4097
                //
4098
251k
                nextCharLL();                 // get & discard the peeked char.
4099
251k
                c.fQuoted = true;
4100
4101
251k
                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4102
251k
                    int32_t endIndex = static_cast<int32_t>(pos);
4103
251k
                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, static_cast<int32_t>(fPatternLength), const_cast<char16_t*>(fRXPat->fPattern->chunkContents));
4104
4105
251k
                    if (endIndex == pos) {
4106
213
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4107
213
                    }
4108
251k
                    fCharNum += endIndex - pos;
4109
251k
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4110
251k
                } else {
4111
0
                    int32_t offset = 0;
4112
0
                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4113
4114
0
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4115
0
                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4116
4117
0
                    if (offset == 0) {
4118
0
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4119
0
                    } else if (context.lastOffset == offset) {
4120
0
                        UTEXT_PREVIOUS32(fRXPat->fPattern);
4121
0
                    } else if (context.lastOffset != offset-1) {
4122
0
                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4123
0
                    }
4124
0
                    fCharNum += offset;
4125
0
                }
4126
251k
            }
4127
317k
            else if (peekCharLL() == chDigit0) {
4128
                //  Octal Escape, using Java Regexp Conventions
4129
                //    which are \0 followed by 1-3 octal digits.
4130
                //    Different from ICU Unescape handling of Octal, which does not
4131
                //    require the leading 0.
4132
                //  Java also has the convention of only consuming 2 octal digits if
4133
                //    the three digit number would be > 0xff
4134
                //
4135
6.16k
                c.fChar = 0;
4136
6.16k
                nextCharLL();    // Consume the initial 0.
4137
6.16k
                int index;
4138
14.1k
                for (index=0; index<3; index++) {
4139
12.8k
                    int32_t ch = peekCharLL();
4140
12.8k
                    if (ch<chDigit0 || ch>chDigit7) {
4141
4.88k
                        if (index==0) {
4142
                           // \0 is not followed by any octal digits.
4143
1.07k
                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4144
1.07k
                        }
4145
4.88k
                        break;
4146
4.88k
                    }
4147
7.99k
                    c.fChar <<= 3;
4148
7.99k
                    c.fChar += ch&7;
4149
7.99k
                    if (c.fChar <= 255) {
4150
6.90k
                        nextCharLL();
4151
6.90k
                    } else {
4152
                        // The last digit made the number too big.  Forget we saw it.
4153
1.09k
                        c.fChar >>= 3;
4154
1.09k
                    }
4155
7.99k
                }
4156
6.16k
                c.fQuoted = true;
4157
6.16k
            }
4158
311k
            else if (peekCharLL() == chQ) {
4159
                //  "\Q"  enter quote mode, which will continue until "\E"
4160
1.93k
                fQuoteMode = true;
4161
1.93k
                nextCharLL();        // discard the 'Q'.
4162
                // nextChar(c);      // recurse to get the real next char.
4163
1.93k
                goto tailRecursion;  // Note: fuzz testing produced test cases that
4164
                //                            resulted in stack overflow here.
4165
1.93k
            }
4166
309k
            else
4167
309k
            {
4168
                // We are in a '\' escape that will be handled by the state table scanner.
4169
                // Just return the backslash, but remember that the following char is to
4170
                //  be taken literally.
4171
309k
                fInBackslashQuote = true;
4172
309k
            }
4173
569k
        }
4174
107M
    }
4175
4176
    // re-enable # to end-of-line comments, in case they were disabled.
4177
    // They are disabled by the parser upon seeing '(?', but this lasts for
4178
    //  the fetching of the next character only.
4179
112M
    fEOLComments = true;
4180
4181
    // putc(c.fChar, stdout);
4182
112M
}
4183
4184
4185
4186
//------------------------------------------------------------------------------
4187
//
4188
//  scanNamedChar
4189
//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4190
//
4191
//             The scan position will be at the 'N'.  On return
4192
//             the scan position should be just after the '}'
4193
//
4194
//             Return the UChar32
4195
//
4196
//------------------------------------------------------------------------------
4197
6.73k
UChar32  RegexCompile::scanNamedChar() {
4198
6.73k
    if (U_FAILURE(*fStatus)) {
4199
0
        return 0;
4200
0
    }
4201
4202
6.73k
    nextChar(fC);
4203
6.73k
    if (fC.fChar != chLBrace) {
4204
33
        error(U_REGEX_PROPERTY_SYNTAX);
4205
33
        return 0;
4206
33
    }
4207
4208
6.70k
    UnicodeString  charName;
4209
24.8k
    for (;;) {
4210
24.8k
        nextChar(fC);
4211
24.8k
        if (fC.fChar == chRBrace) {
4212
6.64k
            break;
4213
6.64k
        }
4214
18.1k
        if (fC.fChar == -1) {
4215
57
            error(U_REGEX_PROPERTY_SYNTAX);
4216
57
            return 0;
4217
57
        }
4218
18.1k
        charName.append(fC.fChar);
4219
18.1k
    }
4220
4221
6.64k
    char name[100];
4222
6.64k
    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4223
6.64k
         static_cast<uint32_t>(charName.length()) >= sizeof(name)) {
4224
        // All Unicode character names have only invariant characters.
4225
        // The API to get a character, given a name, accepts only char *, forcing us to convert,
4226
        //   which requires this error check
4227
7
        error(U_REGEX_PROPERTY_SYNTAX);
4228
7
        return 0;
4229
7
    }
4230
6.64k
    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4231
4232
6.64k
    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4233
6.64k
    if (U_FAILURE(*fStatus)) {
4234
73
        error(U_REGEX_PROPERTY_SYNTAX);
4235
73
    }
4236
4237
6.64k
    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4238
6.64k
    return theChar;
4239
6.64k
}
4240
4241
//------------------------------------------------------------------------------
4242
//
4243
//  scanProp   Construct a UnicodeSet from the text at the current scan
4244
//             position, which will be of the form \p{whaterver}
4245
//
4246
//             The scan position will be at the 'p' or 'P'.  On return
4247
//             the scan position should be just after the '}'
4248
//
4249
//             Return a UnicodeSet, constructed from the \P pattern,
4250
//             or nullptr if the pattern is invalid.
4251
//
4252
//------------------------------------------------------------------------------
4253
3.09k
UnicodeSet *RegexCompile::scanProp() {
4254
3.09k
    UnicodeSet    *uset = nullptr;
4255
4256
3.09k
    if (U_FAILURE(*fStatus)) {
4257
0
        return nullptr;
4258
0
    }
4259
3.09k
    (void)chLowerP;   // Suppress compiler unused variable warning.
4260
3.09k
    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4261
3.09k
    UBool negated = (fC.fChar == chP);
4262
4263
3.09k
    UnicodeString propertyName;
4264
3.09k
    nextChar(fC);
4265
3.09k
    if (fC.fChar != chLBrace) {
4266
40
        error(U_REGEX_PROPERTY_SYNTAX);
4267
40
        return nullptr;
4268
40
    }
4269
1.35M
    for (;;) {
4270
1.35M
        nextChar(fC);
4271
1.35M
        if (fC.fChar == chRBrace) {
4272
2.99k
            break;
4273
2.99k
        }
4274
1.35M
        if (fC.fChar == -1) {
4275
            // Hit the end of the input string without finding the closing '}'
4276
62
            error(U_REGEX_PROPERTY_SYNTAX);
4277
62
            return nullptr;
4278
62
        }
4279
1.35M
        propertyName.append(fC.fChar);
4280
1.35M
    }
4281
2.99k
    uset = createSetForProperty(propertyName, negated);
4282
2.99k
    nextChar(fC);    // Move input scan to position following the closing '}'
4283
2.99k
    return uset;
4284
3.05k
}
4285
4286
//------------------------------------------------------------------------------
4287
//
4288
//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4289
//             position, which is expected be of the form [:property expression:]
4290
//
4291
//             The scan position will be at the opening ':'.  On return
4292
//             the scan position must be on the closing ']'
4293
//
4294
//             Return a UnicodeSet constructed from the pattern,
4295
//             or nullptr if this is not a valid POSIX-style set expression.
4296
//             If not a property expression, restore the initial scan position
4297
//                (to the opening ':')
4298
//
4299
//               Note:  the opening '[:' is not sufficient to guarantee that
4300
//                      this is a [:property:] expression.
4301
//                      [:'+=,] is a perfectly good ordinary set expression that
4302
//                              happens to include ':' as one of its characters.
4303
//
4304
//------------------------------------------------------------------------------
4305
109k
UnicodeSet *RegexCompile::scanPosixProp() {
4306
109k
    UnicodeSet    *uset = nullptr;
4307
4308
109k
    if (U_FAILURE(*fStatus)) {
4309
0
        return nullptr;
4310
0
    }
4311
4312
109k
    U_ASSERT(fC.fChar == chColon);
4313
4314
    // Save the scanner state.
4315
    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4316
109k
    int64_t     savedScanIndex        = fScanIndex;
4317
109k
    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4318
109k
    UBool       savedQuoteMode        = fQuoteMode;
4319
109k
    UBool       savedInBackslashQuote = fInBackslashQuote;
4320
109k
    UBool       savedEOLComments      = fEOLComments;
4321
109k
    int64_t     savedLineNum          = fLineNum;
4322
109k
    int64_t     savedCharNum          = fCharNum;
4323
109k
    UChar32     savedLastChar         = fLastChar;
4324
109k
    UChar32     savedPeekChar         = fPeekChar;
4325
109k
    RegexPatternChar savedfC          = fC;
4326
4327
    // Scan for a closing ].   A little tricky because there are some perverse
4328
    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4329
    //   ending on the second closing ].
4330
4331
109k
    UnicodeString propName;
4332
109k
    UBool         negated  = false;
4333
4334
    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4335
109k
    nextChar(fC);
4336
109k
    if (fC.fChar == chUp) {
4337
2.42k
       negated = true;
4338
2.42k
       nextChar(fC);
4339
2.42k
    }
4340
4341
    // Scan for the closing ":]", collecting the property name along the way.
4342
109k
    UBool  sawPropSetTerminator = false;
4343
28.7M
    for (;;) {
4344
28.7M
        propName.append(fC.fChar);
4345
28.7M
        nextChar(fC);
4346
28.7M
        if (fC.fQuoted || fC.fChar == -1) {
4347
            // Escaped characters or end of input - either says this isn't a [:Property:]
4348
9.02k
            break;
4349
9.02k
        }
4350
28.7M
        if (fC.fChar == chColon) {
4351
100k
            nextChar(fC);
4352
100k
            if (fC.fChar == chRBracket) {
4353
80.3k
                sawPropSetTerminator = true;
4354
80.3k
            }
4355
100k
            break;
4356
100k
        }
4357
28.7M
    }
4358
4359
109k
    if (sawPropSetTerminator) {
4360
80.3k
        uset = createSetForProperty(propName, negated);
4361
80.3k
    }
4362
29.2k
    else
4363
29.2k
    {
4364
        // No closing ":]".
4365
        //  Restore the original scan position.
4366
        //  The main scanner will retry the input as a normal set expression,
4367
        //    not a [:Property:] expression.
4368
29.2k
        fScanIndex        = savedScanIndex;
4369
29.2k
        fQuoteMode        = savedQuoteMode;
4370
29.2k
        fInBackslashQuote = savedInBackslashQuote;
4371
29.2k
        fEOLComments      = savedEOLComments;
4372
29.2k
        fLineNum          = savedLineNum;
4373
29.2k
        fCharNum          = savedCharNum;
4374
29.2k
        fLastChar         = savedLastChar;
4375
29.2k
        fPeekChar         = savedPeekChar;
4376
29.2k
        fC                = savedfC;
4377
29.2k
        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4378
29.2k
    }
4379
109k
    return uset;
4380
109k
}
4381
4382
0
static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4383
0
    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4384
0
    addCategory(set, U_GC_CF_MASK, ec);
4385
0
}
4386
4387
//
4388
//  Create a Unicode Set from a Unicode Property expression.
4389
//     This is common code underlying both \p{...} and [:...:] expressions.
4390
//     Includes trying the Java "properties" that aren't supported as
4391
//     normal ICU UnicodeSet properties
4392
//
4393
83.3k
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4394
4395
83.3k
    if (U_FAILURE(*fStatus)) {
4396
1
        return nullptr;
4397
1
    }
4398
83.3k
    LocalPointer<UnicodeSet> set;
4399
83.3k
    UErrorCode status = U_ZERO_ERROR;
4400
4401
83.3k
    do {      // non-loop, exists to allow breaks from the block.
4402
        //
4403
        //  First try the property as we received it
4404
        //
4405
83.3k
        UnicodeString   setExpr;
4406
83.3k
        uint32_t        usetFlags = 0;
4407
83.3k
        setExpr.append(u"[\\p{", -1);
4408
83.3k
        setExpr.append(propName);
4409
83.3k
        setExpr.append(u"}]", -1);
4410
83.3k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4411
36.9k
            usetFlags |= USET_CASE_INSENSITIVE;
4412
36.9k
        }
4413
83.3k
        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
4414
83.3k
        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4415
67.2k
            break;
4416
67.2k
        }
4417
4418
        //
4419
        //  The incoming property wasn't directly recognized by ICU.
4420
4421
        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4422
        //     Java accepts 'word' with mixed case.
4423
        //     Java accepts 'all' only in all lower case.
4424
4425
16.0k
        status = U_ZERO_ERROR;
4426
16.0k
        if (propName.caseCompare(u"word", -1, 0) == 0) {
4427
839
            set.adoptInsteadAndCheckErrorCode(
4428
839
                RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4429
839
            break;
4430
839
        }
4431
15.1k
        if (propName.compare(u"all", -1) == 0) {
4432
364
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4433
364
            break;
4434
364
        }
4435
4436
4437
        //    Do Java InBlock expressions
4438
        //
4439
14.8k
        UnicodeString mPropName = propName;
4440
14.8k
        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4441
5.53k
            status = U_ZERO_ERROR;
4442
5.53k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4443
5.53k
            if (U_FAILURE(status)) {
4444
0
                break;
4445
0
            }
4446
5.53k
            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4447
5.53k
            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4448
5.53k
            break;
4449
5.53k
        }
4450
4451
        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4452
        //  as "BooleanPropertyValue". The property value can be either a
4453
        //  a General Category or a Script Name.
4454
4455
9.29k
        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4456
7.23k
            mPropName.remove(0, 2);      // Strip the "Is"
4457
7.23k
            if (mPropName.indexOf(u'=') >= 0) {
4458
                // Reject any "Is..." property expression containing an '=', that is,
4459
                // any non-binary property expression.
4460
21
                status = U_REGEX_PROPERTY_SYNTAX;
4461
21
                break;
4462
21
            }
4463
4464
7.21k
            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4465
0
                mPropName.setTo(u"unassigned", -1);
4466
0
                negated = !negated;
4467
7.21k
            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4468
0
                mPropName.setTo(u"Titlecase_Letter", -1);
4469
0
            }
4470
4471
7.21k
            mPropName.insert(0, u"[\\p{", -1);
4472
7.21k
            mPropName.append(u"}]", -1);
4473
7.21k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4474
4475
7.21k
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4476
2.29k
                set->closeOver(USET_CASE_INSENSITIVE);
4477
2.29k
            }
4478
7.21k
            break;
4479
4480
7.23k
        }
4481
4482
2.06k
        if (propName.startsWith(u"java", -1)) {
4483
0
            status = U_ZERO_ERROR;
4484
0
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4485
0
            if (U_FAILURE(status)) {
4486
0
                break;
4487
0
            }
4488
            //
4489
            //  Try the various Java specific properties.
4490
            //   These all begin with "java"
4491
            //
4492
0
            if (propName.compare(u"javaDefined", -1) == 0) {
4493
0
                addCategory(set.getAlias(), U_GC_CN_MASK, status);
4494
0
                set->complement();
4495
0
            }
4496
0
            else if (propName.compare(u"javaDigit", -1) == 0) {
4497
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498
0
            }
4499
0
            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4500
0
                addIdentifierIgnorable(set.getAlias(), status);
4501
0
            }
4502
0
            else if (propName.compare(u"javaISOControl", -1) == 0) {
4503
0
                set->add(0, 0x1F).add(0x7F, 0x9F);
4504
0
            }
4505
0
            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4506
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4507
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4508
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4509
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4510
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4511
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4512
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4513
0
                addIdentifierIgnorable(set.getAlias(), status);
4514
0
            }
4515
0
            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4516
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4517
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4518
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4519
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4520
0
            }
4521
0
            else if (propName.compare(u"javaLetter", -1) == 0) {
4522
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4523
0
            }
4524
0
            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4525
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4526
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4527
0
            }
4528
0
            else if (propName.compare(u"javaLowerCase", -1) == 0) {
4529
0
                addCategory(set.getAlias(), U_GC_LL_MASK, status);
4530
0
            }
4531
0
            else if (propName.compare(u"javaMirrored", -1) == 0) {
4532
0
                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4533
0
            }
4534
0
            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4535
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4536
0
            }
4537
0
            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4538
0
                set->add(0x10000, UnicodeSet::MAX_VALUE);
4539
0
            }
4540
0
            else if (propName.compare(u"javaTitleCase", -1) == 0) {
4541
0
                addCategory(set.getAlias(), U_GC_LT_MASK, status);
4542
0
            }
4543
0
            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4544
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4545
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4546
0
            }
4547
0
            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4548
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4549
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4550
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4551
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4552
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4553
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4554
0
                addIdentifierIgnorable(set.getAlias(), status);
4555
0
            }
4556
0
            else if (propName.compare(u"javaUpperCase", -1) == 0) {
4557
0
                addCategory(set.getAlias(), U_GC_LU_MASK, status);
4558
0
            }
4559
0
            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4560
0
                set->add(0, UnicodeSet::MAX_VALUE);
4561
0
            }
4562
0
            else if (propName.compare(u"javaWhitespace", -1) == 0) {
4563
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4564
0
                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4565
0
                set->add(9, 0x0d).add(0x1c, 0x1f);
4566
0
            } else {
4567
0
                status = U_REGEX_PROPERTY_SYNTAX;
4568
0
            }
4569
4570
0
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4571
0
                set->closeOver(USET_CASE_INSENSITIVE);
4572
0
            }
4573
0
            break;
4574
0
        }
4575
4576
        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4577
        // extensions matched it.
4578
2.06k
        status = U_REGEX_PROPERTY_SYNTAX;
4579
2.06k
    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4580
4581
83.3k
    if (U_SUCCESS(status)) {
4582
        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4583
        // deal with properties of strings and character classes with strings, we ignore them.
4584
        // Just in case something downstream might stumble over the strings,
4585
        // we remove them from the set.
4586
        // Note that when we support strings, the complement of a property (as with \P)
4587
        // should be implemented as .complement().removeAllStrings() (code point complement).
4588
81.1k
        set->removeAllStrings();
4589
81.1k
        U_ASSERT(set.isValid());
4590
81.1k
        if (negated) {
4591
2.60k
            set->complement();
4592
2.60k
        }
4593
81.1k
        return set.orphan();
4594
81.1k
    } else {
4595
2.12k
        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4596
46
            status = U_REGEX_PROPERTY_SYNTAX;
4597
46
        }
4598
2.12k
        error(status);
4599
2.12k
        return nullptr;
4600
2.12k
    }
4601
83.3k
}
4602
4603
4604
//
4605
//  SetEval   Part of the evaluation of [set expressions].
4606
//            Perform any pending (stacked) operations with precedence
4607
//            equal or greater to that of the next operator encountered
4608
//            in the expression.
4609
//
4610
12.6M
void RegexCompile::setEval(int32_t nextOp) {
4611
12.6M
    UnicodeSet *rightOperand = nullptr;
4612
12.6M
    UnicodeSet *leftOperand  = nullptr;
4613
13.1M
    for (;;) {
4614
13.1M
        U_ASSERT(fSetOpStack.empty()==false);
4615
13.1M
        int32_t pendingSetOperation = fSetOpStack.peeki();
4616
13.1M
        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4617
12.6M
            break;
4618
12.6M
        }
4619
497k
        fSetOpStack.popi();
4620
497k
        U_ASSERT(fSetStack.empty() == false);
4621
497k
        rightOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4622
        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4623
        // (see comments there).
4624
        // We also do not yet support string literals in character classes,
4625
        // so there should not be any strings.
4626
        // Note that when we support strings, the complement of a set (as with ^ or \P)
4627
        // should be implemented as .complement().removeAllStrings() (code point complement).
4628
497k
        U_ASSERT(!rightOperand->hasStrings());
4629
497k
        switch (pendingSetOperation) {
4630
11.3k
            case setNegation:
4631
11.3k
                rightOperand->complement();
4632
11.3k
                break;
4633
401k
            case setCaseClose:
4634
                // TODO: need a simple close function.  Ticket 6065
4635
401k
                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4636
401k
                rightOperand->removeAllStrings();
4637
401k
                break;
4638
2.09k
            case setDifference1:
4639
2.59k
            case setDifference2:
4640
2.59k
                fSetStack.pop();
4641
2.59k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4642
2.59k
                leftOperand->removeAll(*rightOperand);
4643
2.59k
                delete rightOperand;
4644
2.59k
                break;
4645
958
            case setIntersection1:
4646
5.87k
            case setIntersection2:
4647
5.87k
                fSetStack.pop();
4648
5.87k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4649
5.87k
                leftOperand->retainAll(*rightOperand);
4650
5.87k
                delete rightOperand;
4651
5.87k
                break;
4652
76.6k
            case setUnion:
4653
76.6k
                fSetStack.pop();
4654
76.6k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4655
76.6k
                leftOperand->addAll(*rightOperand);
4656
76.6k
                delete rightOperand;
4657
76.6k
                break;
4658
0
            default:
4659
0
                UPRV_UNREACHABLE_EXIT;
4660
497k
            }
4661
497k
        }
4662
12.6M
    }
4663
4664
95.9k
void RegexCompile::setPushOp(int32_t op) {
4665
95.9k
    setEval(op);
4666
95.9k
    fSetOpStack.push(op, *fStatus);
4667
95.9k
    LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4668
95.9k
    fSetStack.push(lpSet.orphan(), *fStatus);
4669
95.9k
}
4670
4671
U_NAMESPACE_END
4672
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4673