Coverage Report

Created: 2026-06-23 06:26

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/regexcmp.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
//
4
//  file:  regexcmp.cpp
5
//
6
//  Copyright (C) 2002-2016 International Business Machines Corporation and others.
7
//  All Rights Reserved.
8
//
9
//  This file contains the ICU regular expression compiler, which is responsible
10
//  for processing a regular expression pattern into the compiled form that
11
//  is used by the match finding engine.
12
//
13
14
#include "unicode/utypes.h"
15
16
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
17
18
#include "unicode/ustring.h"
19
#include "unicode/unistr.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/uchriter.h"
23
#include "unicode/parsepos.h"
24
#include "unicode/parseerr.h"
25
#include "unicode/regex.h"
26
#include "unicode/utf.h"
27
#include "unicode/utf16.h"
28
#include "patternprops.h"
29
#include "putilimp.h"
30
#include "cmemory.h"
31
#include "cstr.h"
32
#include "cstring.h"
33
#include "uvectr32.h"
34
#include "uvectr64.h"
35
#include "uassert.h"
36
#include "uinvchar.h"
37
38
#include "regeximp.h"
39
#include "regexcst.h"   // Contains state table for the regex pattern parser.
40
                        //   generated by a Perl script.
41
#include "regexcmp.h"
42
#include "regexst.h"
43
#include "regextxt.h"
44
45
46
47
U_NAMESPACE_BEGIN
48
49
50
//------------------------------------------------------------------------------
51
//
52
//  Constructor.
53
//
54
//------------------------------------------------------------------------------
55
RegexCompile::RegexCompile(RegexPattern *rxp, UErrorCode &status) :
56
20.4k
   fParenStack(status), fSetStack(uprv_deleteUObject, nullptr, status), fSetOpStack(status)
57
20.4k
{
58
    // Lazy init of all shared global sets (needed for init()'s empty text)
59
20.4k
    RegexStaticSets::initGlobals(&status);
60
61
20.4k
    fStatus           = &status;
62
63
20.4k
    fRXPat            = rxp;
64
20.4k
    fScanIndex        = 0;
65
20.4k
    fLastChar         = -1;
66
20.4k
    fPeekChar         = -1;
67
20.4k
    fLineNum          = 1;
68
20.4k
    fCharNum          = 0;
69
20.4k
    fQuoteMode        = false;
70
20.4k
    fInBackslashQuote = false;
71
20.4k
    fModeFlags        = fRXPat->fFlags | 0x80000000;
72
20.4k
    fEOLComments      = true;
73
74
20.4k
    fMatchOpenParen   = -1;
75
20.4k
    fMatchCloseParen  = -1;
76
20.4k
    fCaptureName      = nullptr;
77
20.4k
    fLastSetLiteral   = U_SENTINEL;
78
79
20.4k
    if (U_SUCCESS(status) && U_FAILURE(rxp->fDeferredStatus)) {
80
0
        status = rxp->fDeferredStatus;
81
0
    }
82
20.4k
}
83
84
static const char16_t   chAmp       = 0x26;      // '&'
85
static const char16_t   chDash      = 0x2d;      // '-'
86
87
88
//------------------------------------------------------------------------------
89
//
90
//  Destructor
91
//
92
//------------------------------------------------------------------------------
93
20.4k
RegexCompile::~RegexCompile() {
94
20.4k
    delete fCaptureName;         // Normally will be nullptr, but can exist if pattern
95
                                 //   compilation stops with a syntax error.
96
20.4k
}
97
98
1.51k
static inline void addCategory(UnicodeSet *set, int32_t value, UErrorCode& ec) {
99
1.51k
    set->addAll(UnicodeSet().applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, value, ec));
100
1.51k
}
101
102
//------------------------------------------------------------------------------
103
//
104
//  Compile regex pattern.   The state machine for rexexp pattern parsing is here.
105
//                           The state tables are hand-written in the file regexcst.txt,
106
//                           and converted to the form used here by a perl
107
//                           script regexcst.pl
108
//
109
//------------------------------------------------------------------------------
110
void    RegexCompile::compile(
111
                         const UnicodeString &pat,   // Source pat to be compiled.
112
                         UParseError &pp,            // Error position info
113
                         UErrorCode &e)              // Error Code
114
9.64k
{
115
9.64k
    fRXPat->fPatternString = new UnicodeString(pat);
116
9.64k
    UText patternText = UTEXT_INITIALIZER;
117
9.64k
    utext_openConstUnicodeString(&patternText, fRXPat->fPatternString, &e);
118
119
9.64k
    if (U_SUCCESS(e)) {
120
9.64k
        compile(&patternText, pp, e);
121
9.64k
        utext_close(&patternText);
122
9.64k
    }
123
9.64k
}
124
125
//
126
//   compile, UText mode
127
//     All the work is actually done here.
128
//
129
void    RegexCompile::compile(
130
                         UText *pat,                 // Source pat to be compiled.
131
                         UParseError &pp,            // Error position info
132
                         UErrorCode &e)              // Error Code
133
20.4k
{
134
20.4k
    fStatus             = &e;
135
20.4k
    fParseErr           = &pp;
136
20.4k
    fStackPtr           = 0;
137
20.4k
    fStack[fStackPtr]   = 0;
138
139
20.4k
    if (U_FAILURE(*fStatus)) {
140
0
        return;
141
0
    }
142
143
    // There should be no pattern stuff in the RegexPattern object.  They can not be reused.
144
20.4k
    U_ASSERT(fRXPat->fPattern == nullptr || utext_nativeLength(fRXPat->fPattern) == 0);
145
146
    // Prepare the RegexPattern object to receive the compiled pattern.
147
20.4k
    fRXPat->fPattern        = utext_clone(fRXPat->fPattern, pat, false, true, fStatus);
148
20.4k
    if (U_FAILURE(*fStatus)) {
149
0
        return;
150
0
    }
151
152
    // Initialize the pattern scanning state machine
153
20.4k
    fPatternLength = utext_nativeLength(pat);
154
20.4k
    uint16_t                state = 1;
155
20.4k
    const RegexTableEl      *tableEl;
156
157
    // UREGEX_LITERAL force entire pattern to be treated as a literal string.
158
20.4k
    if (fModeFlags & UREGEX_LITERAL) {
159
0
        fQuoteMode = true;
160
0
    }
161
162
20.4k
    nextChar(fC);                        // Fetch the first char from the pattern string.
163
164
    //
165
    // Main loop for the regex pattern parsing state machine.
166
    //   Runs once per state transition.
167
    //   Each time through optionally performs, depending on the state table,
168
    //      - an advance to the next pattern char
169
    //      - an action to be performed.
170
    //      - pushing or popping a state to/from the local state return stack.
171
    //   file regexcst.txt is the source for the state table.  The logic behind
172
    //     recongizing the pattern syntax is there, not here.
173
    //
174
167M
    for (;;) {
175
        //  Bail out if anything has gone wrong.
176
        //  Regex pattern parsing stops on the first error encountered.
177
167M
        if (U_FAILURE(*fStatus)) {
178
256
            break;
179
256
        }
180
181
167M
        U_ASSERT(state != 0);
182
183
        // Find the state table element that matches the input char from the pattern, or the
184
        //    class of the input character.  Start with the first table row for this
185
        //    state, then linearly scan forward until we find a row that matches the
186
        //    character.  The last row for each state always matches all characters, so
187
        //    the search will stop there, if not before.
188
        //
189
167M
        tableEl = &gRuleParseStateTable[state];
190
167M
        REGEX_SCAN_DEBUG_PRINTF(("char, line, col = (\'%c\', %d, %d)    state=%s ",
191
167M
            fC.fChar, fLineNum, fCharNum, RegexStateNames[state]));
192
193
765M
        for (;;) {    // loop through table rows belonging to this state, looking for one
194
                      //   that matches the current input char.
195
765M
            REGEX_SCAN_DEBUG_PRINTF(("."));
196
765M
            if (tableEl->fCharClass < 127 && fC.fQuoted == false &&   tableEl->fCharClass == fC.fChar) {
197
                // Table row specified an individual character, not a set, and
198
                //   the input character is not quoted, and
199
                //   the input character matched it.
200
25.4M
                break;
201
25.4M
            }
202
739M
            if (tableEl->fCharClass == 255) {
203
                // Table row specified default, match anything character class.
204
100M
                break;
205
100M
            }
206
639M
            if (tableEl->fCharClass == 254 && fC.fQuoted)  {
207
                // Table row specified "quoted" and the char was quoted.
208
2.08M
                break;
209
2.08M
            }
210
637M
            if (tableEl->fCharClass == 253 && fC.fChar == static_cast<UChar32>(-1)) {
211
                // Table row specified eof and we hit eof on the input.
212
16.7k
                break;
213
16.7k
            }
214
215
637M
            if (tableEl->fCharClass >= 128 && tableEl->fCharClass < 240 &&   // Table specs a char class &&
216
62.4M
                fC.fQuoted == false &&                                       //   char is not escaped &&
217
62.4M
                fC.fChar != static_cast<UChar32>(-1)) {                      //   char is not EOF
218
62.4M
                U_ASSERT(tableEl->fCharClass <= 137);
219
62.4M
                if (RegexStaticSets::gStaticSets->fRuleSets[tableEl->fCharClass-128].contains(fC.fChar)) {
220
                    // Table row specified a character class, or set of characters,
221
                    //   and the current char matches it.
222
39.9M
                    break;
223
39.9M
                }
224
62.4M
            }
225
226
            // No match on this row, advance to the next  row for this state,
227
597M
            tableEl++;
228
597M
        }
229
167M
        REGEX_SCAN_DEBUG_PRINTF(("\n"));
230
231
        //
232
        // We've found the row of the state table that matches the current input
233
        //   character from the rules string.
234
        // Perform any action specified  by this row in the state table.
235
167M
        if (doParseActions(tableEl->fAction) == false) {
236
            // Break out of the state machine loop if the
237
            //   the action signalled some kind of error, or
238
            //   the action was to exit, occurs on normal end-of-rules-input.
239
20.1k
            break;
240
20.1k
        }
241
242
167M
        if (tableEl->fPushState != 0) {
243
980k
            fStackPtr++;
244
980k
            if (fStackPtr >= kStackSize) {
245
12
                error(U_REGEX_INTERNAL_ERROR);
246
12
                REGEX_SCAN_DEBUG_PRINTF(("RegexCompile::parse() - state stack overflow.\n"));
247
12
                fStackPtr--;
248
12
            }
249
980k
            fStack[fStackPtr] = tableEl->fPushState;
250
980k
        }
251
252
        //
253
        //  NextChar.  This is where characters are actually fetched from the pattern.
254
        //             Happens under control of the 'n' tag in the state table.
255
        //
256
167M
        if (tableEl->fNextChar) {
257
80.8M
            nextChar(fC);
258
80.8M
        }
259
260
        // Get the next state from the table entry, or from the
261
        //   state stack if the next state was specified as "pop".
262
167M
        if (tableEl->fNextState != 255) {
263
166M
            state = tableEl->fNextState;
264
166M
        } else {
265
955k
            state = fStack[fStackPtr];
266
955k
            fStackPtr--;
267
955k
            if (fStackPtr < 0) {
268
                // state stack underflow
269
                // This will occur if the user pattern has mis-matched parentheses,
270
                //   with extra close parens.
271
                //
272
0
                fStackPtr++;
273
0
                error(U_REGEX_MISMATCHED_PAREN);
274
0
            }
275
955k
        }
276
277
167M
    }
278
279
20.4k
    if (U_FAILURE(*fStatus)) {
280
        // Bail out if the pattern had errors.
281
8.01k
        return;
282
8.01k
    }
283
284
    //
285
    // The pattern has now been read and processed, and the compiled code generated.
286
    //
287
288
    //
289
    // The pattern's fFrameSize so far has accumulated the requirements for
290
    //   storage for capture parentheses, counters, etc. that are encountered
291
    //   in the pattern.  Add space for the two variables that are always
292
    //   present in the saved state:  the input string position (int64_t) and
293
    //   the position in the compiled pattern.
294
    //
295
12.4k
    allocateStackData(RESTACKFRAME_HDRCOUNT);
296
297
    //
298
    // Optimization pass 1: NOPs, back-references, and case-folding
299
    //
300
12.4k
    stripNOPs();
301
302
    //
303
    // Get bounds for the minimum and maximum length of a string that this
304
    //   pattern can match.  Used to avoid looking for matches in strings that
305
    //   are too short.
306
    //
307
12.4k
    fRXPat->fMinMatchLen = minMatchLength(3, fRXPat->fCompiledPat->size()-1);
308
309
    //
310
    // Optimization pass 2: match start type
311
    //
312
12.4k
    matchStartType();
313
314
    //
315
    // Set up fast latin-1 range sets
316
    //
317
12.4k
    int32_t numSets = fRXPat->fSets->size();
318
12.4k
    fRXPat->fSets8 = new Regex8BitSet[numSets];
319
    // Null pointer check.
320
12.4k
    if (fRXPat->fSets8 == nullptr) {
321
0
        e = *fStatus = U_MEMORY_ALLOCATION_ERROR;
322
0
        return;
323
0
    }
324
12.4k
    int32_t i;
325
225k
    for (i=0; i<numSets; i++) {
326
213k
        UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(i));
327
213k
        fRXPat->fSets8[i].init(s);
328
213k
    }
329
330
12.4k
}
331
332
333
334
335
336
//------------------------------------------------------------------------------
337
//
338
//  doParseAction        Do some action during regex pattern parsing.
339
//                       Called by the parse state machine.
340
//
341
//                       Generation of the match engine PCode happens here, or
342
//                       in functions called from the parse actions defined here.
343
//
344
//
345
//------------------------------------------------------------------------------
346
UBool RegexCompile::doParseActions(int32_t action)
347
167M
{
348
167M
    UBool   returnVal = true;
349
350
167M
    switch (static_cast<Regex_PatternParseAction>(action)) {
351
352
20.3k
    case doPatStart:
353
        // Start of pattern compiles to:
354
        //0   SAVE   2        Fall back to position of FAIL
355
        //1   jmp    3
356
        //2   FAIL            Stop if we ever reach here.
357
        //3   NOP             Dummy, so start of pattern looks the same as
358
        //                    the start of an ( grouping.
359
        //4   NOP             Resreved, will be replaced by a save if there are
360
        //                    OR | operators at the top level
361
20.3k
        appendOp(URX_STATE_SAVE, 2);
362
20.3k
        appendOp(URX_JMP,  3);
363
20.3k
        appendOp(URX_FAIL, 0);
364
365
        // Standard open nonCapture paren action emits the two NOPs and
366
        //   sets up the paren stack frame.
367
20.3k
        doParseActions(doOpenNonCaptureParen);
368
20.3k
        break;
369
370
14.7k
    case doPatFinish:
371
        // We've scanned to the end of the pattern
372
        //  The end of pattern compiles to:
373
        //        URX_END
374
        //    which will stop the runtime match engine.
375
        //  Encountering end of pattern also behaves like a close paren,
376
        //   and forces fixups of the State Save at the beginning of the compiled pattern
377
        //   and of any OR operations at the top level.
378
        //
379
14.7k
        handleCloseParen();
380
14.7k
        if (fParenStack.size() > 0) {
381
            // Missing close paren in pattern.
382
2.30k
            error(U_REGEX_MISMATCHED_PAREN);
383
2.30k
        }
384
385
        // add the END operation to the compiled pattern.
386
14.7k
        appendOp(URX_END, 0);
387
388
        // Terminate the pattern compilation state machine.
389
14.7k
        returnVal = false;
390
14.7k
        break;
391
392
393
394
21.5M
    case doOrOperator:
395
        // Scanning a '|', as in (A|B)
396
21.5M
        {
397
            // Generate code for any pending literals preceding the '|'
398
21.5M
            fixLiterals(false);
399
400
            // Insert a SAVE operation at the start of the pattern section preceding
401
            //   this OR at this level.  This SAVE will branch the match forward
402
            //   to the right hand side of the OR in the event that the left hand
403
            //   side fails to match and backtracks.  Locate the position for the
404
            //   save from the location on the top of the parentheses stack.
405
21.5M
            int32_t savePosition = fParenStack.popi();
406
21.5M
            int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(savePosition));
407
21.5M
            U_ASSERT(URX_TYPE(op) == URX_NOP);  // original contents of reserved location
408
21.5M
            op = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+1);
409
21.5M
            fRXPat->fCompiledPat->setElementAt(op, savePosition);
410
411
            // Append an JMP operation into the compiled pattern.  The operand for
412
            //  the JMP will eventually be the location following the ')' for the
413
            //  group.  This will be patched in later, when the ')' is encountered.
414
21.5M
            appendOp(URX_JMP, 0);
415
416
            // Push the position of the newly added JMP op onto the parentheses stack.
417
            // This registers if for fixup when this block's close paren is encountered.
418
21.5M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
419
420
            // Append a NOP to the compiled pattern.  This is the slot reserved
421
            //   for a SAVE in the event that there is yet another '|' following
422
            //   this one.
423
21.5M
            appendOp(URX_NOP, 0);
424
21.5M
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);
425
21.5M
        }
426
21.5M
        break;
427
428
429
708
    case doBeginNamedCapture:
430
        // Scanning (?<letter.
431
        //   The first letter of the name will come through again under doConinueNamedCapture.
432
708
        fCaptureName = new UnicodeString();
433
708
        if (fCaptureName == nullptr) {
434
0
            error(U_MEMORY_ALLOCATION_ERROR);
435
0
        }
436
708
        break;
437
438
2.96k
    case  doContinueNamedCapture:
439
2.96k
        fCaptureName->append(fC.fChar);
440
2.96k
        break;
441
442
39
    case doBadNamedCapture:
443
39
        error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
444
39
        break;
445
        
446
429k
    case doOpenCaptureParen:
447
        // Open Capturing Paren, possibly named.
448
        //   Compile to a
449
        //      - NOP, which later may be replaced by a save-state if the
450
        //         parenthesized group gets a * quantifier, followed by
451
        //      - START_CAPTURE  n    where n is stack frame offset to the capture group variables.
452
        //      - NOP, which may later be replaced by a save-state if there
453
        //             is an '|' alternation within the parens.
454
        //
455
        //    Each capture group gets three slots in the save stack frame:
456
        //         0: Capture Group start position (in input string being matched.)
457
        //         1: Capture Group end position.
458
        //         2: Start of Match-in-progress.
459
        //    The first two locations are for a completed capture group, and are
460
        //     referred to by back references and the like.
461
        //    The third location stores the capture start position when an START_CAPTURE is
462
        //      encountered.  This will be promoted to a completed capture when (and if) the corresponding
463
        //      END_CAPTURE is encountered.
464
429k
        {
465
429k
            fixLiterals();
466
429k
            appendOp(URX_NOP, 0);
467
429k
            int32_t  varsLoc = allocateStackData(3);    // Reserve three slots in match stack frame.
468
429k
            appendOp(URX_START_CAPTURE, varsLoc);
469
429k
            appendOp(URX_NOP, 0);
470
471
            // On the Parentheses stack, start a new frame and add the positions
472
            //   of the two NOPs.  Depending on what follows in the pattern, the
473
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
474
            //   address of the end of the parenthesized group.
475
429k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
476
429k
            fParenStack.push(capturing, *fStatus);                        // Frame type.
477
429k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first  NOP location
478
429k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
479
480
            // Save the mapping from group number to stack frame variable position.
481
429k
            fRXPat->fGroupMap->addElement(varsLoc, *fStatus);
482
483
            // If this is a named capture group, add the name->group number mapping.
484
429k
            if (fCaptureName != nullptr) {
485
691
                if (!fRXPat->initNamedCaptureMap()) {
486
0
                    if (U_SUCCESS(*fStatus)) {
487
0
                        error(fRXPat->fDeferredStatus);
488
0
                    }
489
0
                    break;
490
0
                }
491
691
                int32_t groupNumber = fRXPat->fGroupMap->size();
492
691
                int32_t previousMapping = uhash_puti(fRXPat->fNamedCaptureMap, fCaptureName, groupNumber, fStatus);
493
691
                fCaptureName = nullptr;    // hash table takes ownership of the name (key) string.
494
691
                if (previousMapping > 0 && U_SUCCESS(*fStatus)) {
495
37
                    error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
496
37
                }
497
691
            }
498
429k
        }
499
429k
        break;
500
501
429k
    case doOpenNonCaptureParen:
502
        // Open non-caputuring (grouping only) Paren.
503
        //   Compile to a
504
        //      - NOP, which later may be replaced by a save-state if the
505
        //         parenthesized group gets a * quantifier, followed by
506
        //      - NOP, which may later be replaced by a save-state if there
507
        //             is an '|' alternation within the parens.
508
21.3k
        {
509
21.3k
            fixLiterals();
510
21.3k
            appendOp(URX_NOP, 0);
511
21.3k
            appendOp(URX_NOP, 0);
512
513
            // On the Parentheses stack, start a new frame and add the positions
514
            //   of the two NOPs.
515
21.3k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
516
21.3k
            fParenStack.push(plain,      *fStatus);                       // Begin a new frame.
517
21.3k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
518
21.3k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP loc
519
21.3k
        }
520
21.3k
         break;
521
522
523
1.19k
    case doOpenAtomicParen:
524
        // Open Atomic Paren.  (?>
525
        //   Compile to a
526
        //      - NOP, which later may be replaced if the parenthesized group
527
        //         has a quantifier, followed by
528
        //      - STO_SP  save state stack position, so it can be restored at the ")"
529
        //      - NOP, which may later be replaced by a save-state if there
530
        //             is an '|' alternation within the parens.
531
1.19k
        {
532
1.19k
            fixLiterals();
533
1.19k
            appendOp(URX_NOP, 0);
534
1.19k
            int32_t  varLoc = allocateData(1);    // Reserve a data location for saving the state stack ptr.
535
1.19k
            appendOp(URX_STO_SP, varLoc);
536
1.19k
            appendOp(URX_NOP, 0);
537
538
            // On the Parentheses stack, start a new frame and add the positions
539
            //   of the two NOPs.  Depending on what follows in the pattern, the
540
            //   NOPs may be changed to SAVE_STATE or JMP ops, with a target
541
            //   address of the end of the parenthesized group.
542
1.19k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
543
1.19k
            fParenStack.push(atomic, *fStatus);                           // Frame type.
544
1.19k
            fParenStack.push(fRXPat->fCompiledPat->size()-3, *fStatus);   // The first NOP
545
1.19k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
546
1.19k
        }
547
1.19k
        break;
548
549
550
1.28k
    case doOpenLookAhead:
551
        // Positive Look-ahead   (?=  stuff  )
552
        //
553
        //   Note:   Addition of transparent input regions, with the need to
554
        //           restore the original regions when failing out of a lookahead
555
        //           block, complicated this sequence.  Some combined opcodes
556
        //           might make sense - or might not, lookahead aren't that common.
557
        //
558
        //      Caution:  min match length optimization knows about this
559
        //               sequence; don't change without making updates there too.
560
        //
561
        // Compiles to
562
        //    1    LA_START     dataLoc     Saves SP, Input Pos, Active input region.
563
        //    2.   STATE_SAVE   4            on failure of lookahead, goto 4
564
        //    3    JMP          6           continue ...
565
        //
566
        //    4.   LA_END                   Look Ahead failed.  Restore regions.
567
        //    5.   BACKTRACK                and back track again.
568
        //
569
        //    6.   NOP              reserved for use by quantifiers on the block.
570
        //                          Look-ahead can't have quantifiers, but paren stack
571
        //                             compile time conventions require the slot anyhow.
572
        //    7.   NOP              may be replaced if there is are '|' ops in the block.
573
        //    8.     code for parenthesized stuff.
574
        //    9.   LA_END
575
        //
576
        //  Four data slots are reserved, for saving state on entry to the look-around
577
        //    0:   stack pointer on entry.
578
        //    1:   input position on entry.
579
        //    2:   fActiveStart, the active bounds start on entry.
580
        //    3:   fActiveLimit, the active bounds limit on entry.
581
1.28k
        {
582
1.28k
            fixLiterals();
583
1.28k
            int32_t dataLoc = allocateData(4);
584
1.28k
            appendOp(URX_LA_START, dataLoc);
585
1.28k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+ 2);
586
1.28k
            appendOp(URX_JMP, fRXPat->fCompiledPat->size()+ 3);
587
1.28k
            appendOp(URX_LA_END, dataLoc);
588
1.28k
            appendOp(URX_BACKTRACK, 0);
589
1.28k
            appendOp(URX_NOP, 0);
590
1.28k
            appendOp(URX_NOP, 0);
591
592
            // On the Parentheses stack, start a new frame and add the positions
593
            //   of the NOPs.
594
1.28k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
595
1.28k
            fParenStack.push(lookAhead, *fStatus);                        // Frame type.
596
1.28k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first  NOP location
597
1.28k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
598
1.28k
        }
599
1.28k
        break;
600
601
4.87k
    case doOpenLookAheadNeg:
602
        // Negated Lookahead.   (?! stuff )
603
        // Compiles to
604
        //    1.    LA_START    dataloc
605
        //    2.    SAVE_STATE  7         // Fail within look-ahead block restores to this state,
606
        //                                //   which continues with the match.
607
        //    3.    NOP                   // Std. Open Paren sequence, for possible '|'
608
        //    4.       code for parenthesized stuff.
609
        //    5.    LA_END                // Cut back stack, remove saved state from step 2.
610
        //    6.    BACKTRACK             // code in block succeeded, so neg. lookahead fails.
611
        //    7.    END_LA                // Restore match region, in case look-ahead was using
612
        //                                        an alternate (transparent) region.
613
        //  Four data slots are reserved, for saving state on entry to the look-around
614
        //    0:   stack pointer on entry.
615
        //    1:   input position on entry.
616
        //    2:   fActiveStart, the active bounds start on entry.
617
        //    3:   fActiveLimit, the active bounds limit on entry.
618
4.87k
        {
619
4.87k
            fixLiterals();
620
4.87k
            int32_t dataLoc = allocateData(4);
621
4.87k
            appendOp(URX_LA_START, dataLoc);
622
4.87k
            appendOp(URX_STATE_SAVE, 0);    // dest address will be patched later.
623
4.87k
            appendOp(URX_NOP, 0);
624
625
            // On the Parentheses stack, start a new frame and add the positions
626
            //   of the StateSave and NOP.
627
4.87k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
628
4.87k
            fParenStack.push(negLookAhead, *fStatus);                    // Frame type
629
4.87k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The STATE_SAVE location
630
4.87k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP location
631
632
            // Instructions #5 - #7 will be added when the ')' is encountered.
633
4.87k
        }
634
4.87k
        break;
635
636
3.68k
    case doOpenLookBehind:
637
3.68k
        {
638
            //   Compile a (?<= look-behind open paren.
639
            //
640
            //          Compiles to
641
            //              0       URX_LB_START     dataLoc
642
            //              1       URX_LB_CONT      dataLoc
643
            //              2                        MinMatchLen
644
            //              3                        MaxMatchLen
645
            //              4       URX_NOP          Standard '(' boilerplate.
646
            //              5       URX_NOP          Reserved slot for use with '|' ops within (block).
647
            //              6         <code for LookBehind expression>
648
            //              7       URX_LB_END       dataLoc    # Check match len, restore input  len
649
            //              8       URX_LA_END       dataLoc    # Restore stack, input pos
650
            //
651
            //          Allocate a block of matcher data, to contain (when running a match)
652
            //              0:    Stack ptr on entry
653
            //              1:    Input Index on entry
654
            //              2:    fActiveStart, the active bounds start on entry.
655
            //              3:    fActiveLimit, the active bounds limit on entry.
656
            //              4:    Start index of match current match attempt.
657
            //          The first four items must match the layout of data for LA_START / LA_END
658
659
            // Generate match code for any pending literals.
660
3.68k
            fixLiterals();
661
662
            // Allocate data space
663
3.68k
            int32_t dataLoc = allocateData(5);
664
665
            // Emit URX_LB_START
666
3.68k
            appendOp(URX_LB_START, dataLoc);
667
668
            // Emit URX_LB_CONT
669
3.68k
            appendOp(URX_LB_CONT, dataLoc);
670
3.68k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
671
3.68k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
672
673
            // Emit the NOPs
674
3.68k
            appendOp(URX_NOP, 0);
675
3.68k
            appendOp(URX_NOP, 0);
676
677
            // On the Parentheses stack, start a new frame and add the positions
678
            //   of the URX_LB_CONT and the NOP.
679
3.68k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
680
3.68k
            fParenStack.push(lookBehind, *fStatus);                       // Frame type
681
3.68k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
682
3.68k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
683
684
            // The final two instructions will be added when the ')' is encountered.
685
3.68k
        }
686
687
3.68k
        break;
688
689
5.18k
    case doOpenLookBehindNeg:
690
5.18k
        {
691
            //   Compile a (?<! negated look-behind open paren.
692
            //
693
            //          Compiles to
694
            //              0       URX_LB_START     dataLoc    # Save entry stack, input len
695
            //              1       URX_LBN_CONT     dataLoc    # Iterate possible match positions
696
            //              2                        MinMatchLen
697
            //              3                        MaxMatchLen
698
            //              4                        continueLoc (9)
699
            //              5       URX_NOP          Standard '(' boilerplate.
700
            //              6       URX_NOP          Reserved slot for use with '|' ops within (block).
701
            //              7         <code for LookBehind expression>
702
            //              8       URX_LBN_END      dataLoc    # Check match len, cause a FAIL
703
            //              9       ...
704
            //
705
            //          Allocate a block of matcher data, to contain (when running a match)
706
            //              0:    Stack ptr on entry
707
            //              1:    Input Index on entry
708
            //              2:    fActiveStart, the active bounds start on entry.
709
            //              3:    fActiveLimit, the active bounds limit on entry.
710
            //              4:    Start index of match current match attempt.
711
            //          The first four items must match the layout of data for LA_START / LA_END
712
713
            // Generate match code for any pending literals.
714
5.18k
            fixLiterals();
715
716
            // Allocate data space
717
5.18k
            int32_t dataLoc = allocateData(5);
718
719
            // Emit URX_LB_START
720
5.18k
            appendOp(URX_LB_START, dataLoc);
721
722
            // Emit URX_LBN_CONT
723
5.18k
            appendOp(URX_LBN_CONT, dataLoc);
724
5.18k
            appendOp(URX_RESERVED_OP, 0);    // MinMatchLength.  To be filled later.
725
5.18k
            appendOp(URX_RESERVED_OP, 0);    // MaxMatchLength.  To be filled later.
726
5.18k
            appendOp(URX_RESERVED_OP, 0);    // Continue Loc.    To be filled later.
727
728
            // Emit the NOPs
729
5.18k
            appendOp(URX_NOP, 0);
730
5.18k
            appendOp(URX_NOP, 0);
731
732
            // On the Parentheses stack, start a new frame and add the positions
733
            //   of the URX_LB_CONT and the NOP.
734
5.18k
            fParenStack.push(fModeFlags, *fStatus);                       // Match mode state
735
5.18k
            fParenStack.push(lookBehindN, *fStatus);                      // Frame type
736
5.18k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP location
737
5.18k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The 2nd   NOP location
738
739
            // The final two instructions will be added when the ')' is encountered.
740
5.18k
        }
741
5.18k
        break;
742
743
2
    case doConditionalExpr:
744
        // Conditionals such as (?(1)a:b)
745
4
    case doPerlInline:
746
        // Perl inline-conditionals.  (?{perl code}a|b) We're not perl, no way to do them.
747
4
        error(U_REGEX_UNIMPLEMENTED);
748
4
        break;
749
750
751
442k
    case doCloseParen:
752
442k
        handleCloseParen();
753
442k
        if (fParenStack.size() <= 0) {
754
            //  Extra close paren, or missing open paren.
755
24
            error(U_REGEX_MISMATCHED_PAREN);
756
24
        }
757
442k
        break;
758
759
86.8M
    case doNOP:
760
86.8M
        break;
761
762
763
51
    case doBadOpenParenType:
764
103
    case doRuleError:
765
103
        error(U_REGEX_RULE_SYNTAX);
766
103
        break;
767
768
769
13
    case doMismatchedParenErr:
770
13
        error(U_REGEX_MISMATCHED_PAREN);
771
13
        break;
772
773
31.2k
    case doPlus:
774
        //  Normal '+'  compiles to
775
        //     1.   stuff to be repeated  (already built)
776
        //     2.   jmp-sav 1
777
        //     3.   ...
778
        //
779
        //  Or, if the item to be repeated can match a zero length string,
780
        //     1.   STO_INP_LOC  data-loc
781
        //     2.      body of stuff to be repeated
782
        //     3.   JMP_SAV_X    2
783
        //     4.   ...
784
785
        //
786
        //  Or, if the item to be repeated is simple
787
        //     1.   Item to be repeated.
788
        //     2.   LOOP_SR_I    set number  (assuming repeated item is a set ref)
789
        //     3.   LOOP_C       stack location
790
31.2k
        {
791
31.2k
            int32_t  topLoc = blockTopLoc(false);        // location of item #1
792
31.2k
            int32_t  frameLoc;
793
794
            // Check for simple constructs, which may get special optimized code.
795
31.2k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
796
27.4k
                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
797
798
27.4k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
799
                    // Emit optimized code for [char set]+
800
711
                    appendOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
801
711
                    frameLoc = allocateStackData(1);
802
711
                    appendOp(URX_LOOP_C, frameLoc);
803
711
                    break;
804
711
                }
805
806
26.6k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
807
23.5k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
808
22.7k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
809
                    // Emit Optimized code for .+ operations.
810
4.71k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
811
4.71k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
812
                        // URX_LOOP_DOT_I operand is a flag indicating ". matches any" mode.
813
844
                        loopOpI |= 1;
814
844
                    }
815
4.71k
                    if (fModeFlags & UREGEX_UNIX_LINES) {
816
1.07k
                        loopOpI |= 2;
817
1.07k
                    }
818
4.71k
                    appendOp(loopOpI);
819
4.71k
                    frameLoc = allocateStackData(1);
820
4.71k
                    appendOp(URX_LOOP_C, frameLoc);
821
4.71k
                    break;
822
4.71k
                }
823
824
26.6k
            }
825
826
            // General case.
827
828
            // Check for minimum match length of zero, which requires
829
            //    extra loop-breaking code.
830
25.8k
            if (minMatchLength(topLoc, fRXPat->fCompiledPat->size()-1) == 0) {
831
                // Zero length match is possible.
832
                // Emit the code sequence that can handle it.
833
2.31k
                insertOp(topLoc);
834
2.31k
                frameLoc = allocateStackData(1);
835
836
2.31k
                int32_t op = buildOp(URX_STO_INP_LOC, frameLoc);
837
2.31k
                fRXPat->fCompiledPat->setElementAt(op, topLoc);
838
839
2.31k
                appendOp(URX_JMP_SAV_X, topLoc+1);
840
23.5k
            } else {
841
                // Simpler code when the repeated body must match something non-empty
842
23.5k
                appendOp(URX_JMP_SAV, topLoc);
843
23.5k
            }
844
25.8k
        }
845
0
        break;
846
847
2.32k
    case doNGPlus:
848
        //  Non-greedy '+?'  compiles to
849
        //     1.   stuff to be repeated  (already built)
850
        //     2.   state-save  1
851
        //     3.   ...
852
2.32k
        {
853
2.32k
            int32_t topLoc      = blockTopLoc(false);
854
2.32k
            appendOp(URX_STATE_SAVE, topLoc);
855
2.32k
        }
856
2.32k
        break;
857
858
859
19.5k
    case doOpt:
860
        // Normal (greedy) ? quantifier.
861
        //  Compiles to
862
        //     1. state save 3
863
        //     2.    body of optional block
864
        //     3. ...
865
        // Insert the state save into the compiled pattern, and we're done.
866
19.5k
        {
867
19.5k
            int32_t   saveStateLoc = blockTopLoc(true);
868
19.5k
            int32_t   saveStateOp  = buildOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size());
869
19.5k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
870
19.5k
        }
871
19.5k
        break;
872
873
2.37k
    case doNGOpt:
874
        // Non-greedy ?? quantifier
875
        //   compiles to
876
        //    1.  jmp   4
877
        //    2.     body of optional block
878
        //    3   jmp   5
879
        //    4.  state save 2
880
        //    5    ...
881
        //  This code is less than ideal, with two jmps instead of one, because we can only
882
        //  insert one instruction at the top of the block being iterated.
883
2.37k
        {
884
2.37k
            int32_t  jmp1_loc = blockTopLoc(true);
885
2.37k
            int32_t  jmp2_loc = fRXPat->fCompiledPat->size();
886
887
2.37k
            int32_t  jmp1_op  = buildOp(URX_JMP, jmp2_loc+1);
888
2.37k
            fRXPat->fCompiledPat->setElementAt(jmp1_op, jmp1_loc);
889
890
2.37k
            appendOp(URX_JMP, jmp2_loc+2);
891
892
2.37k
            appendOp(URX_STATE_SAVE, jmp1_loc+1);
893
2.37k
        }
894
2.37k
        break;
895
896
897
261k
    case doStar:
898
        // Normal (greedy) * quantifier.
899
        // Compiles to
900
        //       1.   STATE_SAVE   4
901
        //       2.      body of stuff being iterated over
902
        //       3.   JMP_SAV      2
903
        //       4.   ...
904
        //
905
        // Or, if the body is a simple [Set],
906
        //       1.   LOOP_SR_I    set number
907
        //       2.   LOOP_C       stack location
908
        //       ...
909
        //
910
        // Or if this is a .*
911
        //       1.   LOOP_DOT_I    (. matches all mode flag)
912
        //       2.   LOOP_C        stack location
913
        //
914
        // Or, if the body can match a zero-length string, to inhibit infinite loops,
915
        //       1.   STATE_SAVE   5
916
        //       2.   STO_INP_LOC  data-loc
917
        //       3.      body of stuff
918
        //       4.   JMP_SAV_X    2
919
        //       5.   ...
920
261k
        {
921
            // location of item #1, the STATE_SAVE
922
261k
            int32_t   topLoc = blockTopLoc(false);
923
261k
            int32_t   dataLoc = -1;
924
925
            // Check for simple *, where the construct being repeated
926
            //   compiled to single opcode, and might be optimizable.
927
261k
            if (topLoc == fRXPat->fCompiledPat->size() - 1) {
928
155k
                int32_t repeatedOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topLoc));
929
930
155k
                if (URX_TYPE(repeatedOp) == URX_SETREF) {
931
                    // Emit optimized code for a [char set]*
932
3.15k
                    int32_t loopOpI = buildOp(URX_LOOP_SR_I, URX_VAL(repeatedOp));
933
3.15k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
934
3.15k
                    dataLoc = allocateStackData(1);
935
3.15k
                    appendOp(URX_LOOP_C, dataLoc);
936
3.15k
                    break;
937
3.15k
                }
938
939
152k
                if (URX_TYPE(repeatedOp) == URX_DOTANY ||
940
146k
                    URX_TYPE(repeatedOp) == URX_DOTANY_ALL ||
941
145k
                    URX_TYPE(repeatedOp) == URX_DOTANY_UNIX) {
942
                    // Emit Optimized code for .* operations.
943
7.56k
                    int32_t loopOpI = buildOp(URX_LOOP_DOT_I, 0);
944
7.56k
                    if (URX_TYPE(repeatedOp) == URX_DOTANY_ALL) {
945
                        // URX_LOOP_DOT_I operand is a flag indicating . matches any mode.
946
1.04k
                        loopOpI |= 1;
947
1.04k
                    }
948
7.56k
                    if ((fModeFlags & UREGEX_UNIX_LINES) != 0) {
949
1.10k
                        loopOpI |= 2;
950
1.10k
                    }
951
7.56k
                    fRXPat->fCompiledPat->setElementAt(loopOpI, topLoc);
952
7.56k
                    dataLoc = allocateStackData(1);
953
7.56k
                    appendOp(URX_LOOP_C, dataLoc);
954
7.56k
                    break;
955
7.56k
                }
956
152k
            }
957
958
            // Emit general case code for this *
959
            // The optimizations did not apply.
960
961
250k
            int32_t   saveStateLoc = blockTopLoc(true);
962
250k
            int32_t   jmpOp        = buildOp(URX_JMP_SAV, saveStateLoc+1);
963
964
            // Check for minimum match length of zero, which requires
965
            //    extra loop-breaking code.
966
250k
            if (minMatchLength(saveStateLoc, fRXPat->fCompiledPat->size()-1) == 0) {
967
94.8k
                insertOp(saveStateLoc);
968
94.8k
                dataLoc = allocateStackData(1);
969
970
94.8k
                int32_t op = buildOp(URX_STO_INP_LOC, dataLoc);
971
94.8k
                fRXPat->fCompiledPat->setElementAt(op, saveStateLoc+1);
972
94.8k
                jmpOp      = buildOp(URX_JMP_SAV_X, saveStateLoc+2);
973
94.8k
            }
974
975
            // Locate the position in the compiled pattern where the match will continue
976
            //   after completing the *.   (4 or 5 in the comment above)
977
250k
            int32_t continueLoc = fRXPat->fCompiledPat->size()+1;
978
979
            // Put together the save state op and store it into the compiled code.
980
250k
            int32_t saveStateOp = buildOp(URX_STATE_SAVE, continueLoc);
981
250k
            fRXPat->fCompiledPat->setElementAt(saveStateOp, saveStateLoc);
982
983
            // Append the URX_JMP_SAV or URX_JMPX operation to the compiled pattern.
984
250k
            appendOp(jmpOp);
985
250k
        }
986
0
        break;
987
988
667
    case doNGStar:
989
        // Non-greedy *? quantifier
990
        // compiles to
991
        //     1.   JMP    3
992
        //     2.      body of stuff being iterated over
993
        //     3.   STATE_SAVE  2
994
        //     4    ...
995
667
        {
996
667
            int32_t     jmpLoc  = blockTopLoc(true);                   // loc  1.
997
667
            int32_t     saveLoc = fRXPat->fCompiledPat->size();        // loc  3.
998
667
            int32_t     jmpOp   = buildOp(URX_JMP, saveLoc);
999
667
            fRXPat->fCompiledPat->setElementAt(jmpOp, jmpLoc);
1000
667
            appendOp(URX_STATE_SAVE, jmpLoc+1);
1001
667
        }
1002
667
        break;
1003
1004
1005
116k
    case doIntervalInit:
1006
        // The '{' opening an interval quantifier was just scanned.
1007
        // Init the counter variables that will accumulate the values as the digits
1008
        //    are scanned.
1009
116k
        fIntervalLow = 0;
1010
116k
        fIntervalUpper = -1;
1011
116k
        break;
1012
1013
122k
    case doIntevalLowerDigit:
1014
        // Scanned a digit from the lower value of an {lower,upper} interval
1015
122k
        {
1016
122k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1017
122k
            U_ASSERT(digitValue >= 0);
1018
122k
            int64_t val = static_cast<int64_t>(fIntervalLow) * 10 + digitValue;
1019
122k
            if (val > INT32_MAX) {
1020
6
                error(U_REGEX_NUMBER_TOO_BIG);
1021
122k
            } else {
1022
122k
                fIntervalLow = static_cast<int32_t>(val);
1023
122k
            }
1024
122k
        }
1025
122k
        break;
1026
1027
99.4k
    case doIntervalUpperDigit:
1028
        // Scanned a digit from the upper value of an {lower,upper} interval
1029
99.4k
        {
1030
99.4k
            if (fIntervalUpper < 0) {
1031
93.7k
                fIntervalUpper = 0;
1032
93.7k
            }
1033
99.4k
            int32_t digitValue = u_charDigitValue(fC.fChar);
1034
99.4k
            U_ASSERT(digitValue >= 0);
1035
99.4k
            int64_t val = static_cast<int64_t>(fIntervalUpper) * 10 + digitValue;
1036
99.4k
            if (val > INT32_MAX) {
1037
15
                error(U_REGEX_NUMBER_TOO_BIG);
1038
99.4k
            } else {
1039
99.4k
                fIntervalUpper = static_cast<int32_t>(val);
1040
99.4k
            }
1041
99.4k
        }
1042
99.4k
        break;
1043
1044
21.2k
    case doIntervalSame:
1045
        // Scanned a single value interval like {27}.  Upper = Lower.
1046
21.2k
        fIntervalUpper = fIntervalLow;
1047
21.2k
        break;
1048
1049
109k
    case doInterval:
1050
        // Finished scanning a normal {lower,upper} interval.  Generate the code for it.
1051
109k
        if (compileInlineInterval() == false) {
1052
5.97k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1053
5.97k
        }
1054
109k
        break;
1055
1056
2.86k
    case doPossessiveInterval:
1057
        // Finished scanning a Possessive {lower,upper}+ interval.  Generate the code for it.
1058
2.86k
        {
1059
            // Remember the loc for the top of the block being looped over.
1060
            //   (Can not reserve a slot in the compiled pattern at this time, because
1061
            //    compileInterval needs to reserve also, and blockTopLoc can only reserve
1062
            //    once per block.)
1063
2.86k
            int32_t topLoc = blockTopLoc(false);
1064
1065
            // Produce normal looping code.
1066
2.86k
            compileInterval(URX_CTR_INIT, URX_CTR_LOOP);
1067
1068
            // Surround the just-emitted normal looping code with a STO_SP ... LD_SP
1069
            //  just as if the loop was inclosed in atomic parentheses.
1070
1071
            // First the STO_SP before the start of the loop
1072
2.86k
            insertOp(topLoc);
1073
1074
2.86k
            int32_t  varLoc = allocateData(1);   // Reserve a data location for saving the
1075
2.86k
            int32_t  op     = buildOp(URX_STO_SP, varLoc);
1076
2.86k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1077
1078
2.86k
            int32_t loopOp = static_cast<int32_t>(fRXPat->fCompiledPat->popi());
1079
2.86k
            U_ASSERT(URX_TYPE(loopOp) == URX_CTR_LOOP && URX_VAL(loopOp) == topLoc);
1080
2.86k
            loopOp++;     // point LoopOp after the just-inserted STO_SP
1081
2.86k
            fRXPat->fCompiledPat->push(loopOp, *fStatus);
1082
1083
            // Then the LD_SP after the end of the loop
1084
2.86k
            appendOp(URX_LD_SP, varLoc);
1085
2.86k
        }
1086
1087
2.86k
        break;
1088
1089
3.53k
    case doNGInterval:
1090
        // Finished scanning a non-greedy {lower,upper}? interval.  Generate the code for it.
1091
3.53k
        compileInterval(URX_CTR_INIT_NG, URX_CTR_LOOP_NG);
1092
3.53k
        break;
1093
1094
145
    case doIntervalError:
1095
145
        error(U_REGEX_BAD_INTERVAL);
1096
145
        break;
1097
1098
41.6M
    case doLiteralChar:
1099
        // We've just scanned a "normal" character from the pattern,
1100
41.6M
        literalChar(fC.fChar);
1101
41.6M
        break;
1102
1103
1104
17.2k
    case doEscapedLiteralChar:
1105
        // We've just scanned an backslashed escaped character with  no
1106
        //   special meaning.  It represents itself.
1107
17.2k
        if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1108
0
            ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1109
0
            (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1110
0
               error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1111
0
             }
1112
17.2k
        literalChar(fC.fChar);
1113
17.2k
        break;
1114
1115
1116
485k
    case doDotAny:
1117
        // scanned a ".",  match any single character.
1118
485k
        {
1119
485k
            fixLiterals(false);
1120
485k
            if (fModeFlags & UREGEX_DOTALL) {
1121
6.37k
                appendOp(URX_DOTANY_ALL, 0);
1122
479k
            } else if (fModeFlags & UREGEX_UNIX_LINES) {
1123
4.98k
                appendOp(URX_DOTANY_UNIX, 0);
1124
474k
            } else {
1125
474k
                appendOp(URX_DOTANY, 0);
1126
474k
            }
1127
485k
        }
1128
485k
        break;
1129
1130
7.68k
    case doCaret:
1131
7.68k
        {
1132
7.68k
            fixLiterals(false);
1133
7.68k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1134
4.34k
                appendOp(URX_CARET, 0);
1135
4.34k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1136
1.39k
                appendOp(URX_CARET_M, 0);
1137
1.95k
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1138
476
                appendOp(URX_CARET, 0);   // Only testing true start of input.
1139
1.47k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1140
1.47k
                appendOp(URX_CARET_M_UNIX, 0);
1141
1.47k
            }
1142
7.68k
        }
1143
7.68k
        break;
1144
1145
108k
    case doDollar:
1146
108k
        {
1147
108k
            fixLiterals(false);
1148
108k
            if (       (fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1149
75.7k
                appendOp(URX_DOLLAR, 0);
1150
75.7k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) == 0) {
1151
2.43k
                appendOp(URX_DOLLAR_M, 0);
1152
29.9k
            } else if ((fModeFlags & UREGEX_MULTILINE) == 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1153
27.0k
                appendOp(URX_DOLLAR_D, 0);
1154
27.0k
            } else if ((fModeFlags & UREGEX_MULTILINE) != 0 && (fModeFlags & UREGEX_UNIX_LINES) != 0) {
1155
2.92k
                appendOp(URX_DOLLAR_MD, 0);
1156
2.92k
            }
1157
108k
        }
1158
108k
        break;
1159
1160
441
    case doBackslashA:
1161
441
        fixLiterals(false);
1162
441
        appendOp(URX_CARET, 0);
1163
441
        break;
1164
1165
8.44k
    case doBackslashB:
1166
8.44k
        {
1167
            #if  UCONFIG_NO_BREAK_ITERATION==1
1168
            if (fModeFlags & UREGEX_UWORD) {
1169
                error(U_UNSUPPORTED_ERROR);
1170
            }
1171
            #endif
1172
8.44k
            fixLiterals(false);
1173
8.44k
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1174
8.44k
            appendOp(op, 1);
1175
8.44k
        }
1176
8.44k
        break;
1177
1178
1.36k
    case doBackslashb:
1179
1.36k
        {
1180
            #if  UCONFIG_NO_BREAK_ITERATION==1
1181
            if (fModeFlags & UREGEX_UWORD) {
1182
                error(U_UNSUPPORTED_ERROR);
1183
            }
1184
            #endif
1185
1.36k
            fixLiterals(false);
1186
1.36k
            int32_t op = (fModeFlags & UREGEX_UWORD)? URX_BACKSLASH_BU : URX_BACKSLASH_B;
1187
1.36k
            appendOp(op, 0);
1188
1.36k
        }
1189
1.36k
        break;
1190
1191
4.29k
    case doBackslashD:
1192
4.29k
        fixLiterals(false);
1193
4.29k
        appendOp(URX_BACKSLASH_D, 1);
1194
4.29k
        break;
1195
1196
1.14k
    case doBackslashd:
1197
1.14k
        fixLiterals(false);
1198
1.14k
        appendOp(URX_BACKSLASH_D, 0);
1199
1.14k
        break;
1200
1201
982
    case doBackslashG:
1202
982
        fixLiterals(false);
1203
982
        appendOp(URX_BACKSLASH_G, 0);
1204
982
        break;
1205
1206
2.60k
    case doBackslashH:
1207
2.60k
        fixLiterals(false);
1208
2.60k
        appendOp(URX_BACKSLASH_H, 1);
1209
2.60k
        break;
1210
1211
1.35k
    case doBackslashh:
1212
1.35k
        fixLiterals(false);
1213
1.35k
        appendOp(URX_BACKSLASH_H, 0);
1214
1.35k
        break;
1215
1216
4.04k
    case doBackslashR:
1217
4.04k
        fixLiterals(false);
1218
4.04k
        appendOp(URX_BACKSLASH_R, 0);
1219
4.04k
        break;
1220
1221
20.3k
    case doBackslashS:
1222
20.3k
        fixLiterals(false);
1223
20.3k
        appendOp(URX_STAT_SETREF_N, URX_ISSPACE_SET);
1224
20.3k
        break;
1225
1226
2.72k
    case doBackslashs:
1227
2.72k
        fixLiterals(false);
1228
2.72k
        appendOp(URX_STATIC_SETREF, URX_ISSPACE_SET);
1229
2.72k
        break;
1230
1231
1.76k
    case doBackslashV:
1232
1.76k
        fixLiterals(false);
1233
1.76k
        appendOp(URX_BACKSLASH_V, 1);
1234
1.76k
        break;
1235
1236
1.37k
    case doBackslashv:
1237
1.37k
        fixLiterals(false);
1238
1.37k
        appendOp(URX_BACKSLASH_V, 0);
1239
1.37k
        break;
1240
1241
5.09k
    case doBackslashW:
1242
5.09k
        fixLiterals(false);
1243
5.09k
        appendOp(URX_STAT_SETREF_N, URX_ISWORD_SET);
1244
5.09k
        break;
1245
1246
4.60k
    case doBackslashw:
1247
4.60k
        fixLiterals(false);
1248
4.60k
        appendOp(URX_STATIC_SETREF, URX_ISWORD_SET);
1249
4.60k
        break;
1250
1251
2.15k
    case doBackslashX:
1252
        #if  UCONFIG_NO_BREAK_ITERATION==1
1253
        // Grapheme Cluster Boundary requires ICU break iteration.
1254
        error(U_UNSUPPORTED_ERROR);
1255
        #endif
1256
2.15k
        fixLiterals(false);
1257
2.15k
        appendOp(URX_BACKSLASH_X, 0);
1258
2.15k
        break;
1259
1260
443
    case doBackslashZ:
1261
443
        fixLiterals(false);
1262
443
        appendOp(URX_DOLLAR, 0);
1263
443
        break;
1264
1265
3.12k
    case doBackslashz:
1266
3.12k
        fixLiterals(false);
1267
3.12k
        appendOp(URX_BACKSLASH_Z, 0);
1268
3.12k
        break;
1269
1270
56
    case doEscapeError:
1271
56
        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1272
56
        break;
1273
1274
0
    case doExit:
1275
0
        fixLiterals(false);
1276
0
        returnVal = false;
1277
0
        break;
1278
1279
1.27k
    case doProperty:
1280
1.27k
        {
1281
1.27k
            fixLiterals(false);
1282
1.27k
            UnicodeSet *theSet = scanProp();
1283
1.27k
            compileSet(theSet);
1284
1.27k
        }
1285
1.27k
        break;
1286
1287
1.96k
    case doNamedChar:
1288
1.96k
        {
1289
1.96k
            UChar32 c = scanNamedChar();
1290
1.96k
            literalChar(c);
1291
1.96k
        }
1292
1.96k
        break;
1293
1294
1295
11.8k
    case doBackRef:
1296
        // BackReference.  Somewhat unusual in that the front-end can not completely parse
1297
        //                 the regular expression, because the number of digits to be consumed
1298
        //                 depends on the number of capture groups that have been defined.  So
1299
        //                 we have to do it here instead.
1300
11.8k
        {
1301
11.8k
            int32_t  numCaptureGroups = fRXPat->fGroupMap->size();
1302
11.8k
            int32_t  groupNum = 0;
1303
11.8k
            UChar32  c        = fC.fChar;
1304
1305
13.0k
            for (;;) {
1306
                // Loop once per digit, for max allowed number of digits in a back reference.
1307
13.0k
                int32_t digit = u_charDigitValue(c);
1308
13.0k
                groupNum = groupNum * 10 + digit;
1309
13.0k
                if (groupNum >= numCaptureGroups) {
1310
5.54k
                    break;
1311
5.54k
                }
1312
7.51k
                c = peekCharLL();
1313
7.51k
                if (RegexStaticSets::gStaticSets->fRuleDigitsAlias->contains(c) == false) {
1314
6.25k
                    break;
1315
6.25k
                }
1316
1.26k
                nextCharLL();
1317
1.26k
            }
1318
1319
            // Scan of the back reference in the source regexp is complete.  Now generate
1320
            //  the compiled code for it.
1321
            // Because capture groups can be forward-referenced by back-references,
1322
            //  we fill the operand with the capture group number.  At the end
1323
            //  of compilation, it will be changed to the variable's location.
1324
11.8k
            U_ASSERT(groupNum > 0);  // Shouldn't happen.  '\0' begins an octal escape sequence,
1325
                                     //    and shouldn't enter this code path at all.
1326
11.8k
            fixLiterals(false);
1327
11.8k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1328
7.52k
                appendOp(URX_BACKREF_I, groupNum);
1329
7.52k
            } else {
1330
4.27k
                appendOp(URX_BACKREF, groupNum);
1331
4.27k
            }
1332
11.8k
        }
1333
11.8k
        break;
1334
1335
1.21k
    case doBeginNamedBackRef:
1336
1.21k
        U_ASSERT(fCaptureName == nullptr);
1337
1.21k
        fCaptureName = new UnicodeString;
1338
1.21k
        if (fCaptureName == nullptr) {
1339
0
            error(U_MEMORY_ALLOCATION_ERROR);
1340
0
        }
1341
1.21k
        break;
1342
            
1343
3.96k
    case doContinueNamedBackRef:
1344
3.96k
        fCaptureName->append(fC.fChar);
1345
3.96k
        break;
1346
1347
1.19k
    case doCompleteNamedBackRef:
1348
1.19k
        {
1349
1.19k
        int32_t groupNumber =
1350
1.19k
            fRXPat->fNamedCaptureMap ? uhash_geti(fRXPat->fNamedCaptureMap, fCaptureName) : 0;
1351
1.19k
        if (groupNumber == 0) {
1352
            // Group name has not been defined.
1353
            //   Could be a forward reference. If we choose to support them at some
1354
            //   future time, extra mechanism will be required at this point.
1355
32
            error(U_REGEX_INVALID_CAPTURE_GROUP_NAME);
1356
1.16k
        } else {
1357
            // Given the number, handle identically to a \n numbered back reference.
1358
            // See comments above, under doBackRef
1359
1.16k
            fixLiterals(false);
1360
1.16k
            if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1361
280
                appendOp(URX_BACKREF_I, groupNumber);
1362
884
            } else {
1363
884
                appendOp(URX_BACKREF, groupNumber);
1364
884
            }
1365
1.16k
        }
1366
1.19k
        delete fCaptureName;
1367
1.19k
        fCaptureName = nullptr;
1368
1.19k
        break;
1369
261k
        }
1370
       
1371
39.3k
    case doPossessivePlus:
1372
        // Possessive ++ quantifier.
1373
        // Compiles to
1374
        //       1.   STO_SP
1375
        //       2.      body of stuff being iterated over
1376
        //       3.   STATE_SAVE 5
1377
        //       4.   JMP        2
1378
        //       5.   LD_SP
1379
        //       6.   ...
1380
        //
1381
        //  Note:  TODO:  This is pretty inefficient.  A mass of saved state is built up
1382
        //                then unconditionally discarded.  Perhaps introduce a new opcode.  Ticket 6056
1383
        //
1384
39.3k
        {
1385
            // Emit the STO_SP
1386
39.3k
            int32_t   topLoc = blockTopLoc(true);
1387
39.3k
            int32_t   stoLoc = allocateData(1);  // Reserve the data location for storing save stack ptr.
1388
39.3k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1389
39.3k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1390
1391
            // Emit the STATE_SAVE
1392
39.3k
            appendOp(URX_STATE_SAVE, fRXPat->fCompiledPat->size()+2);
1393
1394
            // Emit the JMP
1395
39.3k
            appendOp(URX_JMP, topLoc+1);
1396
1397
            // Emit the LD_SP
1398
39.3k
            appendOp(URX_LD_SP, stoLoc);
1399
39.3k
        }
1400
39.3k
        break;
1401
1402
26.1k
    case doPossessiveStar:
1403
        // Possessive *+ quantifier.
1404
        // Compiles to
1405
        //       1.   STO_SP       loc
1406
        //       2.   STATE_SAVE   5
1407
        //       3.      body of stuff being iterated over
1408
        //       4.   JMP          2
1409
        //       5.   LD_SP        loc
1410
        //       6    ...
1411
        // TODO:  do something to cut back the state stack each time through the loop.
1412
26.1k
        {
1413
            // Reserve two slots at the top of the block.
1414
26.1k
            int32_t   topLoc = blockTopLoc(true);
1415
26.1k
            insertOp(topLoc);
1416
1417
            // emit   STO_SP     loc
1418
26.1k
            int32_t   stoLoc = allocateData(1);    // Reserve the data location for storing save stack ptr.
1419
26.1k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1420
26.1k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1421
1422
            // Emit the SAVE_STATE   5
1423
26.1k
            int32_t L7 = fRXPat->fCompiledPat->size()+1;
1424
26.1k
            op = buildOp(URX_STATE_SAVE, L7);
1425
26.1k
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1426
1427
            // Append the JMP operation.
1428
26.1k
            appendOp(URX_JMP, topLoc+1);
1429
1430
            // Emit the LD_SP       loc
1431
26.1k
            appendOp(URX_LD_SP, stoLoc);
1432
26.1k
        }
1433
26.1k
        break;
1434
1435
30.5k
    case doPossessiveOpt:
1436
        // Possessive  ?+ quantifier.
1437
        //  Compiles to
1438
        //     1. STO_SP      loc
1439
        //     2. SAVE_STATE  5
1440
        //     3.    body of optional block
1441
        //     4. LD_SP       loc
1442
        //     5. ...
1443
        //
1444
30.5k
        {
1445
            // Reserve two slots at the top of the block.
1446
30.5k
            int32_t   topLoc = blockTopLoc(true);
1447
30.5k
            insertOp(topLoc);
1448
1449
            // Emit the STO_SP
1450
30.5k
            int32_t   stoLoc = allocateData(1);   // Reserve the data location for storing save stack ptr.
1451
30.5k
            int32_t   op     = buildOp(URX_STO_SP, stoLoc);
1452
30.5k
            fRXPat->fCompiledPat->setElementAt(op, topLoc);
1453
1454
            // Emit the SAVE_STATE
1455
30.5k
            int32_t   continueLoc = fRXPat->fCompiledPat->size()+1;
1456
30.5k
            op = buildOp(URX_STATE_SAVE, continueLoc);
1457
30.5k
            fRXPat->fCompiledPat->setElementAt(op, topLoc+1);
1458
1459
            // Emit the LD_SP
1460
30.5k
            appendOp(URX_LD_SP, stoLoc);
1461
30.5k
        }
1462
30.5k
        break;
1463
1464
1465
14.3k
    case doBeginMatchMode:
1466
14.3k
        fNewModeFlags = fModeFlags;
1467
14.3k
        fSetModeFlag  = true;
1468
14.3k
        break;
1469
1470
16.8k
    case doMatchMode:   //  (?i)    and similar
1471
16.8k
        {
1472
16.8k
            int32_t  bit = 0;
1473
16.8k
            switch (fC.fChar) {
1474
4.72k
            case 0x69: /* 'i' */   bit = UREGEX_CASE_INSENSITIVE; break;
1475
990
            case 0x64: /* 'd' */   bit = UREGEX_UNIX_LINES;       break;
1476
1.37k
            case 0x6d: /* 'm' */   bit = UREGEX_MULTILINE;        break;
1477
609
            case 0x73: /* 's' */   bit = UREGEX_DOTALL;           break;
1478
368
            case 0x75: /* 'u' */   bit = 0; /* Unicode casing */  break;
1479
3.07k
            case 0x77: /* 'w' */   bit = UREGEX_UWORD;            break;
1480
4.77k
            case 0x78: /* 'x' */   bit = UREGEX_COMMENTS;         break;
1481
983
            case 0x2d: /* '-' */   fSetModeFlag = false;          break;
1482
0
            default:
1483
0
                UPRV_UNREACHABLE_EXIT;  // Should never happen.  Other chars are filtered out
1484
                                        // by the scanner.
1485
16.8k
            }
1486
16.8k
            if (fSetModeFlag) {
1487
15.3k
                fNewModeFlags |= bit;
1488
15.3k
            } else {
1489
1.51k
                fNewModeFlags &= ~bit;
1490
1.51k
            }
1491
16.8k
        }
1492
0
        break;
1493
1494
9.08k
    case doSetMatchMode:
1495
        // Emit code to match any pending literals, using the not-yet changed match mode.
1496
9.08k
        fixLiterals();
1497
1498
        // We've got a (?i) or similar.  The match mode is being changed, but
1499
        //   the change is not scoped to a parenthesized block.
1500
9.08k
        U_ASSERT(fNewModeFlags < 0);
1501
9.08k
        fModeFlags = fNewModeFlags;
1502
1503
9.08k
        break;
1504
1505
1506
5.15k
    case doMatchModeParen:
1507
        // We've got a (?i: or similar.  Begin a parenthesized block, save old
1508
        //   mode flags so they can be restored at the close of the block.
1509
        //
1510
        //   Compile to a
1511
        //      - NOP, which later may be replaced by a save-state if the
1512
        //         parenthesized group gets a * quantifier, followed by
1513
        //      - NOP, which may later be replaced by a save-state if there
1514
        //             is an '|' alternation within the parens.
1515
5.15k
        {
1516
5.15k
            fixLiterals(false);
1517
5.15k
            appendOp(URX_NOP, 0);
1518
5.15k
            appendOp(URX_NOP, 0);
1519
1520
            // On the Parentheses stack, start a new frame and add the positions
1521
            //   of the two NOPs (a normal non-capturing () frame, except for the
1522
            //   saving of the original mode flags.)
1523
5.15k
            fParenStack.push(fModeFlags, *fStatus);
1524
5.15k
            fParenStack.push(flags, *fStatus);                            // Frame Marker
1525
5.15k
            fParenStack.push(fRXPat->fCompiledPat->size()-2, *fStatus);   // The first NOP
1526
5.15k
            fParenStack.push(fRXPat->fCompiledPat->size()-1, *fStatus);   // The second NOP
1527
1528
            // Set the current mode flags to the new values.
1529
5.15k
            U_ASSERT(fNewModeFlags < 0);
1530
5.15k
            fModeFlags = fNewModeFlags;
1531
5.15k
        }
1532
5.15k
        break;
1533
1534
113
    case doBadModeFlag:
1535
113
        error(U_REGEX_INVALID_FLAG);
1536
113
        break;
1537
1538
32.9k
    case doSuppressComments:
1539
        // We have just scanned a '(?'.  We now need to prevent the character scanner from
1540
        // treating a '#' as a to-the-end-of-line comment.
1541
        //   (This Perl compatibility just gets uglier and uglier to do...)
1542
32.9k
        fEOLComments = false;
1543
32.9k
        break;
1544
1545
1546
1.40k
    case doSetAddAmp:
1547
1.40k
        {
1548
1.40k
          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1549
1.40k
          set->add(chAmp);
1550
1.40k
        }
1551
1.40k
        break;
1552
1553
1.72k
    case doSetAddDash:
1554
1.72k
        {
1555
1.72k
          UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1556
1.72k
          set->add(chDash);
1557
1.72k
        }
1558
1.72k
        break;
1559
1560
734
     case doSetBackslash_s:
1561
734
        {
1562
734
         UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1563
734
         set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]);
1564
734
         break;
1565
16.8k
        }
1566
1567
1.76k
     case doSetBackslash_S:
1568
1.76k
        {
1569
1.76k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1570
1.76k
            UnicodeSet SSet;
1571
1.76k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISSPACE_SET]).complement();
1572
1.76k
            set->addAll(SSet);
1573
1.76k
            break;
1574
16.8k
        }
1575
1576
1.51k
    case doSetBackslash_d:
1577
1.51k
        {
1578
1.51k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1579
            // TODO - make a static set, ticket 6058.
1580
1.51k
            addCategory(set, U_GC_ND_MASK, *fStatus);
1581
1.51k
            break;
1582
16.8k
        }
1583
1584
8.94k
    case doSetBackslash_D:
1585
8.94k
        {
1586
8.94k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1587
8.94k
            UnicodeSet digits;
1588
            // TODO - make a static set, ticket 6058.
1589
8.94k
            digits.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
1590
8.94k
            digits.complement();
1591
8.94k
            set->addAll(digits);
1592
8.94k
            break;
1593
16.8k
        }
1594
1595
1.13k
    case doSetBackslash_h:
1596
1.13k
        {
1597
1.13k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1598
1.13k
            UnicodeSet h;
1599
1.13k
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1600
1.13k
            h.add(static_cast<UChar32>(9)); // Tab
1601
1.13k
            set->addAll(h);
1602
1.13k
            break;
1603
16.8k
        }
1604
1605
2.20k
    case doSetBackslash_H:
1606
2.20k
        {
1607
2.20k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1608
2.20k
            UnicodeSet h;
1609
2.20k
            h.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
1610
2.20k
            h.add(static_cast<UChar32>(9)); // Tab
1611
2.20k
            h.complement();
1612
2.20k
            set->addAll(h);
1613
2.20k
            break;
1614
16.8k
        }
1615
1616
1.92k
    case doSetBackslash_v:
1617
1.92k
        {
1618
1.92k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1619
1.92k
            set->add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
1620
1.92k
            set->add(static_cast<UChar32>(0x85));
1621
1.92k
            set->add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
1622
1.92k
            break;
1623
16.8k
        }
1624
1625
1.62k
    case doSetBackslash_V:
1626
1.62k
        {
1627
1.62k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1628
1.62k
            UnicodeSet v;
1629
1.62k
            v.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
1630
1.62k
            v.add(static_cast<UChar32>(0x85));
1631
1.62k
            v.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
1632
1.62k
            v.complement();
1633
1.62k
            set->addAll(v);
1634
1.62k
            break;
1635
16.8k
        }
1636
1637
2.10k
    case doSetBackslash_w:
1638
2.10k
        {
1639
2.10k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1640
2.10k
            set->addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]);
1641
2.10k
            break;
1642
16.8k
        }
1643
1644
15.1k
    case doSetBackslash_W:
1645
15.1k
        {
1646
15.1k
            UnicodeSet* set = static_cast<UnicodeSet*>(fSetStack.peek());
1647
15.1k
            UnicodeSet SSet;
1648
15.1k
            SSet.addAll(RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET]).complement();
1649
15.1k
            set->addAll(SSet);
1650
15.1k
            break;
1651
16.8k
        }
1652
1653
435k
    case doSetBegin:
1654
435k
        {
1655
435k
            fixLiterals(false);
1656
435k
            LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
1657
435k
            fSetStack.push(lpSet.orphan(), *fStatus);
1658
435k
            fSetOpStack.push(setStart, *fStatus);
1659
435k
            if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1660
387k
                fSetOpStack.push(setCaseClose, *fStatus);
1661
387k
            }
1662
435k
            break;
1663
16.8k
        }
1664
1665
1.69k
    case doSetBeginDifference1:
1666
        //  We have scanned something like [[abc]-[
1667
        //  Set up a new UnicodeSet for the set beginning with the just-scanned '['
1668
        //  Push a Difference operator, which will cause the new set to be subtracted from what
1669
        //    went before once it is created.
1670
1.69k
        setPushOp(setDifference1);
1671
1.69k
        fSetOpStack.push(setStart, *fStatus);
1672
1.69k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1673
957
            fSetOpStack.push(setCaseClose, *fStatus);
1674
957
        }
1675
1.69k
        break;
1676
1677
1.20k
    case doSetBeginIntersection1:
1678
        //  We have scanned something like  [[abc]&[
1679
        //   Need both the '&' operator and the open '[' operator.
1680
1.20k
        setPushOp(setIntersection1);
1681
1.20k
        fSetOpStack.push(setStart, *fStatus);
1682
1.20k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1683
808
            fSetOpStack.push(setCaseClose, *fStatus);
1684
808
        }
1685
1.20k
        break;
1686
1687
88.7k
    case doSetBeginUnion:
1688
        //  We have scanned something like  [[abc][
1689
        //     Need to handle the union operation explicitly [[abc] | [
1690
88.7k
        setPushOp(setUnion);
1691
88.7k
        fSetOpStack.push(setStart, *fStatus);
1692
88.7k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) != 0) {
1693
61.2k
            fSetOpStack.push(setCaseClose, *fStatus);
1694
61.2k
        }
1695
88.7k
        break;
1696
1697
453
    case doSetDifference2:
1698
        // We have scanned something like [abc--
1699
        //   Consider this to unambiguously be a set difference operator.
1700
453
        setPushOp(setDifference2);
1701
453
        break;
1702
1703
512k
    case doSetEnd:
1704
        // Have encountered the ']' that closes a set.
1705
        //    Force the evaluation of any pending operations within this set,
1706
        //    leave the completed set on the top of the set stack.
1707
512k
        setEval(setEnd);
1708
512k
        U_ASSERT(fSetOpStack.peeki()==setStart);
1709
512k
        fSetOpStack.popi();
1710
512k
        break;
1711
1712
431k
    case doSetFinish:
1713
431k
        {
1714
        // Finished a complete set expression, including all nested sets.
1715
        //   The close bracket has already triggered clearing out pending set operators,
1716
        //    the operator stack should be empty and the operand stack should have just
1717
        //    one entry, the result set.
1718
431k
        U_ASSERT(fSetOpStack.empty());
1719
431k
        UnicodeSet* theSet = static_cast<UnicodeSet*>(fSetStack.pop());
1720
431k
        U_ASSERT(fSetStack.empty());
1721
431k
        compileSet(theSet);
1722
431k
        break;
1723
16.8k
        }
1724
1725
7.37k
    case doSetIntersection2:
1726
        // Have scanned something like [abc&&
1727
7.37k
        setPushOp(setIntersection2);
1728
7.37k
        break;
1729
1730
13.5M
    case doSetLiteral:
1731
        // Union the just-scanned literal character into the set being built.
1732
        //    This operation is the highest precedence set operation, so we can always do
1733
        //    it immediately, without waiting to see what follows.  It is necessary to perform
1734
        //    any pending '-' or '&' operation first, because these have the same precedence
1735
        //    as union-ing in a literal'
1736
13.5M
        {
1737
13.5M
            setEval(setUnion);
1738
13.5M
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1739
13.5M
            s->add(fC.fChar);
1740
13.5M
            fLastSetLiteral = fC.fChar;
1741
13.5M
            break;
1742
16.8k
        }
1743
1744
31.6k
    case doSetLiteralEscaped:
1745
        // A back-slash escaped literal character was encountered.
1746
        // Processing is the same as with setLiteral, above, with the addition of
1747
        //  the optional check for errors on escaped ASCII letters.
1748
31.6k
        {
1749
31.6k
            if ((fModeFlags & UREGEX_ERROR_ON_UNKNOWN_ESCAPES) != 0 &&
1750
0
                ((fC.fChar >= 0x41 && fC.fChar<= 0x5A) ||     // in [A-Z]
1751
0
                 (fC.fChar >= 0x61 && fC.fChar <= 0x7a))) {   // in [a-z]
1752
0
                error(U_REGEX_BAD_ESCAPE_SEQUENCE);
1753
0
            }
1754
31.6k
            setEval(setUnion);
1755
31.6k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1756
31.6k
            s->add(fC.fChar);
1757
31.6k
            fLastSetLiteral = fC.fChar;
1758
31.6k
            break;
1759
16.8k
        }
1760
1761
2.91k
        case doSetNamedChar:
1762
        // Scanning a \N{UNICODE CHARACTER NAME}
1763
        //  Aside from the source of the character, the processing is identical to doSetLiteral,
1764
        //    above.
1765
2.91k
        {
1766
2.91k
            UChar32  c = scanNamedChar();
1767
2.91k
            setEval(setUnion);
1768
2.91k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1769
2.91k
            s->add(c);
1770
2.91k
            fLastSetLiteral = c;
1771
2.91k
            break;
1772
16.8k
        }
1773
1774
1.94k
    case doSetNamedRange:
1775
        // We have scanned literal-\N{CHAR NAME}.  Add the range to the set.
1776
        // The left character is already in the set, and is saved in fLastSetLiteral.
1777
        // The right side needs to be picked up, the scan is at the 'N'.
1778
        // Lower Limit > Upper limit being an error matches both Java
1779
        //        and ICU UnicodeSet behavior.
1780
1.94k
        {
1781
1.94k
            UChar32  c = scanNamedChar();
1782
1.94k
            if (U_SUCCESS(*fStatus) && (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > c)) {
1783
7
                error(U_REGEX_INVALID_RANGE);
1784
7
            }
1785
1.94k
            UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1786
1.94k
            s->add(fLastSetLiteral, c);
1787
1.94k
            fLastSetLiteral = c;
1788
1.94k
            break;
1789
16.8k
        }
1790
1791
1792
12.1k
    case  doSetNegate:
1793
        // Scanned a '^' at the start of a set.
1794
        // Push the negation operator onto the set op stack.
1795
        // A twist for case-insensitive matching:
1796
        //   the case closure operation must happen _before_ negation.
1797
        //   But the case closure operation will already be on the stack if it's required.
1798
        //   This requires checking for case closure, and swapping the stack order
1799
        //    if it is present.
1800
12.1k
        {
1801
12.1k
            int32_t  tosOp = fSetOpStack.peeki();
1802
12.1k
            if (tosOp == setCaseClose) {
1803
5.29k
                fSetOpStack.popi();
1804
5.29k
                fSetOpStack.push(setNegation, *fStatus);
1805
5.29k
                fSetOpStack.push(setCaseClose, *fStatus);
1806
6.85k
            } else {
1807
6.85k
                fSetOpStack.push(setNegation, *fStatus);
1808
6.85k
            }
1809
12.1k
        }
1810
12.1k
        break;
1811
1812
1.94k
    case doSetNoCloseError:
1813
1.94k
        error(U_REGEX_MISSING_CLOSE_BRACKET);
1814
1.94k
        break;
1815
1816
3
    case doSetOpError:
1817
3
        error(U_REGEX_RULE_SYNTAX);   //  -- or && at the end of a set.  Illegal.
1818
3
        break;
1819
1820
106k
    case doSetPosixProp:
1821
106k
        {
1822
106k
            UnicodeSet *s = scanPosixProp();
1823
106k
            if (s != nullptr) {
1824
75.4k
                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
1825
75.4k
                tos->addAll(*s);
1826
75.4k
                delete s;
1827
75.4k
            }  // else error.  scanProp() reported the error status already.
1828
106k
        }
1829
106k
        break;
1830
1831
1.89k
    case doSetProp:
1832
        //  Scanned a \p \P within [brackets].
1833
1.89k
        {
1834
1.89k
            UnicodeSet *s = scanProp();
1835
1.89k
            if (s != nullptr) {
1836
1.87k
                UnicodeSet* tos = static_cast<UnicodeSet*>(fSetStack.peek());
1837
1.87k
                tos->addAll(*s);
1838
1.87k
                delete s;
1839
1.87k
            }  // else error.  scanProp() reported the error status already.
1840
1.89k
        }
1841
1.89k
        break;
1842
1843
1844
5.24k
    case doSetRange:
1845
        // We have scanned literal-literal.  Add the range to the set.
1846
        // The left character is already in the set, and is saved in fLastSetLiteral.
1847
        // The right side is the current character.
1848
        // Lower Limit > Upper limit being an error matches both Java
1849
        //        and ICU UnicodeSet behavior.
1850
5.24k
        {
1851
1852
5.24k
        if (fLastSetLiteral == U_SENTINEL || fLastSetLiteral > fC.fChar) {
1853
75
            error(U_REGEX_INVALID_RANGE);
1854
75
        }
1855
5.24k
        UnicodeSet* s = static_cast<UnicodeSet*>(fSetStack.peek());
1856
5.24k
        s->add(fLastSetLiteral, fC.fChar);
1857
5.24k
        break;
1858
16.8k
        }
1859
1860
0
    default:
1861
0
        UPRV_UNREACHABLE_EXIT;
1862
167M
    }
1863
1864
167M
    if (U_FAILURE(*fStatus)) {
1865
7.76k
        returnVal = false;
1866
7.76k
    }
1867
1868
167M
    return returnVal;
1869
167M
}
1870
1871
1872
1873
//------------------------------------------------------------------------------
1874
//
1875
//   literalChar           We've encountered a literal character from the pattern,
1876
//                             or an escape sequence that reduces to a character.
1877
//                         Add it to the string containing all literal chars/strings from
1878
//                             the pattern.
1879
//
1880
//------------------------------------------------------------------------------
1881
42.0M
void RegexCompile::literalChar(UChar32 c)  {
1882
42.0M
    fLiteralChars.append(c);
1883
42.0M
}
1884
1885
1886
//------------------------------------------------------------------------------
1887
//
1888
//    fixLiterals           When compiling something that can follow a literal
1889
//                          string in a pattern, emit the code to match the
1890
//                          accumulated literal string.
1891
//
1892
//                          Optionally, split the last char of the string off into
1893
//                          a single "ONE_CHAR" operation, so that quantifiers can
1894
//                          apply to that char alone.  Example:   abc*
1895
//                          The * must apply to the 'c' only.
1896
//
1897
//------------------------------------------------------------------------------
1898
25.1M
void    RegexCompile::fixLiterals(UBool split) {
1899
1900
    // If no literal characters have been scanned but not yet had code generated
1901
    //   for them, nothing needs to be done.
1902
25.1M
    if (fLiteralChars.length() == 0) {
1903
22.9M
        return;
1904
22.9M
    }
1905
1906
2.12M
    int32_t indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1907
2.12M
    UChar32 lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1908
1909
    // Split:  We need to  ensure that the last item in the compiled pattern
1910
    //     refers only to the last literal scanned in the pattern, so that
1911
    //     quantifiers (*, +, etc.) affect only it, and not a longer string.
1912
    //     Split before case folding for case insensitive matches.
1913
1914
2.12M
    if (split) {
1915
370k
        fLiteralChars.truncate(indexOfLastCodePoint);
1916
370k
        fixLiterals(false);   // Recursive call, emit code to match the first part of the string.
1917
                              //  Note that the truncated literal string may be empty, in which case
1918
                              //  nothing will be emitted.
1919
1920
370k
        literalChar(lastCodePoint);  // Re-add the last code point as if it were a new literal.
1921
370k
        fixLiterals(false);          // Second recursive call, code for the final code point.
1922
370k
        return;
1923
370k
    }
1924
1925
    // If we are doing case-insensitive matching, case fold the string.  This may expand
1926
    //   the string, e.g. the German sharp-s turns into "ss"
1927
1.75M
    if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1928
1.36M
        fLiteralChars.foldCase();
1929
1.36M
        indexOfLastCodePoint = fLiteralChars.moveIndex32(fLiteralChars.length(), -1);
1930
1.36M
        lastCodePoint = fLiteralChars.char32At(indexOfLastCodePoint);
1931
1.36M
    }
1932
1933
1.75M
    if (indexOfLastCodePoint == 0) {
1934
        // Single character, emit a URX_ONECHAR op to match it.
1935
522k
        if ((fModeFlags & UREGEX_CASE_INSENSITIVE) &&
1936
285k
                 u_hasBinaryProperty(lastCodePoint, UCHAR_CASE_SENSITIVE)) {
1937
26.2k
            appendOp(URX_ONECHAR_I, lastCodePoint);
1938
496k
        } else {
1939
496k
            appendOp(URX_ONECHAR, lastCodePoint);
1940
496k
        }
1941
1.23M
    } else {
1942
        // Two or more chars, emit a URX_STRING to match them.
1943
1.23M
        if (fLiteralChars.length() > 0x00ffffff || fRXPat->fLiteralText.length() > 0x00ffffff) {
1944
0
            error(U_REGEX_PATTERN_TOO_BIG);
1945
0
        }
1946
1.23M
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
1947
1.07M
            appendOp(URX_STRING_I, fRXPat->fLiteralText.length());
1948
1.07M
        } else {
1949
            // TODO here:  add optimization to split case sensitive strings of length two
1950
            //             into two single char ops, for efficiency.
1951
157k
            appendOp(URX_STRING, fRXPat->fLiteralText.length());
1952
157k
        }
1953
1.23M
        appendOp(URX_STRING_LEN, fLiteralChars.length());
1954
1955
        // Add this string into the accumulated strings of the compiled pattern.
1956
1.23M
        fRXPat->fLiteralText.append(fLiteralChars);
1957
1.23M
    }
1958
1959
1.75M
    fLiteralChars.remove();
1960
1.75M
}
1961
1962
1963
108M
int32_t RegexCompile::buildOp(int32_t type, int32_t val) {
1964
108M
    if (U_FAILURE(*fStatus)) {
1965
325k
        return 0;
1966
325k
    }
1967
108M
    if (type < 0 || type > 255) {
1968
0
        UPRV_UNREACHABLE_EXIT;
1969
0
    }
1970
108M
    if (val > 0x00ffffff) {
1971
0
        UPRV_UNREACHABLE_EXIT;
1972
0
    }
1973
108M
    if (val < 0) {
1974
0
        if (!(type == URX_RESERVED_OP_N || type == URX_RESERVED_OP)) {
1975
0
            UPRV_UNREACHABLE_EXIT;
1976
0
        }
1977
0
        if (URX_TYPE(val) != 0xff) {
1978
0
            UPRV_UNREACHABLE_EXIT;
1979
0
        }
1980
0
        type = URX_RESERVED_OP_N;
1981
0
    }
1982
108M
    return (type << 24) | val;
1983
108M
}
1984
1985
1986
//------------------------------------------------------------------------------
1987
//
1988
//   appendOp()             Append a new instruction onto the compiled pattern
1989
//                          Includes error checking, limiting the size of the
1990
//                          pattern to lengths that can be represented in the
1991
//                          24 bit operand field of an instruction.
1992
//
1993
//------------------------------------------------------------------------------
1994
51.1M
void RegexCompile::appendOp(int32_t op) {
1995
51.1M
    if (U_FAILURE(*fStatus)) {
1996
2.37k
        return;
1997
2.37k
    }
1998
51.1M
    fRXPat->fCompiledPat->addElement(op, *fStatus);
1999
51.1M
    if ((fRXPat->fCompiledPat->size() > 0x00fffff0) && U_SUCCESS(*fStatus)) {
2000
0
        error(U_REGEX_PATTERN_TOO_BIG);
2001
0
    }
2002
51.1M
}
2003
2004
49.3M
void RegexCompile::appendOp(int32_t type, int32_t val) {
2005
49.3M
    appendOp(buildOp(type, val));
2006
49.3M
}
2007
2008
2009
//------------------------------------------------------------------------------
2010
//
2011
//   insertOp()             Insert a slot for a new opcode into the already
2012
//                          compiled pattern code.
2013
//
2014
//                          Fill the slot with a NOP.  Our caller will replace it
2015
//                          with what they really wanted.
2016
//
2017
//------------------------------------------------------------------------------
2018
283k
void   RegexCompile::insertOp(int32_t where) {
2019
283k
    UVector64 *code = fRXPat->fCompiledPat;
2020
283k
    U_ASSERT(where>0 && where < code->size());
2021
2022
283k
    int32_t  nop = buildOp(URX_NOP, 0);
2023
283k
    code->insertElementAt(nop, where, *fStatus);
2024
2025
    // Walk through the pattern, looking for any ops with targets that
2026
    //  were moved down by the insert.  Fix them.
2027
283k
    int32_t loc;
2028
5.78G
    for (loc=0; loc<code->size(); loc++) {
2029
5.78G
        int32_t op = static_cast<int32_t>(code->elementAti(loc));
2030
5.78G
        int32_t opType = URX_TYPE(op);
2031
5.78G
        int32_t opValue = URX_VAL(op);
2032
5.78G
        if ((opType == URX_JMP         ||
2033
5.08G
            opType == URX_JMPX         ||
2034
5.08G
            opType == URX_STATE_SAVE   ||
2035
3.04G
            opType == URX_CTR_LOOP     ||
2036
3.04G
            opType == URX_CTR_LOOP_NG  ||
2037
3.04G
            opType == URX_JMP_SAV      ||
2038
2.98G
            opType == URX_JMP_SAV_X    ||
2039
2.91G
            opType == URX_RELOC_OPRND)    && opValue > where) {
2040
            // Target location for this opcode is after the insertion point and
2041
            //   needs to be incremented to adjust for the insertion.
2042
9.87M
            opValue++;
2043
9.87M
            op = buildOp(opType, opValue);
2044
9.87M
            code->setElementAt(op, loc);
2045
9.87M
        }
2046
5.78G
    }
2047
2048
    // Now fix up the parentheses stack.  All positive values in it are locations in
2049
    //  the compiled pattern.   (Negative values are frame boundaries, and don't need fixing.)
2050
600M
    for (loc=0; loc<fParenStack.size(); loc++) {
2051
599M
        int32_t x = fParenStack.elementAti(loc);
2052
599M
        U_ASSERT(x < code->size());
2053
599M
        if (x>where) {
2054
0
            x++;
2055
0
            fParenStack.setElementAt(x, loc);
2056
0
        }
2057
599M
    }
2058
2059
283k
    if (fMatchCloseParen > where) {
2060
16.1k
        fMatchCloseParen++;
2061
16.1k
    }
2062
283k
    if (fMatchOpenParen > where) {
2063
0
        fMatchOpenParen++;
2064
0
    }
2065
283k
}
2066
2067
2068
//------------------------------------------------------------------------------
2069
//
2070
//   allocateData()        Allocate storage in the matcher's static data area.
2071
//                         Return the index for the newly allocated data.
2072
//                         The storage won't actually exist until we are running a match
2073
//                         operation, but the storage indexes are inserted into various
2074
//                         opcodes while compiling the pattern.
2075
//
2076
//------------------------------------------------------------------------------
2077
115k
int32_t RegexCompile::allocateData(int32_t size) {
2078
115k
    if (U_FAILURE(*fStatus)) {
2079
20
        return 0;
2080
20
    }
2081
115k
    if (size <= 0 || size > 0x100 || fRXPat->fDataSize < 0) {
2082
0
        error(U_REGEX_INTERNAL_ERROR);
2083
0
        return 0;
2084
0
    }
2085
115k
    int32_t dataIndex = fRXPat->fDataSize;
2086
115k
    fRXPat->fDataSize += size;
2087
115k
    if (fRXPat->fDataSize >= 0x00fffff0) {
2088
0
        error(U_REGEX_INTERNAL_ERROR);
2089
0
    }
2090
115k
    return dataIndex;
2091
115k
}
2092
2093
2094
//------------------------------------------------------------------------------
2095
//
2096
//   allocateStackData()   Allocate space in the back-tracking stack frame.
2097
//                         Return the index for the newly allocated data.
2098
//                         The frame indexes are inserted into various
2099
//                         opcodes while compiling the pattern, meaning that frame
2100
//                         size must be restricted to the size that will fit
2101
//                         as an operand (24 bits).
2102
//
2103
//------------------------------------------------------------------------------
2104
568k
int32_t RegexCompile::allocateStackData(int32_t size) {
2105
568k
    if (U_FAILURE(*fStatus)) {
2106
0
        return 0;
2107
0
    }
2108
568k
    if (size <= 0 || size > 0x100 || fRXPat->fFrameSize < 0) {
2109
0
        error(U_REGEX_INTERNAL_ERROR);
2110
0
        return 0;
2111
0
    }
2112
568k
    int32_t dataIndex = fRXPat->fFrameSize;
2113
568k
    fRXPat->fFrameSize += size;
2114
568k
    if (fRXPat->fFrameSize >= 0x00fffff0) {
2115
0
        error(U_REGEX_PATTERN_TOO_BIG);
2116
0
    }
2117
568k
    return dataIndex;
2118
568k
}
2119
2120
2121
//------------------------------------------------------------------------------
2122
//
2123
//   blockTopLoc()          Find or create a location in the compiled pattern
2124
//                          at the start of the operation or block that has
2125
//                          just been compiled.  Needed when a quantifier (* or
2126
//                          whatever) appears, and we need to add an operation
2127
//                          at the start of the thing being quantified.
2128
//
2129
//                          (Parenthesized Blocks) have a slot with a NOP that
2130
//                          is reserved for this purpose.  .* or similar don't
2131
//                          and a slot needs to be added.
2132
//
2133
//       parameter reserveLoc   :  true -  ensure that there is space to add an opcode
2134
//                                         at the returned location.
2135
//                                 false - just return the address,
2136
//                                         do not reserve a location there.
2137
//
2138
//------------------------------------------------------------------------------
2139
784k
int32_t   RegexCompile::blockTopLoc(UBool reserveLoc) {
2140
784k
    int32_t   theLoc;
2141
784k
    fixLiterals(true);  // Emit code for any pending literals.
2142
                        //   If last item was a string, emit separate op for the its last char.
2143
784k
    if (fRXPat->fCompiledPat->size() == fMatchCloseParen)
2144
19.7k
    {
2145
        // The item just processed is a parenthesized block.
2146
19.7k
        theLoc = fMatchOpenParen;   // A slot is already reserved for us.
2147
19.7k
        U_ASSERT(theLoc > 0);
2148
19.7k
        U_ASSERT(URX_TYPE(((uint32_t)fRXPat->fCompiledPat->elementAti(theLoc))) == URX_NOP);
2149
19.7k
    }
2150
764k
    else {
2151
        // Item just compiled is a single thing, a ".", or a single char, a string or a set reference.
2152
        // No slot for STATE_SAVE was pre-reserved in the compiled code.
2153
        // We need to make space now.
2154
764k
        theLoc = fRXPat->fCompiledPat->size()-1;
2155
764k
        int32_t opAtTheLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(theLoc));
2156
764k
        if (URX_TYPE(opAtTheLoc) == URX_STRING_LEN) {
2157
            // Strings take two opcode, we want the position of the first one.
2158
            // We can have a string at this point if a single character case-folded to two.
2159
206k
            theLoc--;
2160
206k
        }
2161
764k
        if (reserveLoc) {
2162
372k
            int32_t  nop = buildOp(URX_NOP, 0);
2163
372k
            fRXPat->fCompiledPat->insertElementAt(nop, theLoc, *fStatus);
2164
372k
        }
2165
764k
    }
2166
784k
    return theLoc;
2167
784k
}
2168
2169
2170
2171
//------------------------------------------------------------------------------
2172
//
2173
//    handleCloseParen      When compiling a close paren, we need to go back
2174
//                          and fix up any JMP or SAVE operations within the
2175
//                          parenthesized block that need to target the end
2176
//                          of the block.  The locations of these are kept on
2177
//                          the paretheses stack.
2178
//
2179
//                          This function is called both when encountering a
2180
//                          real ) and at the end of the pattern.
2181
//
2182
//------------------------------------------------------------------------------
2183
457k
void  RegexCompile::handleCloseParen() {
2184
457k
    int32_t   patIdx;
2185
457k
    int32_t   patOp;
2186
457k
    if (fParenStack.size() <= 0) {
2187
0
        error(U_REGEX_MISMATCHED_PAREN);
2188
0
        return;
2189
0
    }
2190
2191
    // Emit code for any pending literals.
2192
457k
    fixLiterals(false);
2193
2194
    // Fixup any operations within the just-closed parenthesized group
2195
    //    that need to reference the end of the (block).
2196
    //    (The first one popped from the stack is an unused slot for
2197
    //     alternation (OR) state save, but applying the fixup to it does no harm.)
2198
20.3M
    for (;;) {
2199
20.3M
        patIdx = fParenStack.popi();
2200
20.3M
        if (patIdx < 0) {
2201
            // value < 0 flags the start of the frame on the paren stack.
2202
457k
            break;
2203
457k
        }
2204
19.9M
        U_ASSERT(patIdx>0 && patIdx <= fRXPat->fCompiledPat->size());
2205
19.9M
        patOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(patIdx));
2206
19.9M
        U_ASSERT(URX_VAL(patOp) == 0);          // Branch target for JMP should not be set.
2207
19.9M
        patOp |= fRXPat->fCompiledPat->size();  // Set it now.
2208
19.9M
        fRXPat->fCompiledPat->setElementAt(patOp, patIdx);
2209
19.9M
        fMatchOpenParen     = patIdx;
2210
19.9M
    }
2211
2212
    //  At the close of any parenthesized block, restore the match mode flags  to
2213
    //  the value they had at the open paren.  Saved value is
2214
    //  at the top of the paren stack.
2215
457k
    fModeFlags = fParenStack.popi();
2216
457k
    U_ASSERT(fModeFlags < 0);
2217
2218
    // DO any additional fixups, depending on the specific kind of
2219
    // parentesized grouping this is
2220
2221
457k
    switch (patIdx) {
2222
13.2k
    case plain:
2223
17.5k
    case flags:
2224
        // No additional fixups required.
2225
        //   (Grouping-only parentheses)
2226
17.5k
        break;
2227
424k
    case capturing:
2228
        // Capturing Parentheses.
2229
        //   Insert a End Capture op into the pattern.
2230
        //   The frame offset of the variables for this cg is obtained from the
2231
        //       start capture op and put it into the end-capture op.
2232
424k
        {
2233
424k
            int32_t captureOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
2234
424k
            U_ASSERT(URX_TYPE(captureOp) == URX_START_CAPTURE);
2235
2236
424k
            int32_t   frameVarLocation = URX_VAL(captureOp);
2237
424k
            appendOp(URX_END_CAPTURE, frameVarLocation);
2238
424k
        }
2239
424k
        break;
2240
851
    case atomic:
2241
        // Atomic Parenthesis.
2242
        //   Insert a LD_SP operation to restore the state stack to the position
2243
        //   it was when the atomic parens were entered.
2244
851
        {
2245
851
            int32_t stoOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen + 1));
2246
851
            U_ASSERT(URX_TYPE(stoOp) == URX_STO_SP);
2247
851
            int32_t   stoLoc = URX_VAL(stoOp);
2248
851
            appendOp(URX_LD_SP, stoLoc);
2249
851
        }
2250
851
        break;
2251
2252
1.15k
    case lookAhead:
2253
1.15k
        {
2254
1.15k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
2255
1.15k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2256
1.15k
            int32_t dataLoc  = URX_VAL(startOp);
2257
1.15k
            appendOp(URX_LA_END, dataLoc);
2258
1.15k
        }
2259
1.15k
        break;
2260
2261
4.71k
    case negLookAhead:
2262
4.71k
        {
2263
            // See comment at doOpenLookAheadNeg
2264
4.71k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 1));
2265
4.71k
            U_ASSERT(URX_TYPE(startOp) == URX_LA_START);
2266
4.71k
            int32_t dataLoc  = URX_VAL(startOp);
2267
4.71k
            appendOp(URX_LA_END, dataLoc);
2268
4.71k
            appendOp(URX_BACKTRACK, 0);
2269
4.71k
            appendOp(URX_LA_END, dataLoc);
2270
2271
            // Patch the URX_SAVE near the top of the block.
2272
            // The destination of the SAVE is the final LA_END that was just added.
2273
4.71k
            int32_t saveOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen));
2274
4.71k
            U_ASSERT(URX_TYPE(saveOp) == URX_STATE_SAVE);
2275
4.71k
            int32_t dest     = fRXPat->fCompiledPat->size()-1;
2276
4.71k
            saveOp           = buildOp(URX_STATE_SAVE, dest);
2277
4.71k
            fRXPat->fCompiledPat->setElementAt(saveOp, fMatchOpenParen);
2278
4.71k
        }
2279
4.71k
        break;
2280
2281
3.54k
    case lookBehind:
2282
3.54k
        {
2283
            // See comment at doOpenLookBehind.
2284
2285
            // Append the URX_LB_END and URX_LA_END to the compiled pattern.
2286
3.54k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 4));
2287
3.54k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2288
3.54k
            int32_t dataLoc  = URX_VAL(startOp);
2289
3.54k
            appendOp(URX_LB_END, dataLoc);
2290
3.54k
            appendOp(URX_LA_END, dataLoc);
2291
2292
            // Determine the min and max bounds for the length of the
2293
            //  string that the pattern can match.
2294
            //  An unbounded upper limit is an error.
2295
3.54k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2296
3.54k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2297
3.54k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2298
3.54k
            if (URX_TYPE(maxML) != 0) {
2299
144
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2300
144
                break;
2301
144
            }
2302
3.40k
            if (maxML == INT32_MAX) {
2303
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2304
0
                break;
2305
0
            }
2306
3.40k
            if (minML == INT32_MAX) {
2307
                // This condition happens when no match is possible, such as with a
2308
                // [set] expression containing no elements.
2309
                // In principle, the generated code to evaluate the expression could be deleted,
2310
                // but it's probably not worth the complication.
2311
581
                minML = 0;
2312
581
            }
2313
3.40k
            U_ASSERT(minML <= maxML);
2314
2315
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2316
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2317
3.40k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-2);
2318
3.40k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-1);
2319
2320
3.40k
        }
2321
0
        break;
2322
2323
2324
2325
4.74k
    case lookBehindN:
2326
4.74k
        {
2327
            // See comment at doOpenLookBehindNeg.
2328
2329
            // Append the URX_LBN_END to the compiled pattern.
2330
4.74k
            int32_t startOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(fMatchOpenParen - 5));
2331
4.74k
            U_ASSERT(URX_TYPE(startOp) == URX_LB_START);
2332
4.74k
            int32_t dataLoc  = URX_VAL(startOp);
2333
4.74k
            appendOp(URX_LBN_END, dataLoc);
2334
2335
            // Determine the min and max bounds for the length of the
2336
            //  string that the pattern can match.
2337
            //  An unbounded upper limit is an error.
2338
4.74k
            int32_t patEnd   = fRXPat->fCompiledPat->size() - 1;
2339
4.74k
            int32_t minML    = minMatchLength(fMatchOpenParen, patEnd);
2340
4.74k
            int32_t maxML    = maxMatchLength(fMatchOpenParen, patEnd);
2341
4.74k
            if (URX_TYPE(maxML) != 0) {
2342
147
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2343
147
                break;
2344
147
            }
2345
4.60k
            if (maxML == INT32_MAX) {
2346
0
                error(U_REGEX_LOOK_BEHIND_LIMIT);
2347
0
                break;
2348
0
            }
2349
4.60k
            if (minML == INT32_MAX) {
2350
                // This condition happens when no match is possible, such as with a
2351
                // [set] expression containing no elements.
2352
                // In principle, the generated code to evaluate the expression could be deleted,
2353
                // but it's probably not worth the complication.
2354
231
                minML = 0;
2355
231
            }
2356
2357
4.60k
            U_ASSERT(minML <= maxML);
2358
2359
            // Insert the min and max match len bounds into the URX_LB_CONT op that
2360
            //  appears at the top of the look-behind block, at location fMatchOpenParen+1
2361
4.60k
            fRXPat->fCompiledPat->setElementAt(minML,  fMatchOpenParen-3);
2362
4.60k
            fRXPat->fCompiledPat->setElementAt(maxML,  fMatchOpenParen-2);
2363
2364
            // Insert the pattern location to continue at after a successful match
2365
            //  as the last operand of the URX_LBN_CONT
2366
4.60k
            int32_t op = buildOp(URX_RELOC_OPRND, fRXPat->fCompiledPat->size());
2367
4.60k
            fRXPat->fCompiledPat->setElementAt(op,  fMatchOpenParen-1);
2368
4.60k
        }
2369
0
        break;
2370
2371
2372
2373
0
    default:
2374
0
        UPRV_UNREACHABLE_EXIT;
2375
457k
    }
2376
2377
    // remember the next location in the compiled pattern.
2378
    // The compilation of Quantifiers will look at this to see whether its looping
2379
    //   over a parenthesized block or a single item
2380
457k
    fMatchCloseParen = fRXPat->fCompiledPat->size();
2381
457k
}
2382
2383
2384
2385
//------------------------------------------------------------------------------
2386
//
2387
//   compileSet       Compile the pattern operations for a reference to a
2388
//                    UnicodeSet.
2389
//
2390
//------------------------------------------------------------------------------
2391
void        RegexCompile::compileSet(UnicodeSet *theSet)
2392
432k
{
2393
432k
    if (theSet == nullptr) {
2394
199
        return;
2395
199
    }
2396
    //  Remove any strings from the set.
2397
    //  There shouldn't be any, but just in case.
2398
    //     (Case Closure can add them; if we had a simple case closure available that
2399
    //      ignored strings, that would be better.)
2400
432k
    theSet->removeAllStrings();
2401
432k
    int32_t  setSize = theSet->size();
2402
2403
432k
    switch (setSize) {
2404
8.43k
    case 0:
2405
8.43k
        {
2406
            // Set of no elements.   Always fails to match.
2407
8.43k
            appendOp(URX_BACKTRACK, 0);
2408
8.43k
            delete theSet;
2409
8.43k
        }
2410
8.43k
        break;
2411
2412
7.54k
    case 1:
2413
7.54k
        {
2414
            // The set contains only a single code point.  Put it into
2415
            //   the compiled pattern as a single char operation rather
2416
            //   than a set, and discard the set itself.
2417
7.54k
            literalChar(theSet->charAt(0));
2418
7.54k
            delete theSet;
2419
7.54k
        }
2420
7.54k
        break;
2421
2422
416k
    default:
2423
416k
        {
2424
            //  The set contains two or more chars.  (the normal case)
2425
            //  Put it into the compiled pattern as a set.
2426
416k
            theSet->freeze();
2427
416k
            int32_t setNumber = fRXPat->fSets->size();
2428
416k
            fRXPat->fSets->addElement(theSet, *fStatus);
2429
416k
            if (U_SUCCESS(*fStatus)) {
2430
416k
                appendOp(URX_SETREF, setNumber);
2431
416k
            } else {
2432
3
                delete theSet;
2433
3
            }
2434
416k
        }
2435
432k
    }
2436
432k
}
2437
2438
2439
//------------------------------------------------------------------------------
2440
//
2441
//   compileInterval    Generate the code for a {min, max} style interval quantifier.
2442
//                      Except for the specific opcodes used, the code is the same
2443
//                      for all three types (greedy, non-greedy, possessive) of
2444
//                      intervals.  The opcodes are supplied as parameters.
2445
//                      (There are two sets of opcodes - greedy & possessive use the
2446
//                      same ones, while non-greedy has it's own.)
2447
//
2448
//                      The code for interval loops has this form:
2449
//                         0  CTR_INIT   counter loc (in stack frame)
2450
//                         1             5  patt address of CTR_LOOP at bottom of block
2451
//                         2             min count
2452
//                         3             max count   (-1 for unbounded)
2453
//                         4  ...        block to be iterated over
2454
//                         5  CTR_LOOP
2455
//
2456
//                       In
2457
//------------------------------------------------------------------------------
2458
void        RegexCompile::compileInterval(int32_t InitOp,  int32_t LoopOp)
2459
12.3k
{
2460
    // The CTR_INIT op at the top of the block with the {n,m} quantifier takes
2461
    //   four slots in the compiled code.  Reserve them.
2462
12.3k
    int32_t   topOfBlock = blockTopLoc(true);
2463
12.3k
    insertOp(topOfBlock);
2464
12.3k
    insertOp(topOfBlock);
2465
12.3k
    insertOp(topOfBlock);
2466
2467
    // The operands for the CTR_INIT opcode include the index in the matcher data
2468
    //   of the counter.  Allocate it now. There are two data items
2469
    //        counterLoc   -->  Loop counter
2470
    //               +1    -->  Input index (for breaking non-progressing loops)
2471
    //                          (Only present if unbounded upper limit on loop)
2472
12.3k
    int32_t   dataSize = fIntervalUpper < 0 ? 2 : 1;
2473
12.3k
    int32_t   counterLoc = allocateStackData(dataSize);
2474
2475
12.3k
    int32_t   op = buildOp(InitOp, counterLoc);
2476
12.3k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock);
2477
2478
    // The second operand of CTR_INIT is the location following the end of the loop.
2479
    //   Must put in as a URX_RELOC_OPRND so that the value will be adjusted if the
2480
    //   compilation of something later on causes the code to grow and the target
2481
    //   position to move.
2482
12.3k
    int32_t loopEnd = fRXPat->fCompiledPat->size();
2483
12.3k
    op = buildOp(URX_RELOC_OPRND, loopEnd);
2484
12.3k
    fRXPat->fCompiledPat->setElementAt(op, topOfBlock+1);
2485
2486
    // Followed by the min and max counts.
2487
12.3k
    fRXPat->fCompiledPat->setElementAt(fIntervalLow, topOfBlock+2);
2488
12.3k
    fRXPat->fCompiledPat->setElementAt(fIntervalUpper, topOfBlock+3);
2489
2490
    // Append the CTR_LOOP op.  The operand is the location of the CTR_INIT op.
2491
    //   Goes at end of the block being looped over, so just append to the code so far.
2492
12.3k
    appendOp(LoopOp, topOfBlock);
2493
2494
12.3k
    if ((fIntervalLow & 0xff000000) != 0 ||
2495
12.3k
        (fIntervalUpper > 0 && (fIntervalUpper & 0xff000000) != 0)) {
2496
40
            error(U_REGEX_NUMBER_TOO_BIG);
2497
40
        }
2498
2499
12.3k
    if (fIntervalLow > fIntervalUpper && fIntervalUpper != -1) {
2500
25
        error(U_REGEX_MAX_LT_MIN);
2501
25
    }
2502
12.3k
}
2503
2504
2505
2506
109k
UBool RegexCompile::compileInlineInterval() {
2507
109k
    if (fIntervalUpper > 10 || fIntervalUpper < fIntervalLow) {
2508
        // Too big to inline.  Fail, which will cause looping code to be generated.
2509
        //   (Upper < Lower picks up unbounded upper and errors, both.)
2510
5.23k
        return false;
2511
5.23k
    }
2512
2513
104k
    int32_t   topOfBlock = blockTopLoc(false);
2514
104k
    if (fIntervalUpper == 0) {
2515
        // Pathological case.  Attempt no matches, as if the block doesn't exist.
2516
        // Discard the generated code for the block.
2517
        // If the block included parens, discard the info pertaining to them as well.
2518
2.07k
        fRXPat->fCompiledPat->setSize(topOfBlock);
2519
2.07k
        if (fMatchOpenParen >= topOfBlock) {
2520
152
            fMatchOpenParen = -1;
2521
152
        }
2522
2.07k
        if (fMatchCloseParen >= topOfBlock) {
2523
153
            fMatchCloseParen = -1;
2524
153
        }
2525
2.07k
        return true;
2526
2.07k
    }
2527
2528
102k
    if (topOfBlock != fRXPat->fCompiledPat->size()-1 && fIntervalUpper != 1) {
2529
        // The thing being repeated is not a single op, but some
2530
        //   more complex block.  Do it as a loop, not inlines.
2531
        //   Note that things "repeated" a max of once are handled as inline, because
2532
        //     the one copy of the code already generated is just fine.
2533
735
        return false;
2534
735
    }
2535
2536
    // Pick up the opcode that is to be repeated
2537
    //
2538
101k
    int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(topOfBlock));
2539
2540
    // Compute the pattern location where the inline sequence
2541
    //   will end, and set up the state save op that will be needed.
2542
    //
2543
101k
    int32_t endOfSequenceLoc = fRXPat->fCompiledPat->size()-1
2544
101k
                                + fIntervalUpper + (fIntervalUpper-fIntervalLow);
2545
101k
    int32_t saveOp = buildOp(URX_STATE_SAVE, endOfSequenceLoc);
2546
101k
    if (fIntervalLow == 0) {
2547
89.7k
        insertOp(topOfBlock);
2548
89.7k
        fRXPat->fCompiledPat->setElementAt(saveOp, topOfBlock);
2549
89.7k
    }
2550
2551
2552
2553
    //  Loop, emitting the op for the thing being repeated each time.
2554
    //    Loop starts at 1 because one instance of the op already exists in the pattern,
2555
    //    it was put there when it was originally encountered.
2556
101k
    int32_t i;
2557
896k
    for (i=1; i<fIntervalUpper; i++ ) {
2558
794k
        if (i >= fIntervalLow) {
2559
721k
            appendOp(saveOp);
2560
721k
        }
2561
794k
        appendOp(op);
2562
794k
    }
2563
101k
    return true;
2564
102k
}
2565
2566
2567
2568
//------------------------------------------------------------------------------
2569
//
2570
//   caseInsensitiveStart  given a single code point from a pattern string, determine the 
2571
//                         set of characters that could potentially begin a case-insensitive 
2572
//                         match of a string beginning with that character, using full Unicode
2573
//                         case insensitive matching.
2574
//
2575
//          This is used in optimizing find().
2576
//
2577
//          closeOver(USET_CASE_INSENSITIVE) does most of what is needed, but
2578
//          misses cases like this:
2579
//             A string from the pattern begins with 'ss' (although all we know
2580
//                 in this context is that it begins with 's')
2581
//             The pattern could match a string beginning with a German sharp-s
2582
//
2583
//           To the ordinary case closure for a character c, we add all other
2584
//           characters cx where the case closure of cx includes a string form that begins
2585
//           with the original character c.
2586
//
2587
//           This function could be made smarter. The full pattern string is available
2588
//           and it would be possible to verify that the extra characters being added
2589
//           to the starting set fully match, rather than having just a first-char of the
2590
//           folded form match.
2591
//
2592
//------------------------------------------------------------------------------
2593
457k
void  RegexCompile::findCaseInsensitiveStarters(UChar32 c, UnicodeSet *starterChars) {
2594
2595
// Machine Generated below.
2596
// It may need updating with new versions of Unicode.
2597
// Intltest test RegexTest::TestCaseInsensitiveStarters will fail if an update is needed.
2598
// The update tool is here:
2599
// https://github.com/unicode-org/icu/tree/main/tools/unicode/c/genregexcasing
2600
2601
// Machine Generated Data. Do not hand edit.
2602
457k
    static const UChar32 RECaseFixCodePoints[] = {
2603
457k
        0x61, 0x66, 0x68, 0x69, 0x6a, 0x73, 0x74, 0x77, 0x79, 0x2bc, 
2604
457k
        0x3ac, 0x3ae, 0x3b1, 0x3b7, 0x3b9, 0x3c1, 0x3c5, 0x3c9, 0x3ce, 0x565, 
2605
457k
        0x574, 0x57e, 0x1f00, 0x1f01, 0x1f02, 0x1f03, 0x1f04, 0x1f05, 0x1f06, 0x1f07, 
2606
457k
        0x1f20, 0x1f21, 0x1f22, 0x1f23, 0x1f24, 0x1f25, 0x1f26, 0x1f27, 0x1f60, 0x1f61, 
2607
457k
        0x1f62, 0x1f63, 0x1f64, 0x1f65, 0x1f66, 0x1f67, 0x1f70, 0x1f74, 0x1f7c, 0x110000};
2608
2609
457k
    static const int16_t RECaseFixStringOffsets[] = {
2610
457k
        0x0, 0x1, 0x6, 0x7, 0x8, 0x9, 0xd, 0xe, 0xf, 0x10, 
2611
457k
        0x11, 0x12, 0x13, 0x17, 0x1b, 0x20, 0x21, 0x2a, 0x2e, 0x2f, 
2612
457k
        0x30, 0x34, 0x35, 0x37, 0x39, 0x3b, 0x3d, 0x3f, 0x41, 0x43, 
2613
457k
        0x45, 0x47, 0x49, 0x4b, 0x4d, 0x4f, 0x51, 0x53, 0x55, 0x57, 
2614
457k
        0x59, 0x5b, 0x5d, 0x5f, 0x61, 0x63, 0x65, 0x66, 0x67, 0};
2615
2616
457k
    static const int16_t RECaseFixCounts[] = {
2617
457k
        0x1, 0x5, 0x1, 0x1, 0x1, 0x4, 0x1, 0x1, 0x1, 0x1, 
2618
457k
        0x1, 0x1, 0x4, 0x4, 0x5, 0x1, 0x9, 0x4, 0x1, 0x1, 
2619
457k
        0x4, 0x1, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2620
457k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 
2621
457k
        0x2, 0x2, 0x2, 0x2, 0x2, 0x2, 0x1, 0x1, 0x1, 0};
2622
2623
457k
    static const char16_t RECaseFixData[] = {
2624
457k
        0x1e9a, 0xfb00, 0xfb01, 0xfb02, 0xfb03, 0xfb04, 0x1e96, 0x130, 0x1f0, 0xdf, 
2625
457k
        0x1e9e, 0xfb05, 0xfb06, 0x1e97, 0x1e98, 0x1e99, 0x149, 0x1fb4, 0x1fc4, 0x1fb3, 
2626
457k
        0x1fb6, 0x1fb7, 0x1fbc, 0x1fc3, 0x1fc6, 0x1fc7, 0x1fcc, 0x390, 0x1fd2, 0x1fd3, 
2627
457k
        0x1fd6, 0x1fd7, 0x1fe4, 0x3b0, 0x1f50, 0x1f52, 0x1f54, 0x1f56, 0x1fe2, 0x1fe3, 
2628
457k
        0x1fe6, 0x1fe7, 0x1ff3, 0x1ff6, 0x1ff7, 0x1ffc, 0x1ff4, 0x587, 0xfb13, 0xfb14, 
2629
457k
        0xfb15, 0xfb17, 0xfb16, 0x1f80, 0x1f88, 0x1f81, 0x1f89, 0x1f82, 0x1f8a, 0x1f83, 
2630
457k
        0x1f8b, 0x1f84, 0x1f8c, 0x1f85, 0x1f8d, 0x1f86, 0x1f8e, 0x1f87, 0x1f8f, 0x1f90, 
2631
457k
        0x1f98, 0x1f91, 0x1f99, 0x1f92, 0x1f9a, 0x1f93, 0x1f9b, 0x1f94, 0x1f9c, 0x1f95, 
2632
457k
        0x1f9d, 0x1f96, 0x1f9e, 0x1f97, 0x1f9f, 0x1fa0, 0x1fa8, 0x1fa1, 0x1fa9, 0x1fa2, 
2633
457k
        0x1faa, 0x1fa3, 0x1fab, 0x1fa4, 0x1fac, 0x1fa5, 0x1fad, 0x1fa6, 0x1fae, 0x1fa7, 
2634
457k
        0x1faf, 0x1fb2, 0x1fc2, 0x1ff2, 0};
2635
2636
// End of machine generated data.
2637
2638
457k
    if (c < UCHAR_MIN_VALUE || c > UCHAR_MAX_VALUE) {
2639
        // This function should never be called with an invalid input character.
2640
0
        UPRV_UNREACHABLE_EXIT;
2641
457k
    } else if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2642
20.6k
        UChar32 caseFoldedC  = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2643
20.6k
        starterChars->set(caseFoldedC, caseFoldedC);
2644
2645
20.6k
        int32_t i;
2646
357k
        for (i=0; RECaseFixCodePoints[i]<c ; i++) {
2647
            // Simple linear search through the sorted list of interesting code points.
2648
336k
        }
2649
2650
20.6k
        if (RECaseFixCodePoints[i] == c) {
2651
18.7k
            int32_t dataIndex = RECaseFixStringOffsets[i];
2652
18.7k
            int32_t numCharsToAdd = RECaseFixCounts[i];
2653
18.7k
            UChar32 cpToAdd = 0;
2654
140k
            for (int32_t j=0; j<numCharsToAdd; j++) {
2655
121k
                U16_NEXT_UNSAFE(RECaseFixData, dataIndex, cpToAdd);
2656
121k
                starterChars->add(cpToAdd);
2657
121k
            }
2658
18.7k
        }
2659
2660
20.6k
        starterChars->closeOver(USET_CASE_INSENSITIVE);
2661
20.6k
        starterChars->removeAllStrings();
2662
436k
    } else {
2663
        // Not a cased character. Just return it alone.
2664
436k
        starterChars->set(c, c);
2665
436k
    }
2666
457k
}
2667
2668
2669
// Increment with overflow check.
2670
// val and delta will both be positive.
2671
2672
6.04M
static int32_t safeIncrement(int32_t val, int32_t delta) {
2673
6.04M
    if (INT32_MAX - val > delta) {
2674
5.89M
        return val + delta;
2675
5.89M
    } else {
2676
148k
        return INT32_MAX;
2677
148k
    }
2678
6.04M
}
2679
2680
2681
//------------------------------------------------------------------------------
2682
//
2683
//   matchStartType    Determine how a match can start.
2684
//                     Used to optimize find() operations.
2685
//
2686
//                     Operation is very similar to minMatchLength().  Walk the compiled
2687
//                     pattern, keeping an on-going minimum-match-length.  For any
2688
//                     op where the min match coming in is zero, add that ops possible
2689
//                     starting matches to the possible starts for the overall pattern.
2690
//
2691
//------------------------------------------------------------------------------
2692
12.4k
void   RegexCompile::matchStartType() {
2693
12.4k
    if (U_FAILURE(*fStatus)) {
2694
96
        return;
2695
96
    }
2696
2697
2698
12.3k
    int32_t    loc;                    // Location in the pattern of the current op being processed.
2699
12.3k
    int32_t    op;                     // The op being processed
2700
12.3k
    int32_t    opType;                 // The opcode type of the op
2701
12.3k
    int32_t    currentLen = 0;         // Minimum length of a match to this point (loc) in the pattern
2702
12.3k
    int32_t    numInitialStrings = 0;  // Number of strings encountered that could match at start.
2703
2704
12.3k
    UBool      atStart = true;         // True if no part of the pattern yet encountered
2705
                                       //   could have advanced the position in a match.
2706
                                       //   (Maximum match length so far == 0)
2707
2708
    // forwardedLength is a vector holding minimum-match-length values that
2709
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
2710
    //   It must be one longer than the pattern being checked because some  ops
2711
    //   will jmp to a end-of-block+1 location from within a block, and we must
2712
    //   count those when checking the block.
2713
12.3k
    int32_t end = fRXPat->fCompiledPat->size();
2714
12.3k
    UVector32  forwardedLength(end+1, *fStatus);
2715
12.3k
    forwardedLength.setSize(end+1);
2716
30.5M
    for (loc=3; loc<end; loc++) {
2717
30.5M
        forwardedLength.setElementAt(INT32_MAX, loc);
2718
30.5M
    }
2719
2720
23.8M
    for (loc = 3; loc<end; loc++) {
2721
23.8M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
2722
23.8M
        opType = URX_TYPE(op);
2723
2724
        // The loop is advancing linearly through the pattern.
2725
        // If the op we are now at was the destination of a branch in the pattern,
2726
        // and that path has a shorter minimum length than the current accumulated value,
2727
        // replace the current accumulated value.
2728
23.8M
        if (forwardedLength.elementAti(loc) < currentLen) {
2729
802k
            currentLen = forwardedLength.elementAti(loc);
2730
802k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
2731
802k
        }
2732
2733
23.8M
        switch (opType) {
2734
            // Ops that don't change the total length matched
2735
0
        case URX_RESERVED_OP:
2736
12.3k
        case URX_END:
2737
12.3k
        case URX_FAIL:
2738
12.3k
        case URX_STRING_LEN:
2739
12.3k
        case URX_NOP:
2740
66.3k
        case URX_START_CAPTURE:
2741
120k
        case URX_END_CAPTURE:
2742
121k
        case URX_BACKSLASH_B:
2743
128k
        case URX_BACKSLASH_BU:
2744
129k
        case URX_BACKSLASH_G:
2745
131k
        case URX_BACKSLASH_Z:
2746
201k
        case URX_DOLLAR:
2747
203k
        case URX_DOLLAR_M:
2748
230k
        case URX_DOLLAR_D:
2749
232k
        case URX_DOLLAR_MD:
2750
232k
        case URX_RELOC_OPRND:
2751
326k
        case URX_STO_INP_LOC:
2752
327k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
2753
330k
        case URX_BACKREF_I:
2754
2755
426k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
2756
523k
        case URX_LD_SP:
2757
523k
            break;
2758
2759
3.35k
        case URX_CARET:
2760
3.35k
            if (atStart) {
2761
694
                fRXPat->fStartType = START_START;
2762
694
            }
2763
3.35k
            break;
2764
2765
864
        case URX_CARET_M:
2766
1.76k
        case URX_CARET_M_UNIX:
2767
1.76k
            if (atStart) {
2768
325
                fRXPat->fStartType = START_LINE;
2769
325
            }
2770
1.76k
            break;
2771
2772
776k
        case URX_ONECHAR:
2773
776k
            if (currentLen == 0) {
2774
                // This character could appear at the start of a match.
2775
                //   Add it to the set of possible starting characters.
2776
77.8k
                fRXPat->fInitialChars->add(URX_VAL(op));
2777
77.8k
                numInitialStrings += 2;
2778
77.8k
            }
2779
776k
            currentLen = safeIncrement(currentLen, 1);
2780
776k
            atStart = false;
2781
776k
            break;
2782
2783
2784
198k
        case URX_SETREF:
2785
198k
            if (currentLen == 0) {
2786
2.60k
                int32_t  sn = URX_VAL(op);
2787
2.60k
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2788
2.60k
                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
2789
2.60k
                fRXPat->fInitialChars->addAll(*s);
2790
2.60k
                numInitialStrings += 2;
2791
2.60k
            }
2792
198k
            currentLen = safeIncrement(currentLen, 1);
2793
198k
            atStart = false;
2794
198k
            break;
2795
2796
2.98k
        case URX_LOOP_SR_I:
2797
            // [Set]*, like a SETREF, above, in what it can match,
2798
            //  but may not match at all, so currentLen is not incremented.
2799
2.98k
            if (currentLen == 0) {
2800
419
                int32_t  sn = URX_VAL(op);
2801
419
                U_ASSERT(sn > 0 && sn < fRXPat->fSets->size());
2802
419
                const UnicodeSet* s = static_cast<UnicodeSet*>(fRXPat->fSets->elementAt(sn));
2803
419
                fRXPat->fInitialChars->addAll(*s);
2804
419
                numInitialStrings += 2;
2805
419
            }
2806
2.98k
            atStart = false;
2807
2.98k
            break;
2808
2809
10.3k
        case URX_LOOP_DOT_I:
2810
10.3k
            if (currentLen == 0) {
2811
                // .* at the start of a pattern.
2812
                //    Any character can begin the match.
2813
2.71k
                fRXPat->fInitialChars->clear();
2814
2.71k
                fRXPat->fInitialChars->complement();
2815
2.71k
                numInitialStrings += 2;
2816
2.71k
            }
2817
10.3k
            atStart = false;
2818
10.3k
            break;
2819
2820
2821
5.26k
        case URX_STATIC_SETREF:
2822
5.26k
            if (currentLen == 0) {
2823
1.05k
                int32_t  sn = URX_VAL(op);
2824
1.05k
                U_ASSERT(sn>0 && sn<URX_LAST_SET);
2825
1.05k
                const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[sn];
2826
1.05k
                fRXPat->fInitialChars->addAll(s);
2827
1.05k
                numInitialStrings += 2;
2828
1.05k
            }
2829
5.26k
            currentLen = safeIncrement(currentLen, 1);
2830
5.26k
            atStart = false;
2831
5.26k
            break;
2832
2833
2834
2835
16.3k
        case URX_STAT_SETREF_N:
2836
16.3k
            if (currentLen == 0) {
2837
430
                int32_t  sn = URX_VAL(op);
2838
430
                UnicodeSet sc;
2839
430
                sc.addAll(RegexStaticSets::gStaticSets->fPropSets[sn]).complement();
2840
430
                fRXPat->fInitialChars->addAll(sc);
2841
430
                numInitialStrings += 2;
2842
430
            }
2843
16.3k
            currentLen = safeIncrement(currentLen, 1);
2844
16.3k
            atStart = false;
2845
16.3k
            break;
2846
2847
2848
2849
4.87k
        case URX_BACKSLASH_D:
2850
            // Digit Char
2851
4.87k
             if (currentLen == 0) {
2852
1.01k
                 UnicodeSet s;
2853
1.01k
                 s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ND_MASK, *fStatus);
2854
1.01k
                 if (URX_VAL(op) != 0) {
2855
710
                     s.complement();
2856
710
                 }
2857
1.01k
                 fRXPat->fInitialChars->addAll(s);
2858
1.01k
                 numInitialStrings += 2;
2859
1.01k
            }
2860
4.87k
            currentLen = safeIncrement(currentLen, 1);
2861
4.87k
            atStart = false;
2862
4.87k
            break;
2863
2864
2865
3.50k
        case URX_BACKSLASH_H:
2866
            // Horiz white space
2867
3.50k
            if (currentLen == 0) {
2868
863
                UnicodeSet s;
2869
863
                s.applyIntPropertyValue(UCHAR_GENERAL_CATEGORY_MASK, U_GC_ZS_MASK, *fStatus);
2870
863
                s.add(static_cast<UChar32>(9)); // Tab
2871
863
                if (URX_VAL(op) != 0) {
2872
404
                    s.complement();
2873
404
                }
2874
863
                fRXPat->fInitialChars->addAll(s);
2875
863
                numInitialStrings += 2;
2876
863
            }
2877
3.50k
            currentLen = safeIncrement(currentLen, 1);
2878
3.50k
            atStart = false;
2879
3.50k
            break;
2880
2881
2882
2.92k
        case URX_BACKSLASH_R:       // Any line ending sequence
2883
5.61k
        case URX_BACKSLASH_V:       // Any line ending code point, with optional negation
2884
5.61k
            if (currentLen == 0) {
2885
883
                UnicodeSet s;
2886
883
                s.add(static_cast<UChar32>(0x0a), static_cast<UChar32>(0x0d)); // add range
2887
883
                s.add(static_cast<UChar32>(0x85));
2888
883
                s.add(static_cast<UChar32>(0x2028), static_cast<UChar32>(0x2029));
2889
883
                if (URX_VAL(op) != 0) {
2890
                     // Complement option applies to URX_BACKSLASH_V only.
2891
355
                     s.complement();
2892
355
                }
2893
883
                fRXPat->fInitialChars->addAll(s);
2894
883
                numInitialStrings += 2;
2895
883
            }
2896
5.61k
            currentLen = safeIncrement(currentLen, 1);
2897
5.61k
            atStart = false;
2898
5.61k
            break;
2899
2900
2901
2902
42.2k
        case URX_ONECHAR_I:
2903
            // Case Insensitive Single Character.
2904
42.2k
            if (currentLen == 0) {
2905
3.36k
                UChar32  c = URX_VAL(op);
2906
3.36k
                if (u_hasBinaryProperty(c, UCHAR_CASE_SENSITIVE)) {
2907
3.36k
                    UnicodeSet starters(c, c);
2908
3.36k
                    starters.closeOver(USET_CASE_INSENSITIVE);
2909
                    // findCaseInsensitiveStarters(c, &starters);
2910
                    //   For ONECHAR_I, no need to worry about text chars that expand on folding into strings.
2911
                    //   The expanded folding can't match the pattern.
2912
3.36k
                    fRXPat->fInitialChars->addAll(starters);
2913
3.36k
                } else {
2914
                    // Char has no case variants.  Just add it as-is to the
2915
                    //   set of possible starting chars.
2916
0
                    fRXPat->fInitialChars->add(c);
2917
0
                }
2918
3.36k
                numInitialStrings += 2;
2919
3.36k
            }
2920
42.2k
            currentLen = safeIncrement(currentLen, 1);
2921
42.2k
            atStart = false;
2922
42.2k
            break;
2923
2924
2925
2.03k
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
2926
5.34k
        case URX_DOTANY_ALL:    // . matches one or two.
2927
460k
        case URX_DOTANY:
2928
464k
        case URX_DOTANY_UNIX:
2929
464k
            if (currentLen == 0) {
2930
                // These constructs are all bad news when they appear at the start
2931
                //   of a match.  Any character can begin the match.
2932
3.13k
                fRXPat->fInitialChars->clear();
2933
3.13k
                fRXPat->fInitialChars->complement();
2934
3.13k
                numInitialStrings += 2;
2935
3.13k
            }
2936
464k
            currentLen = safeIncrement(currentLen, 1);
2937
464k
            atStart = false;
2938
464k
            break;
2939
2940
2941
0
        case URX_JMPX:
2942
0
            loc++;             // Except for extra operand on URX_JMPX, same as URX_JMP.
2943
0
            U_FALLTHROUGH;
2944
9.86M
        case URX_JMP:
2945
9.86M
            {
2946
9.86M
                int32_t  jmpDest = URX_VAL(op);
2947
9.86M
                if (jmpDest < loc) {
2948
                    // Loop of some kind.  Can safely ignore, the worst that will happen
2949
                    //  is that we understate the true minimum length
2950
65.0k
                    currentLen = forwardedLength.elementAti(loc+1);
2951
2952
9.79M
                } else {
2953
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
2954
9.79M
                    U_ASSERT(jmpDest <= end+1);
2955
9.79M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
2956
4.68k
                        forwardedLength.setElementAt(currentLen, jmpDest);
2957
4.68k
                    }
2958
9.79M
                }
2959
9.86M
            }
2960
9.86M
            atStart = false;
2961
9.86M
            break;
2962
2963
162k
        case URX_JMP_SAV:
2964
256k
        case URX_JMP_SAV_X:
2965
            // Combo of state save to the next loc, + jmp backwards.
2966
            //   Net effect on min. length computation is nothing.
2967
256k
            atStart = false;
2968
256k
            break;
2969
2970
5.33k
        case URX_BACKTRACK:
2971
            // Fails are kind of like a branch, except that the min length was
2972
            //   propagated already, by the state save.
2973
5.33k
            currentLen = forwardedLength.elementAti(loc+1);
2974
5.33k
            atStart = false;
2975
5.33k
            break;
2976
2977
2978
10.6M
        case URX_STATE_SAVE:
2979
10.6M
            {
2980
                // State Save, for forward jumps, propagate the current minimum.
2981
                //             of the state save.
2982
10.6M
                int32_t  jmpDest = URX_VAL(op);
2983
10.6M
                if (jmpDest > loc) {
2984
10.6M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
2985
10.1M
                        forwardedLength.setElementAt(currentLen, jmpDest);
2986
10.1M
                    }
2987
10.6M
                }
2988
10.6M
            }
2989
10.6M
            atStart = false;
2990
10.6M
            break;
2991
2992
2993
2994
2995
84.2k
        case URX_STRING:
2996
84.2k
            {
2997
84.2k
                loc++;
2998
84.2k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
2999
84.2k
                int32_t stringLen   = URX_VAL(stringLenOp);
3000
84.2k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3001
84.2k
                U_ASSERT(stringLenOp >= 2);
3002
84.2k
                if (currentLen == 0) {
3003
                    // Add the starting character of this string to the set of possible starting
3004
                    //   characters for this pattern.
3005
14.0k
                    int32_t stringStartIdx = URX_VAL(op);
3006
14.0k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3007
14.0k
                    fRXPat->fInitialChars->add(c);
3008
3009
                    // Remember this string.  After the entire pattern has been checked,
3010
                    //  if nothing else is identified that can start a match, we'll use it.
3011
14.0k
                    numInitialStrings++;
3012
14.0k
                    fRXPat->fInitialStringIdx = stringStartIdx;
3013
14.0k
                    fRXPat->fInitialStringLen = stringLen;
3014
14.0k
                }
3015
3016
84.2k
                currentLen = safeIncrement(currentLen, stringLen);
3017
84.2k
                atStart = false;
3018
84.2k
            }
3019
84.2k
            break;
3020
3021
974k
        case URX_STRING_I:
3022
974k
            {
3023
                // Case-insensitive string.  Unlike exact-match strings, we won't
3024
                //   attempt a string search for possible match positions.  But we
3025
                //   do update the set of possible starting characters.
3026
974k
                loc++;
3027
974k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3028
974k
                int32_t stringLen   = URX_VAL(stringLenOp);
3029
974k
                U_ASSERT(URX_TYPE(stringLenOp) == URX_STRING_LEN);
3030
974k
                U_ASSERT(stringLenOp >= 2);
3031
974k
                if (currentLen == 0) {
3032
                    // Add the starting character of this string to the set of possible starting
3033
                    //   characters for this pattern.
3034
457k
                    int32_t stringStartIdx = URX_VAL(op);
3035
457k
                    UChar32  c = fRXPat->fLiteralText.char32At(stringStartIdx);
3036
457k
                    UnicodeSet s;
3037
457k
                    findCaseInsensitiveStarters(c, &s);
3038
457k
                    fRXPat->fInitialChars->addAll(s);
3039
457k
                    numInitialStrings += 2;  // Matching on an initial string not possible.
3040
457k
                }
3041
974k
                currentLen = safeIncrement(currentLen, stringLen);
3042
974k
                atStart = false;
3043
974k
            }
3044
974k
            break;
3045
3046
2.71k
        case URX_CTR_INIT:
3047
3.85k
        case URX_CTR_INIT_NG:
3048
3.85k
            {
3049
                // Loop Init Ops.  These don't change the min length, but they are 4 word ops
3050
                //   so location must be updated accordingly.
3051
                // Loop Init Ops.
3052
                //   If the min loop count == 0
3053
                //      move loc forwards to the end of the loop, skipping over the body.
3054
                //   If the min count is > 0,
3055
                //      continue normal processing of the body of the loop.
3056
3.85k
                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
3057
3.85k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3058
3.85k
                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
3059
3.85k
                if (minLoopCount == 0) {
3060
                    // Min Loop Count of 0, treat like a forward branch and
3061
                    //   move the current minimum length up to the target
3062
                    //   (end of loop) location.
3063
2.06k
                    U_ASSERT(loopEndLoc <= end+1);
3064
2.06k
                    if (forwardedLength.elementAti(loopEndLoc) > currentLen) {
3065
1.59k
                        forwardedLength.setElementAt(currentLen, loopEndLoc);
3066
1.59k
                    }
3067
2.06k
                }
3068
3.85k
                loc+=3;  // Skips over operands of CTR_INIT
3069
3.85k
            }
3070
3.85k
            atStart = false;
3071
3.85k
            break;
3072
3073
3074
2.71k
        case URX_CTR_LOOP:
3075
3.85k
        case URX_CTR_LOOP_NG:
3076
            // Loop ops.
3077
            //  The jump is conditional, backwards only.
3078
3.85k
            atStart = false;
3079
3.85k
            break;
3080
3081
13.3k
        case URX_LOOP_C:
3082
            // More loop ops.  These state-save to themselves.
3083
            //   don't change the minimum match
3084
13.3k
            atStart = false;
3085
13.3k
            break;
3086
3087
3088
1.33k
        case URX_LA_START:
3089
3.09k
        case URX_LB_START:
3090
3.09k
            {
3091
                // Look-around.  Scan forward until the matching look-ahead end,
3092
                //   without processing the look-around block.  This is overly pessimistic.
3093
3094
                // Keep track of the nesting depth of look-around blocks.  Boilerplate code for
3095
                //   lookahead contains two LA_END instructions, so count goes up by two
3096
                //   for each LA_START.
3097
3.09k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3098
5.56M
                for (;;) {
3099
5.56M
                    loc++;
3100
5.56M
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3101
5.56M
                    if (URX_TYPE(op) == URX_LA_START) {
3102
747
                        depth+=2;
3103
747
                    }
3104
5.56M
                    if (URX_TYPE(op) == URX_LB_START) {
3105
973
                        depth++;
3106
973
                    }
3107
5.56M
                    if (URX_TYPE(op) == URX_LA_END || URX_TYPE(op)==URX_LBN_END) {
3108
6.88k
                        depth--;
3109
6.88k
                        if (depth == 0) {
3110
3.09k
                            break;
3111
3.09k
                        }
3112
6.88k
                    }
3113
5.56M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3114
                        // Need this because neg lookahead blocks will FAIL to outside
3115
                        //   of the block.
3116
2.72M
                        int32_t  jmpDest = URX_VAL(op);
3117
2.72M
                        if (jmpDest > loc) {
3118
2.72M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3119
2.59M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3120
2.59M
                            }
3121
2.72M
                        }
3122
2.72M
                    }
3123
5.56M
                    U_ASSERT(loc <= end);
3124
5.56M
                }
3125
3.09k
            }
3126
3.09k
            break;
3127
3128
0
        case URX_LA_END:
3129
0
        case URX_LB_CONT:
3130
0
        case URX_LB_END:
3131
0
        case URX_LBN_CONT:
3132
0
        case URX_LBN_END:
3133
0
            UPRV_UNREACHABLE_EXIT;  // Shouldn't get here.  These ops should be
3134
                                    //  consumed by the scan in URX_LA_START and LB_START
3135
0
        default:
3136
0
            UPRV_UNREACHABLE_EXIT;
3137
23.8M
            }
3138
3139
23.8M
        }
3140
3141
3142
    // We have finished walking through the ops.  Check whether some forward jump
3143
    //   propagated a shorter length to location end+1.
3144
12.3k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3145
10.3k
        currentLen = forwardedLength.elementAti(end+1);
3146
10.3k
    }
3147
3148
3149
12.3k
    fRXPat->fInitialChars8->init(fRXPat->fInitialChars);
3150
3151
3152
    // Sort out what we should check for when looking for candidate match start positions.
3153
    // In order of preference,
3154
    //     1.   Start of input text buffer.
3155
    //     2.   A literal string.
3156
    //     3.   Start of line in multi-line mode.
3157
    //     4.   A single literal character.
3158
    //     5.   A character from a set of characters.
3159
    //
3160
12.3k
    if (fRXPat->fStartType == START_START) {
3161
        // Match only at the start of an input text string.
3162
        //    start type is already set.  We're done.
3163
12.2k
    } else if (numInitialStrings == 1 && fRXPat->fMinMatchLen > 0) {
3164
        // Match beginning only with a literal string.
3165
437
        UChar32  c = fRXPat->fLiteralText.char32At(fRXPat->fInitialStringIdx);
3166
437
        U_ASSERT(fRXPat->fInitialChars->contains(c));
3167
437
        fRXPat->fStartType   = START_STRING;
3168
437
        fRXPat->fInitialChar = c;
3169
11.8k
    } else if (fRXPat->fStartType == START_LINE) {
3170
        // Match at start of line in Multi-Line mode.
3171
        // Nothing to do here; everything is already set.
3172
11.7k
    } else if (fRXPat->fMinMatchLen == 0) {
3173
        // Zero length match possible.  We could start anywhere.
3174
1.91k
        fRXPat->fStartType = START_NO_INFO;
3175
9.78k
    } else if (fRXPat->fInitialChars->size() == 1) {
3176
        // All matches begin with the same char.
3177
2.14k
        fRXPat->fStartType   = START_CHAR;
3178
2.14k
        fRXPat->fInitialChar = fRXPat->fInitialChars->charAt(0);
3179
2.14k
        U_ASSERT(fRXPat->fInitialChar != (UChar32)-1);
3180
7.64k
    } else if (fRXPat->fInitialChars->contains(static_cast<UChar32>(0), static_cast<UChar32>(0x10ffff)) == false &&
3181
4.26k
        fRXPat->fMinMatchLen > 0) {
3182
        // Matches start with a set of character smaller than the set of all chars.
3183
4.26k
        fRXPat->fStartType = START_SET;
3184
4.26k
    } else {
3185
        // Matches can start with anything
3186
3.38k
        fRXPat->fStartType = START_NO_INFO;
3187
3.38k
    }
3188
12.3k
}
3189
3190
3191
3192
//------------------------------------------------------------------------------
3193
//
3194
//   minMatchLength    Calculate the length of the shortest string that could
3195
//                     match the specified pattern.
3196
//                     Length is in 16 bit code units, not code points.
3197
//
3198
//                     The calculated length may not be exact.  The returned
3199
//                     value may be shorter than the actual minimum; it must
3200
//                     never be longer.
3201
//
3202
//                     start and end are the range of p-code operations to be
3203
//                     examined.  The endpoints are included in the range.
3204
//
3205
//------------------------------------------------------------------------------
3206
297k
int32_t   RegexCompile::minMatchLength(int32_t start, int32_t end) {
3207
297k
    if (U_FAILURE(*fStatus)) {
3208
96
        return 0;
3209
96
    }
3210
3211
297k
    U_ASSERT(start <= end);
3212
297k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3213
3214
3215
297k
    int32_t    loc;
3216
297k
    int32_t    op;
3217
297k
    int32_t    opType;
3218
297k
    int32_t    currentLen = 0;
3219
3220
3221
    // forwardedLength is a vector holding minimum-match-length values that
3222
    //   are propagated forward in the pattern by JMP or STATE_SAVE operations.
3223
    //   It must be one longer than the pattern being checked because some  ops
3224
    //   will jmp to a end-of-block+1 location from within a block, and we must
3225
    //   count those when checking the block.
3226
297k
    UVector32  forwardedLength(end+2, *fStatus);
3227
297k
    forwardedLength.setSize(end+2);
3228
46.2M
    for (loc=start; loc<=end+1; loc++) {
3229
45.9M
        forwardedLength.setElementAt(INT32_MAX, loc);
3230
45.9M
    }
3231
3232
30.5M
    for (loc = start; loc<=end; loc++) {
3233
30.2M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3234
30.2M
        opType = URX_TYPE(op);
3235
3236
        // The loop is advancing linearly through the pattern.
3237
        // If the op we are now at was the destination of a branch in the pattern,
3238
        // and that path has a shorter minimum length than the current accumulated value,
3239
        // replace the current accumulated value.
3240
        // U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);  // MinLength == INT32_MAX for some
3241
                                                               //   no-match-possible cases.
3242
30.2M
        if (forwardedLength.elementAti(loc) < currentLen) {
3243
887k
            currentLen = forwardedLength.elementAti(loc);
3244
887k
            U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3245
887k
        }
3246
3247
30.2M
        switch (opType) {
3248
            // Ops that don't change the total length matched
3249
0
        case URX_RESERVED_OP:
3250
12.3k
        case URX_END:
3251
12.3k
        case URX_STRING_LEN:
3252
292k
        case URX_NOP:
3253
356k
        case URX_START_CAPTURE:
3254
421k
        case URX_END_CAPTURE:
3255
423k
        case URX_BACKSLASH_B:
3256
430k
        case URX_BACKSLASH_BU:
3257
431k
        case URX_BACKSLASH_G:
3258
434k
        case URX_BACKSLASH_Z:
3259
439k
        case URX_CARET:
3260
570k
        case URX_DOLLAR:
3261
573k
        case URX_DOLLAR_M:
3262
626k
        case URX_DOLLAR_D:
3263
631k
        case URX_DOLLAR_MD:
3264
631k
        case URX_RELOC_OPRND:
3265
726k
        case URX_STO_INP_LOC:
3266
727k
        case URX_CARET_M:
3267
729k
        case URX_CARET_M_UNIX:
3268
730k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3269
733k
        case URX_BACKREF_I:
3270
3271
832k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3272
930k
        case URX_LD_SP:
3273
3274
1.09M
        case URX_JMP_SAV:
3275
1.18M
        case URX_JMP_SAV_X:
3276
1.18M
            break;
3277
3278
3279
            // Ops that match a minimum of one character (one or two 16 bit code units.)
3280
            //
3281
1.10M
        case URX_ONECHAR:
3282
1.11M
        case URX_STATIC_SETREF:
3283
1.13M
        case URX_STAT_SETREF_N:
3284
1.33M
        case URX_SETREF:
3285
1.33M
        case URX_BACKSLASH_D:
3286
1.34M
        case URX_BACKSLASH_H:
3287
1.34M
        case URX_BACKSLASH_R:
3288
1.34M
        case URX_BACKSLASH_V:
3289
1.40M
        case URX_ONECHAR_I:
3290
1.40M
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3291
1.40M
        case URX_DOTANY_ALL:    // . matches one or two.
3292
1.87M
        case URX_DOTANY:
3293
1.87M
        case URX_DOTANY_UNIX:
3294
1.87M
            currentLen = safeIncrement(currentLen, 1);
3295
1.87M
            break;
3296
3297
3298
0
        case URX_JMPX:
3299
0
            loc++;              // URX_JMPX has an extra operand, ignored here,
3300
                                //   otherwise processed identically to URX_JMP.
3301
0
            U_FALLTHROUGH;
3302
12.4M
        case URX_JMP:
3303
12.4M
            {
3304
12.4M
                int32_t  jmpDest = URX_VAL(op);
3305
12.4M
                if (jmpDest < loc) {
3306
                    // Loop of some kind.  Can safely ignore, the worst that will happen
3307
                    //  is that we understate the true minimum length
3308
65.3k
                    currentLen = forwardedLength.elementAti(loc+1);
3309
12.3M
                } else {
3310
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3311
12.3M
                    U_ASSERT(jmpDest <= end+1);
3312
12.3M
                    if (forwardedLength.elementAti(jmpDest) > currentLen) {
3313
6.71k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3314
6.71k
                    }
3315
12.3M
                }
3316
12.4M
            }
3317
12.4M
            break;
3318
3319
6.70k
        case URX_BACKTRACK:
3320
6.70k
            {
3321
                // Back-tracks are kind of like a branch, except that the min length was
3322
                //   propagated already, by the state save.
3323
6.70k
                currentLen = forwardedLength.elementAti(loc+1);
3324
6.70k
            }
3325
6.70k
            break;
3326
3327
3328
13.4M
        case URX_STATE_SAVE:
3329
13.4M
            {
3330
                // State Save, for forward jumps, propagate the current minimum.
3331
                //             of the state save.
3332
13.4M
                int32_t  jmpDest = URX_VAL(op);
3333
13.4M
                if (jmpDest > loc) {
3334
13.4M
                    if (currentLen < forwardedLength.elementAti(jmpDest)) {
3335
12.7M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3336
12.7M
                    }
3337
13.4M
                }
3338
13.4M
            }
3339
13.4M
            break;
3340
3341
3342
101k
        case URX_STRING:
3343
101k
            {
3344
101k
                loc++;
3345
101k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3346
101k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3347
101k
            }
3348
101k
            break;
3349
3350
3351
1.11M
        case URX_STRING_I:
3352
1.11M
            {
3353
1.11M
                loc++;
3354
                // TODO: with full case folding, matching input text may be shorter than
3355
                //       the string we have here.  More smarts could put some bounds on it.
3356
                //       Assume a min length of one for now.  A min length of zero causes
3357
                //        optimization failures for a pattern like "string"+
3358
                // currentLen += URX_VAL(stringLenOp);
3359
1.11M
                currentLen = safeIncrement(currentLen, 1);
3360
1.11M
            }
3361
1.11M
            break;
3362
3363
7.77k
        case URX_CTR_INIT:
3364
11.2k
        case URX_CTR_INIT_NG:
3365
11.2k
            {
3366
                // Loop Init Ops.
3367
                //   If the min loop count == 0
3368
                //      move loc forwards to the end of the loop, skipping over the body.
3369
                //   If the min count is > 0,
3370
                //      continue normal processing of the body of the loop.
3371
11.2k
                int32_t loopEndLoc = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 1));
3372
11.2k
                        loopEndLoc   = URX_VAL(loopEndLoc);
3373
11.2k
                int32_t minLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc + 2));
3374
11.2k
                if (minLoopCount == 0) {
3375
6.76k
                    loc = loopEndLoc;
3376
6.76k
                } else {
3377
4.47k
                    loc+=3;  // Skips over operands of CTR_INIT
3378
4.47k
                }
3379
11.2k
            }
3380
11.2k
            break;
3381
3382
3383
3.66k
        case URX_CTR_LOOP:
3384
4.47k
        case URX_CTR_LOOP_NG:
3385
            // Loop ops.
3386
            //  The jump is conditional, backwards only.
3387
4.47k
            break;
3388
3389
2.99k
        case URX_LOOP_SR_I:
3390
13.6k
        case URX_LOOP_DOT_I:
3391
27.3k
        case URX_LOOP_C:
3392
            // More loop ops.  These state-save to themselves.
3393
            //   don't change the minimum match - could match nothing at all.
3394
27.3k
            break;
3395
3396
3397
2.17k
        case URX_LA_START:
3398
6.67k
        case URX_LB_START:
3399
6.67k
            {
3400
                // Look-around.  Scan forward until the matching look-ahead end,
3401
                //   without processing the look-around block.  This is overly pessimistic for look-ahead,
3402
                //   it assumes that the look-ahead match might be zero-length.
3403
                //   TODO:  Positive lookahead could recursively do the block, then continue
3404
                //          with the longer of the block or the value coming in.  Ticket 6060
3405
6.67k
                int32_t  depth = (opType == URX_LA_START? 2: 1);
3406
12.9M
                for (;;) {
3407
12.9M
                    loc++;
3408
12.9M
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3409
12.9M
                    if (URX_TYPE(op) == URX_LA_START) {
3410
                        // The boilerplate for look-ahead includes two LA_END instructions,
3411
                        //    Depth will be decremented by each one when it is seen.
3412
2.17k
                        depth += 2;
3413
2.17k
                    }
3414
12.9M
                    if (URX_TYPE(op) == URX_LB_START) {
3415
2.48k
                        depth++;
3416
2.48k
                    }
3417
12.9M
                    if (URX_TYPE(op) == URX_LA_END) {
3418
12.2k
                        depth--;
3419
12.2k
                        if (depth == 0) {
3420
4.37k
                            break;
3421
4.37k
                        }
3422
12.2k
                    }
3423
12.9M
                    if (URX_TYPE(op)==URX_LBN_END) {
3424
3.44k
                        depth--;
3425
3.44k
                        if (depth == 0) {
3426
2.29k
                            break;
3427
2.29k
                        }
3428
3.44k
                    }
3429
12.9M
                    if (URX_TYPE(op) == URX_STATE_SAVE) {
3430
                        // Need this because neg lookahead blocks will FAIL to outside
3431
                        //   of the block.
3432
6.33M
                        int32_t  jmpDest = URX_VAL(op);
3433
6.33M
                        if (jmpDest > loc) {
3434
6.33M
                            if (currentLen < forwardedLength.elementAti(jmpDest)) {
3435
6.19M
                                forwardedLength.setElementAt(currentLen, jmpDest);
3436
6.19M
                            }
3437
6.33M
                        }
3438
6.33M
                    }
3439
12.9M
                    U_ASSERT(loc <= end);
3440
12.9M
                }
3441
6.67k
            }
3442
6.67k
            break;
3443
3444
3.54k
        case URX_LA_END:
3445
3.54k
        case URX_LB_CONT:
3446
7.09k
        case URX_LB_END:
3447
7.09k
        case URX_LBN_CONT:
3448
11.8k
        case URX_LBN_END:
3449
            // Only come here if the matching URX_LA_START or URX_LB_START was not in the
3450
            //   range being sized, which happens when measuring size of look-behind blocks.
3451
11.8k
            break;
3452
3453
0
        default:
3454
0
            UPRV_UNREACHABLE_EXIT;
3455
30.2M
            }
3456
3457
30.2M
        }
3458
3459
    // We have finished walking through the ops.  Check whether some forward jump
3460
    //   propagated a shorter length to location end+1.
3461
297k
    if (forwardedLength.elementAti(end+1) < currentLen) {
3462
434
        currentLen = forwardedLength.elementAti(end+1);
3463
434
        U_ASSERT(currentLen>=0 && currentLen < INT32_MAX);
3464
434
    }
3465
3466
297k
    return currentLen;
3467
297k
}
3468
3469
//------------------------------------------------------------------------------
3470
//
3471
//   maxMatchLength    Calculate the length of the longest string that could
3472
//                     match the specified pattern.
3473
//                     Length is in 16 bit code units, not code points.
3474
//
3475
//                     The calculated length may not be exact.  The returned
3476
//                     value may be longer than the actual maximum; it must
3477
//                     never be shorter.
3478
//
3479
//                     start, end: the range of the pattern to check.
3480
//                     end is inclusive.
3481
//
3482
//------------------------------------------------------------------------------
3483
15.2k
int32_t   RegexCompile::maxMatchLength(int32_t start, int32_t end) {
3484
15.2k
    if (U_FAILURE(*fStatus)) {
3485
0
        return 0;
3486
0
    }
3487
15.2k
    U_ASSERT(start <= end);
3488
15.2k
    U_ASSERT(end < fRXPat->fCompiledPat->size());
3489
3490
15.2k
    int32_t    loc;
3491
15.2k
    int32_t    op;
3492
15.2k
    int32_t    opType;
3493
15.2k
    int32_t    currentLen = 0;
3494
15.2k
    UVector32  forwardedLength(end+1, *fStatus);
3495
15.2k
    forwardedLength.setSize(end+1);
3496
3497
13.8M
    for (loc=start; loc<=end; loc++) {
3498
13.8M
        forwardedLength.setElementAt(0, loc);
3499
13.8M
    }
3500
3501
10.7M
    for (loc = start; loc<=end; loc++) {
3502
10.7M
        op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3503
10.7M
        opType = URX_TYPE(op);
3504
3505
        // The loop is advancing linearly through the pattern.
3506
        // If the op we are now at was the destination of a branch in the pattern,
3507
        // and that path has a longer maximum length than the current accumulated value,
3508
        // replace the current accumulated value.
3509
10.7M
        if (forwardedLength.elementAti(loc) > currentLen) {
3510
2.99M
            currentLen = forwardedLength.elementAti(loc);
3511
2.99M
        }
3512
3513
10.7M
        switch (opType) {
3514
            // Ops that don't change the total length matched
3515
0
        case URX_RESERVED_OP:
3516
0
        case URX_END:
3517
0
        case URX_STRING_LEN:
3518
24.4k
        case URX_NOP:
3519
28.1k
        case URX_START_CAPTURE:
3520
30.4k
        case URX_END_CAPTURE:
3521
31.3k
        case URX_BACKSLASH_B:
3522
31.8k
        case URX_BACKSLASH_BU:
3523
32.1k
        case URX_BACKSLASH_G:
3524
32.5k
        case URX_BACKSLASH_Z:
3525
33.8k
        case URX_CARET:
3526
34.3k
        case URX_DOLLAR:
3527
35.0k
        case URX_DOLLAR_M:
3528
35.3k
        case URX_DOLLAR_D:
3529
35.9k
        case URX_DOLLAR_MD:
3530
35.9k
        case URX_RELOC_OPRND:
3531
36.4k
        case URX_STO_INP_LOC:
3532
36.8k
        case URX_CARET_M:
3533
37.3k
        case URX_CARET_M_UNIX:
3534
3535
39.1k
        case URX_STO_SP:          // Setup for atomic or possessive blocks.  Doesn't change what can match.
3536
40.7k
        case URX_LD_SP:
3537
3538
44.2k
        case URX_LB_END:
3539
44.2k
        case URX_LB_CONT:
3540
44.2k
        case URX_LBN_CONT:
3541
48.9k
        case URX_LBN_END:
3542
48.9k
            break;
3543
3544
3545
            // Ops that increase that cause an unbounded increase in the length
3546
            //   of a matched string, or that increase it a hard to characterize way.
3547
            //   Call the max length unbounded, and stop further checking.
3548
1.24k
        case URX_BACKREF:         // BackRef.  Must assume that it might be a zero length match
3549
1.44k
        case URX_BACKREF_I:
3550
1.52k
        case URX_BACKSLASH_X:   // Grapheme Cluster.  Minimum is 1, max unbounded.
3551
1.52k
            currentLen = INT32_MAX;
3552
1.52k
            break;
3553
3554
3555
            // Ops that match a max of one character (possibly two 16 bit code units.)
3556
            //
3557
1.61k
        case URX_STATIC_SETREF:
3558
2.44k
        case URX_STAT_SETREF_N:
3559
3.44k
        case URX_SETREF:
3560
3.74k
        case URX_BACKSLASH_D:
3561
4.19k
        case URX_BACKSLASH_H:
3562
4.71k
        case URX_BACKSLASH_R:
3563
5.04k
        case URX_BACKSLASH_V:
3564
6.52k
        case URX_ONECHAR_I:
3565
6.94k
        case URX_DOTANY_ALL:
3566
17.3k
        case URX_DOTANY:
3567
17.7k
        case URX_DOTANY_UNIX:
3568
17.7k
            currentLen = safeIncrement(currentLen, 2);
3569
17.7k
            break;
3570
3571
            // Single literal character.  Increase current max length by one or two,
3572
            //       depending on whether the char is in the supplementary range.
3573
263k
        case URX_ONECHAR:
3574
263k
            currentLen = safeIncrement(currentLen, 1);
3575
263k
            if (URX_VAL(op) > 0x10000) {
3576
14.8k
                currentLen = safeIncrement(currentLen, 1);
3577
14.8k
            }
3578
263k
            break;
3579
3580
            // Jumps.
3581
            //
3582
5.06M
        case URX_JMP:
3583
5.06M
        case URX_JMPX:
3584
5.06M
        case URX_JMP_SAV:
3585
5.06M
        case URX_JMP_SAV_X:
3586
5.06M
            {
3587
5.06M
                int32_t  jmpDest = URX_VAL(op);
3588
5.06M
                if (jmpDest < loc) {
3589
                    // Loop of some kind.  Max match length is unbounded.
3590
702
                    currentLen = INT32_MAX;
3591
5.06M
                } else {
3592
                    // Forward jump.  Propagate the current min length to the target loc of the jump.
3593
5.06M
                    if (forwardedLength.elementAti(jmpDest) < currentLen) {
3594
2.72k
                        forwardedLength.setElementAt(currentLen, jmpDest);
3595
2.72k
                    }
3596
5.06M
                    currentLen = 0;
3597
5.06M
                }
3598
5.06M
            }
3599
5.06M
            break;
3600
3601
2.67k
        case URX_BACKTRACK:
3602
            // back-tracks are kind of like a branch, except that the max length was
3603
            //   propagated already, by the state save.
3604
2.67k
            currentLen = forwardedLength.elementAti(loc+1);
3605
2.67k
            break;
3606
3607
3608
5.26M
        case URX_STATE_SAVE:
3609
5.26M
            {
3610
                // State Save, for forward jumps, propagate the current minimum.
3611
                //               of the state save.
3612
                //             For backwards jumps, they create a loop, maximum
3613
                //               match length is unbounded.
3614
5.26M
                int32_t  jmpDest = URX_VAL(op);
3615
5.26M
                if (jmpDest > loc) {
3616
5.26M
                    if (currentLen > forwardedLength.elementAti(jmpDest)) {
3617
3.18M
                        forwardedLength.setElementAt(currentLen, jmpDest);
3618
3.18M
                    }
3619
5.26M
                } else {
3620
218
                    currentLen = INT32_MAX;
3621
218
                }
3622
5.26M
            }
3623
5.26M
            break;
3624
3625
3626
3627
3628
44.8k
        case URX_STRING:
3629
44.8k
            {
3630
44.8k
                loc++;
3631
44.8k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3632
44.8k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3633
44.8k
                break;
3634
5.06M
            }
3635
3636
29.1k
        case URX_STRING_I:
3637
            // TODO:  This code assumes that any user string that matches will be no longer
3638
            //        than our compiled string, with case insensitive matching.
3639
            //        Our compiled string has been case-folded already.
3640
            //
3641
            //        Any matching user string will have no more code points than our
3642
            //        compiled (folded) string.  Folding may add code points, but
3643
            //        not remove them.
3644
            //
3645
            //        There is a potential problem if a supplemental code point
3646
            //        case-folds to a BMP code point.  In this case our compiled string
3647
            //        could be shorter (in code units) than a matching user string.
3648
            //
3649
            //        At this time (Unicode 6.1) there are no such characters, and this case
3650
            //        is not being handled.  A test, intltest regex/Bug9283, will fail if
3651
            //        any problematic characters are added to Unicode.
3652
            //
3653
            //        If this happens, we can make a set of the BMP chars that the
3654
            //        troublesome supplementals fold to, scan our string, and bump the
3655
            //        currentLen one extra for each that is found.
3656
            //
3657
29.1k
            {
3658
29.1k
                loc++;
3659
29.1k
                int32_t stringLenOp = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3660
29.1k
                currentLen = safeIncrement(currentLen, URX_VAL(stringLenOp));
3661
29.1k
            }
3662
29.1k
            break;
3663
3664
4.85k
        case URX_CTR_INIT:
3665
7.04k
        case URX_CTR_INIT_NG:
3666
            // For Loops, recursively call this function on the pattern for the loop body,
3667
            //   then multiply the result by the maximum loop count.
3668
7.04k
            {
3669
7.04k
                int32_t  loopEndLoc = URX_VAL(fRXPat->fCompiledPat->elementAti(loc+1));
3670
7.04k
                if (loopEndLoc == loc+4) {
3671
                    // Loop has an empty body. No affect on max match length.
3672
                    // Continue processing with code after the loop end.
3673
0
                    loc = loopEndLoc;
3674
0
                    break;
3675
0
                }
3676
3677
7.04k
                int32_t maxLoopCount = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc+3));
3678
7.04k
                if (maxLoopCount == -1) {
3679
                    // Unbounded Loop. No upper bound on match length.
3680
122
                    currentLen = INT32_MAX;
3681
122
                    break;
3682
122
                }
3683
3684
6.91k
                U_ASSERT(loopEndLoc >= loc+4);
3685
6.91k
                int64_t blockLen = maxMatchLength(loc+4, loopEndLoc-1);  // Recursive call.
3686
6.91k
                int64_t updatedLen = static_cast<int64_t>(currentLen) + blockLen * maxLoopCount;
3687
6.91k
                if (updatedLen >= INT32_MAX) {
3688
100
                    currentLen = INT32_MAX;
3689
100
                    break;
3690
100
                }
3691
6.81k
                currentLen = static_cast<int32_t>(updatedLen);
3692
6.81k
                loc = loopEndLoc;
3693
6.81k
                break;
3694
6.91k
            }
3695
3696
0
        case URX_CTR_LOOP:
3697
0
        case URX_CTR_LOOP_NG:
3698
            // These opcodes will be skipped over by code for URX_CTR_INIT.
3699
            // We shouldn't encounter them here.
3700
0
            UPRV_UNREACHABLE_EXIT;
3701
3702
196
        case URX_LOOP_SR_I:
3703
300
        case URX_LOOP_DOT_I:
3704
300
        case URX_LOOP_C:
3705
            // For anything to do with loops, make the match length unbounded.
3706
300
            currentLen = INT32_MAX;
3707
300
            break;
3708
3709
3710
3711
1.42k
        case URX_LA_START:
3712
7.73k
        case URX_LA_END:
3713
            // Look-ahead.  Just ignore, treat the look-ahead block as if
3714
            // it were normal pattern.  Gives a too-long match length,
3715
            //  but good enough for now.
3716
7.73k
            break;
3717
3718
            // End of look-ahead ops should always be consumed by the processing at
3719
            //  the URX_LA_START op.
3720
            // UPRV_UNREACHABLE_EXIT;
3721
3722
2.67k
        case URX_LB_START:
3723
2.67k
            {
3724
                // Look-behind.  Scan forward until the matching look-around end,
3725
                //   without processing the look-behind block.
3726
2.67k
                int32_t dataLoc = URX_VAL(op);
3727
1.16M
                for (loc = loc + 1; loc <= end; ++loc) {
3728
1.16M
                    op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3729
1.16M
                    int32_t opType = URX_TYPE(op);
3730
1.16M
                    if ((opType == URX_LA_END || opType == URX_LBN_END) && (URX_VAL(op) == dataLoc)) {
3731
2.67k
                        break;
3732
2.67k
                    }
3733
1.16M
                }
3734
2.67k
                U_ASSERT(loc <= end);
3735
2.67k
            }
3736
2.67k
            break;
3737
3738
0
        default:
3739
0
            UPRV_UNREACHABLE_EXIT;
3740
10.7M
        }
3741
3742
3743
10.7M
        if (currentLen == INT32_MAX) {
3744
            //  The maximum length is unbounded.
3745
            //  Stop further processing of the pattern.
3746
2.97k
            break;
3747
2.97k
        }
3748
3749
10.7M
    }
3750
15.2k
    return currentLen;
3751
3752
15.2k
}
3753
3754
3755
//------------------------------------------------------------------------------
3756
//
3757
//   stripNOPs    Remove any NOP operations from the compiled pattern code.
3758
//                Extra NOPs are inserted for some constructs during the initial
3759
//                code generation to provide locations that may be patched later.
3760
//                Many end up unneeded, and are removed by this function.
3761
//
3762
//                In order to minimize the number of passes through the pattern,
3763
//                back-reference fixup is also performed here (adjusting
3764
//                back-reference operands to point to the correct frame offsets).
3765
//
3766
//------------------------------------------------------------------------------
3767
12.4k
void RegexCompile::stripNOPs() {
3768
3769
12.4k
    if (U_FAILURE(*fStatus)) {
3770
0
        return;
3771
0
    }
3772
3773
12.4k
    int32_t    end = fRXPat->fCompiledPat->size();
3774
12.4k
    UVector32  deltas(end, *fStatus);
3775
3776
    // Make a first pass over the code, computing the amount that things
3777
    //   will be offset at each location in the original code.
3778
12.4k
    int32_t   loc;
3779
12.4k
    int32_t   d = 0;
3780
30.9M
    for (loc=0; loc<end; loc++) {
3781
30.9M
        deltas.addElement(d, *fStatus);
3782
30.9M
        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(loc));
3783
30.9M
        if (URX_TYPE(op) == URX_NOP) {
3784
140k
            d++;
3785
140k
        }
3786
30.9M
    }
3787
3788
12.4k
    UnicodeString caseStringBuffer;
3789
3790
    // Make a second pass over the code, removing the NOPs by moving following
3791
    //  code up, and patching operands that refer to code locations that
3792
    //  are being moved.  The array of offsets from the first step is used
3793
    //  to compute the new operand values.
3794
12.4k
    int32_t src;
3795
12.4k
    int32_t dst = 0;
3796
30.9M
    for (src=0; src<end; src++) {
3797
30.9M
        int32_t op = static_cast<int32_t>(fRXPat->fCompiledPat->elementAti(src));
3798
30.9M
        int32_t opType = URX_TYPE(op);
3799
30.9M
        switch (opType) {
3800
140k
        case URX_NOP:
3801
140k
            break;
3802
3803
13.4M
        case URX_STATE_SAVE:
3804
26.0M
        case URX_JMP:
3805
26.0M
        case URX_CTR_LOOP:
3806
26.0M
        case URX_CTR_LOOP_NG:
3807
26.0M
        case URX_RELOC_OPRND:
3808
26.0M
        case URX_JMPX:
3809
26.2M
        case URX_JMP_SAV:
3810
26.3M
        case URX_JMP_SAV_X:
3811
            // These are instructions with operands that refer to code locations.
3812
26.3M
            {
3813
26.3M
                int32_t  operandAddress = URX_VAL(op);
3814
26.3M
                U_ASSERT(operandAddress>=0 && operandAddress<deltas.size());
3815
26.3M
                int32_t fixedOperandAddress = operandAddress - deltas.elementAti(operandAddress);
3816
26.3M
                op = buildOp(opType, fixedOperandAddress);
3817
26.3M
                fRXPat->fCompiledPat->setElementAt(op, dst);
3818
26.3M
                dst++;
3819
26.3M
                break;
3820
26.2M
            }
3821
3822
2.36k
        case URX_BACKREF:
3823
7.79k
        case URX_BACKREF_I:
3824
7.79k
            {
3825
7.79k
                int32_t where = URX_VAL(op);
3826
7.79k
                if (where > fRXPat->fGroupMap->size()) {
3827
1.83k
                    error(U_REGEX_INVALID_BACK_REF);
3828
1.83k
                    break;
3829
1.83k
                }
3830
5.96k
                where = fRXPat->fGroupMap->elementAti(where-1);
3831
5.96k
                op    = buildOp(opType, where);
3832
5.96k
                fRXPat->fCompiledPat->setElementAt(op, dst);
3833
5.96k
                dst++;
3834
3835
5.96k
                fRXPat->fNeedsAltInput = true;
3836
5.96k
                break;
3837
7.79k
            }
3838
16.1k
        case URX_RESERVED_OP:
3839
16.9k
        case URX_RESERVED_OP_N:
3840
24.3k
        case URX_BACKTRACK:
3841
36.8k
        case URX_END:
3842
991k
        case URX_ONECHAR:
3843
1.10M
        case URX_STRING:
3844
2.19M
        case URX_STRING_LEN:
3845
2.25M
        case URX_START_CAPTURE:
3846
2.30M
        case URX_END_CAPTURE:
3847
2.31M
        case URX_STATIC_SETREF:
3848
2.32M
        case URX_STAT_SETREF_N:
3849
2.52M
        case URX_SETREF:
3850
2.98M
        case URX_DOTANY:
3851
2.99M
        case URX_FAIL:
3852
3.00M
        case URX_BACKSLASH_B:
3853
3.00M
        case URX_BACKSLASH_BU:
3854
3.00M
        case URX_BACKSLASH_G:
3855
3.00M
        case URX_BACKSLASH_X:
3856
3.01M
        case URX_BACKSLASH_Z:
3857
3.01M
        case URX_DOTANY_ALL:
3858
3.02M
        case URX_BACKSLASH_D:
3859
3.02M
        case URX_CARET:
3860
3.09M
        case URX_DOLLAR:
3861
3.09M
        case URX_CTR_INIT:
3862
3.10M
        case URX_CTR_INIT_NG:
3863
3.10M
        case URX_DOTANY_UNIX:
3864
3.20M
        case URX_STO_SP:
3865
3.29M
        case URX_LD_SP:
3866
3.39M
        case URX_STO_INP_LOC:
3867
3.39M
        case URX_LA_START:
3868
3.39M
        case URX_LA_END:
3869
3.44M
        case URX_ONECHAR_I:
3870
4.42M
        case URX_STRING_I:
3871
4.42M
        case URX_DOLLAR_M:
3872
4.42M
        case URX_CARET_M:
3873
4.42M
        case URX_CARET_M_UNIX:
3874
4.43M
        case URX_LB_START:
3875
4.43M
        case URX_LB_CONT:
3876
4.43M
        case URX_LB_END:
3877
4.43M
        case URX_LBN_CONT:
3878
4.43M
        case URX_LBN_END:
3879
4.44M
        case URX_LOOP_SR_I:
3880
4.45M
        case URX_LOOP_DOT_I:
3881
4.46M
        case URX_LOOP_C:
3882
4.49M
        case URX_DOLLAR_D:
3883
4.49M
        case URX_DOLLAR_MD:
3884
4.49M
        case URX_BACKSLASH_H:
3885
4.50M
        case URX_BACKSLASH_R:
3886
4.50M
        case URX_BACKSLASH_V:
3887
            // These instructions are unaltered by the relocation.
3888
4.50M
            fRXPat->fCompiledPat->setElementAt(op, dst);
3889
4.50M
            dst++;
3890
4.50M
            break;
3891
3892
0
        default:
3893
            // Some op is unaccounted for.
3894
0
            UPRV_UNREACHABLE_EXIT;
3895
30.9M
        }
3896
30.9M
    }
3897
3898
12.4k
    fRXPat->fCompiledPat->setSize(dst);
3899
12.4k
}
3900
3901
3902
3903
3904
//------------------------------------------------------------------------------
3905
//
3906
//  Error         Report a rule parse error.
3907
//                Only report it if no previous error has been recorded.
3908
//
3909
//------------------------------------------------------------------------------
3910
10.0k
void RegexCompile::error(UErrorCode e) {
3911
10.0k
    if (U_SUCCESS(*fStatus) || e == U_MEMORY_ALLOCATION_ERROR) {
3912
7.59k
        *fStatus = e;
3913
        // Hmm. fParseErr (UParseError) line & offset fields are int32_t in public
3914
        // API (see common/unicode/parseerr.h), while fLineNum and fCharNum are
3915
        // int64_t. If the values of the latter are out of range for the former,
3916
        // set them to the appropriate "field not supported" values.
3917
7.59k
        if (fLineNum > 0x7FFFFFFF) {
3918
0
            fParseErr->line   = 0;
3919
0
            fParseErr->offset = -1;
3920
7.59k
        } else if (fCharNum > 0x7FFFFFFF) {
3921
0
            fParseErr->line = static_cast<int32_t>(fLineNum);
3922
0
            fParseErr->offset = -1;
3923
7.59k
        } else {
3924
7.59k
            fParseErr->line = static_cast<int32_t>(fLineNum);
3925
7.59k
            fParseErr->offset = static_cast<int32_t>(fCharNum);
3926
7.59k
        }
3927
3928
7.59k
        UErrorCode status = U_ZERO_ERROR; // throwaway status for extracting context
3929
3930
        // Fill in the context.
3931
        //   Note: extractBetween() pins supplied indices to the string bounds.
3932
7.59k
        uprv_memset(fParseErr->preContext,  0, sizeof(fParseErr->preContext));
3933
7.59k
        uprv_memset(fParseErr->postContext, 0, sizeof(fParseErr->postContext));
3934
7.59k
        utext_extract(fRXPat->fPattern, fScanIndex-U_PARSE_CONTEXT_LEN+1, fScanIndex, fParseErr->preContext, U_PARSE_CONTEXT_LEN, &status);
3935
7.59k
        utext_extract(fRXPat->fPattern, fScanIndex, fScanIndex+U_PARSE_CONTEXT_LEN-1, fParseErr->postContext, U_PARSE_CONTEXT_LEN, &status);
3936
7.59k
    }
3937
10.0k
}
3938
3939
3940
//
3941
//  Assorted Unicode character constants.
3942
//     Numeric because there is no portable way to enter them as literals.
3943
//     (Think EBCDIC).
3944
//
3945
static const char16_t   chCR        = 0x0d;      // New lines, for terminating comments.
3946
static const char16_t   chLF        = 0x0a;      // Line Feed
3947
static const char16_t   chPound     = 0x23;      // '#', introduces a comment.
3948
static const char16_t   chDigit0    = 0x30;      // '0'
3949
static const char16_t   chDigit7    = 0x37;      // '9'
3950
static const char16_t   chColon     = 0x3A;      // ':'
3951
static const char16_t   chE         = 0x45;      // 'E'
3952
static const char16_t   chQ         = 0x51;      // 'Q'
3953
//static const char16_t   chN         = 0x4E;      // 'N'
3954
static const char16_t   chP         = 0x50;      // 'P'
3955
static const char16_t   chBackSlash = 0x5c;      // '\'  introduces a char escape
3956
//static const char16_t   chLBracket  = 0x5b;      // '['
3957
static const char16_t   chRBracket  = 0x5d;      // ']'
3958
static const char16_t   chUp        = 0x5e;      // '^'
3959
static const char16_t   chLowerP    = 0x70;
3960
static const char16_t   chLBrace    = 0x7b;      // '{'
3961
static const char16_t   chRBrace    = 0x7d;      // '}'
3962
static const char16_t   chNEL       = 0x85;      //    NEL newline variant
3963
static const char16_t   chLS        = 0x2028;    //    Unicode Line Separator
3964
3965
3966
//------------------------------------------------------------------------------
3967
//
3968
//  nextCharLL    Low Level Next Char from the regex pattern.
3969
//                Get a char from the string, keep track of input position
3970
//                     for error reporting.
3971
//
3972
//------------------------------------------------------------------------------
3973
113M
UChar32  RegexCompile::nextCharLL() {
3974
113M
    UChar32       ch;
3975
3976
113M
    if (fPeekChar != -1) {
3977
631k
        ch = fPeekChar;
3978
631k
        fPeekChar = -1;
3979
631k
        return ch;
3980
631k
    }
3981
3982
    // assume we're already in the right place
3983
112M
    ch = UTEXT_NEXT32(fRXPat->fPattern);
3984
112M
    if (ch == U_SENTINEL) {
3985
23.7k
        return ch;
3986
23.7k
    }
3987
3988
112M
    if (ch == chCR ||
3989
112M
        ch == chNEL ||
3990
112M
        ch == chLS   ||
3991
112M
        (ch == chLF && fLastChar != chCR)) {
3992
        // Character is starting a new line.  Bump up the line number, and
3993
        //  reset the column to 0.
3994
184k
        fLineNum++;
3995
184k
        fCharNum=0;
3996
184k
    }
3997
112M
    else {
3998
        // Character is not starting a new line.  Except in the case of a
3999
        //   LF following a CR, increment the column position.
4000
112M
        if (ch != chLF) {
4001
112M
            fCharNum++;
4002
112M
        }
4003
112M
    }
4004
112M
    fLastChar = ch;
4005
112M
    return ch;
4006
112M
}
4007
4008
//------------------------------------------------------------------------------
4009
//
4010
//   peekCharLL    Low Level Character Scanning, sneak a peek at the next
4011
//                 character without actually getting it.
4012
//
4013
//------------------------------------------------------------------------------
4014
1.42M
UChar32  RegexCompile::peekCharLL() {
4015
1.42M
    if (fPeekChar == -1) {
4016
634k
        fPeekChar = nextCharLL();
4017
634k
    }
4018
1.42M
    return fPeekChar;
4019
1.42M
}
4020
4021
4022
//------------------------------------------------------------------------------
4023
//
4024
//   nextChar     for pattern scanning.  At this level, we handle stripping
4025
//                out comments and processing some backslash character escapes.
4026
//                The rest of the pattern grammar is handled at the next level up.
4027
//
4028
//------------------------------------------------------------------------------
4029
110M
void RegexCompile::nextChar(RegexPatternChar &c) {
4030
110M
  tailRecursion:
4031
110M
    fScanIndex = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4032
110M
    c.fChar    = nextCharLL();
4033
110M
    c.fQuoted  = false;
4034
4035
110M
    if (fQuoteMode) {
4036
2.02M
        c.fQuoted = true;
4037
2.02M
        if ((c.fChar == chBackSlash && peekCharLL() == chE && ((fModeFlags & UREGEX_LITERAL) == 0)) ||
4038
2.01M
            c.fChar == static_cast<UChar32>(-1)) {
4039
1.58k
            fQuoteMode = false;  //  Exit quote mode,
4040
1.58k
            nextCharLL();        // discard the E
4041
            // nextChar(c);      // recurse to get the real next char
4042
1.58k
            goto tailRecursion;  // Note: fuzz testing produced testcases that
4043
                                 //       resulted in stack overflow here.
4044
1.58k
        }
4045
2.02M
    }
4046
108M
    else if (fInBackslashQuote) {
4047
        // The current character immediately follows a '\'
4048
        // Don't check for any further escapes, just return it as-is.
4049
        // Don't set c.fQuoted, because that would prevent the state machine from
4050
        //    dispatching on the character.
4051
388k
        fInBackslashQuote = false;
4052
388k
    }
4053
107M
    else
4054
107M
    {
4055
        // We are not in a \Q quoted region \E of the source.
4056
        //
4057
107M
        if (fModeFlags & UREGEX_COMMENTS) {
4058
            //
4059
            // We are in free-spacing and comments mode.
4060
            //  Scan through any white space and comments, until we
4061
            //  reach a significant character or the end of input.
4062
1.32M
            for (;;) {
4063
1.32M
                if (c.fChar == static_cast<UChar32>(-1)) {
4064
1.46k
                    break;     // End of Input
4065
1.46k
                }
4066
1.32M
                if  (c.fChar == chPound && fEOLComments) {
4067
                    // Start of a comment.  Consume the rest of it, until EOF or a new line
4068
2.28M
                    for (;;) {
4069
2.28M
                        c.fChar = nextCharLL();
4070
2.28M
                        if (c.fChar == static_cast<UChar32>(-1) || // EOF
4071
2.28M
                            c.fChar == chCR        ||
4072
2.28M
                            c.fChar == chLF        ||
4073
2.28M
                            c.fChar == chNEL       ||
4074
2.28M
                            c.fChar == chLS)       {
4075
5.96k
                            break;
4076
5.96k
                        }
4077
2.28M
                    }
4078
5.96k
                }
4079
                // TODO:  check what Java & Perl do with non-ASCII white spaces.  Ticket 6061.
4080
1.32M
                if (PatternProps::isWhiteSpace(c.fChar) == false) {
4081
1.30M
                    break;
4082
1.30M
                }
4083
21.0k
                c.fChar = nextCharLL();
4084
21.0k
            }
4085
1.30M
        }
4086
4087
        //
4088
        //  check for backslash escaped characters.
4089
        //
4090
107M
        if (c.fChar == chBackSlash) {
4091
600k
            int64_t pos = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4092
600k
            if (RegexStaticSets::gStaticSets->fUnescapeCharSet.contains(peekCharLL())) {
4093
                //
4094
                // A '\' sequence that is handled by ICU's standard unescapeAt function.
4095
                //   Includes \uxxxx, \n, \r, many others.
4096
                //   Return the single equivalent character.
4097
                //
4098
204k
                nextCharLL();                 // get & discard the peeked char.
4099
204k
                c.fQuoted = true;
4100
4101
204k
                if (UTEXT_FULL_TEXT_IN_CHUNK(fRXPat->fPattern, fPatternLength)) {
4102
204k
                    int32_t endIndex = static_cast<int32_t>(pos);
4103
204k
                    c.fChar = u_unescapeAt(uregex_ucstr_unescape_charAt, &endIndex, static_cast<int32_t>(fPatternLength), const_cast<char16_t*>(fRXPat->fPattern->chunkContents));
4104
4105
204k
                    if (endIndex == pos) {
4106
198
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4107
198
                    }
4108
204k
                    fCharNum += endIndex - pos;
4109
204k
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, endIndex);
4110
204k
                } else {
4111
0
                    int32_t offset = 0;
4112
0
                    struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(fRXPat->fPattern);
4113
4114
0
                    UTEXT_SETNATIVEINDEX(fRXPat->fPattern, pos);
4115
0
                    c.fChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
4116
4117
0
                    if (offset == 0) {
4118
0
                        error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4119
0
                    } else if (context.lastOffset == offset) {
4120
0
                        UTEXT_PREVIOUS32(fRXPat->fPattern);
4121
0
                    } else if (context.lastOffset != offset-1) {
4122
0
                        utext_moveIndex32(fRXPat->fPattern, offset - context.lastOffset - 1);
4123
0
                    }
4124
0
                    fCharNum += offset;
4125
0
                }
4126
204k
            }
4127
396k
            else if (peekCharLL() == chDigit0) {
4128
                //  Octal Escape, using Java Regexp Conventions
4129
                //    which are \0 followed by 1-3 octal digits.
4130
                //    Different from ICU Unescape handling of Octal, which does not
4131
                //    require the leading 0.
4132
                //  Java also has the convention of only consuming 2 octal digits if
4133
                //    the three digit number would be > 0xff
4134
                //
4135
5.14k
                c.fChar = 0;
4136
5.14k
                nextCharLL();    // Consume the initial 0.
4137
5.14k
                int index;
4138
13.5k
                for (index=0; index<3; index++) {
4139
12.0k
                    int32_t ch = peekCharLL();
4140
12.0k
                    if (ch<chDigit0 || ch>chDigit7) {
4141
3.57k
                        if (index==0) {
4142
                           // \0 is not followed by any octal digits.
4143
358
                           error(U_REGEX_BAD_ESCAPE_SEQUENCE);
4144
358
                        }
4145
3.57k
                        break;
4146
3.57k
                    }
4147
8.42k
                    c.fChar <<= 3;
4148
8.42k
                    c.fChar += ch&7;
4149
8.42k
                    if (c.fChar <= 255) {
4150
7.20k
                        nextCharLL();
4151
7.20k
                    } else {
4152
                        // The last digit made the number too big.  Forget we saw it.
4153
1.22k
                        c.fChar >>= 3;
4154
1.22k
                    }
4155
8.42k
                }
4156
5.14k
                c.fQuoted = true;
4157
5.14k
            }
4158
391k
            else if (peekCharLL() == chQ) {
4159
                //  "\Q"  enter quote mode, which will continue until "\E"
4160
1.86k
                fQuoteMode = true;
4161
1.86k
                nextCharLL();        // discard the 'Q'.
4162
                // nextChar(c);      // recurse to get the real next char.
4163
1.86k
                goto tailRecursion;  // Note: fuzz testing produced test cases that
4164
                //                            resulted in stack overflow here.
4165
1.86k
            }
4166
389k
            else
4167
389k
            {
4168
                // We are in a '\' escape that will be handled by the state table scanner.
4169
                // Just return the backslash, but remember that the following char is to
4170
                //  be taken literally.
4171
389k
                fInBackslashQuote = true;
4172
389k
            }
4173
600k
        }
4174
107M
    }
4175
4176
    // re-enable # to end-of-line comments, in case they were disabled.
4177
    // They are disabled by the parser upon seeing '(?', but this lasts for
4178
    //  the fetching of the next character only.
4179
110M
    fEOLComments = true;
4180
4181
    // putc(c.fChar, stdout);
4182
110M
}
4183
4184
4185
4186
//------------------------------------------------------------------------------
4187
//
4188
//  scanNamedChar
4189
//            Get a UChar32 from a \N{UNICODE CHARACTER NAME} in the pattern.
4190
//
4191
//             The scan position will be at the 'N'.  On return
4192
//             the scan position should be just after the '}'
4193
//
4194
//             Return the UChar32
4195
//
4196
//------------------------------------------------------------------------------
4197
6.81k
UChar32  RegexCompile::scanNamedChar() {
4198
6.81k
    if (U_FAILURE(*fStatus)) {
4199
0
        return 0;
4200
0
    }
4201
4202
6.81k
    nextChar(fC);
4203
6.81k
    if (fC.fChar != chLBrace) {
4204
30
        error(U_REGEX_PROPERTY_SYNTAX);
4205
30
        return 0;
4206
30
    }
4207
4208
6.78k
    UnicodeString  charName;
4209
28.0k
    for (;;) {
4210
28.0k
        nextChar(fC);
4211
28.0k
        if (fC.fChar == chRBrace) {
4212
6.72k
            break;
4213
6.72k
        }
4214
21.2k
        if (fC.fChar == -1) {
4215
64
            error(U_REGEX_PROPERTY_SYNTAX);
4216
64
            return 0;
4217
64
        }
4218
21.2k
        charName.append(fC.fChar);
4219
21.2k
    }
4220
4221
6.72k
    char name[100];
4222
6.72k
    if (!uprv_isInvariantUString(charName.getBuffer(), charName.length()) ||
4223
6.71k
         static_cast<uint32_t>(charName.length()) >= sizeof(name)) {
4224
        // All Unicode character names have only invariant characters.
4225
        // The API to get a character, given a name, accepts only char *, forcing us to convert,
4226
        //   which requires this error check
4227
12
        error(U_REGEX_PROPERTY_SYNTAX);
4228
12
        return 0;
4229
12
    }
4230
6.71k
    charName.extract(0, charName.length(), name, sizeof(name), US_INV);
4231
4232
6.71k
    UChar32  theChar = u_charFromName(U_UNICODE_CHAR_NAME, name, fStatus);
4233
6.71k
    if (U_FAILURE(*fStatus)) {
4234
115
        error(U_REGEX_PROPERTY_SYNTAX);
4235
115
    }
4236
4237
6.71k
    nextChar(fC);      // Continue overall regex pattern processing with char after the '}'
4238
6.71k
    return theChar;
4239
6.72k
}
4240
4241
//------------------------------------------------------------------------------
4242
//
4243
//  scanProp   Construct a UnicodeSet from the text at the current scan
4244
//             position, which will be of the form \p{whaterver}
4245
//
4246
//             The scan position will be at the 'p' or 'P'.  On return
4247
//             the scan position should be just after the '}'
4248
//
4249
//             Return a UnicodeSet, constructed from the \P pattern,
4250
//             or nullptr if the pattern is invalid.
4251
//
4252
//------------------------------------------------------------------------------
4253
3.16k
UnicodeSet *RegexCompile::scanProp() {
4254
3.16k
    UnicodeSet    *uset = nullptr;
4255
4256
3.16k
    if (U_FAILURE(*fStatus)) {
4257
0
        return nullptr;
4258
0
    }
4259
3.16k
    (void)chLowerP;   // Suppress compiler unused variable warning.
4260
3.16k
    U_ASSERT(fC.fChar == chLowerP || fC.fChar == chP);
4261
3.16k
    UBool negated = (fC.fChar == chP);
4262
4263
3.16k
    UnicodeString propertyName;
4264
3.16k
    nextChar(fC);
4265
3.16k
    if (fC.fChar != chLBrace) {
4266
41
        error(U_REGEX_PROPERTY_SYNTAX);
4267
41
        return nullptr;
4268
41
    }
4269
1.37M
    for (;;) {
4270
1.37M
        nextChar(fC);
4271
1.37M
        if (fC.fChar == chRBrace) {
4272
3.05k
            break;
4273
3.05k
        }
4274
1.37M
        if (fC.fChar == -1) {
4275
            // Hit the end of the input string without finding the closing '}'
4276
69
            error(U_REGEX_PROPERTY_SYNTAX);
4277
69
            return nullptr;
4278
69
        }
4279
1.37M
        propertyName.append(fC.fChar);
4280
1.37M
    }
4281
3.05k
    uset = createSetForProperty(propertyName, negated);
4282
3.05k
    nextChar(fC);    // Move input scan to position following the closing '}'
4283
3.05k
    return uset;
4284
3.12k
}
4285
4286
//------------------------------------------------------------------------------
4287
//
4288
//  scanPosixProp   Construct a UnicodeSet from the text at the current scan
4289
//             position, which is expected be of the form [:property expression:]
4290
//
4291
//             The scan position will be at the opening ':'.  On return
4292
//             the scan position must be on the closing ']'
4293
//
4294
//             Return a UnicodeSet constructed from the pattern,
4295
//             or nullptr if this is not a valid POSIX-style set expression.
4296
//             If not a property expression, restore the initial scan position
4297
//                (to the opening ':')
4298
//
4299
//               Note:  the opening '[:' is not sufficient to guarantee that
4300
//                      this is a [:property:] expression.
4301
//                      [:'+=,] is a perfectly good ordinary set expression that
4302
//                              happens to include ':' as one of its characters.
4303
//
4304
//------------------------------------------------------------------------------
4305
106k
UnicodeSet *RegexCompile::scanPosixProp() {
4306
106k
    UnicodeSet    *uset = nullptr;
4307
4308
106k
    if (U_FAILURE(*fStatus)) {
4309
0
        return nullptr;
4310
0
    }
4311
4312
106k
    U_ASSERT(fC.fChar == chColon);
4313
4314
    // Save the scanner state.
4315
    // TODO:  move this into the scanner, with the state encapsulated in some way.  Ticket 6062
4316
106k
    int64_t     savedScanIndex        = fScanIndex;
4317
106k
    int64_t     savedNextIndex        = UTEXT_GETNATIVEINDEX(fRXPat->fPattern);
4318
106k
    UBool       savedQuoteMode        = fQuoteMode;
4319
106k
    UBool       savedInBackslashQuote = fInBackslashQuote;
4320
106k
    UBool       savedEOLComments      = fEOLComments;
4321
106k
    int64_t     savedLineNum          = fLineNum;
4322
106k
    int64_t     savedCharNum          = fCharNum;
4323
106k
    UChar32     savedLastChar         = fLastChar;
4324
106k
    UChar32     savedPeekChar         = fPeekChar;
4325
106k
    RegexPatternChar savedfC          = fC;
4326
4327
    // Scan for a closing ].   A little tricky because there are some perverse
4328
    //   edge cases possible.  "[:abc\Qdef:] \E]"  is a valid non-property expression,
4329
    //   ending on the second closing ].
4330
4331
106k
    UnicodeString propName;
4332
106k
    UBool         negated  = false;
4333
4334
    // Check for and consume the '^' in a negated POSIX property, e.g.  [:^Letter:]
4335
106k
    nextChar(fC);
4336
106k
    if (fC.fChar == chUp) {
4337
3.86k
       negated = true;
4338
3.86k
       nextChar(fC);
4339
3.86k
    }
4340
4341
    // Scan for the closing ":]", collecting the property name along the way.
4342
106k
    UBool  sawPropSetTerminator = false;
4343
27.6M
    for (;;) {
4344
27.6M
        propName.append(fC.fChar);
4345
27.6M
        nextChar(fC);
4346
27.6M
        if (fC.fQuoted || fC.fChar == -1) {
4347
            // Escaped characters or end of input - either says this isn't a [:Property:]
4348
8.79k
            break;
4349
8.79k
        }
4350
27.5M
        if (fC.fChar == chColon) {
4351
97.7k
            nextChar(fC);
4352
97.7k
            if (fC.fChar == chRBracket) {
4353
77.3k
                sawPropSetTerminator = true;
4354
77.3k
            }
4355
97.7k
            break;
4356
97.7k
        }
4357
27.5M
    }
4358
4359
106k
    if (sawPropSetTerminator) {
4360
77.3k
        uset = createSetForProperty(propName, negated);
4361
77.3k
    }
4362
29.1k
    else
4363
29.1k
    {
4364
        // No closing ":]".
4365
        //  Restore the original scan position.
4366
        //  The main scanner will retry the input as a normal set expression,
4367
        //    not a [:Property:] expression.
4368
29.1k
        fScanIndex        = savedScanIndex;
4369
29.1k
        fQuoteMode        = savedQuoteMode;
4370
29.1k
        fInBackslashQuote = savedInBackslashQuote;
4371
29.1k
        fEOLComments      = savedEOLComments;
4372
29.1k
        fLineNum          = savedLineNum;
4373
29.1k
        fCharNum          = savedCharNum;
4374
29.1k
        fLastChar         = savedLastChar;
4375
29.1k
        fPeekChar         = savedPeekChar;
4376
29.1k
        fC                = savedfC;
4377
29.1k
        UTEXT_SETNATIVEINDEX(fRXPat->fPattern, savedNextIndex);
4378
29.1k
    }
4379
106k
    return uset;
4380
106k
}
4381
4382
0
static inline void addIdentifierIgnorable(UnicodeSet *set, UErrorCode& ec) {
4383
0
    set->add(0, 8).add(0x0e, 0x1b).add(0x7f, 0x9f);
4384
0
    addCategory(set, U_GC_CF_MASK, ec);
4385
0
}
4386
4387
//
4388
//  Create a Unicode Set from a Unicode Property expression.
4389
//     This is common code underlying both \p{...} and [:...:] expressions.
4390
//     Includes trying the Java "properties" that aren't supported as
4391
//     normal ICU UnicodeSet properties
4392
//
4393
80.4k
UnicodeSet *RegexCompile::createSetForProperty(const UnicodeString &propName, UBool negated) {
4394
4395
80.4k
    if (U_FAILURE(*fStatus)) {
4396
1
        return nullptr;
4397
1
    }
4398
80.4k
    LocalPointer<UnicodeSet> set;
4399
80.4k
    UErrorCode status = U_ZERO_ERROR;
4400
4401
80.4k
    do {      // non-loop, exists to allow breaks from the block.
4402
        //
4403
        //  First try the property as we received it
4404
        //
4405
80.4k
        UnicodeString   setExpr;
4406
80.4k
        uint32_t        usetFlags = 0;
4407
80.4k
        setExpr.append(u"[\\p{", -1);
4408
80.4k
        setExpr.append(propName);
4409
80.4k
        setExpr.append(u"}]", -1);
4410
80.4k
        if (fModeFlags & UREGEX_CASE_INSENSITIVE) {
4411
36.4k
            usetFlags |= USET_CASE_INSENSITIVE;
4412
36.4k
        }
4413
80.4k
        set.adoptInsteadAndCheckErrorCode(new UnicodeSet(setExpr, usetFlags, nullptr, status), status);
4414
80.4k
        if (U_SUCCESS(status) || status == U_MEMORY_ALLOCATION_ERROR) {
4415
64.8k
            break;
4416
64.8k
        }
4417
4418
        //
4419
        //  The incoming property wasn't directly recognized by ICU.
4420
4421
        //  Check [:word:] and [:all:]. These are not recognized as a properties by ICU UnicodeSet.
4422
        //     Java accepts 'word' with mixed case.
4423
        //     Java accepts 'all' only in all lower case.
4424
4425
15.5k
        status = U_ZERO_ERROR;
4426
15.5k
        if (propName.caseCompare(u"word", -1, 0) == 0) {
4427
541
            set.adoptInsteadAndCheckErrorCode(
4428
541
                RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].cloneAsThawed(), status);
4429
541
            break;
4430
541
        }
4431
14.9k
        if (propName.compare(u"all", -1) == 0) {
4432
170
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(0, 0x10ffff), status);
4433
170
            break;
4434
170
        }
4435
4436
4437
        //    Do Java InBlock expressions
4438
        //
4439
14.8k
        UnicodeString mPropName = propName;
4440
14.8k
        if (mPropName.startsWith(u"In", 2) && mPropName.length() >= 3) {
4441
6.12k
            status = U_ZERO_ERROR;
4442
6.12k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4443
6.12k
            if (U_FAILURE(status)) {
4444
0
                break;
4445
0
            }
4446
6.12k
            UnicodeString blockName(mPropName, 2);  // Property with the leading "In" removed.
4447
6.12k
            set->applyPropertyAlias(UnicodeString(u"Block"), blockName, status);
4448
6.12k
            break;
4449
6.12k
        }
4450
4451
        //  Check for the Java form "IsBooleanPropertyValue", which we will recast
4452
        //  as "BooleanPropertyValue". The property value can be either a
4453
        //  a General Category or a Script Name.
4454
4455
8.69k
        if (propName.startsWith(u"Is", 2) && propName.length()>=3) {
4456
6.76k
            mPropName.remove(0, 2);      // Strip the "Is"
4457
6.76k
            if (mPropName.indexOf(u'=') >= 0) {
4458
                // Reject any "Is..." property expression containing an '=', that is,
4459
                // any non-binary property expression.
4460
22
                status = U_REGEX_PROPERTY_SYNTAX;
4461
22
                break;
4462
22
            }
4463
4464
6.74k
            if (mPropName.caseCompare(u"assigned", -1, 0) == 0) {
4465
0
                mPropName.setTo(u"unassigned", -1);
4466
0
                negated = !negated;
4467
6.74k
            } else if (mPropName.caseCompare(u"TitleCase", -1, 0) == 0) {
4468
0
                mPropName.setTo(u"Titlecase_Letter", -1);
4469
0
            }
4470
4471
6.74k
            mPropName.insert(0, u"[\\p{", -1);
4472
6.74k
            mPropName.append(u"}]", -1);
4473
6.74k
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(mPropName, *fStatus), status);
4474
4475
6.74k
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4476
2.33k
                set->closeOver(USET_CASE_INSENSITIVE);
4477
2.33k
            }
4478
6.74k
            break;
4479
4480
6.76k
        }
4481
4482
1.92k
        if (propName.startsWith(u"java", -1)) {
4483
0
            status = U_ZERO_ERROR;
4484
0
            set.adoptInsteadAndCheckErrorCode(new UnicodeSet(), status);
4485
0
            if (U_FAILURE(status)) {
4486
0
                break;
4487
0
            }
4488
            //
4489
            //  Try the various Java specific properties.
4490
            //   These all begin with "java"
4491
            //
4492
0
            if (propName.compare(u"javaDefined", -1) == 0) {
4493
0
                addCategory(set.getAlias(), U_GC_CN_MASK, status);
4494
0
                set->complement();
4495
0
            }
4496
0
            else if (propName.compare(u"javaDigit", -1) == 0) {
4497
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4498
0
            }
4499
0
            else if (propName.compare(u"javaIdentifierIgnorable", -1) == 0) {
4500
0
                addIdentifierIgnorable(set.getAlias(), status);
4501
0
            }
4502
0
            else if (propName.compare(u"javaISOControl", -1) == 0) {
4503
0
                set->add(0, 0x1F).add(0x7F, 0x9F);
4504
0
            }
4505
0
            else if (propName.compare(u"javaJavaIdentifierPart", -1) == 0) {
4506
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4507
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4508
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4509
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4510
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4511
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4512
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4513
0
                addIdentifierIgnorable(set.getAlias(), status);
4514
0
            }
4515
0
            else if (propName.compare(u"javaJavaIdentifierStart", -1) == 0) {
4516
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4517
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4518
0
                addCategory(set.getAlias(), U_GC_SC_MASK, status);
4519
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4520
0
            }
4521
0
            else if (propName.compare(u"javaLetter", -1) == 0) {
4522
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4523
0
            }
4524
0
            else if (propName.compare(u"javaLetterOrDigit", -1) == 0) {
4525
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4526
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4527
0
            }
4528
0
            else if (propName.compare(u"javaLowerCase", -1) == 0) {
4529
0
                addCategory(set.getAlias(), U_GC_LL_MASK, status);
4530
0
            }
4531
0
            else if (propName.compare(u"javaMirrored", -1) == 0) {
4532
0
                set->applyIntPropertyValue(UCHAR_BIDI_MIRRORED, 1, status);
4533
0
            }
4534
0
            else if (propName.compare(u"javaSpaceChar", -1) == 0) {
4535
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4536
0
            }
4537
0
            else if (propName.compare(u"javaSupplementaryCodePoint", -1) == 0) {
4538
0
                set->add(0x10000, UnicodeSet::MAX_VALUE);
4539
0
            }
4540
0
            else if (propName.compare(u"javaTitleCase", -1) == 0) {
4541
0
                addCategory(set.getAlias(), U_GC_LT_MASK, status);
4542
0
            }
4543
0
            else if (propName.compare(u"javaUnicodeIdentifierStart", -1) == 0) {
4544
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4545
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4546
0
            }
4547
0
            else if (propName.compare(u"javaUnicodeIdentifierPart", -1) == 0) {
4548
0
                addCategory(set.getAlias(), U_GC_L_MASK, status);
4549
0
                addCategory(set.getAlias(), U_GC_PC_MASK, status);
4550
0
                addCategory(set.getAlias(), U_GC_ND_MASK, status);
4551
0
                addCategory(set.getAlias(), U_GC_NL_MASK, status);
4552
0
                addCategory(set.getAlias(), U_GC_MC_MASK, status);
4553
0
                addCategory(set.getAlias(), U_GC_MN_MASK, status);
4554
0
                addIdentifierIgnorable(set.getAlias(), status);
4555
0
            }
4556
0
            else if (propName.compare(u"javaUpperCase", -1) == 0) {
4557
0
                addCategory(set.getAlias(), U_GC_LU_MASK, status);
4558
0
            }
4559
0
            else if (propName.compare(u"javaValidCodePoint", -1) == 0) {
4560
0
                set->add(0, UnicodeSet::MAX_VALUE);
4561
0
            }
4562
0
            else if (propName.compare(u"javaWhitespace", -1) == 0) {
4563
0
                addCategory(set.getAlias(), U_GC_Z_MASK, status);
4564
0
                set->removeAll(UnicodeSet().add(0xa0).add(0x2007).add(0x202f));
4565
0
                set->add(9, 0x0d).add(0x1c, 0x1f);
4566
0
            } else {
4567
0
                status = U_REGEX_PROPERTY_SYNTAX;
4568
0
            }
4569
4570
0
            if (U_SUCCESS(status) && !set->isEmpty() && (usetFlags & USET_CASE_INSENSITIVE)) {
4571
0
                set->closeOver(USET_CASE_INSENSITIVE);
4572
0
            }
4573
0
            break;
4574
0
        }
4575
4576
        // Unrecognized property. ICU didn't like it as it was, and none of the Java compatibility
4577
        // extensions matched it.
4578
1.92k
        status = U_REGEX_PROPERTY_SYNTAX;
4579
1.92k
    } while (false);   // End of do loop block. Code above breaks out of the block on success or hard failure.
4580
4581
80.4k
    if (U_SUCCESS(status)) {
4582
        // ICU 70 adds emoji properties of strings, but as long as Java does not say how to
4583
        // deal with properties of strings and character classes with strings, we ignore them.
4584
        // Just in case something downstream might stumble over the strings,
4585
        // we remove them from the set.
4586
        // Note that when we support strings, the complement of a property (as with \P)
4587
        // should be implemented as .complement().removeAllStrings() (code point complement).
4588
78.4k
        set->removeAllStrings();
4589
78.4k
        U_ASSERT(set.isValid());
4590
78.4k
        if (negated) {
4591
2.72k
            set->complement();
4592
2.72k
        }
4593
78.4k
        return set.orphan();
4594
78.4k
    } else {
4595
2.00k
        if (status == U_ILLEGAL_ARGUMENT_ERROR) {
4596
49
            status = U_REGEX_PROPERTY_SYNTAX;
4597
49
        }
4598
2.00k
        error(status);
4599
2.00k
        return nullptr;
4600
2.00k
    }
4601
80.4k
}
4602
4603
4604
//
4605
//  SetEval   Part of the evaluation of [set expressions].
4606
//            Perform any pending (stacked) operations with precedence
4607
//            equal or greater to that of the next operator encountered
4608
//            in the expression.
4609
//
4610
14.1M
void RegexCompile::setEval(int32_t nextOp) {
4611
14.1M
    UnicodeSet *rightOperand = nullptr;
4612
14.1M
    UnicodeSet *leftOperand  = nullptr;
4613
14.7M
    for (;;) {
4614
14.7M
        U_ASSERT(fSetOpStack.empty()==false);
4615
14.7M
        int32_t pendingSetOperation = fSetOpStack.peeki();
4616
14.7M
        if ((pendingSetOperation&0xffff0000) < (nextOp&0xffff0000)) {
4617
14.1M
            break;
4618
14.1M
        }
4619
545k
        fSetOpStack.popi();
4620
545k
        U_ASSERT(fSetStack.empty() == false);
4621
545k
        rightOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4622
        // ICU 70 adds emoji properties of strings, but createSetForProperty() removes all strings
4623
        // (see comments there).
4624
        // We also do not yet support string literals in character classes,
4625
        // so there should not be any strings.
4626
        // Note that when we support strings, the complement of a set (as with ^ or \P)
4627
        // should be implemented as .complement().removeAllStrings() (code point complement).
4628
545k
        U_ASSERT(!rightOperand->hasStrings());
4629
545k
        switch (pendingSetOperation) {
4630
11.6k
            case setNegation:
4631
11.6k
                rightOperand->complement();
4632
11.6k
                break;
4633
444k
            case setCaseClose:
4634
                // TODO: need a simple close function.  Ticket 6065
4635
444k
                rightOperand->closeOver(USET_CASE_INSENSITIVE);
4636
444k
                rightOperand->removeAllStrings();
4637
444k
                break;
4638
1.64k
            case setDifference1:
4639
2.06k
            case setDifference2:
4640
2.06k
                fSetStack.pop();
4641
2.06k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4642
2.06k
                leftOperand->removeAll(*rightOperand);
4643
2.06k
                delete rightOperand;
4644
2.06k
                break;
4645
1.12k
            case setIntersection1:
4646
8.06k
            case setIntersection2:
4647
8.06k
                fSetStack.pop();
4648
8.06k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4649
8.06k
                leftOperand->retainAll(*rightOperand);
4650
8.06k
                delete rightOperand;
4651
8.06k
                break;
4652
78.9k
            case setUnion:
4653
78.9k
                fSetStack.pop();
4654
78.9k
                leftOperand = static_cast<UnicodeSet*>(fSetStack.peek());
4655
78.9k
                leftOperand->addAll(*rightOperand);
4656
78.9k
                delete rightOperand;
4657
78.9k
                break;
4658
0
            default:
4659
0
                UPRV_UNREACHABLE_EXIT;
4660
545k
            }
4661
545k
        }
4662
14.1M
    }
4663
4664
99.4k
void RegexCompile::setPushOp(int32_t op) {
4665
99.4k
    setEval(op);
4666
99.4k
    fSetOpStack.push(op, *fStatus);
4667
99.4k
    LocalPointer<UnicodeSet> lpSet(new UnicodeSet(), *fStatus);
4668
99.4k
    fSetStack.push(lpSet.orphan(), *fStatus);
4669
99.4k
}
4670
4671
U_NAMESPACE_END
4672
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
4673