Coverage Report

Created: 2025-11-07 06:50

next uncovered line (L), next uncovered region (R), next uncovered branch (B)
/src/icu/icu4c/source/i18n/rematch.cpp
Line
Count
Source
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**************************************************************************
5
*   Copyright (C) 2002-2016 International Business Machines Corporation
6
*   and others. All rights reserved.
7
**************************************************************************
8
*/
9
//
10
//  file:  rematch.cpp
11
//
12
//         Contains the implementation of class RegexMatcher,
13
//         which is one of the main API classes for the ICU regular expression package.
14
//
15
16
#include "unicode/utypes.h"
17
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19
#include "unicode/regex.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/ustring.h"
23
#include "unicode/rbbi.h"
24
#include "unicode/utf.h"
25
#include "unicode/utf16.h"
26
#include "uassert.h"
27
#include "cmemory.h"
28
#include "cstr.h"
29
#include "uvector.h"
30
#include "uvectr32.h"
31
#include "uvectr64.h"
32
#include "regeximp.h"
33
#include "regexst.h"
34
#include "regextxt.h"
35
#include "ucase.h"
36
37
// #include <malloc.h>        // Needed for heapcheck testing
38
39
40
U_NAMESPACE_BEGIN
41
42
// Default limit for the size of the back track stack, to avoid system
43
//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44
// This value puts ICU's limits higher than most other regexp implementations,
45
//    which use recursion rather than the heap, and take more storage per
46
//    backtrack point.
47
//
48
static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50
// Time limit counter constant.
51
//   Time limits for expression evaluation are in terms of quanta of work by
52
//   the engine, each of which is 10,000 state saves.
53
//   This constant determines that state saves per tick number.
54
static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57
// Test for any of the Unicode line terminating characters.
58
649M
static inline UBool isLineTerminator(UChar32 c) {
59
649M
    if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60
521M
        return false;
61
521M
    }
62
127M
    return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63
649M
}
64
65
//-----------------------------------------------------------------------------
66
//
67
//   Constructor and Destructor
68
//
69
//-----------------------------------------------------------------------------
70
14.5k
RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71
14.5k
    fDeferredStatus = U_ZERO_ERROR;
72
14.5k
    init(fDeferredStatus);
73
14.5k
    if (U_FAILURE(fDeferredStatus)) {
74
0
        return;
75
0
    }
76
14.5k
    if (pat==nullptr) {
77
0
        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78
0
        return;
79
0
    }
80
14.5k
    fPattern = pat;
81
14.5k
    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82
14.5k
}
83
84
85
86
RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87
0
                           uint32_t flags, UErrorCode &status) {
88
0
    init(status);
89
0
    if (U_FAILURE(status)) {
90
0
        return;
91
0
    }
92
0
    UParseError    pe;
93
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94
0
    fPattern           = fPatternOwned;
95
96
0
    UText inputText = UTEXT_INITIALIZER;
97
0
    utext_openConstUnicodeString(&inputText, &input, &status);
98
0
    init2(&inputText, status);
99
0
    utext_close(&inputText);
100
101
0
    fInputUniStrMaybeMutable = true;
102
0
}
103
104
105
RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106
0
                           uint32_t flags, UErrorCode &status) {
107
0
    init(status);
108
0
    if (U_FAILURE(status)) {
109
0
        return;
110
0
    }
111
0
    UParseError    pe;
112
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113
0
    if (U_FAILURE(status)) {
114
0
        return;
115
0
    }
116
117
0
    fPattern           = fPatternOwned;
118
0
    init2(input, status);
119
0
}
120
121
122
RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123
0
                           uint32_t flags, UErrorCode &status) {
124
0
    init(status);
125
0
    if (U_FAILURE(status)) {
126
0
        return;
127
0
    }
128
0
    UParseError    pe;
129
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130
0
    if (U_FAILURE(status)) {
131
0
        return;
132
0
    }
133
0
    fPattern           = fPatternOwned;
134
0
    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135
0
}
136
137
RegexMatcher::RegexMatcher(UText *regexp,
138
0
                           uint32_t flags, UErrorCode &status) {
139
0
    init(status);
140
0
    if (U_FAILURE(status)) {
141
0
        return;
142
0
    }
143
0
    UParseError    pe;
144
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145
0
        if (U_FAILURE(status)) {
146
0
        return;
147
0
    }
148
149
0
    fPattern           = fPatternOwned;
150
0
    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151
0
}
152
153
154
155
156
14.5k
RegexMatcher::~RegexMatcher() {
157
14.5k
    delete fStack;
158
14.5k
    if (fData != fSmallData) {
159
458
        uprv_free(fData);
160
458
        fData = nullptr;
161
458
    }
162
14.5k
    if (fPatternOwned) {
163
0
        delete fPatternOwned;
164
0
        fPatternOwned = nullptr;
165
0
        fPattern = nullptr;
166
0
    }
167
168
14.5k
    delete fInput;
169
14.5k
    if (fInputText) {
170
14.5k
        utext_close(fInputText);
171
14.5k
    }
172
14.5k
    if (fAltInputText) {
173
386
        utext_close(fAltInputText);
174
386
    }
175
176
14.5k
    #if UCONFIG_NO_BREAK_ITERATION==0
177
14.5k
    delete fWordBreakItr;
178
14.5k
    delete fGCBreakItr;
179
14.5k
    #endif
180
14.5k
}
181
182
//
183
//   init()   common initialization for use by all constructors.
184
//            Initialize all fields, get the object into a consistent state.
185
//            This must be done even when the initial status shows an error,
186
//            so that the object is initialized sufficiently well for the destructor
187
//            to run safely.
188
//
189
14.5k
void RegexMatcher::init(UErrorCode &status) {
190
14.5k
    fPattern           = nullptr;
191
14.5k
    fPatternOwned      = nullptr;
192
14.5k
    fFrameSize         = 0;
193
14.5k
    fRegionStart       = 0;
194
14.5k
    fRegionLimit       = 0;
195
14.5k
    fAnchorStart       = 0;
196
14.5k
    fAnchorLimit       = 0;
197
14.5k
    fLookStart         = 0;
198
14.5k
    fLookLimit         = 0;
199
14.5k
    fActiveStart       = 0;
200
14.5k
    fActiveLimit       = 0;
201
14.5k
    fTransparentBounds = false;
202
14.5k
    fAnchoringBounds   = true;
203
14.5k
    fMatch             = false;
204
14.5k
    fMatchStart        = 0;
205
14.5k
    fMatchEnd          = 0;
206
14.5k
    fLastMatchEnd      = -1;
207
14.5k
    fAppendPosition    = 0;
208
14.5k
    fHitEnd            = false;
209
14.5k
    fRequireEnd        = false;
210
14.5k
    fStack             = nullptr;
211
14.5k
    fFrame             = nullptr;
212
14.5k
    fTimeLimit         = 0;
213
14.5k
    fTime              = 0;
214
14.5k
    fTickCounter       = 0;
215
14.5k
    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
216
14.5k
    fCallbackFn        = nullptr;
217
14.5k
    fCallbackContext   = nullptr;
218
14.5k
    fFindProgressCallbackFn      = nullptr;
219
14.5k
    fFindProgressCallbackContext = nullptr;
220
14.5k
    fTraceDebug        = false;
221
14.5k
    fDeferredStatus    = status;
222
14.5k
    fData              = fSmallData;
223
14.5k
    fWordBreakItr      = nullptr;
224
14.5k
    fGCBreakItr        = nullptr;
225
226
14.5k
    fStack             = nullptr;
227
14.5k
    fInputText         = nullptr;
228
14.5k
    fAltInputText      = nullptr;
229
14.5k
    fInput             = nullptr;
230
14.5k
    fInputLength       = 0;
231
14.5k
    fInputUniStrMaybeMutable = false;
232
14.5k
}
233
234
//
235
//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
236
//            This handles the common setup to be done after the Pattern is available.
237
//
238
14.5k
void RegexMatcher::init2(UText *input, UErrorCode &status) {
239
14.5k
    if (U_FAILURE(status)) {
240
0
        fDeferredStatus = status;
241
0
        return;
242
0
    }
243
244
14.5k
    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
245
458
        fData = static_cast<int64_t*>(uprv_malloc(fPattern->fDataSize * sizeof(int64_t)));
246
458
        if (fData == nullptr) {
247
0
            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
248
0
            return;
249
0
        }
250
458
    }
251
252
14.5k
    fStack = new UVector64(status);
253
14.5k
    if (fStack == nullptr) {
254
0
        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
255
0
        return;
256
0
    }
257
258
14.5k
    reset(input);
259
14.5k
    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
260
14.5k
    if (U_FAILURE(status)) {
261
0
        fDeferredStatus = status;
262
0
        return;
263
0
    }
264
14.5k
}
265
266
267
static const char16_t BACKSLASH  = 0x5c;
268
static const char16_t DOLLARSIGN = 0x24;
269
static const char16_t LEFTBRACKET = 0x7b;
270
static const char16_t RIGHTBRACKET = 0x7d;
271
272
//--------------------------------------------------------------------------------
273
//
274
//    appendReplacement
275
//
276
//--------------------------------------------------------------------------------
277
RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
278
                                              const UnicodeString &replacement,
279
0
                                              UErrorCode &status) {
280
0
    UText replacementText = UTEXT_INITIALIZER;
281
282
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
283
0
    if (U_SUCCESS(status)) {
284
0
        UText resultText = UTEXT_INITIALIZER;
285
0
        utext_openUnicodeString(&resultText, &dest, &status);
286
287
0
        if (U_SUCCESS(status)) {
288
0
            appendReplacement(&resultText, &replacementText, status);
289
0
            utext_close(&resultText);
290
0
        }
291
0
        utext_close(&replacementText);
292
0
    }
293
294
0
    return *this;
295
0
}
296
297
//
298
//    appendReplacement, UText mode
299
//
300
RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
301
                                              UText *replacement,
302
0
                                              UErrorCode &status) {
303
0
    if (U_FAILURE(status)) {
304
0
        return *this;
305
0
    }
306
0
    if (U_FAILURE(fDeferredStatus)) {
307
0
        status = fDeferredStatus;
308
0
        return *this;
309
0
    }
310
0
    if (fMatch == false) {
311
0
        status = U_REGEX_INVALID_STATE;
312
0
        return *this;
313
0
    }
314
315
    // Copy input string from the end of previous match to start of current match
316
0
    int64_t  destLen = utext_nativeLength(dest);
317
0
    if (fMatchStart > fAppendPosition) {
318
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
319
0
            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
320
0
                                     static_cast<int32_t>(fMatchStart - fAppendPosition), &status);
321
0
        } else {
322
0
            int32_t len16;
323
0
            if (UTEXT_USES_U16(fInputText)) {
324
0
                len16 = static_cast<int32_t>(fMatchStart - fAppendPosition);
325
0
            } else {
326
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
327
0
                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, nullptr, 0, &lengthStatus);
328
0
            }
329
0
            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1)));
330
0
            if (inputChars == nullptr) {
331
0
                status = U_MEMORY_ALLOCATION_ERROR;
332
0
                return *this;
333
0
            }
334
0
            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
335
0
            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
336
0
            uprv_free(inputChars);
337
0
        }
338
0
    }
339
0
    fAppendPosition = fMatchEnd;
340
341
342
    // scan the replacement text, looking for substitutions ($n) and \escapes.
343
    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
344
    //         move entire ranges not containing substitutions.
345
0
    UTEXT_SETNATIVEINDEX(replacement, 0);
346
0
    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
347
0
        if (c == BACKSLASH) {
348
            // Backslash Escape.  Copy the following char out without further checks.
349
            //                    Note:  Surrogate pairs don't need any special handling
350
            //                           The second half wont be a '$' or a '\', and
351
            //                           will move to the dest normally on the next
352
            //                           loop iteration.
353
0
            c = UTEXT_CURRENT32(replacement);
354
0
            if (c == U_SENTINEL) {
355
0
                break;
356
0
            }
357
358
0
            if (c==0x55/*U*/ || c==0x75/*u*/) {
359
                // We have a \udddd or \Udddddddd escape sequence.
360
0
                int32_t offset = 0;
361
0
                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
362
0
                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
363
0
                if (escapedChar != static_cast<UChar32>(0xFFFFFFFF)) {
364
0
                    if (U_IS_BMP(escapedChar)) {
365
0
                        char16_t c16 = static_cast<char16_t>(escapedChar);
366
0
                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
367
0
                    } else {
368
0
                        char16_t surrogate[2];
369
0
                        surrogate[0] = U16_LEAD(escapedChar);
370
0
                        surrogate[1] = U16_TRAIL(escapedChar);
371
0
                        if (U_SUCCESS(status)) {
372
0
                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
373
0
                        }
374
0
                    }
375
                    // TODO:  Report errors for mal-formed \u escapes?
376
                    //        As this is, the original sequence is output, which may be OK.
377
0
                    if (context.lastOffset == offset) {
378
0
                        (void)UTEXT_PREVIOUS32(replacement);
379
0
                    } else if (context.lastOffset != offset-1) {
380
0
                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
381
0
                    }
382
0
                }
383
0
            } else {
384
0
                (void)UTEXT_NEXT32(replacement);
385
                // Plain backslash escape.  Just put out the escaped character.
386
0
                if (U_IS_BMP(c)) {
387
0
                    char16_t c16 = static_cast<char16_t>(c);
388
0
                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
389
0
                } else {
390
0
                    char16_t surrogate[2];
391
0
                    surrogate[0] = U16_LEAD(c);
392
0
                    surrogate[1] = U16_TRAIL(c);
393
0
                    if (U_SUCCESS(status)) {
394
0
                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
395
0
                    }
396
0
                }
397
0
            }
398
0
        } else if (c != DOLLARSIGN) {
399
            // Normal char, not a $.  Copy it out without further checks.
400
0
            if (U_IS_BMP(c)) {
401
0
                char16_t c16 = static_cast<char16_t>(c);
402
0
                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
403
0
            } else {
404
0
                char16_t surrogate[2];
405
0
                surrogate[0] = U16_LEAD(c);
406
0
                surrogate[1] = U16_TRAIL(c);
407
0
                if (U_SUCCESS(status)) {
408
0
                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
409
0
                }
410
0
            }
411
0
        } else {
412
            // We've got a $.  Pick up a capture group name or number if one follows.
413
            // Consume digits so long as the resulting group number <= the number of
414
            // number of capture groups in the pattern.
415
416
0
            int32_t groupNum  = 0;
417
0
            int32_t numDigits = 0;
418
0
            UChar32 nextChar = utext_current32(replacement);
419
0
            if (nextChar == LEFTBRACKET) {
420
                // Scan for a Named Capture Group, ${name}.
421
0
                UnicodeString groupName;
422
0
                utext_next32(replacement);
423
0
                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
424
0
                    nextChar = utext_next32(replacement);
425
0
                    if (nextChar == U_SENTINEL) {
426
0
                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
427
0
                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
428
0
                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
429
0
                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
430
0
                        groupName.append(nextChar);
431
0
                    } else if (nextChar == RIGHTBRACKET) {
432
0
                        groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
433
0
                        if (groupNum == 0) {
434
0
                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
435
0
                        }
436
0
                    } else {
437
                        // Character was something other than a name char or a closing '}'
438
0
                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
439
0
                    }
440
0
                }
441
442
0
            } else if (u_isdigit(nextChar)) {
443
                // $n    Scan for a capture group number
444
0
                int32_t numCaptureGroups = fPattern->fGroupMap->size();
445
0
                for (;;) {
446
0
                    nextChar = UTEXT_CURRENT32(replacement);
447
0
                    if (nextChar == U_SENTINEL) {
448
0
                        break;
449
0
                    }
450
0
                    if (u_isdigit(nextChar) == false) {
451
0
                        break;
452
0
                    }
453
0
                    int32_t nextDigitVal = u_charDigitValue(nextChar);
454
0
                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
455
                        // Don't consume the next digit if it makes the capture group number too big.
456
0
                        if (numDigits == 0) {
457
0
                            status = U_INDEX_OUTOFBOUNDS_ERROR;
458
0
                        }
459
0
                        break;
460
0
                    }
461
0
                    (void)UTEXT_NEXT32(replacement);
462
0
                    groupNum=groupNum*10 + nextDigitVal;
463
0
                    ++numDigits;
464
0
                }
465
0
            } else {
466
                // $ not followed by capture group name or number.
467
0
                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
468
0
            }
469
470
0
            if (U_SUCCESS(status)) {
471
0
                destLen += appendGroup(groupNum, dest, status);
472
0
            }
473
0
        }  // End of $ capture group handling
474
0
    }  // End of per-character loop through the replacement string.
475
476
0
    return *this;
477
0
}
478
479
480
481
//--------------------------------------------------------------------------------
482
//
483
//    appendTail     Intended to be used in conjunction with appendReplacement()
484
//                   To the destination string, append everything following
485
//                   the last match position from the input string.
486
//
487
//                   Note:  Match ranges do not affect appendTail or appendReplacement
488
//
489
//--------------------------------------------------------------------------------
490
0
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
491
0
    UErrorCode status = U_ZERO_ERROR;
492
0
    UText resultText = UTEXT_INITIALIZER;
493
0
    utext_openUnicodeString(&resultText, &dest, &status);
494
495
0
    if (U_SUCCESS(status)) {
496
0
        appendTail(&resultText, status);
497
0
        utext_close(&resultText);
498
0
    }
499
500
0
    return dest;
501
0
}
502
503
//
504
//   appendTail, UText mode
505
//
506
0
UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
507
0
    if (U_FAILURE(status)) {
508
0
        return dest;
509
0
    }
510
0
    if (U_FAILURE(fDeferredStatus)) {
511
0
        status = fDeferredStatus;
512
0
        return dest;
513
0
    }
514
515
0
    if (fInputLength > fAppendPosition) {
516
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
517
0
            int64_t destLen = utext_nativeLength(dest);
518
0
            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
519
0
                          static_cast<int32_t>(fInputLength - fAppendPosition), &status);
520
0
        } else {
521
0
            int32_t len16;
522
0
            if (UTEXT_USES_U16(fInputText)) {
523
0
                len16 = static_cast<int32_t>(fInputLength - fAppendPosition);
524
0
            } else {
525
0
                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, nullptr, 0, &status);
526
0
                status = U_ZERO_ERROR; // buffer overflow
527
0
            }
528
529
0
            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16)));
530
0
            if (inputChars == nullptr) {
531
0
                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
532
0
            } else {
533
0
                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
534
0
                int64_t destLen = utext_nativeLength(dest);
535
0
                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
536
0
                uprv_free(inputChars);
537
0
            }
538
0
        }
539
0
    }
540
0
    return dest;
541
0
}
542
543
544
545
//--------------------------------------------------------------------------------
546
//
547
//   end
548
//
549
//--------------------------------------------------------------------------------
550
0
int32_t RegexMatcher::end(UErrorCode &err) const {
551
0
    return end(0, err);
552
0
}
553
554
0
int64_t RegexMatcher::end64(UErrorCode &err) const {
555
0
    return end64(0, err);
556
0
}
557
558
0
int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
559
0
    if (U_FAILURE(err)) {
560
0
        return -1;
561
0
    }
562
0
    if (fMatch == false) {
563
0
        err = U_REGEX_INVALID_STATE;
564
0
        return -1;
565
0
    }
566
0
    if (group < 0 || group > fPattern->fGroupMap->size()) {
567
0
        err = U_INDEX_OUTOFBOUNDS_ERROR;
568
0
        return -1;
569
0
    }
570
0
    int64_t e = -1;
571
0
    if (group == 0) {
572
0
        e = fMatchEnd;
573
0
    } else {
574
        // Get the position within the stack frame of the variables for
575
        //    this capture group.
576
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
577
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
578
0
        U_ASSERT(groupOffset >= 0);
579
0
        e = fFrame->fExtra[groupOffset + 1];
580
0
    }
581
582
0
        return e;
583
0
}
584
585
0
int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
586
0
    return static_cast<int32_t>(end64(group, err));
587
0
}
588
589
//--------------------------------------------------------------------------------
590
//
591
//   findProgressInterrupt  This function is called once for each advance in the target
592
//                          string from the find() function, and calls the user progress callback
593
//                          function if there is one installed.
594
//
595
//         Return:  true if the find operation is to be terminated.
596
//                  false if the find operation is to continue running.
597
//
598
//--------------------------------------------------------------------------------
599
46.4M
UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
600
46.4M
    if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
601
0
        status = U_REGEX_STOPPED_BY_CALLER;
602
0
        return true;
603
0
    }
604
46.4M
    return false;
605
46.4M
}
606
607
//--------------------------------------------------------------------------------
608
//
609
//   find()
610
//
611
//--------------------------------------------------------------------------------
612
0
UBool RegexMatcher::find() {
613
0
    if (U_FAILURE(fDeferredStatus)) {
614
0
        return false;
615
0
    }
616
0
    UErrorCode status = U_ZERO_ERROR;
617
0
    UBool result = find(status);
618
0
    return result;
619
0
}
620
621
//--------------------------------------------------------------------------------
622
//
623
//   find()
624
//
625
//--------------------------------------------------------------------------------
626
10.2k
UBool RegexMatcher::find(UErrorCode &status) {
627
    // Start at the position of the last match end.  (Will be zero if the
628
    //   matcher has been reset.)
629
    //
630
10.2k
    if (U_FAILURE(status)) {
631
0
        return false;
632
0
    }
633
10.2k
    if (U_FAILURE(fDeferredStatus)) {
634
0
        status = fDeferredStatus;
635
0
        return false;
636
0
    }
637
638
10.2k
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
639
10.2k
        return findUsingChunk(status);
640
10.2k
    }
641
642
0
    int64_t startPos = fMatchEnd;
643
0
    if (startPos==0) {
644
0
        startPos = fActiveStart;
645
0
    }
646
647
0
    if (fMatch) {
648
        // Save the position of any previous successful match.
649
0
        fLastMatchEnd = fMatchEnd;
650
651
0
        if (fMatchStart == fMatchEnd) {
652
            // Previous match had zero length.  Move start position up one position
653
            //  to avoid sending find() into a loop on zero-length matches.
654
0
            if (startPos >= fActiveLimit) {
655
0
                fMatch = false;
656
0
                fHitEnd = true;
657
0
                return false;
658
0
            }
659
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
660
0
            (void)UTEXT_NEXT32(fInputText);
661
0
            startPos = UTEXT_GETNATIVEINDEX(fInputText);
662
0
        }
663
0
    } else {
664
0
        if (fLastMatchEnd >= 0) {
665
            // A previous find() failed to match.  Don't try again.
666
            //   (without this test, a pattern with a zero-length match
667
            //    could match again at the end of an input string.)
668
0
            fHitEnd = true;
669
0
            return false;
670
0
        }
671
0
    }
672
673
674
    // Compute the position in the input string beyond which a match can not begin, because
675
    //   the minimum length match would extend past the end of the input.
676
    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
677
    //          Be aware of possible overflows if making changes here.
678
0
    int64_t testStartLimit;
679
0
    if (UTEXT_USES_U16(fInputText)) {
680
0
        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
681
0
        if (startPos > testStartLimit) {
682
0
            fMatch = false;
683
0
            fHitEnd = true;
684
0
            return false;
685
0
        }
686
0
    } else {
687
        // We don't know exactly how long the minimum match length is in native characters.
688
        // Treat anything > 0 as 1.
689
0
        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
690
0
    }
691
692
0
    UChar32  c;
693
0
    U_ASSERT(startPos >= 0);
694
695
0
    switch (fPattern->fStartType) {
696
0
    case START_NO_INFO:
697
        // No optimization was found.
698
        //  Try a match at each input position.
699
0
        for (;;) {
700
0
            MatchAt(startPos, false, status);
701
0
            if (U_FAILURE(status)) {
702
0
                return false;
703
0
            }
704
0
            if (fMatch) {
705
0
                return true;
706
0
            }
707
0
            if (startPos >= testStartLimit) {
708
0
                fHitEnd = true;
709
0
                return false;
710
0
            }
711
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
712
0
            (void)UTEXT_NEXT32(fInputText);
713
0
            startPos = UTEXT_GETNATIVEINDEX(fInputText);
714
            // Note that it's perfectly OK for a pattern to have a zero-length
715
            //   match at the end of a string, so we must make sure that the loop
716
            //   runs with startPos == testStartLimit the last time through.
717
0
            if  (findProgressInterrupt(startPos, status))
718
0
                return false;
719
0
        }
720
0
        UPRV_UNREACHABLE_EXIT;
721
722
0
    case START_START:
723
        // Matches are only possible at the start of the input string
724
        //   (pattern begins with ^ or \A)
725
0
        if (startPos > fActiveStart) {
726
0
            fMatch = false;
727
0
            return false;
728
0
        }
729
0
        MatchAt(startPos, false, status);
730
0
        if (U_FAILURE(status)) {
731
0
            return false;
732
0
        }
733
0
        return fMatch;
734
735
736
0
    case START_SET:
737
0
        {
738
            // Match may start on any char from a pre-computed set.
739
0
            U_ASSERT(fPattern->fMinMatchLen > 0);
740
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
741
0
            for (;;) {
742
0
                int64_t pos = startPos;
743
0
                c = UTEXT_NEXT32(fInputText);
744
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
745
                // c will be -1 (U_SENTINEL) at end of text, in which case we
746
                // skip this next block (so we don't have a negative array index)
747
                // and handle end of text in the following block.
748
0
                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
749
0
                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
750
0
                    MatchAt(pos, false, status);
751
0
                    if (U_FAILURE(status)) {
752
0
                        return false;
753
0
                    }
754
0
                    if (fMatch) {
755
0
                        return true;
756
0
                    }
757
0
                    UTEXT_SETNATIVEINDEX(fInputText, pos);
758
0
                }
759
0
                if (startPos > testStartLimit) {
760
0
                    fMatch = false;
761
0
                    fHitEnd = true;
762
0
                    return false;
763
0
                }
764
0
                if  (findProgressInterrupt(startPos, status))
765
0
                    return false;
766
0
            }
767
0
        }
768
0
        UPRV_UNREACHABLE_EXIT;
769
770
0
    case START_STRING:
771
0
    case START_CHAR:
772
0
        {
773
            // Match starts on exactly one char.
774
0
            U_ASSERT(fPattern->fMinMatchLen > 0);
775
0
            UChar32 theChar = fPattern->fInitialChar;
776
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
777
0
            for (;;) {
778
0
                int64_t pos = startPos;
779
0
                c = UTEXT_NEXT32(fInputText);
780
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
781
0
                if (c == theChar) {
782
0
                    MatchAt(pos, false, status);
783
0
                    if (U_FAILURE(status)) {
784
0
                        return false;
785
0
                    }
786
0
                    if (fMatch) {
787
0
                        return true;
788
0
                    }
789
0
                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
790
0
                }
791
0
                if (startPos > testStartLimit) {
792
0
                    fMatch = false;
793
0
                    fHitEnd = true;
794
0
                    return false;
795
0
                }
796
0
                if  (findProgressInterrupt(startPos, status))
797
0
                    return false;
798
0
           }
799
0
        }
800
0
        UPRV_UNREACHABLE_EXIT;
801
802
0
    case START_LINE:
803
0
        {
804
0
            UChar32 ch;
805
0
            if (startPos == fAnchorStart) {
806
0
                MatchAt(startPos, false, status);
807
0
                if (U_FAILURE(status)) {
808
0
                    return false;
809
0
                }
810
0
                if (fMatch) {
811
0
                    return true;
812
0
                }
813
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
814
0
                ch = UTEXT_NEXT32(fInputText);
815
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
816
0
            } else {
817
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
818
0
                ch = UTEXT_PREVIOUS32(fInputText);
819
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
820
0
            }
821
822
0
            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
823
0
                for (;;) {
824
0
                    if (ch == 0x0a) {
825
0
                            MatchAt(startPos, false, status);
826
0
                            if (U_FAILURE(status)) {
827
0
                                return false;
828
0
                            }
829
0
                            if (fMatch) {
830
0
                                return true;
831
0
                            }
832
0
                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
833
0
                    }
834
0
                    if (startPos >= testStartLimit) {
835
0
                        fMatch = false;
836
0
                        fHitEnd = true;
837
0
                        return false;
838
0
                    }
839
0
                    ch = UTEXT_NEXT32(fInputText);
840
0
                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
841
                    // Note that it's perfectly OK for a pattern to have a zero-length
842
                    //   match at the end of a string, so we must make sure that the loop
843
                    //   runs with startPos == testStartLimit the last time through.
844
0
                    if  (findProgressInterrupt(startPos, status))
845
0
                        return false;
846
0
                }
847
0
            } else {
848
0
                for (;;) {
849
0
                    if (isLineTerminator(ch)) {
850
0
                        if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
851
0
                            (void)UTEXT_NEXT32(fInputText);
852
0
                            startPos = UTEXT_GETNATIVEINDEX(fInputText);
853
0
                        }
854
0
                        MatchAt(startPos, false, status);
855
0
                        if (U_FAILURE(status)) {
856
0
                            return false;
857
0
                        }
858
0
                        if (fMatch) {
859
0
                            return true;
860
0
                        }
861
0
                        UTEXT_SETNATIVEINDEX(fInputText, startPos);
862
0
                    }
863
0
                    if (startPos >= testStartLimit) {
864
0
                        fMatch = false;
865
0
                        fHitEnd = true;
866
0
                        return false;
867
0
                    }
868
0
                    ch = UTEXT_NEXT32(fInputText);
869
0
                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
870
                    // Note that it's perfectly OK for a pattern to have a zero-length
871
                    //   match at the end of a string, so we must make sure that the loop
872
                    //   runs with startPos == testStartLimit the last time through.
873
0
                    if  (findProgressInterrupt(startPos, status))
874
0
                        return false;
875
0
                }
876
0
            }
877
0
        }
878
879
0
    default:
880
0
        UPRV_UNREACHABLE_ASSERT;
881
        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
882
        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
883
        // See ICU-21669.
884
0
        status = U_INTERNAL_PROGRAM_ERROR;
885
0
        return false;
886
0
    }
887
888
0
    UPRV_UNREACHABLE_EXIT;
889
0
}
890
891
892
893
10.2k
UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
894
10.2k
    if (U_FAILURE(status)) {
895
0
        return false;
896
0
    }
897
10.2k
    if (U_FAILURE(fDeferredStatus)) {
898
0
        status = fDeferredStatus;
899
0
        return false;
900
0
    }
901
10.2k
    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
902
                                          //        This will reset the region to be the full input length.
903
10.2k
    if (start < 0) {
904
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
905
0
        return false;
906
0
    }
907
908
10.2k
    int64_t nativeStart = start;
909
10.2k
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
910
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
911
0
        return false;
912
0
    }
913
10.2k
    fMatchEnd = nativeStart;
914
10.2k
    return find(status);
915
10.2k
}
916
917
918
//--------------------------------------------------------------------------------
919
//
920
//   findUsingChunk() -- like find(), but with the advance knowledge that the
921
//                       entire string is available in the UText's chunk buffer.
922
//
923
//--------------------------------------------------------------------------------
924
10.2k
UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
925
    // Start at the position of the last match end.  (Will be zero if the
926
    //   matcher has been reset.
927
    //
928
929
10.2k
    int32_t startPos = static_cast<int32_t>(fMatchEnd);
930
10.2k
    if (startPos==0) {
931
10.2k
        startPos = static_cast<int32_t>(fActiveStart);
932
10.2k
    }
933
934
10.2k
    const char16_t *inputBuf = fInputText->chunkContents;
935
936
10.2k
    if (fMatch) {
937
        // Save the position of any previous successful match.
938
0
        fLastMatchEnd = fMatchEnd;
939
940
0
        if (fMatchStart == fMatchEnd) {
941
            // Previous match had zero length.  Move start position up one position
942
            //  to avoid sending find() into a loop on zero-length matches.
943
0
            if (startPos >= fActiveLimit) {
944
0
                fMatch = false;
945
0
                fHitEnd = true;
946
0
                return false;
947
0
            }
948
0
            U16_FWD_1(inputBuf, startPos, fInputLength);
949
0
        }
950
10.2k
    } else {
951
10.2k
        if (fLastMatchEnd >= 0) {
952
            // A previous find() failed to match.  Don't try again.
953
            //   (without this test, a pattern with a zero-length match
954
            //    could match again at the end of an input string.)
955
0
            fHitEnd = true;
956
0
            return false;
957
0
        }
958
10.2k
    }
959
960
961
    // Compute the position in the input string beyond which a match can not begin, because
962
    //   the minimum length match would extend past the end of the input.
963
    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
964
    //          Be aware of possible overflows if making changes here.
965
    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
966
10.2k
    int32_t testLen = static_cast<int32_t>(fActiveLimit - fPattern->fMinMatchLen);
967
10.2k
    if (startPos > testLen) {
968
159
        fMatch = false;
969
159
        fHitEnd = true;
970
159
        return false;
971
159
    }
972
973
10.0k
    UChar32  c;
974
10.0k
    U_ASSERT(startPos >= 0);
975
976
10.0k
    switch (fPattern->fStartType) {
977
5.33k
    case START_NO_INFO:
978
        // No optimization was found.
979
        //  Try a match at each input position.
980
18.5M
        for (;;) {
981
18.5M
            MatchChunkAt(startPos, false, status);
982
18.5M
            if (U_FAILURE(status)) {
983
402
                return false;
984
402
            }
985
18.5M
            if (fMatch) {
986
1.59k
                return true;
987
1.59k
            }
988
18.5M
            if (startPos >= testLen) {
989
3.33k
                fHitEnd = true;
990
3.33k
                return false;
991
3.33k
            }
992
18.5M
            U16_FWD_1(inputBuf, startPos, fActiveLimit);
993
            // Note that it's perfectly OK for a pattern to have a zero-length
994
            //   match at the end of a string, so we must make sure that the loop
995
            //   runs with startPos == testLen the last time through.
996
18.5M
            if  (findProgressInterrupt(startPos, status))
997
0
                return false;
998
18.5M
        }
999
5.33k
        UPRV_UNREACHABLE_EXIT;
1000
1001
52
    case START_START:
1002
        // Matches are only possible at the start of the input string
1003
        //   (pattern begins with ^ or \A)
1004
52
        if (startPos > fActiveStart) {
1005
0
            fMatch = false;
1006
0
            return false;
1007
0
        }
1008
52
        MatchChunkAt(startPos, false, status);
1009
52
        if (U_FAILURE(status)) {
1010
20
            return false;
1011
20
        }
1012
32
        return fMatch;
1013
1014
1015
2.83k
    case START_SET:
1016
2.83k
    {
1017
        // Match may start on any char from a pre-computed set.
1018
2.83k
        U_ASSERT(fPattern->fMinMatchLen > 0);
1019
6.49M
        for (;;) {
1020
6.49M
            int32_t pos = startPos;
1021
6.49M
            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1022
6.49M
            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1023
5.46M
                (c>=256 && fPattern->fInitialChars->contains(c))) {
1024
1.78M
                MatchChunkAt(pos, false, status);
1025
1.78M
                if (U_FAILURE(status)) {
1026
43
                    return false;
1027
43
                }
1028
1.78M
                if (fMatch) {
1029
949
                    return true;
1030
949
                }
1031
1.78M
            }
1032
6.49M
            if (startPos > testLen) {
1033
1.83k
                fMatch = false;
1034
1.83k
                fHitEnd = true;
1035
1.83k
                return false;
1036
1.83k
            }
1037
6.49M
            if  (findProgressInterrupt(startPos, status))
1038
0
                return false;
1039
6.49M
        }
1040
2.83k
    }
1041
2.83k
    UPRV_UNREACHABLE_EXIT;
1042
1043
42
    case START_STRING:
1044
1.69k
    case START_CHAR:
1045
1.69k
    {
1046
        // Match starts on exactly one char.
1047
1.69k
        U_ASSERT(fPattern->fMinMatchLen > 0);
1048
1.69k
        UChar32 theChar = fPattern->fInitialChar;
1049
16.4M
        for (;;) {
1050
16.4M
            int32_t pos = startPos;
1051
16.4M
            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1052
16.4M
            if (c == theChar) {
1053
347k
                MatchChunkAt(pos, false, status);
1054
347k
                if (U_FAILURE(status)) {
1055
8
                    return false;
1056
8
                }
1057
347k
                if (fMatch) {
1058
107
                    return true;
1059
107
                }
1060
347k
            }
1061
16.4M
            if (startPos > testLen) {
1062
1.58k
                fMatch = false;
1063
1.58k
                fHitEnd = true;
1064
1.58k
                return false;
1065
1.58k
            }
1066
16.4M
            if  (findProgressInterrupt(startPos, status))
1067
0
                return false;
1068
16.4M
        }
1069
1.69k
    }
1070
1.69k
    UPRV_UNREACHABLE_EXIT;
1071
1072
155
    case START_LINE:
1073
155
    {
1074
155
        UChar32 ch;
1075
155
        if (startPos == fAnchorStart) {
1076
155
            MatchChunkAt(startPos, false, status);
1077
155
            if (U_FAILURE(status)) {
1078
3
                return false;
1079
3
            }
1080
152
            if (fMatch) {
1081
16
                return true;
1082
16
            }
1083
152
            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1084
136
        }
1085
1086
136
        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1087
0
            for (;;) {
1088
0
                ch = inputBuf[startPos-1];
1089
0
                if (ch == 0x0a) {
1090
0
                    MatchChunkAt(startPos, false, status);
1091
0
                    if (U_FAILURE(status)) {
1092
0
                        return false;
1093
0
                    }
1094
0
                    if (fMatch) {
1095
0
                        return true;
1096
0
                    }
1097
0
                }
1098
0
                if (startPos >= testLen) {
1099
0
                    fMatch = false;
1100
0
                    fHitEnd = true;
1101
0
                    return false;
1102
0
                }
1103
0
                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1104
                // Note that it's perfectly OK for a pattern to have a zero-length
1105
                //   match at the end of a string, so we must make sure that the loop
1106
                //   runs with startPos == testLen the last time through.
1107
0
                if  (findProgressInterrupt(startPos, status))
1108
0
                    return false;
1109
0
            }
1110
136
        } else {
1111
4.97M
            for (;;) {
1112
4.97M
                ch = inputBuf[startPos-1];
1113
4.97M
                if (isLineTerminator(ch)) {
1114
27.9k
                    if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1115
1.83k
                        startPos++;
1116
1.83k
                    }
1117
27.9k
                    MatchChunkAt(startPos, false, status);
1118
27.9k
                    if (U_FAILURE(status)) {
1119
2
                        return false;
1120
2
                    }
1121
27.9k
                    if (fMatch) {
1122
7
                        return true;
1123
7
                    }
1124
27.9k
                }
1125
4.97M
                if (startPos >= testLen) {
1126
127
                    fMatch = false;
1127
127
                    fHitEnd = true;
1128
127
                    return false;
1129
127
                }
1130
4.97M
                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1131
                // Note that it's perfectly OK for a pattern to have a zero-length
1132
                //   match at the end of a string, so we must make sure that the loop
1133
                //   runs with startPos == testLen the last time through.
1134
4.97M
                if  (findProgressInterrupt(startPos, status))
1135
0
                    return false;
1136
4.97M
            }
1137
136
        }
1138
136
    }
1139
1140
0
    default:
1141
0
        UPRV_UNREACHABLE_ASSERT;
1142
        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1143
        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1144
        // See ICU-21669.
1145
0
        status = U_INTERNAL_PROGRAM_ERROR;
1146
0
        return false;
1147
10.0k
    }
1148
1149
10.0k
    UPRV_UNREACHABLE_EXIT;
1150
10.0k
}
1151
1152
1153
1154
//--------------------------------------------------------------------------------
1155
//
1156
//  group()
1157
//
1158
//--------------------------------------------------------------------------------
1159
0
UnicodeString RegexMatcher::group(UErrorCode &status) const {
1160
0
    return group(0, status);
1161
0
}
1162
1163
//  Return immutable shallow clone
1164
0
UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1165
0
    return group(0, dest, group_len, status);
1166
0
}
1167
1168
//  Return immutable shallow clone
1169
0
UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1170
0
    group_len = 0;
1171
0
    if (U_FAILURE(status)) {
1172
0
        return dest;
1173
0
    }
1174
0
    if (U_FAILURE(fDeferredStatus)) {
1175
0
        status = fDeferredStatus;
1176
0
    } else if (fMatch == false) {
1177
0
        status = U_REGEX_INVALID_STATE;
1178
0
    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1179
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1180
0
    }
1181
1182
0
    if (U_FAILURE(status)) {
1183
0
        return dest;
1184
0
    }
1185
1186
0
    int64_t s, e;
1187
0
    if (groupNum == 0) {
1188
0
        s = fMatchStart;
1189
0
        e = fMatchEnd;
1190
0
    } else {
1191
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1192
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
1193
0
        U_ASSERT(groupOffset >= 0);
1194
0
        s = fFrame->fExtra[groupOffset];
1195
0
        e = fFrame->fExtra[groupOffset+1];
1196
0
    }
1197
1198
0
    if (s < 0) {
1199
        // A capture group wasn't part of the match
1200
0
        return utext_clone(dest, fInputText, false, true, &status);
1201
0
    }
1202
0
    U_ASSERT(s <= e);
1203
0
    group_len = e - s;
1204
1205
0
    dest = utext_clone(dest, fInputText, false, true, &status);
1206
0
    if (dest)
1207
0
        UTEXT_SETNATIVEINDEX(dest, s);
1208
0
    return dest;
1209
0
}
1210
1211
0
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1212
0
    UnicodeString result;
1213
0
    int64_t groupStart = start64(groupNum, status);
1214
0
    int64_t groupEnd = end64(groupNum, status);
1215
0
    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1216
0
        return result;
1217
0
    }
1218
1219
    // Get the group length using a utext_extract preflight.
1220
    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1221
0
    UErrorCode bufferStatus = U_ZERO_ERROR;
1222
0
    int32_t length = utext_extract(fInputText, groupStart, groupEnd, nullptr, 0, &bufferStatus);
1223
0
    if (bufferStatus != U_BUFFER_OVERFLOW_ERROR) {
1224
0
        if (U_FAILURE(bufferStatus)) {
1225
0
            status = bufferStatus;
1226
0
        }
1227
0
        return result;
1228
0
    }
1229
1230
0
    char16_t *buf = result.getBuffer(length);
1231
0
    if (buf == nullptr) {
1232
0
        status = U_MEMORY_ALLOCATION_ERROR;
1233
0
    } else {
1234
0
        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1235
0
        result.releaseBuffer(extractLength);
1236
0
        U_ASSERT(length == extractLength);
1237
0
    }
1238
0
    return result;
1239
0
}
1240
1241
1242
//--------------------------------------------------------------------------------
1243
//
1244
//  appendGroup() -- currently internal only, appends a group to a UText rather
1245
//                   than replacing its contents
1246
//
1247
//--------------------------------------------------------------------------------
1248
1249
0
int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1250
0
    if (U_FAILURE(status)) {
1251
0
        return 0;
1252
0
    }
1253
0
    if (U_FAILURE(fDeferredStatus)) {
1254
0
        status = fDeferredStatus;
1255
0
        return 0;
1256
0
    }
1257
0
    int64_t destLen = utext_nativeLength(dest);
1258
1259
0
    if (fMatch == false) {
1260
0
        status = U_REGEX_INVALID_STATE;
1261
0
        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1262
0
    }
1263
0
    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1264
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1265
0
        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1266
0
    }
1267
1268
0
    int64_t s, e;
1269
0
    if (groupNum == 0) {
1270
0
        s = fMatchStart;
1271
0
        e = fMatchEnd;
1272
0
    } else {
1273
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1274
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
1275
0
        U_ASSERT(groupOffset >= 0);
1276
0
        s = fFrame->fExtra[groupOffset];
1277
0
        e = fFrame->fExtra[groupOffset+1];
1278
0
    }
1279
1280
0
    if (s < 0) {
1281
        // A capture group wasn't part of the match
1282
0
        return utext_replace(dest, destLen, destLen, nullptr, 0, &status);
1283
0
    }
1284
0
    U_ASSERT(s <= e);
1285
1286
0
    int64_t deltaLen;
1287
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1288
0
        U_ASSERT(e <= fInputLength);
1289
0
        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents + s, static_cast<int32_t>(e - s), &status);
1290
0
    } else {
1291
0
        int32_t len16;
1292
0
        if (UTEXT_USES_U16(fInputText)) {
1293
0
            len16 = static_cast<int32_t>(e - s);
1294
0
        } else {
1295
0
            UErrorCode lengthStatus = U_ZERO_ERROR;
1296
0
            len16 = utext_extract(fInputText, s, e, nullptr, 0, &lengthStatus);
1297
0
        }
1298
0
        char16_t* groupChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (len16 + 1)));
1299
0
        if (groupChars == nullptr) {
1300
0
            status = U_MEMORY_ALLOCATION_ERROR;
1301
0
            return 0;
1302
0
        }
1303
0
        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1304
1305
0
        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1306
0
        uprv_free(groupChars);
1307
0
    }
1308
0
    return deltaLen;
1309
0
}
1310
1311
1312
1313
//--------------------------------------------------------------------------------
1314
//
1315
//  groupCount()
1316
//
1317
//--------------------------------------------------------------------------------
1318
0
int32_t RegexMatcher::groupCount() const {
1319
0
    return fPattern->fGroupMap->size();
1320
0
}
1321
1322
//--------------------------------------------------------------------------------
1323
//
1324
//  hasAnchoringBounds()
1325
//
1326
//--------------------------------------------------------------------------------
1327
0
UBool RegexMatcher::hasAnchoringBounds() const {
1328
0
    return fAnchoringBounds;
1329
0
}
1330
1331
1332
//--------------------------------------------------------------------------------
1333
//
1334
//  hasTransparentBounds()
1335
//
1336
//--------------------------------------------------------------------------------
1337
0
UBool RegexMatcher::hasTransparentBounds() const {
1338
0
    return fTransparentBounds;
1339
0
}
1340
1341
1342
1343
//--------------------------------------------------------------------------------
1344
//
1345
//  hitEnd()
1346
//
1347
//--------------------------------------------------------------------------------
1348
0
UBool RegexMatcher::hitEnd() const {
1349
0
    return fHitEnd;
1350
0
}
1351
1352
1353
//--------------------------------------------------------------------------------
1354
//
1355
//  input()
1356
//
1357
//--------------------------------------------------------------------------------
1358
0
const UnicodeString &RegexMatcher::input() const {
1359
0
    if (!fInput) {
1360
0
        UErrorCode status = U_ZERO_ERROR;
1361
0
        int32_t len16;
1362
0
        if (UTEXT_USES_U16(fInputText)) {
1363
0
            len16 = static_cast<int32_t>(fInputLength);
1364
0
        } else {
1365
0
            len16 = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &status);
1366
0
            status = U_ZERO_ERROR; // overflow, length status
1367
0
        }
1368
0
        UnicodeString *result = new UnicodeString(len16, 0, 0);
1369
1370
0
        char16_t *inputChars = result->getBuffer(len16);
1371
0
        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1372
0
        result->releaseBuffer(len16);
1373
1374
0
        *const_cast<const UnicodeString**>(&fInput) = result; // pointer assignment, rather than operator=
1375
0
    }
1376
1377
0
    return *fInput;
1378
0
}
1379
1380
//--------------------------------------------------------------------------------
1381
//
1382
//  inputText()
1383
//
1384
//--------------------------------------------------------------------------------
1385
0
UText *RegexMatcher::inputText() const {
1386
0
    return fInputText;
1387
0
}
1388
1389
1390
//--------------------------------------------------------------------------------
1391
//
1392
//  getInput() -- like inputText(), but makes a clone or copies into another UText
1393
//
1394
//--------------------------------------------------------------------------------
1395
0
UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1396
0
    if (U_FAILURE(status)) {
1397
0
        return dest;
1398
0
    }
1399
0
    if (U_FAILURE(fDeferredStatus)) {
1400
0
        status = fDeferredStatus;
1401
0
        return dest;
1402
0
    }
1403
1404
0
    if (dest) {
1405
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1406
0
            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, static_cast<int32_t>(fInputLength), &status);
1407
0
        } else {
1408
0
            int32_t input16Len;
1409
0
            if (UTEXT_USES_U16(fInputText)) {
1410
0
                input16Len = static_cast<int32_t>(fInputLength);
1411
0
            } else {
1412
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
1413
0
                input16Len = utext_extract(fInputText, 0, fInputLength, nullptr, 0, &lengthStatus); // buffer overflow error
1414
0
            }
1415
0
            char16_t* inputChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (input16Len)));
1416
0
            if (inputChars == nullptr) {
1417
0
                return dest;
1418
0
            }
1419
1420
0
            status = U_ZERO_ERROR;
1421
0
            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1422
0
            status = U_ZERO_ERROR;
1423
0
            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1424
1425
0
            uprv_free(inputChars);
1426
0
        }
1427
0
        return dest;
1428
0
    } else {
1429
0
        return utext_clone(nullptr, fInputText, false, true, &status);
1430
0
    }
1431
0
}
1432
1433
1434
static UBool compat_SyncMutableUTextContents(UText *ut);
1435
0
static UBool compat_SyncMutableUTextContents(UText *ut) {
1436
0
    UBool retVal = false;
1437
1438
    //  In the following test, we're really only interested in whether the UText should switch
1439
    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1440
    //  will still point to the correct data.
1441
0
    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1442
0
        UnicodeString *us=(UnicodeString *)ut->context;
1443
1444
        // Update to the latest length.
1445
        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1446
0
        int32_t newLength = us->length();
1447
1448
        // Update the chunk description.
1449
        // The buffer may have switched between stack- and heap-based.
1450
0
        ut->chunkContents    = us->getBuffer();
1451
0
        ut->chunkLength      = newLength;
1452
0
        ut->chunkNativeLimit = newLength;
1453
0
        ut->nativeIndexingLimit = newLength;
1454
0
        retVal = true;
1455
0
    }
1456
1457
0
    return retVal;
1458
0
}
1459
1460
//--------------------------------------------------------------------------------
1461
//
1462
//  lookingAt()
1463
//
1464
//--------------------------------------------------------------------------------
1465
0
UBool RegexMatcher::lookingAt(UErrorCode &status) {
1466
0
    if (U_FAILURE(status)) {
1467
0
        return false;
1468
0
    }
1469
0
    if (U_FAILURE(fDeferredStatus)) {
1470
0
        status = fDeferredStatus;
1471
0
        return false;
1472
0
    }
1473
1474
0
    if (fInputUniStrMaybeMutable) {
1475
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1476
0
        fInputLength = utext_nativeLength(fInputText);
1477
0
        reset();
1478
0
        }
1479
0
    }
1480
0
    else {
1481
0
        resetPreserveRegion();
1482
0
    }
1483
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1484
0
        MatchChunkAt(static_cast<int32_t>(fActiveStart), false, status);
1485
0
    } else {
1486
0
        MatchAt(fActiveStart, false, status);
1487
0
    }
1488
0
    return fMatch;
1489
0
}
1490
1491
1492
0
UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1493
0
    if (U_FAILURE(status)) {
1494
0
        return false;
1495
0
    }
1496
0
    if (U_FAILURE(fDeferredStatus)) {
1497
0
        status = fDeferredStatus;
1498
0
        return false;
1499
0
    }
1500
0
    reset();
1501
1502
0
    if (start < 0) {
1503
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1504
0
        return false;
1505
0
    }
1506
1507
0
    if (fInputUniStrMaybeMutable) {
1508
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1509
0
        fInputLength = utext_nativeLength(fInputText);
1510
0
        reset();
1511
0
        }
1512
0
    }
1513
1514
0
    int64_t nativeStart;
1515
0
    nativeStart = start;
1516
0
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1517
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1518
0
        return false;
1519
0
    }
1520
1521
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1522
0
        MatchChunkAt(static_cast<int32_t>(nativeStart), false, status);
1523
0
    } else {
1524
0
        MatchAt(nativeStart, false, status);
1525
0
    }
1526
0
    return fMatch;
1527
0
}
1528
1529
1530
1531
//--------------------------------------------------------------------------------
1532
//
1533
//  matches()
1534
//
1535
//--------------------------------------------------------------------------------
1536
0
UBool RegexMatcher::matches(UErrorCode &status) {
1537
0
    if (U_FAILURE(status)) {
1538
0
        return false;
1539
0
    }
1540
0
    if (U_FAILURE(fDeferredStatus)) {
1541
0
        status = fDeferredStatus;
1542
0
        return false;
1543
0
    }
1544
1545
0
    if (fInputUniStrMaybeMutable) {
1546
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1547
0
        fInputLength = utext_nativeLength(fInputText);
1548
0
        reset();
1549
0
        }
1550
0
    }
1551
0
    else {
1552
0
        resetPreserveRegion();
1553
0
    }
1554
1555
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1556
0
        MatchChunkAt(static_cast<int32_t>(fActiveStart), true, status);
1557
0
    } else {
1558
0
        MatchAt(fActiveStart, true, status);
1559
0
    }
1560
0
    return fMatch;
1561
0
}
1562
1563
1564
0
UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1565
0
    if (U_FAILURE(status)) {
1566
0
        return false;
1567
0
    }
1568
0
    if (U_FAILURE(fDeferredStatus)) {
1569
0
        status = fDeferredStatus;
1570
0
        return false;
1571
0
    }
1572
0
    reset();
1573
1574
0
    if (start < 0) {
1575
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1576
0
        return false;
1577
0
    }
1578
1579
0
    if (fInputUniStrMaybeMutable) {
1580
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1581
0
        fInputLength = utext_nativeLength(fInputText);
1582
0
        reset();
1583
0
        }
1584
0
    }
1585
1586
0
    int64_t nativeStart;
1587
0
    nativeStart = start;
1588
0
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1589
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1590
0
        return false;
1591
0
    }
1592
1593
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1594
0
        MatchChunkAt(static_cast<int32_t>(nativeStart), true, status);
1595
0
    } else {
1596
0
        MatchAt(nativeStart, true, status);
1597
0
    }
1598
0
    return fMatch;
1599
0
}
1600
1601
1602
1603
//--------------------------------------------------------------------------------
1604
//
1605
//    pattern
1606
//
1607
//--------------------------------------------------------------------------------
1608
0
const RegexPattern &RegexMatcher::pattern() const {
1609
0
    return *fPattern;
1610
0
}
1611
1612
1613
1614
//--------------------------------------------------------------------------------
1615
//
1616
//    region
1617
//
1618
//--------------------------------------------------------------------------------
1619
0
RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1620
0
    if (U_FAILURE(status)) {
1621
0
        return *this;
1622
0
    }
1623
1624
0
    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1625
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1626
0
    }
1627
1628
0
    int64_t nativeStart = regionStart;
1629
0
    int64_t nativeLimit = regionLimit;
1630
0
    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1631
0
      status = U_ILLEGAL_ARGUMENT_ERROR;
1632
0
    }
1633
1634
0
    if (startIndex == -1)
1635
0
      this->reset();
1636
0
    else
1637
0
      resetPreserveRegion();
1638
1639
0
    fRegionStart = nativeStart;
1640
0
    fRegionLimit = nativeLimit;
1641
0
    fActiveStart = nativeStart;
1642
0
    fActiveLimit = nativeLimit;
1643
1644
0
    if (startIndex != -1) {
1645
0
      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1646
0
          status = U_INDEX_OUTOFBOUNDS_ERROR;
1647
0
      }
1648
0
      fMatchEnd = startIndex;
1649
0
    }
1650
1651
0
    if (!fTransparentBounds) {
1652
0
        fLookStart = nativeStart;
1653
0
        fLookLimit = nativeLimit;
1654
0
    }
1655
0
    if (fAnchoringBounds) {
1656
0
        fAnchorStart = nativeStart;
1657
0
        fAnchorLimit = nativeLimit;
1658
0
    }
1659
0
    return *this;
1660
0
}
1661
1662
0
RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1663
0
  return region(start, limit, -1, status);
1664
0
}
1665
1666
//--------------------------------------------------------------------------------
1667
//
1668
//    regionEnd
1669
//
1670
//--------------------------------------------------------------------------------
1671
0
int32_t RegexMatcher::regionEnd() const {
1672
0
    return static_cast<int32_t>(fRegionLimit);
1673
0
}
1674
1675
0
int64_t RegexMatcher::regionEnd64() const {
1676
0
    return fRegionLimit;
1677
0
}
1678
1679
//--------------------------------------------------------------------------------
1680
//
1681
//    regionStart
1682
//
1683
//--------------------------------------------------------------------------------
1684
0
int32_t RegexMatcher::regionStart() const {
1685
0
    return static_cast<int32_t>(fRegionStart);
1686
0
}
1687
1688
0
int64_t RegexMatcher::regionStart64() const {
1689
0
    return fRegionStart;
1690
0
}
1691
1692
1693
//--------------------------------------------------------------------------------
1694
//
1695
//    replaceAll
1696
//
1697
//--------------------------------------------------------------------------------
1698
0
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1699
0
    UText replacementText = UTEXT_INITIALIZER;
1700
0
    UText resultText = UTEXT_INITIALIZER;
1701
0
    UnicodeString resultString;
1702
0
    if (U_FAILURE(status)) {
1703
0
        return resultString;
1704
0
    }
1705
1706
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1707
0
    utext_openUnicodeString(&resultText, &resultString, &status);
1708
1709
0
    replaceAll(&replacementText, &resultText, status);
1710
1711
0
    utext_close(&resultText);
1712
0
    utext_close(&replacementText);
1713
1714
0
    return resultString;
1715
0
}
1716
1717
1718
//
1719
//    replaceAll, UText mode
1720
//
1721
0
UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1722
0
    if (U_FAILURE(status)) {
1723
0
        return dest;
1724
0
    }
1725
0
    if (U_FAILURE(fDeferredStatus)) {
1726
0
        status = fDeferredStatus;
1727
0
        return dest;
1728
0
    }
1729
1730
0
    if (dest == nullptr) {
1731
0
        UnicodeString emptyString;
1732
0
        UText empty = UTEXT_INITIALIZER;
1733
1734
0
        utext_openUnicodeString(&empty, &emptyString, &status);
1735
0
        dest = utext_clone(nullptr, &empty, true, false, &status);
1736
0
        utext_close(&empty);
1737
0
    }
1738
1739
0
    if (U_SUCCESS(status)) {
1740
0
        reset();
1741
0
        while (find()) {
1742
0
            appendReplacement(dest, replacement, status);
1743
0
            if (U_FAILURE(status)) {
1744
0
                break;
1745
0
            }
1746
0
        }
1747
0
        appendTail(dest, status);
1748
0
    }
1749
1750
0
    return dest;
1751
0
}
1752
1753
1754
//--------------------------------------------------------------------------------
1755
//
1756
//    replaceFirst
1757
//
1758
//--------------------------------------------------------------------------------
1759
0
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1760
0
    UText replacementText = UTEXT_INITIALIZER;
1761
0
    UText resultText = UTEXT_INITIALIZER;
1762
0
    UnicodeString resultString;
1763
1764
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1765
0
    utext_openUnicodeString(&resultText, &resultString, &status);
1766
1767
0
    replaceFirst(&replacementText, &resultText, status);
1768
1769
0
    utext_close(&resultText);
1770
0
    utext_close(&replacementText);
1771
1772
0
    return resultString;
1773
0
}
1774
1775
//
1776
//    replaceFirst, UText mode
1777
//
1778
0
UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1779
0
    if (U_FAILURE(status)) {
1780
0
        return dest;
1781
0
    }
1782
0
    if (U_FAILURE(fDeferredStatus)) {
1783
0
        status = fDeferredStatus;
1784
0
        return dest;
1785
0
    }
1786
1787
0
    reset();
1788
0
    if (!find()) {
1789
0
        return getInput(dest, status);
1790
0
    }
1791
1792
0
    if (dest == nullptr) {
1793
0
        UnicodeString emptyString;
1794
0
        UText empty = UTEXT_INITIALIZER;
1795
1796
0
        utext_openUnicodeString(&empty, &emptyString, &status);
1797
0
        dest = utext_clone(nullptr, &empty, true, false, &status);
1798
0
        utext_close(&empty);
1799
0
    }
1800
1801
0
    appendReplacement(dest, replacement, status);
1802
0
    appendTail(dest, status);
1803
1804
0
    return dest;
1805
0
}
1806
1807
1808
//--------------------------------------------------------------------------------
1809
//
1810
//     requireEnd
1811
//
1812
//--------------------------------------------------------------------------------
1813
0
UBool RegexMatcher::requireEnd() const {
1814
0
    return fRequireEnd;
1815
0
}
1816
1817
1818
//--------------------------------------------------------------------------------
1819
//
1820
//     reset
1821
//
1822
//--------------------------------------------------------------------------------
1823
49.5k
RegexMatcher &RegexMatcher::reset() {
1824
49.5k
    fRegionStart    = 0;
1825
49.5k
    fRegionLimit    = fInputLength;
1826
49.5k
    fActiveStart    = 0;
1827
49.5k
    fActiveLimit    = fInputLength;
1828
49.5k
    fAnchorStart    = 0;
1829
49.5k
    fAnchorLimit    = fInputLength;
1830
49.5k
    fLookStart      = 0;
1831
49.5k
    fLookLimit      = fInputLength;
1832
49.5k
    resetPreserveRegion();
1833
49.5k
    return *this;
1834
49.5k
}
1835
1836
1837
1838
49.5k
void RegexMatcher::resetPreserveRegion() {
1839
49.5k
    fMatchStart     = 0;
1840
49.5k
    fMatchEnd       = 0;
1841
49.5k
    fLastMatchEnd   = -1;
1842
49.5k
    fAppendPosition = 0;
1843
49.5k
    fMatch          = false;
1844
49.5k
    fHitEnd         = false;
1845
49.5k
    fRequireEnd     = false;
1846
49.5k
    fTime           = 0;
1847
49.5k
    fTickCounter    = TIMER_INITIAL_VALUE;
1848
    //resetStack(); // more expensive than it looks...
1849
49.5k
}
1850
1851
1852
10.2k
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1853
10.2k
    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1854
10.2k
    if (fPattern->fNeedsAltInput) {
1855
347
        fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1856
347
    }
1857
10.2k
    if (U_FAILURE(fDeferredStatus)) {
1858
0
        return *this;
1859
0
    }
1860
10.2k
    fInputLength = utext_nativeLength(fInputText);
1861
1862
10.2k
    reset();
1863
10.2k
    delete fInput;
1864
10.2k
    fInput = nullptr;
1865
1866
    //  Do the following for any UnicodeString.
1867
    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1868
10.2k
    fInputUniStrMaybeMutable = true;
1869
1870
10.2k
#if UCONFIG_NO_BREAK_ITERATION==0
1871
10.2k
    if (fWordBreakItr) {
1872
0
        fWordBreakItr->setText(fInputText, fDeferredStatus);
1873
0
    }
1874
10.2k
    if (fGCBreakItr) {
1875
0
        fGCBreakItr->setText(fInputText, fDeferredStatus);
1876
0
    }
1877
10.2k
#endif
1878
1879
10.2k
    return *this;
1880
10.2k
}
1881
1882
1883
14.5k
RegexMatcher &RegexMatcher::reset(UText *input) {
1884
14.5k
    if (fInputText != input) {
1885
14.5k
        fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus);
1886
14.5k
        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1887
14.5k
        if (U_FAILURE(fDeferredStatus)) {
1888
0
            return *this;
1889
0
        }
1890
14.5k
        fInputLength = utext_nativeLength(fInputText);
1891
1892
14.5k
        delete fInput;
1893
14.5k
        fInput = nullptr;
1894
1895
14.5k
#if UCONFIG_NO_BREAK_ITERATION==0
1896
14.5k
        if (fWordBreakItr) {
1897
0
            fWordBreakItr->setText(input, fDeferredStatus);
1898
0
        }
1899
14.5k
        if (fGCBreakItr) {
1900
0
            fGCBreakItr->setText(fInputText, fDeferredStatus);
1901
0
        }
1902
14.5k
#endif
1903
14.5k
    }
1904
14.5k
    reset();
1905
14.5k
    fInputUniStrMaybeMutable = false;
1906
1907
14.5k
    return *this;
1908
14.5k
}
1909
1910
/*RegexMatcher &RegexMatcher::reset(const char16_t *) {
1911
    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1912
    return *this;
1913
}*/
1914
1915
0
RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1916
0
    if (U_FAILURE(status)) {
1917
0
        return *this;
1918
0
    }
1919
0
    reset();       // Reset also resets the region to be the entire string.
1920
1921
0
    if (position < 0 || position > fActiveLimit) {
1922
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1923
0
        return *this;
1924
0
    }
1925
0
    fMatchEnd = position;
1926
0
    return *this;
1927
0
}
1928
1929
1930
//--------------------------------------------------------------------------------
1931
//
1932
//    refresh
1933
//
1934
//--------------------------------------------------------------------------------
1935
0
RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1936
0
    if (U_FAILURE(status)) {
1937
0
        return *this;
1938
0
    }
1939
0
    if (input == nullptr) {
1940
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1941
0
        return *this;
1942
0
    }
1943
0
    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1944
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1945
0
        return *this;
1946
0
    }
1947
0
    int64_t  pos = utext_getNativeIndex(fInputText);
1948
    //  Shallow read-only clone of the new UText into the existing input UText
1949
0
    fInputText = utext_clone(fInputText, input, false, true, &status);
1950
0
    if (U_FAILURE(status)) {
1951
0
        return *this;
1952
0
    }
1953
0
    utext_setNativeIndex(fInputText, pos);
1954
1955
0
    if (fAltInputText != nullptr) {
1956
0
        pos = utext_getNativeIndex(fAltInputText);
1957
0
        fAltInputText = utext_clone(fAltInputText, input, false, true, &status);
1958
0
        if (U_FAILURE(status)) {
1959
0
            return *this;
1960
0
        }
1961
0
        utext_setNativeIndex(fAltInputText, pos);
1962
0
    }
1963
0
    return *this;
1964
0
}
1965
1966
1967
1968
//--------------------------------------------------------------------------------
1969
//
1970
//    setTrace
1971
//
1972
//--------------------------------------------------------------------------------
1973
0
void RegexMatcher::setTrace(UBool state) {
1974
0
    fTraceDebug = state;
1975
0
}
1976
1977
1978
1979
/**
1980
  *  UText, replace entire contents of the destination UText with a substring of the source UText.
1981
  *
1982
  *     @param src    The source UText
1983
  *     @param dest   The destination UText. Must be writable.
1984
  *                   May be nullptr, in which case a new UText will be allocated.
1985
  *     @param start  Start index of source substring.
1986
  *     @param limit  Limit index of source substring.
1987
  *     @param status An error code.
1988
  */
1989
0
static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1990
0
    if (U_FAILURE(*status)) {
1991
0
        return dest;
1992
0
    }
1993
0
    if (start == limit) {
1994
0
        if (dest) {
1995
0
            utext_replace(dest, 0, utext_nativeLength(dest), nullptr, 0, status);
1996
0
            return dest;
1997
0
        } else {
1998
0
            return utext_openUChars(nullptr, nullptr, 0, status);
1999
0
        }
2000
0
    }
2001
0
    UErrorCode bufferStatus = U_ZERO_ERROR;
2002
0
    int32_t length = utext_extract(src, start, limit, nullptr, 0, &bufferStatus);
2003
0
    if (bufferStatus != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(bufferStatus)) {
2004
0
        *status = bufferStatus;
2005
0
        return dest;
2006
0
    }
2007
0
    MaybeStackArray<char16_t, 40> buffer;
2008
0
    if (length >= buffer.getCapacity()) {
2009
0
        char16_t *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
2010
0
        if (newBuf == nullptr) {
2011
0
            *status = U_MEMORY_ALLOCATION_ERROR;
2012
0
        }
2013
0
    }
2014
0
    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2015
0
    if (dest) {
2016
0
        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2017
0
        return dest;
2018
0
    }
2019
2020
    // Caller did not provide a preexisting UText.
2021
    // Open a new one, and have it adopt the text buffer storage.
2022
0
    if (U_FAILURE(*status)) {
2023
0
        return nullptr;
2024
0
    }
2025
0
    int32_t ownedLength = 0;
2026
0
    char16_t *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2027
0
    if (ownedBuf == nullptr) {
2028
0
        *status = U_MEMORY_ALLOCATION_ERROR;
2029
0
        return nullptr;
2030
0
    }
2031
0
    UText *result = utext_openUChars(nullptr, ownedBuf, length, status);
2032
0
    if (U_FAILURE(*status)) {
2033
0
        uprv_free(ownedBuf);
2034
0
        return nullptr;
2035
0
    }
2036
0
    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2037
0
    return result;
2038
0
}
2039
2040
2041
//---------------------------------------------------------------------
2042
//
2043
//   split
2044
//
2045
//---------------------------------------------------------------------
2046
int32_t  RegexMatcher::split(const UnicodeString &input,
2047
        UnicodeString    dest[],
2048
        int32_t          destCapacity,
2049
        UErrorCode      &status)
2050
0
{
2051
0
    UText inputText = UTEXT_INITIALIZER;
2052
0
    utext_openConstUnicodeString(&inputText, &input, &status);
2053
0
    if (U_FAILURE(status)) {
2054
0
        return 0;
2055
0
    }
2056
2057
0
    UText** destText = static_cast<UText**>(uprv_malloc(sizeof(UText*) * destCapacity));
2058
0
    if (destText == nullptr) {
2059
0
        status = U_MEMORY_ALLOCATION_ERROR;
2060
0
        return 0;
2061
0
    }
2062
0
    int32_t i;
2063
0
    for (i = 0; i < destCapacity; i++) {
2064
0
        destText[i] = utext_openUnicodeString(nullptr, &dest[i], &status);
2065
0
    }
2066
2067
0
    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2068
2069
0
    for (i = 0; i < destCapacity; i++) {
2070
0
        utext_close(destText[i]);
2071
0
    }
2072
2073
0
    uprv_free(destText);
2074
0
    utext_close(&inputText);
2075
0
    return fieldCount;
2076
0
}
2077
2078
//
2079
//   split, UText mode
2080
//
2081
int32_t  RegexMatcher::split(UText *input,
2082
        UText           *dest[],
2083
        int32_t          destCapacity,
2084
        UErrorCode      &status)
2085
0
{
2086
    //
2087
    // Check arguments for validity
2088
    //
2089
0
    if (U_FAILURE(status)) {
2090
0
        return 0;
2091
0
    }
2092
2093
0
    if (destCapacity < 1) {
2094
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2095
0
        return 0;
2096
0
    }
2097
2098
    //
2099
    // Reset for the input text
2100
    //
2101
0
    reset(input);
2102
0
    int64_t   nextOutputStringStart = 0;
2103
0
    if (fActiveLimit == 0) {
2104
0
        return 0;
2105
0
    }
2106
2107
    //
2108
    // Loop through the input text, searching for the delimiter pattern
2109
    //
2110
0
    int32_t i;
2111
0
    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2112
0
    for (i=0; ; i++) {
2113
0
        if (i>=destCapacity-1) {
2114
            // There is one or zero output string left.
2115
            // Fill the last output string with whatever is left from the input, then exit the loop.
2116
            //  ( i will be == destCapacity if we filled the output array while processing
2117
            //    capture groups of the delimiter expression, in which case we will discard the
2118
            //    last capture group saved in favor of the unprocessed remainder of the
2119
            //    input string.)
2120
0
            i = destCapacity-1;
2121
0
            if (fActiveLimit > nextOutputStringStart) {
2122
0
                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2123
0
                    if (dest[i]) {
2124
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2125
0
                                      input->chunkContents+nextOutputStringStart,
2126
0
                                      static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status);
2127
0
                    } else {
2128
0
                        UText remainingText = UTEXT_INITIALIZER;
2129
0
                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2130
0
                                         fActiveLimit-nextOutputStringStart, &status);
2131
0
                        dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2132
0
                        utext_close(&remainingText);
2133
0
                    }
2134
0
                } else {
2135
0
                    UErrorCode lengthStatus = U_ZERO_ERROR;
2136
0
                    int32_t remaining16Length =
2137
0
                        utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2138
0
                    char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
2139
0
                    if (remainingChars == nullptr) {
2140
0
                        status = U_MEMORY_ALLOCATION_ERROR;
2141
0
                        break;
2142
0
                    }
2143
2144
0
                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2145
0
                    if (dest[i]) {
2146
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2147
0
                    } else {
2148
0
                        UText remainingText = UTEXT_INITIALIZER;
2149
0
                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2150
0
                        dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2151
0
                        utext_close(&remainingText);
2152
0
                    }
2153
2154
0
                    uprv_free(remainingChars);
2155
0
                }
2156
0
            }
2157
0
            break;
2158
0
        }
2159
0
        if (find()) {
2160
            // We found another delimiter.  Move everything from where we started looking
2161
            //  up until the start of the delimiter into the next output string.
2162
0
            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2163
0
                if (dest[i]) {
2164
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2165
0
                                  input->chunkContents+nextOutputStringStart,
2166
0
                                  static_cast<int32_t>(fMatchStart - nextOutputStringStart), &status);
2167
0
                } else {
2168
0
                    UText remainingText = UTEXT_INITIALIZER;
2169
0
                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2170
0
                                      fMatchStart-nextOutputStringStart, &status);
2171
0
                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2172
0
                    utext_close(&remainingText);
2173
0
                }
2174
0
            } else {
2175
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
2176
0
                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, nullptr, 0, &lengthStatus);
2177
0
                char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
2178
0
                if (remainingChars == nullptr) {
2179
0
                    status = U_MEMORY_ALLOCATION_ERROR;
2180
0
                    break;
2181
0
                }
2182
0
                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2183
0
                if (dest[i]) {
2184
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2185
0
                } else {
2186
0
                    UText remainingText = UTEXT_INITIALIZER;
2187
0
                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2188
0
                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2189
0
                    utext_close(&remainingText);
2190
0
                }
2191
2192
0
                uprv_free(remainingChars);
2193
0
            }
2194
0
            nextOutputStringStart = fMatchEnd;
2195
2196
            // If the delimiter pattern has capturing parentheses, the captured
2197
            //  text goes out into the next n destination strings.
2198
0
            int32_t groupNum;
2199
0
            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2200
0
                if (i >= destCapacity-2) {
2201
                    // Never fill the last available output string with capture group text.
2202
                    // It will filled with the last field, the remainder of the
2203
                    //  unsplit input text.
2204
0
                    break;
2205
0
                }
2206
0
                i++;
2207
0
                dest[i] = utext_extract_replace(fInputText, dest[i],
2208
0
                                               start64(groupNum, status), end64(groupNum, status), &status);
2209
0
            }
2210
2211
0
            if (nextOutputStringStart == fActiveLimit) {
2212
                // The delimiter was at the end of the string.  We're done, but first
2213
                // we output one last empty string, for the empty field following
2214
                //   the delimiter at the end of input.
2215
0
                if (i+1 < destCapacity) {
2216
0
                    ++i;
2217
0
                    if (dest[i] == nullptr) {
2218
0
                        dest[i] = utext_openUChars(nullptr, nullptr, 0, &status);
2219
0
                    } else {
2220
0
                        static const char16_t emptyString[] = {static_cast<char16_t>(0)};
2221
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2222
0
                    }
2223
0
                }
2224
0
                break;
2225
2226
0
            }
2227
0
        }
2228
0
        else
2229
0
        {
2230
            // We ran off the end of the input while looking for the next delimiter.
2231
            // All the remaining text goes into the current output string.
2232
0
            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2233
0
                if (dest[i]) {
2234
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2235
0
                                  input->chunkContents+nextOutputStringStart,
2236
0
                                  static_cast<int32_t>(fActiveLimit - nextOutputStringStart), &status);
2237
0
                } else {
2238
0
                    UText remainingText = UTEXT_INITIALIZER;
2239
0
                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2240
0
                                     fActiveLimit-nextOutputStringStart, &status);
2241
0
                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2242
0
                    utext_close(&remainingText);
2243
0
                }
2244
0
            } else {
2245
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
2246
0
                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, nullptr, 0, &lengthStatus);
2247
0
                char16_t* remainingChars = static_cast<char16_t*>(uprv_malloc(sizeof(char16_t) * (remaining16Length + 1)));
2248
0
                if (remainingChars == nullptr) {
2249
0
                    status = U_MEMORY_ALLOCATION_ERROR;
2250
0
                    break;
2251
0
                }
2252
2253
0
                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2254
0
                if (dest[i]) {
2255
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2256
0
                } else {
2257
0
                    UText remainingText = UTEXT_INITIALIZER;
2258
0
                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2259
0
                    dest[i] = utext_clone(nullptr, &remainingText, true, false, &status);
2260
0
                    utext_close(&remainingText);
2261
0
                }
2262
2263
0
                uprv_free(remainingChars);
2264
0
            }
2265
0
            break;
2266
0
        }
2267
0
        if (U_FAILURE(status)) {
2268
0
            break;
2269
0
        }
2270
0
    }   // end of for loop
2271
0
    return i+1;
2272
0
}
2273
2274
2275
//--------------------------------------------------------------------------------
2276
//
2277
//     start
2278
//
2279
//--------------------------------------------------------------------------------
2280
0
int32_t RegexMatcher::start(UErrorCode &status) const {
2281
0
    return start(0, status);
2282
0
}
2283
2284
0
int64_t RegexMatcher::start64(UErrorCode &status) const {
2285
0
    return start64(0, status);
2286
0
}
2287
2288
//--------------------------------------------------------------------------------
2289
//
2290
//     start(int32_t group, UErrorCode &status)
2291
//
2292
//--------------------------------------------------------------------------------
2293
2294
0
int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2295
0
    if (U_FAILURE(status)) {
2296
0
        return -1;
2297
0
    }
2298
0
    if (U_FAILURE(fDeferredStatus)) {
2299
0
        status = fDeferredStatus;
2300
0
        return -1;
2301
0
    }
2302
0
    if (fMatch == false) {
2303
0
        status = U_REGEX_INVALID_STATE;
2304
0
        return -1;
2305
0
    }
2306
0
    if (group < 0 || group > fPattern->fGroupMap->size()) {
2307
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
2308
0
        return -1;
2309
0
    }
2310
0
    int64_t s;
2311
0
    if (group == 0) {
2312
0
        s = fMatchStart;
2313
0
    } else {
2314
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2315
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
2316
0
        U_ASSERT(groupOffset >= 0);
2317
0
        s = fFrame->fExtra[groupOffset];
2318
0
    }
2319
2320
0
    return s;
2321
0
}
2322
2323
2324
0
int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2325
0
    return static_cast<int32_t>(start64(group, status));
2326
0
}
2327
2328
//--------------------------------------------------------------------------------
2329
//
2330
//     useAnchoringBounds
2331
//
2332
//--------------------------------------------------------------------------------
2333
0
RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2334
0
    fAnchoringBounds = b;
2335
0
    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2336
0
    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2337
0
    return *this;
2338
0
}
2339
2340
2341
//--------------------------------------------------------------------------------
2342
//
2343
//     useTransparentBounds
2344
//
2345
//--------------------------------------------------------------------------------
2346
0
RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2347
0
    fTransparentBounds = b;
2348
0
    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2349
0
    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2350
0
    return *this;
2351
0
}
2352
2353
//--------------------------------------------------------------------------------
2354
//
2355
//     setTimeLimit
2356
//
2357
//--------------------------------------------------------------------------------
2358
10.2k
void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2359
10.2k
    if (U_FAILURE(status)) {
2360
0
        return;
2361
0
    }
2362
10.2k
    if (U_FAILURE(fDeferredStatus)) {
2363
0
        status = fDeferredStatus;
2364
0
        return;
2365
0
    }
2366
10.2k
    if (limit < 0) {
2367
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2368
0
        return;
2369
0
    }
2370
10.2k
    fTimeLimit = limit;
2371
10.2k
}
2372
2373
2374
//--------------------------------------------------------------------------------
2375
//
2376
//     getTimeLimit
2377
//
2378
//--------------------------------------------------------------------------------
2379
0
int32_t RegexMatcher::getTimeLimit() const {
2380
0
    return fTimeLimit;
2381
0
}
2382
2383
2384
//--------------------------------------------------------------------------------
2385
//
2386
//     setStackLimit
2387
//
2388
//--------------------------------------------------------------------------------
2389
14.5k
void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2390
14.5k
    if (U_FAILURE(status)) {
2391
0
        return;
2392
0
    }
2393
14.5k
    if (U_FAILURE(fDeferredStatus)) {
2394
0
        status = fDeferredStatus;
2395
0
        return;
2396
0
    }
2397
14.5k
    if (limit < 0) {
2398
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2399
0
        return;
2400
0
    }
2401
2402
    // Reset the matcher.  This is needed here in case there is a current match
2403
    //    whose final stack frame (containing the match results, pointed to by fFrame)
2404
    //    would be lost by resizing to a smaller stack size.
2405
14.5k
    reset();
2406
2407
14.5k
    if (limit == 0) {
2408
        // Unlimited stack expansion
2409
0
        fStack->setMaxCapacity(0);
2410
14.5k
    } else {
2411
        // Change the units of the limit  from bytes to ints, and bump the size up
2412
        //   to be big enough to hold at least one stack frame for the pattern,
2413
        //   if it isn't there already.
2414
14.5k
        int32_t adjustedLimit = limit / sizeof(int32_t);
2415
14.5k
        if (adjustedLimit < fPattern->fFrameSize) {
2416
0
            adjustedLimit = fPattern->fFrameSize;
2417
0
        }
2418
14.5k
        fStack->setMaxCapacity(adjustedLimit);
2419
14.5k
    }
2420
14.5k
    fStackLimit = limit;
2421
14.5k
}
2422
2423
2424
//--------------------------------------------------------------------------------
2425
//
2426
//     getStackLimit
2427
//
2428
//--------------------------------------------------------------------------------
2429
0
int32_t RegexMatcher::getStackLimit() const {
2430
0
    return fStackLimit;
2431
0
}
2432
2433
2434
//--------------------------------------------------------------------------------
2435
//
2436
//     setMatchCallback
2437
//
2438
//--------------------------------------------------------------------------------
2439
void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2440
                                    const void              *context,
2441
0
                                    UErrorCode              &status) {
2442
0
    if (U_FAILURE(status)) {
2443
0
        return;
2444
0
    }
2445
0
    fCallbackFn = callback;
2446
0
    fCallbackContext = context;
2447
0
}
2448
2449
2450
//--------------------------------------------------------------------------------
2451
//
2452
//     getMatchCallback
2453
//
2454
//--------------------------------------------------------------------------------
2455
void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2456
                                  const void              *&context,
2457
0
                                  UErrorCode              &status) {
2458
0
    if (U_FAILURE(status)) {
2459
0
       return;
2460
0
    }
2461
0
    callback = fCallbackFn;
2462
0
    context  = fCallbackContext;
2463
0
}
2464
2465
2466
//--------------------------------------------------------------------------------
2467
//
2468
//     setMatchCallback
2469
//
2470
//--------------------------------------------------------------------------------
2471
void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2472
                                                const void                      *context,
2473
0
                                                UErrorCode                      &status) {
2474
0
    if (U_FAILURE(status)) {
2475
0
        return;
2476
0
    }
2477
0
    fFindProgressCallbackFn = callback;
2478
0
    fFindProgressCallbackContext = context;
2479
0
}
2480
2481
2482
//--------------------------------------------------------------------------------
2483
//
2484
//     getMatchCallback
2485
//
2486
//--------------------------------------------------------------------------------
2487
void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2488
                                                const void                    *&context,
2489
0
                                                UErrorCode                    &status) {
2490
0
    if (U_FAILURE(status)) {
2491
0
       return;
2492
0
    }
2493
0
    callback = fFindProgressCallbackFn;
2494
0
    context  = fFindProgressCallbackContext;
2495
0
}
2496
2497
2498
//================================================================================
2499
//
2500
//    Code following this point in this file is the internal
2501
//    Match Engine Implementation.
2502
//
2503
//================================================================================
2504
2505
2506
//--------------------------------------------------------------------------------
2507
//
2508
//   resetStack
2509
//           Discard any previous contents of the state save stack, and initialize a
2510
//           new stack frame to all -1.  The -1s are needed for capture group limits,
2511
//           where they indicate that a group has not yet matched anything.
2512
//--------------------------------------------------------------------------------
2513
20.6M
REStackFrame *RegexMatcher::resetStack() {
2514
    // Discard any previous contents of the state save stack, and initialize a
2515
    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2516
    //  where they indicate that a group has not yet matched anything.
2517
20.6M
    fStack->removeAllElements();
2518
2519
20.6M
    REStackFrame* iFrame = reinterpret_cast<REStackFrame*>(fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus));
2520
20.6M
    if(U_FAILURE(fDeferredStatus)) {
2521
0
        return nullptr;
2522
0
    }
2523
2524
20.6M
    int32_t i;
2525
804M
    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2526
783M
        iFrame->fExtra[i] = -1;
2527
783M
    }
2528
20.6M
    return iFrame;
2529
20.6M
}
2530
2531
2532
2533
//--------------------------------------------------------------------------------
2534
//
2535
//   isWordBoundary
2536
//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2537
//                     For us,
2538
//                       If the current char is a combining mark,
2539
//                          \b is false.
2540
//                       Else Scan backwards to the first non-combining char.
2541
//                            We are at a boundary if the this char and the original chars are
2542
//                               opposite in membership in \w set
2543
//
2544
//          parameters:   pos   - the current position in the input buffer
2545
//
2546
//              TODO:  double-check edge cases at region boundaries.
2547
//
2548
//--------------------------------------------------------------------------------
2549
0
UBool RegexMatcher::isWordBoundary(int64_t pos) {
2550
0
    UBool isBoundary = false;
2551
0
    UBool cIsWord    = false;
2552
2553
0
    if (pos >= fLookLimit) {
2554
0
        fHitEnd = true;
2555
0
    } else {
2556
        // Determine whether char c at current position is a member of the word set of chars.
2557
        // If we're off the end of the string, behave as though we're not at a word char.
2558
0
        UTEXT_SETNATIVEINDEX(fInputText, pos);
2559
0
        UChar32  c = UTEXT_CURRENT32(fInputText);
2560
0
        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2561
            // Current char is a combining one.  Not a boundary.
2562
0
            return false;
2563
0
        }
2564
0
        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2565
0
    }
2566
2567
    // Back up until we come to a non-combining char, determine whether
2568
    //  that char is a word char.
2569
0
    UBool prevCIsWord = false;
2570
0
    for (;;) {
2571
0
        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2572
0
            break;
2573
0
        }
2574
0
        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2575
0
        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2576
0
              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2577
0
            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2578
0
            break;
2579
0
        }
2580
0
    }
2581
0
    isBoundary = cIsWord ^ prevCIsWord;
2582
0
    return isBoundary;
2583
0
}
2584
2585
49.5M
UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2586
49.5M
    UBool isBoundary = false;
2587
49.5M
    UBool cIsWord    = false;
2588
2589
49.5M
    const char16_t *inputBuf = fInputText->chunkContents;
2590
2591
49.5M
    if (pos >= fLookLimit) {
2592
109k
        fHitEnd = true;
2593
49.4M
    } else {
2594
        // Determine whether char c at current position is a member of the word set of chars.
2595
        // If we're off the end of the string, behave as though we're not at a word char.
2596
49.4M
        UChar32 c;
2597
49.4M
        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2598
49.4M
        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2599
            // Current char is a combining one.  Not a boundary.
2600
671k
            return false;
2601
671k
        }
2602
48.7M
        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2603
48.7M
    }
2604
2605
    // Back up until we come to a non-combining char, determine whether
2606
    //  that char is a word char.
2607
48.8M
    UBool prevCIsWord = false;
2608
49.5M
    for (;;) {
2609
49.5M
        if (pos <= fLookStart) {
2610
1.23M
            break;
2611
1.23M
        }
2612
48.2M
        UChar32 prevChar;
2613
48.2M
        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2614
48.2M
        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2615
47.8M
              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2616
47.6M
            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2617
47.6M
            break;
2618
47.6M
        }
2619
48.2M
    }
2620
48.8M
    isBoundary = cIsWord ^ prevCIsWord;
2621
48.8M
    return isBoundary;
2622
49.5M
}
2623
2624
//--------------------------------------------------------------------------------
2625
//
2626
//   isUWordBoundary
2627
//
2628
//         Test for a word boundary using RBBI word break.
2629
//
2630
//          parameters:   pos   - the current position in the input buffer
2631
//
2632
//--------------------------------------------------------------------------------
2633
351M
UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2634
351M
    UBool       returnVal = false;
2635
2636
351M
#if UCONFIG_NO_BREAK_ITERATION==0
2637
    // Note: this point will never be reached if break iteration is configured out.
2638
    //       Regex patterns that would require this function will fail to compile.
2639
2640
    // If we haven't yet created a break iterator for this matcher, do it now.
2641
351M
    if (fWordBreakItr == nullptr) {
2642
2.08k
        fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2643
2.08k
        if (U_FAILURE(status)) {
2644
0
            return false;
2645
0
        }
2646
2.08k
        fWordBreakItr->setText(fInputText, status);
2647
2.08k
    }
2648
2649
    // Note: zero width boundary tests like \b see through transparent region bounds,
2650
    //       which is why fLookLimit is used here, rather than fActiveLimit.
2651
351M
    if (pos >= fLookLimit) {
2652
712k
        fHitEnd = true;
2653
712k
        returnVal = true;   // With Unicode word rules, only positions within the interior of "real"
2654
                            //    words are not boundaries.  All non-word chars stand by themselves,
2655
                            //    with word boundaries on both sides.
2656
350M
    } else {
2657
350M
        returnVal = fWordBreakItr->isBoundary(static_cast<int32_t>(pos));
2658
350M
    }
2659
351M
#endif
2660
351M
    return   returnVal;
2661
351M
}
2662
2663
2664
153M
int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2665
153M
    int64_t result = pos;
2666
2667
153M
#if UCONFIG_NO_BREAK_ITERATION==0
2668
    // Note: this point will never be reached if break iteration is configured out.
2669
    //       Regex patterns that would require this function will fail to compile.
2670
2671
    // If we haven't yet created a break iterator for this matcher, do it now.
2672
153M
    if (fGCBreakItr == nullptr) {
2673
663
        fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2674
663
        if (U_FAILURE(status)) {
2675
0
            return pos;
2676
0
        }
2677
663
        fGCBreakItr->setText(fInputText, status);
2678
663
    }
2679
153M
    result = fGCBreakItr->following(pos);
2680
153M
    if (result == BreakIterator::DONE) {
2681
0
        result = pos;
2682
0
    }
2683
153M
#endif
2684
153M
    return result;
2685
153M
}
2686
2687
//--------------------------------------------------------------------------------
2688
//
2689
//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2690
//                     saves. Increment the "time" counter, and call the
2691
//                     user callback function if there is one installed.
2692
//
2693
//                     If the match operation needs to be aborted, either for a time-out
2694
//                     or because the user callback asked for it, just set an error status.
2695
//                     The engine will pick that up and stop in its outer loop.
2696
//
2697
//--------------------------------------------------------------------------------
2698
160k
void RegexMatcher::IncrementTime(UErrorCode &status) {
2699
160k
    fTickCounter = TIMER_INITIAL_VALUE;
2700
160k
    fTime++;
2701
160k
    if (fCallbackFn != nullptr) {
2702
0
        if ((*fCallbackFn)(fCallbackContext, fTime) == false) {
2703
0
            status = U_REGEX_STOPPED_BY_CALLER;
2704
0
            return;
2705
0
        }
2706
0
    }
2707
160k
    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2708
401
        status = U_REGEX_TIME_OUT;
2709
401
    }
2710
160k
}
2711
2712
//--------------------------------------------------------------------------------
2713
//
2714
//   StateSave
2715
//       Make a new stack frame, initialized as a copy of the current stack frame.
2716
//       Set the pattern index in the original stack frame from the operand value
2717
//       in the opcode.  Execution of the engine continues with the state in
2718
//       the newly created stack frame
2719
//
2720
//       Note that reserveBlock() may grow the stack, resulting in the
2721
//       whole thing being relocated in memory.
2722
//
2723
//    Parameters:
2724
//       fp           The top frame pointer when called.  At return, a new
2725
//                    fame will be present
2726
//       savePatIdx   An index into the compiled pattern.  Goes into the original
2727
//                    (not new) frame.  If execution ever back-tracks out of the
2728
//                    new frame, this will be where we continue from in the pattern.
2729
//    Return
2730
//                    The new frame pointer.
2731
//
2732
//--------------------------------------------------------------------------------
2733
1.55G
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2734
1.55G
    if (U_FAILURE(status)) {
2735
0
        return fp;
2736
0
    }
2737
    // push storage for a new frame.
2738
1.55G
    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2739
1.55G
    if (U_FAILURE(status)) {
2740
        // Failure on attempted stack expansion.
2741
        //   Stack function set some other error code, change it to a more
2742
        //   specific one for regular expressions.
2743
77
        status = U_REGEX_STACK_OVERFLOW;
2744
        // We need to return a writable stack frame, so just return the
2745
        //    previous frame.  The match operation will stop quickly
2746
        //    because of the error status, after which the frame will never
2747
        //    be looked at again.
2748
77
        return fp;
2749
77
    }
2750
1.55G
    fp = reinterpret_cast<REStackFrame*>(newFP - fFrameSize); // in case of realloc of stack.
2751
2752
    // New stack frame = copy of old top frame.
2753
1.55G
    int64_t* source = reinterpret_cast<int64_t*>(fp);
2754
1.55G
    int64_t *dest   = newFP;
2755
38.0G
    for (;;) {
2756
38.0G
        *dest++ = *source++;
2757
38.0G
        if (source == newFP) {
2758
1.55G
            break;
2759
1.55G
        }
2760
38.0G
    }
2761
2762
1.55G
    fTickCounter--;
2763
1.55G
    if (fTickCounter <= 0) {
2764
154k
       IncrementTime(status);    // Re-initializes fTickCounter
2765
154k
    }
2766
1.55G
    fp->fPatIdx = savePatIdx;
2767
1.55G
    return reinterpret_cast<REStackFrame*>(newFP);
2768
1.55G
}
2769
2770
#if defined(REGEX_DEBUG)
2771
namespace {
2772
UnicodeString StringFromUText(UText *ut) {
2773
    UnicodeString result;
2774
    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2775
        result.append(c);
2776
    }
2777
    return result;
2778
}
2779
}
2780
#endif // REGEX_DEBUG
2781
2782
2783
//--------------------------------------------------------------------------------
2784
//
2785
//   MatchAt      This is the actual matching engine.
2786
//
2787
//                  startIdx:    begin matching a this index.
2788
//                  toEnd:       if true, match must extend to end of the input region
2789
//
2790
//--------------------------------------------------------------------------------
2791
0
void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2792
0
    UBool       isMatch  = false;      // True if the we have a match.
2793
2794
0
    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2795
2796
0
    int32_t     op;                    // Operation from the compiled pattern, split into
2797
0
    int32_t     opType;                //    the opcode
2798
0
    int32_t     opValue;               //    and the operand value.
2799
2800
#ifdef REGEX_RUN_DEBUG
2801
    if (fTraceDebug) {
2802
        printf("MatchAt(startIdx=%ld)\n", startIdx);
2803
        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2804
        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2805
    }
2806
#endif
2807
2808
0
    if (U_FAILURE(status)) {
2809
0
        return;
2810
0
    }
2811
2812
    //  Cache frequently referenced items from the compiled pattern
2813
    //
2814
0
    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2815
2816
0
    const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
2817
0
    UVector             *fSets         = fPattern->fSets;
2818
2819
0
    fFrameSize = fPattern->fFrameSize;
2820
0
    REStackFrame        *fp            = resetStack();
2821
0
    if (U_FAILURE(fDeferredStatus)) {
2822
0
        status = fDeferredStatus;
2823
0
        return;
2824
0
    }
2825
2826
0
    fp->fPatIdx   = 0;
2827
0
    fp->fInputIdx = startIdx;
2828
2829
    // Zero out the pattern's static data
2830
0
    int32_t i;
2831
0
    for (i = 0; i<fPattern->fDataSize; i++) {
2832
0
        fData[i] = 0;
2833
0
    }
2834
2835
    //
2836
    //  Main loop for interpreting the compiled pattern.
2837
    //  One iteration of the loop per pattern operation performed.
2838
    //
2839
0
    for (;;) {
2840
0
        op = static_cast<int32_t>(pat[fp->fPatIdx]);
2841
0
        opType  = URX_TYPE(op);
2842
0
        opValue = URX_VAL(op);
2843
#ifdef REGEX_RUN_DEBUG
2844
        if (fTraceDebug) {
2845
            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2846
            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2847
                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2848
            fPattern->dumpOp(fp->fPatIdx);
2849
        }
2850
#endif
2851
0
        fp->fPatIdx++;
2852
2853
0
        switch (opType) {
2854
2855
2856
0
        case URX_NOP:
2857
0
            break;
2858
2859
2860
0
        case URX_BACKTRACK:
2861
            // Force a backtrack.  In some circumstances, the pattern compiler
2862
            //   will notice that the pattern can't possibly match anything, and will
2863
            //   emit one of these at that point.
2864
0
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
2865
0
            break;
2866
2867
2868
0
        case URX_ONECHAR:
2869
0
            if (fp->fInputIdx < fActiveLimit) {
2870
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2871
0
                UChar32 c = UTEXT_NEXT32(fInputText);
2872
0
                if (c == opValue) {
2873
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2874
0
                    break;
2875
0
                }
2876
0
            } else {
2877
0
                fHitEnd = true;
2878
0
            }
2879
0
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
2880
0
            break;
2881
2882
2883
0
        case URX_STRING:
2884
0
            {
2885
                // Test input against a literal string.
2886
                // Strings require two slots in the compiled pattern, one for the
2887
                //   offset to the string text, and one for the length.
2888
2889
0
                int32_t   stringStartIdx = opValue;
2890
0
                op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand
2891
0
                fp->fPatIdx++;
2892
0
                opType    = URX_TYPE(op);
2893
0
                int32_t stringLen = URX_VAL(op);
2894
0
                U_ASSERT(opType == URX_STRING_LEN);
2895
0
                U_ASSERT(stringLen >= 2);
2896
2897
0
                const char16_t *patternString = litText+stringStartIdx;
2898
0
                int32_t patternStringIndex = 0;
2899
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2900
0
                UChar32 inputChar;
2901
0
                UChar32 patternChar;
2902
0
                UBool success = true;
2903
0
                while (patternStringIndex < stringLen) {
2904
0
                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2905
0
                        success = false;
2906
0
                        fHitEnd = true;
2907
0
                        break;
2908
0
                    }
2909
0
                    inputChar = UTEXT_NEXT32(fInputText);
2910
0
                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2911
0
                    if (patternChar != inputChar) {
2912
0
                        success = false;
2913
0
                        break;
2914
0
                    }
2915
0
                }
2916
2917
0
                if (success) {
2918
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2919
0
                } else {
2920
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
2921
0
                }
2922
0
            }
2923
0
            break;
2924
2925
2926
0
        case URX_STATE_SAVE:
2927
0
            fp = StateSave(fp, opValue, status);
2928
0
            break;
2929
2930
2931
0
        case URX_END:
2932
            // The match loop will exit via this path on a successful match,
2933
            //   when we reach the end of the pattern.
2934
0
            if (toEnd && fp->fInputIdx != fActiveLimit) {
2935
                // The pattern matched, but not to the end of input.  Try some more.
2936
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
2937
0
                break;
2938
0
            }
2939
0
            isMatch = true;
2940
0
            goto  breakFromLoop;
2941
2942
        // Start and End Capture stack frame variables are laid out out like this:
2943
            //  fp->fExtra[opValue]  - The start of a completed capture group
2944
            //             opValue+1 - The end   of a completed capture group
2945
            //             opValue+2 - the start of a capture group whose end
2946
            //                          has not yet been reached (and might not ever be).
2947
0
        case URX_START_CAPTURE:
2948
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2949
0
            fp->fExtra[opValue+2] = fp->fInputIdx;
2950
0
            break;
2951
2952
2953
0
        case URX_END_CAPTURE:
2954
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2955
0
            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2956
0
            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2957
0
            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2958
0
            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2959
0
            break;
2960
2961
2962
0
        case URX_DOLLAR:                   //  $, test for End of line
2963
                                           //     or for position before new line at end of input
2964
0
            {
2965
0
                if (fp->fInputIdx >= fAnchorLimit) {
2966
                    // We really are at the end of input.  Success.
2967
0
                    fHitEnd = true;
2968
0
                    fRequireEnd = true;
2969
0
                    break;
2970
0
                }
2971
2972
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2973
2974
                // If we are positioned just before a new-line that is located at the
2975
                //   end of input, succeed.
2976
0
                UChar32 c = UTEXT_NEXT32(fInputText);
2977
0
                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2978
0
                    if (isLineTerminator(c)) {
2979
                        // If not in the middle of a CR/LF sequence
2980
0
                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2981
                            // At new-line at end of input. Success
2982
0
                            fHitEnd = true;
2983
0
                            fRequireEnd = true;
2984
2985
0
                            break;
2986
0
                        }
2987
0
                    }
2988
0
                } else {
2989
0
                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2990
0
                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2991
0
                        fHitEnd = true;
2992
0
                        fRequireEnd = true;
2993
0
                        break;                         // At CR/LF at end of input.  Success
2994
0
                    }
2995
0
                }
2996
2997
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
2998
0
            }
2999
0
            break;
3000
3001
3002
0
         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3003
0
            if (fp->fInputIdx >= fAnchorLimit) {
3004
                // Off the end of input.  Success.
3005
0
                fHitEnd = true;
3006
0
                fRequireEnd = true;
3007
0
                break;
3008
0
            } else {
3009
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3010
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3011
                // Either at the last character of input, or off the end.
3012
0
                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3013
0
                    fHitEnd = true;
3014
0
                    fRequireEnd = true;
3015
0
                    break;
3016
0
                }
3017
0
            }
3018
3019
            // Not at end of input.  Back-track out.
3020
0
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3021
0
            break;
3022
3023
3024
0
         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3025
0
             {
3026
0
                 if (fp->fInputIdx >= fAnchorLimit) {
3027
                     // We really are at the end of input.  Success.
3028
0
                     fHitEnd = true;
3029
0
                     fRequireEnd = true;
3030
0
                     break;
3031
0
                 }
3032
                 // If we are positioned just before a new-line, succeed.
3033
                 // It makes no difference where the new-line is within the input.
3034
0
                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3035
0
                 UChar32 c = UTEXT_CURRENT32(fInputText);
3036
0
                 if (isLineTerminator(c)) {
3037
                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3038
                     //  In multi-line mode, hitting a new-line just before the end of input does not
3039
                     //   set the hitEnd or requireEnd flags
3040
0
                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3041
0
                        break;
3042
0
                     }
3043
0
                 }
3044
                 // not at a new line.  Fail.
3045
0
                 fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3046
0
             }
3047
0
             break;
3048
3049
3050
0
         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3051
0
             {
3052
0
                 if (fp->fInputIdx >= fAnchorLimit) {
3053
                     // We really are at the end of input.  Success.
3054
0
                     fHitEnd = true;
3055
0
                     fRequireEnd = true;  // Java set requireEnd in this case, even though
3056
0
                     break;               //   adding a new-line would not lose the match.
3057
0
                 }
3058
                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3059
                 // It makes no difference where the new-line is within the input.
3060
0
                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3061
0
                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3062
0
                     fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3063
0
                 }
3064
0
             }
3065
0
             break;
3066
3067
3068
0
       case URX_CARET:                    //  ^, test for start of line
3069
0
            if (fp->fInputIdx != fAnchorStart) {
3070
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3071
0
            }
3072
0
            break;
3073
3074
3075
0
       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3076
0
           {
3077
0
               if (fp->fInputIdx == fAnchorStart) {
3078
                   // We are at the start input.  Success.
3079
0
                   break;
3080
0
               }
3081
               // Check whether character just before the current pos is a new-line
3082
               //   unless we are at the end of input
3083
0
               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3084
0
               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3085
0
               if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3086
                   //  It's a new-line.  ^ is true.  Success.
3087
                   //  TODO:  what should be done with positions between a CR and LF?
3088
0
                   break;
3089
0
               }
3090
               // Not at the start of a line.  Fail.
3091
0
               fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3092
0
           }
3093
0
           break;
3094
3095
3096
0
       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3097
0
           {
3098
0
               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3099
0
               if (fp->fInputIdx <= fAnchorStart) {
3100
                   // We are at the start input.  Success.
3101
0
                   break;
3102
0
               }
3103
               // Check whether character just before the current pos is a new-line
3104
0
               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3105
0
               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3106
0
               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3107
0
               if (c != 0x0a) {
3108
                   // Not at the start of a line.  Back-track out.
3109
0
                   fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3110
0
               }
3111
0
           }
3112
0
           break;
3113
3114
0
        case URX_BACKSLASH_B:          // Test for word boundaries
3115
0
            {
3116
0
                UBool success = isWordBoundary(fp->fInputIdx);
3117
0
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
3118
0
                if (!success) {
3119
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3120
0
                }
3121
0
            }
3122
0
            break;
3123
3124
3125
0
        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3126
0
            {
3127
0
                UBool success = isUWordBoundary(fp->fInputIdx, status);
3128
0
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
3129
0
                if (!success) {
3130
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3131
0
                }
3132
0
            }
3133
0
            break;
3134
3135
3136
0
        case URX_BACKSLASH_D:            // Test for decimal digit
3137
0
            {
3138
0
                if (fp->fInputIdx >= fActiveLimit) {
3139
0
                    fHitEnd = true;
3140
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3141
0
                    break;
3142
0
                }
3143
3144
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3145
3146
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3147
0
                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3148
0
                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3149
0
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \D
3150
0
                if (success) {
3151
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3152
0
                } else {
3153
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3154
0
                }
3155
0
            }
3156
0
            break;
3157
3158
3159
0
        case URX_BACKSLASH_G:          // Test for position at end of previous match
3160
0
            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
3161
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3162
0
            }
3163
0
            break;
3164
3165
3166
0
        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3167
0
            {
3168
0
                if (fp->fInputIdx >= fActiveLimit) {
3169
0
                    fHitEnd = true;
3170
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3171
0
                    break;
3172
0
                }
3173
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3174
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3175
0
                int8_t ctype = u_charType(c);
3176
0
                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3177
0
                success ^= static_cast<UBool>(opValue != 0);  // flip sense for \H
3178
0
                if (success) {
3179
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3180
0
                } else {
3181
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3182
0
                }
3183
0
            }
3184
0
            break;
3185
3186
3187
0
        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3188
0
            {
3189
0
                if (fp->fInputIdx >= fActiveLimit) {
3190
0
                    fHitEnd = true;
3191
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3192
0
                    break;
3193
0
                }
3194
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3195
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3196
0
                if (isLineTerminator(c)) {
3197
0
                    if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3198
0
                        utext_next32(fInputText);
3199
0
                    }
3200
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3201
0
                } else {
3202
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3203
0
                }
3204
0
            }
3205
0
            break;
3206
3207
3208
0
        case URX_BACKSLASH_V:            // \v, any single line ending character.
3209
0
            {
3210
0
                if (fp->fInputIdx >= fActiveLimit) {
3211
0
                    fHitEnd = true;
3212
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3213
0
                    break;
3214
0
                }
3215
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3216
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3217
0
                UBool success = isLineTerminator(c);
3218
0
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \V
3219
0
                if (success) {
3220
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3221
0
                } else {
3222
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3223
0
                }
3224
0
            }
3225
0
            break;
3226
3227
3228
0
        case URX_BACKSLASH_X:
3229
            //  Match a Grapheme, as defined by Unicode UAX 29.
3230
3231
            // Fail if at end of input
3232
0
            if (fp->fInputIdx >= fActiveLimit) {
3233
0
                fHitEnd = true;
3234
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3235
0
                break;
3236
0
            }
3237
3238
0
            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3239
0
            if (fp->fInputIdx >= fActiveLimit) {
3240
0
                fHitEnd = true;
3241
0
                fp->fInputIdx = fActiveLimit;
3242
0
            }
3243
0
            break;
3244
3245
3246
0
        case URX_BACKSLASH_Z:          // Test for end of Input
3247
0
            if (fp->fInputIdx < fAnchorLimit) {
3248
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3249
0
            } else {
3250
0
                fHitEnd = true;
3251
0
                fRequireEnd = true;
3252
0
            }
3253
0
            break;
3254
3255
3256
3257
0
        case URX_STATIC_SETREF:
3258
0
            {
3259
                // Test input character against one of the predefined sets
3260
                //    (Word Characters, for example)
3261
                // The high bit of the op value is a flag for the match polarity.
3262
                //    0:   success if input char is in set.
3263
                //    1:   success if input char is not in set.
3264
0
                if (fp->fInputIdx >= fActiveLimit) {
3265
0
                    fHitEnd = true;
3266
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3267
0
                    break;
3268
0
                }
3269
3270
0
                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3271
0
                opValue &= ~URX_NEG_SET;
3272
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3273
3274
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3275
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3276
0
                if (c < 256) {
3277
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3278
0
                    if (s8.contains(c)) {
3279
0
                        success = !success;
3280
0
                    }
3281
0
                } else {
3282
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3283
0
                    if (s.contains(c)) {
3284
0
                        success = !success;
3285
0
                    }
3286
0
                }
3287
0
                if (success) {
3288
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3289
0
                } else {
3290
                    // the character wasn't in the set.
3291
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3292
0
                }
3293
0
            }
3294
0
            break;
3295
3296
3297
0
        case URX_STAT_SETREF_N:
3298
0
            {
3299
                // Test input character for NOT being a member of  one of
3300
                //    the predefined sets (Word Characters, for example)
3301
0
                if (fp->fInputIdx >= fActiveLimit) {
3302
0
                    fHitEnd = true;
3303
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3304
0
                    break;
3305
0
                }
3306
3307
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3308
3309
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3310
3311
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3312
0
                if (c < 256) {
3313
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3314
0
                    if (s8.contains(c) == false) {
3315
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3316
0
                        break;
3317
0
                    }
3318
0
                } else {
3319
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3320
0
                    if (s.contains(c) == false) {
3321
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3322
0
                        break;
3323
0
                    }
3324
0
                }
3325
                // the character wasn't in the set.
3326
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3327
0
            }
3328
0
            break;
3329
3330
3331
0
        case URX_SETREF:
3332
0
            if (fp->fInputIdx >= fActiveLimit) {
3333
0
                fHitEnd = true;
3334
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3335
0
                break;
3336
0
            } else {
3337
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3338
3339
                // There is input left.  Pick up one char and test it for set membership.
3340
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3341
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
3342
0
                if (c<256) {
3343
0
                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3344
0
                    if (s8->contains(c)) {
3345
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3346
0
                        break;
3347
0
                    }
3348
0
                } else {
3349
0
                    UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
3350
0
                    if (s->contains(c)) {
3351
                        // The character is in the set.  A Match.
3352
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3353
0
                        break;
3354
0
                    }
3355
0
                }
3356
3357
                // the character wasn't in the set.
3358
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3359
0
            }
3360
0
            break;
3361
3362
3363
0
        case URX_DOTANY:
3364
0
            {
3365
                // . matches anything, but stops at end-of-line.
3366
0
                if (fp->fInputIdx >= fActiveLimit) {
3367
                    // At end of input.  Match failed.  Backtrack out.
3368
0
                    fHitEnd = true;
3369
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3370
0
                    break;
3371
0
                }
3372
3373
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3374
3375
                // There is input left.  Advance over one char, unless we've hit end-of-line
3376
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3377
0
                if (isLineTerminator(c)) {
3378
                    // End of line in normal mode.   . does not match.
3379
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3380
0
                    break;
3381
0
                }
3382
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3383
0
            }
3384
0
            break;
3385
3386
3387
0
        case URX_DOTANY_ALL:
3388
0
            {
3389
                // ., in dot-matches-all (including new lines) mode
3390
0
                if (fp->fInputIdx >= fActiveLimit) {
3391
                    // At end of input.  Match failed.  Backtrack out.
3392
0
                    fHitEnd = true;
3393
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3394
0
                    break;
3395
0
                }
3396
3397
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3398
3399
                // There is input left.  Advance over one char, except if we are
3400
                //   at a cr/lf, advance over both of them.
3401
0
                UChar32 c;
3402
0
                c = UTEXT_NEXT32(fInputText);
3403
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3404
0
                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3405
                    // In the case of a CR/LF, we need to advance over both.
3406
0
                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3407
0
                    if (nextc == 0x0a) {
3408
0
                        (void)UTEXT_NEXT32(fInputText);
3409
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3410
0
                    }
3411
0
                }
3412
0
            }
3413
0
            break;
3414
3415
3416
0
        case URX_DOTANY_UNIX:
3417
0
            {
3418
                // '.' operator, matches all, but stops at end-of-line.
3419
                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3420
0
                if (fp->fInputIdx >= fActiveLimit) {
3421
                    // At end of input.  Match failed.  Backtrack out.
3422
0
                    fHitEnd = true;
3423
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3424
0
                    break;
3425
0
                }
3426
3427
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3428
3429
                // There is input left.  Advance over one char, unless we've hit end-of-line
3430
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3431
0
                if (c == 0x0a) {
3432
                    // End of line in normal mode.   '.' does not match the \n
3433
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3434
0
                } else {
3435
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3436
0
                }
3437
0
            }
3438
0
            break;
3439
3440
3441
0
        case URX_JMP:
3442
0
            fp->fPatIdx = opValue;
3443
0
            break;
3444
3445
0
        case URX_FAIL:
3446
0
            isMatch = false;
3447
0
            goto breakFromLoop;
3448
3449
0
        case URX_JMP_SAV:
3450
0
            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3451
0
            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3452
0
            fp->fPatIdx = opValue;                         // Then JMP.
3453
0
            break;
3454
3455
0
        case URX_JMP_SAV_X:
3456
            // This opcode is used with (x)+, when x can match a zero length string.
3457
            // Same as JMP_SAV, except conditional on the match having made forward progress.
3458
            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3459
            //   data address of the input position at the start of the loop.
3460
0
            {
3461
0
                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3462
0
                int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]);
3463
0
                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3464
0
                int32_t  frameLoc = URX_VAL(stoOp);
3465
0
                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3466
0
                int64_t prevInputIdx = fp->fExtra[frameLoc];
3467
0
                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3468
0
                if (prevInputIdx < fp->fInputIdx) {
3469
                    // The match did make progress.  Repeat the loop.
3470
0
                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3471
0
                    fp->fPatIdx = opValue;
3472
0
                    fp->fExtra[frameLoc] = fp->fInputIdx;
3473
0
                }
3474
                // If the input position did not advance, we do nothing here,
3475
                //   execution will fall out of the loop.
3476
0
            }
3477
0
            break;
3478
3479
0
        case URX_CTR_INIT:
3480
0
            {
3481
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3482
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3483
3484
                // Pick up the three extra operands that CTR_INIT has, and
3485
                //    skip the pattern location counter past
3486
0
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
3487
0
                fp->fPatIdx += 3;
3488
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3489
0
                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
3490
0
                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
3491
0
                U_ASSERT(minCount>=0);
3492
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
3493
0
                U_ASSERT(loopLoc>=fp->fPatIdx);
3494
3495
0
                if (minCount == 0) {
3496
0
                    fp = StateSave(fp, loopLoc+1, status);
3497
0
                }
3498
0
                if (maxCount == -1) {
3499
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3500
0
                } else if (maxCount == 0) {
3501
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3502
0
                }
3503
0
            }
3504
0
            break;
3505
3506
0
        case URX_CTR_LOOP:
3507
0
            {
3508
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3509
0
                int32_t initOp = static_cast<int32_t>(pat[opValue]);
3510
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3511
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3512
0
                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
3513
0
                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
3514
0
                (*pCounter)++;
3515
0
                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
3516
0
                    U_ASSERT(*pCounter == maxCount);
3517
0
                    break;
3518
0
                }
3519
0
                if (*pCounter >= minCount) {
3520
0
                    if (maxCount == -1) {
3521
                        // Loop has no hard upper bound.
3522
                        // Check that it is progressing through the input, break if it is not.
3523
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3524
0
                        if (fp->fInputIdx == *pLastInputIdx) {
3525
0
                            break;
3526
0
                        } else {
3527
0
                            *pLastInputIdx = fp->fInputIdx;
3528
0
                        }
3529
0
                    }
3530
0
                    fp = StateSave(fp, fp->fPatIdx, status);
3531
0
                } else {
3532
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
3533
0
                    fTickCounter--;
3534
0
                    if (fTickCounter <= 0) {
3535
0
                        IncrementTime(status);    // Re-initializes fTickCounter
3536
0
                    }
3537
0
                }
3538
3539
0
                fp->fPatIdx = opValue + 4;    // Loop back.
3540
0
            }
3541
0
            break;
3542
3543
0
        case URX_CTR_INIT_NG:
3544
0
            {
3545
                // Initialize a non-greedy loop
3546
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3547
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3548
3549
                // Pick up the three extra operands that CTR_INIT_NG has, and
3550
                //    skip the pattern location counter past
3551
0
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
3552
0
                fp->fPatIdx += 3;
3553
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3554
0
                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
3555
0
                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
3556
0
                U_ASSERT(minCount>=0);
3557
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
3558
0
                U_ASSERT(loopLoc>fp->fPatIdx);
3559
0
                if (maxCount == -1) {
3560
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3561
0
                }
3562
3563
0
                if (minCount == 0) {
3564
0
                    if (maxCount != 0) {
3565
0
                        fp = StateSave(fp, fp->fPatIdx, status);
3566
0
                    }
3567
0
                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3568
0
                }
3569
0
            }
3570
0
            break;
3571
3572
0
        case URX_CTR_LOOP_NG:
3573
0
            {
3574
                // Non-greedy {min, max} loops
3575
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3576
0
                int32_t initOp = static_cast<int32_t>(pat[opValue]);
3577
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3578
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3579
0
                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
3580
0
                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
3581
3582
0
                (*pCounter)++;
3583
0
                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
3584
                    // The loop has matched the maximum permitted number of times.
3585
                    //   Break out of here with no action.  Matching will
3586
                    //   continue with the following pattern.
3587
0
                    U_ASSERT(*pCounter == maxCount);
3588
0
                    break;
3589
0
                }
3590
3591
0
                if (*pCounter < minCount) {
3592
                    // We haven't met the minimum number of matches yet.
3593
                    //   Loop back for another one.
3594
0
                    fp->fPatIdx = opValue + 4;    // Loop back.
3595
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
3596
0
                    fTickCounter--;
3597
0
                    if (fTickCounter <= 0) {
3598
0
                        IncrementTime(status);    // Re-initializes fTickCounter
3599
0
                    }
3600
0
                } else {
3601
                    // We do have the minimum number of matches.
3602
3603
                    // If there is no upper bound on the loop iterations, check that the input index
3604
                    // is progressing, and stop the loop if it is not.
3605
0
                    if (maxCount == -1) {
3606
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3607
0
                        if (fp->fInputIdx == *pLastInputIdx) {
3608
0
                            break;
3609
0
                        }
3610
0
                        *pLastInputIdx = fp->fInputIdx;
3611
0
                    }
3612
3613
                    // Loop Continuation: we will fall into the pattern following the loop
3614
                    //   (non-greedy, don't execute loop body first), but first do
3615
                    //   a state save to the top of the loop, so that a match failure
3616
                    //   in the following pattern will try another iteration of the loop.
3617
0
                    fp = StateSave(fp, opValue + 4, status);
3618
0
                }
3619
0
            }
3620
0
            break;
3621
3622
0
        case URX_STO_SP:
3623
0
            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3624
0
            fData[opValue] = fStack->size();
3625
0
            break;
3626
3627
0
        case URX_LD_SP:
3628
0
            {
3629
0
                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3630
0
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
3631
0
                U_ASSERT(newStackSize <= fStack->size());
3632
0
                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3633
0
                if (newFP == reinterpret_cast<int64_t*>(fp)) {
3634
0
                    break;
3635
0
                }
3636
0
                int32_t j;
3637
0
                for (j=0; j<fFrameSize; j++) {
3638
0
                    newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
3639
0
                }
3640
0
                fp = reinterpret_cast<REStackFrame*>(newFP);
3641
0
                fStack->setSize(newStackSize);
3642
0
            }
3643
0
            break;
3644
3645
0
        case URX_BACKREF:
3646
0
            {
3647
0
                U_ASSERT(opValue < fFrameSize);
3648
0
                int64_t groupStartIdx = fp->fExtra[opValue];
3649
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3650
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
3651
0
                if (groupStartIdx < 0) {
3652
                    // This capture group has not participated in the match thus far,
3653
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
3654
0
                    break;
3655
0
                }
3656
0
                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3657
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3658
3659
                //   Note: if the capture group match was of an empty string the backref
3660
                //         match succeeds.  Verified by testing:  Perl matches succeed
3661
                //         in this case, so we do too.
3662
3663
0
                UBool success = true;
3664
0
                for (;;) {
3665
0
                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3666
0
                        success = true;
3667
0
                        break;
3668
0
                    }
3669
0
                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3670
0
                        success = false;
3671
0
                        fHitEnd = true;
3672
0
                        break;
3673
0
                    }
3674
0
                    UChar32 captureGroupChar = utext_next32(fAltInputText);
3675
0
                    UChar32 inputChar = utext_next32(fInputText);
3676
0
                    if (inputChar != captureGroupChar) {
3677
0
                        success = false;
3678
0
                        break;
3679
0
                    }
3680
0
                }
3681
3682
0
                if (success) {
3683
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3684
0
                } else {
3685
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3686
0
                }
3687
0
            }
3688
0
            break;
3689
3690
3691
3692
0
        case URX_BACKREF_I:
3693
0
            {
3694
0
                U_ASSERT(opValue < fFrameSize);
3695
0
                int64_t groupStartIdx = fp->fExtra[opValue];
3696
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3697
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
3698
0
                if (groupStartIdx < 0) {
3699
                    // This capture group has not participated in the match thus far,
3700
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
3701
0
                    break;
3702
0
                }
3703
0
                utext_setNativeIndex(fAltInputText, groupStartIdx);
3704
0
                utext_setNativeIndex(fInputText, fp->fInputIdx);
3705
0
                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3706
0
                CaseFoldingUTextIterator inputItr(*fInputText);
3707
3708
                //   Note: if the capture group match was of an empty string the backref
3709
                //         match succeeds.  Verified by testing:  Perl matches succeed
3710
                //         in this case, so we do too.
3711
3712
0
                UBool success = true;
3713
0
                for (;;) {
3714
0
                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3715
0
                        success = true;
3716
0
                        break;
3717
0
                    }
3718
0
                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3719
0
                        success = false;
3720
0
                        fHitEnd = true;
3721
0
                        break;
3722
0
                    }
3723
0
                    UChar32 captureGroupChar = captureGroupItr.next();
3724
0
                    UChar32 inputChar = inputItr.next();
3725
0
                    if (inputChar != captureGroupChar) {
3726
0
                        success = false;
3727
0
                        break;
3728
0
                    }
3729
0
                }
3730
3731
0
                if (success && inputItr.inExpansion()) {
3732
                    // We obtained a match by consuming part of a string obtained from
3733
                    // case-folding a single code point of the input text.
3734
                    // This does not count as an overall match.
3735
0
                    success = false;
3736
0
                }
3737
3738
0
                if (success) {
3739
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3740
0
                } else {
3741
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3742
0
                }
3743
3744
0
            }
3745
0
            break;
3746
3747
0
        case URX_STO_INP_LOC:
3748
0
            {
3749
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3750
0
                fp->fExtra[opValue] = fp->fInputIdx;
3751
0
            }
3752
0
            break;
3753
3754
0
        case URX_JMPX:
3755
0
            {
3756
0
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
3757
0
                fp->fPatIdx += 1;
3758
0
                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3759
0
                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3760
0
                int64_t savedInputIdx = fp->fExtra[dataLoc];
3761
0
                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3762
0
                if (savedInputIdx < fp->fInputIdx) {
3763
0
                    fp->fPatIdx = opValue;                               // JMP
3764
0
                } else {
3765
0
                     fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop.
3766
0
                }
3767
0
            }
3768
0
            break;
3769
3770
0
        case URX_LA_START:
3771
0
            {
3772
                // Entering a look around block.
3773
                // Save Stack Ptr, Input Pos.
3774
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3775
0
                fData[opValue]   = fStack->size();
3776
0
                fData[opValue+1] = fp->fInputIdx;
3777
0
                fData[opValue+2] = fActiveStart;
3778
0
                fData[opValue+3] = fActiveLimit;
3779
0
                fActiveStart     = fLookStart;          // Set the match region change for
3780
0
                fActiveLimit     = fLookLimit;          //   transparent bounds.
3781
0
            }
3782
0
            break;
3783
3784
0
        case URX_LA_END:
3785
0
            {
3786
                // Leaving a look-ahead block.
3787
                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3788
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3789
0
                int32_t stackSize = fStack->size();
3790
0
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
3791
0
                U_ASSERT(stackSize >= newStackSize);
3792
0
                if (stackSize > newStackSize) {
3793
                    // Copy the current top frame back to the new (cut back) top frame.
3794
                    //   This makes the capture groups from within the look-ahead
3795
                    //   expression available.
3796
0
                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3797
0
                    int32_t j;
3798
0
                    for (j=0; j<fFrameSize; j++) {
3799
0
                        newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
3800
0
                    }
3801
0
                    fp = reinterpret_cast<REStackFrame*>(newFP);
3802
0
                    fStack->setSize(newStackSize);
3803
0
                }
3804
0
                fp->fInputIdx = fData[opValue+1];
3805
3806
                // Restore the active region bounds in the input string; they may have
3807
                //    been changed because of transparent bounds on a Region.
3808
0
                fActiveStart = fData[opValue+2];
3809
0
                fActiveLimit = fData[opValue+3];
3810
0
                U_ASSERT(fActiveStart >= 0);
3811
0
                U_ASSERT(fActiveLimit <= fInputLength);
3812
0
            }
3813
0
            break;
3814
3815
0
        case URX_ONECHAR_I:
3816
            // Case insensitive one char.  The char from the pattern is already case folded.
3817
            // Input text is not, but case folding the input can not reduce two or more code
3818
            // points to one.
3819
0
            if (fp->fInputIdx < fActiveLimit) {
3820
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3821
3822
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3823
0
                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3824
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3825
0
                    break;
3826
0
                }
3827
0
            } else {
3828
0
                fHitEnd = true;
3829
0
            }
3830
3831
0
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3832
0
            break;
3833
3834
0
        case URX_STRING_I:
3835
0
            {
3836
                // Case-insensitive test input against a literal string.
3837
                // Strings require two slots in the compiled pattern, one for the
3838
                //   offset to the string text, and one for the length.
3839
                //   The compiled string has already been case folded.
3840
0
                {
3841
0
                    const char16_t *patternString = litText + opValue;
3842
0
                    int32_t      patternStringIdx  = 0;
3843
3844
0
                    op = static_cast<int32_t>(pat[fp->fPatIdx]);
3845
0
                    fp->fPatIdx++;
3846
0
                    opType  = URX_TYPE(op);
3847
0
                    opValue = URX_VAL(op);
3848
0
                    U_ASSERT(opType == URX_STRING_LEN);
3849
0
                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3850
3851
3852
0
                    UChar32   cPattern;
3853
0
                    UChar32   cText;
3854
0
                    UBool     success = true;
3855
3856
0
                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3857
0
                    CaseFoldingUTextIterator inputIterator(*fInputText);
3858
0
                    while (patternStringIdx < patternStringLen) {
3859
0
                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3860
0
                            success = false;
3861
0
                            fHitEnd = true;
3862
0
                            break;
3863
0
                        }
3864
0
                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3865
0
                        cText = inputIterator.next();
3866
0
                        if (cText != cPattern) {
3867
0
                            success = false;
3868
0
                            break;
3869
0
                        }
3870
0
                    }
3871
0
                    if (inputIterator.inExpansion()) {
3872
0
                        success = false;
3873
0
                    }
3874
3875
0
                    if (success) {
3876
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3877
0
                    } else {
3878
0
                        fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3879
0
                    }
3880
0
                }
3881
0
            }
3882
0
            break;
3883
3884
0
        case URX_LB_START:
3885
0
            {
3886
                // Entering a look-behind block.
3887
                // Save Stack Ptr, Input Pos and active input region.
3888
                //   TODO:  implement transparent bounds.  Ticket #6067
3889
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3890
0
                fData[opValue]   = fStack->size();
3891
0
                fData[opValue+1] = fp->fInputIdx;
3892
                // Save input string length, then reset to pin any matches to end at
3893
                //   the current position.
3894
0
                fData[opValue+2] = fActiveStart;
3895
0
                fData[opValue+3] = fActiveLimit;
3896
0
                fActiveStart     = fRegionStart;
3897
0
                fActiveLimit     = fp->fInputIdx;
3898
                // Init the variable containing the start index for attempted matches.
3899
0
                fData[opValue+4] = -1;
3900
0
            }
3901
0
            break;
3902
3903
3904
0
        case URX_LB_CONT:
3905
0
            {
3906
                // Positive Look-Behind, at top of loop checking for matches of LB expression
3907
                //    at all possible input starting positions.
3908
3909
                // Fetch the min and max possible match lengths.  They are the operands
3910
                //   of this op in the pattern.
3911
0
                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
3912
0
                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
3913
0
                if (!UTEXT_USES_U16(fInputText)) {
3914
                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3915
                    // The max length need not be exact; it just needs to be >= actual maximum.
3916
0
                    maxML *= 3;
3917
0
                }
3918
0
                U_ASSERT(minML <= maxML);
3919
0
                U_ASSERT(minML >= 0);
3920
3921
                // Fetch (from data) the last input index where a match was attempted.
3922
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3923
0
                int64_t  &lbStartIdx = fData[opValue+4];
3924
0
                if (lbStartIdx < 0) {
3925
                    // First time through loop.
3926
0
                    lbStartIdx = fp->fInputIdx - minML;
3927
0
                    if (lbStartIdx > 0) {
3928
                        // move index to a code point boundary, if it's not on one already.
3929
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3930
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3931
0
                    }
3932
0
                } else {
3933
                    // 2nd through nth time through the loop.
3934
                    // Back up start position for match by one.
3935
0
                    if (lbStartIdx == 0) {
3936
0
                        (lbStartIdx)--;
3937
0
                    } else {
3938
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3939
0
                        (void)UTEXT_PREVIOUS32(fInputText);
3940
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3941
0
                    }
3942
0
                }
3943
3944
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3945
                    // We have tried all potential match starting points without
3946
                    //  getting a match.  Backtrack out, and out of the
3947
                    //   Look Behind altogether.
3948
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3949
0
                    fActiveStart = fData[opValue+2];
3950
0
                    fActiveLimit = fData[opValue+3];
3951
0
                    U_ASSERT(fActiveStart >= 0);
3952
0
                    U_ASSERT(fActiveLimit <= fInputLength);
3953
0
                    break;
3954
0
                }
3955
3956
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3957
                //      (successful match will fall off the end of the loop.)
3958
0
                fp = StateSave(fp, fp->fPatIdx-3, status);
3959
0
                fp->fInputIdx = lbStartIdx;
3960
0
            }
3961
0
            break;
3962
3963
0
        case URX_LB_END:
3964
            // End of a look-behind block, after a successful match.
3965
0
            {
3966
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3967
0
                if (fp->fInputIdx != fActiveLimit) {
3968
                    //  The look-behind expression matched, but the match did not
3969
                    //    extend all the way to the point that we are looking behind from.
3970
                    //  FAIL out of here, which will take us back to the LB_CONT, which
3971
                    //     will retry the match starting at another position or fail
3972
                    //     the look-behind altogether, whichever is appropriate.
3973
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
3974
0
                    break;
3975
0
                }
3976
3977
                // Look-behind match is good.  Restore the original input string region,
3978
                //   which had been truncated to pin the end of the lookbehind match to the
3979
                //   position being looked-behind.
3980
0
                fActiveStart = fData[opValue+2];
3981
0
                fActiveLimit = fData[opValue+3];
3982
0
                U_ASSERT(fActiveStart >= 0);
3983
0
                U_ASSERT(fActiveLimit <= fInputLength);
3984
0
            }
3985
0
            break;
3986
3987
3988
0
        case URX_LBN_CONT:
3989
0
            {
3990
                // Negative Look-Behind, at top of loop checking for matches of LB expression
3991
                //    at all possible input starting positions.
3992
3993
                // Fetch the extra parameters of this op.
3994
0
                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
3995
0
                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
3996
0
                if (!UTEXT_USES_U16(fInputText)) {
3997
                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3998
                    // The max length need not be exact; it just needs to be >= actual maximum.
3999
0
                    maxML *= 3;
4000
0
                }
4001
0
                int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]);
4002
0
                        continueLoc = URX_VAL(continueLoc);
4003
0
                U_ASSERT(minML <= maxML);
4004
0
                U_ASSERT(minML >= 0);
4005
0
                U_ASSERT(continueLoc > fp->fPatIdx);
4006
4007
                // Fetch (from data) the last input index where a match was attempted.
4008
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4009
0
                int64_t  &lbStartIdx = fData[opValue+4];
4010
0
                if (lbStartIdx < 0) {
4011
                    // First time through loop.
4012
0
                    lbStartIdx = fp->fInputIdx - minML;
4013
0
                    if (lbStartIdx > 0) {
4014
                        // move index to a code point boundary, if it's not on one already.
4015
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4016
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4017
0
                    }
4018
0
                } else {
4019
                    // 2nd through nth time through the loop.
4020
                    // Back up start position for match by one.
4021
0
                    if (lbStartIdx == 0) {
4022
0
                        (lbStartIdx)--;
4023
0
                    } else {
4024
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4025
0
                        (void)UTEXT_PREVIOUS32(fInputText);
4026
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4027
0
                    }
4028
0
                }
4029
4030
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4031
                    // We have tried all potential match starting points without
4032
                    //  getting a match, which means that the negative lookbehind as
4033
                    //  a whole has succeeded.  Jump forward to the continue location
4034
0
                    fActiveStart = fData[opValue+2];
4035
0
                    fActiveLimit = fData[opValue+3];
4036
0
                    U_ASSERT(fActiveStart >= 0);
4037
0
                    U_ASSERT(fActiveLimit <= fInputLength);
4038
0
                    fp->fPatIdx = continueLoc;
4039
0
                    break;
4040
0
                }
4041
4042
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4043
                //      (successful match will cause a FAIL out of the loop altogether.)
4044
0
                fp = StateSave(fp, fp->fPatIdx-4, status);
4045
0
                fp->fInputIdx = lbStartIdx;
4046
0
            }
4047
0
            break;
4048
4049
0
        case URX_LBN_END:
4050
            // End of a negative look-behind block, after a successful match.
4051
0
            {
4052
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4053
0
                if (fp->fInputIdx != fActiveLimit) {
4054
                    //  The look-behind expression matched, but the match did not
4055
                    //    extend all the way to the point that we are looking behind from.
4056
                    //  FAIL out of here, which will take us back to the LB_CONT, which
4057
                    //     will retry the match starting at another position or succeed
4058
                    //     the look-behind altogether, whichever is appropriate.
4059
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4060
0
                    break;
4061
0
                }
4062
4063
                // Look-behind expression matched, which means look-behind test as
4064
                //   a whole Fails
4065
4066
                //   Restore the original input string length, which had been truncated
4067
                //   inorder to pin the end of the lookbehind match
4068
                //   to the position being looked-behind.
4069
0
                fActiveStart = fData[opValue+2];
4070
0
                fActiveLimit = fData[opValue+3];
4071
0
                U_ASSERT(fActiveStart >= 0);
4072
0
                U_ASSERT(fActiveLimit <= fInputLength);
4073
4074
                // Restore original stack position, discarding any state saved
4075
                //   by the successful pattern match.
4076
0
                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4077
0
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
4078
0
                U_ASSERT(fStack->size() > newStackSize);
4079
0
                fStack->setSize(newStackSize);
4080
4081
                //  FAIL, which will take control back to someplace
4082
                //  prior to entering the look-behind test.
4083
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4084
0
            }
4085
0
            break;
4086
4087
4088
0
        case URX_LOOP_SR_I:
4089
            // Loop Initialization for the optimized implementation of
4090
            //     [some character set]*
4091
            //   This op scans through all matching input.
4092
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4093
0
            {
4094
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
4095
0
                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4096
0
                UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
4097
4098
                // Loop through input, until either the input is exhausted or
4099
                //   we reach a character that is not a member of the set.
4100
0
                int64_t ix = fp->fInputIdx;
4101
0
                UTEXT_SETNATIVEINDEX(fInputText, ix);
4102
0
                for (;;) {
4103
0
                    if (ix >= fActiveLimit) {
4104
0
                        fHitEnd = true;
4105
0
                        break;
4106
0
                    }
4107
0
                    UChar32 c = UTEXT_NEXT32(fInputText);
4108
0
                    if (c<256) {
4109
0
                        if (s8->contains(c) == false) {
4110
0
                            break;
4111
0
                        }
4112
0
                    } else {
4113
0
                        if (s->contains(c) == false) {
4114
0
                            break;
4115
0
                        }
4116
0
                    }
4117
0
                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4118
0
                }
4119
4120
                // If there were no matching characters, skip over the loop altogether.
4121
                //   The loop doesn't run at all, a * op always succeeds.
4122
0
                if (ix == fp->fInputIdx) {
4123
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4124
0
                    break;
4125
0
                }
4126
4127
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4128
                //   must follow.  It's operand is the stack location
4129
                //   that holds the starting input index for the match of this [set]*
4130
0
                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
4131
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4132
0
                int32_t stackLoc = URX_VAL(loopcOp);
4133
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4134
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
4135
0
                fp->fInputIdx = ix;
4136
4137
                // Save State to the URX_LOOP_C op that follows this one,
4138
                //   so that match failures in the following code will return to there.
4139
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4140
0
                fp = StateSave(fp, fp->fPatIdx, status);
4141
0
                fp->fPatIdx++;
4142
0
            }
4143
0
            break;
4144
4145
4146
0
        case URX_LOOP_DOT_I:
4147
            // Loop Initialization for the optimized implementation of .*
4148
            //   This op scans through all remaining input.
4149
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4150
0
            {
4151
                // Loop through input until the input is exhausted (we reach an end-of-line)
4152
                // In DOTALL mode, we can just go straight to the end of the input.
4153
0
                int64_t ix;
4154
0
                if ((opValue & 1) == 1) {
4155
                    // Dot-matches-All mode.  Jump straight to the end of the string.
4156
0
                    ix = fActiveLimit;
4157
0
                    fHitEnd = true;
4158
0
                } else {
4159
                    // NOT DOT ALL mode.  Line endings do not match '.'
4160
                    // Scan forward until a line ending or end of input.
4161
0
                    ix = fp->fInputIdx;
4162
0
                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4163
0
                    for (;;) {
4164
0
                        if (ix >= fActiveLimit) {
4165
0
                            fHitEnd = true;
4166
0
                            break;
4167
0
                        }
4168
0
                        UChar32 c = UTEXT_NEXT32(fInputText);
4169
0
                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4170
0
                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4171
0
                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4172
0
                                    isLineTerminator(c))) {
4173
                                //  char is a line ending.  Exit the scanning loop.
4174
0
                                break;
4175
0
                            }
4176
0
                        }
4177
0
                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4178
0
                    }
4179
0
                }
4180
4181
                // If there were no matching characters, skip over the loop altogether.
4182
                //   The loop doesn't run at all, a * op always succeeds.
4183
0
                if (ix == fp->fInputIdx) {
4184
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4185
0
                    break;
4186
0
                }
4187
4188
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4189
                //   must follow.  It's operand is the stack location
4190
                //   that holds the starting input index for the match of this .*
4191
0
                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
4192
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4193
0
                int32_t stackLoc = URX_VAL(loopcOp);
4194
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4195
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
4196
0
                fp->fInputIdx = ix;
4197
4198
                // Save State to the URX_LOOP_C op that follows this one,
4199
                //   so that match failures in the following code will return to there.
4200
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4201
0
                fp = StateSave(fp, fp->fPatIdx, status);
4202
0
                fp->fPatIdx++;
4203
0
            }
4204
0
            break;
4205
4206
4207
0
        case URX_LOOP_C:
4208
0
            {
4209
0
                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4210
0
                backSearchIndex = fp->fExtra[opValue];
4211
0
                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4212
0
                if (backSearchIndex == fp->fInputIdx) {
4213
                    // We've backed up the input idx to the point that the loop started.
4214
                    // The loop is done.  Leave here without saving state.
4215
                    //  Subsequent failures won't come back here.
4216
0
                    break;
4217
0
                }
4218
                // Set up for the next iteration of the loop, with input index
4219
                //   backed up by one from the last time through,
4220
                //   and a state save to this instruction in case the following code fails again.
4221
                //   (We're going backwards because this loop emulates stack unwinding, not
4222
                //    the initial scan forward.)
4223
0
                U_ASSERT(fp->fInputIdx > 0);
4224
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4225
0
                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4226
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4227
4228
0
                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4229
0
                if (prevC == 0x0a &&
4230
0
                    fp->fInputIdx > backSearchIndex &&
4231
0
                    twoPrevC == 0x0d) {
4232
0
                    int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]);
4233
0
                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4234
                        // .*, stepping back over CRLF pair.
4235
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4236
0
                    }
4237
0
                }
4238
4239
4240
0
                fp = StateSave(fp, fp->fPatIdx-1, status);
4241
0
            }
4242
0
            break;
4243
4244
4245
4246
0
        default:
4247
            // Trouble.  The compiled pattern contains an entry with an
4248
            //           unrecognized type tag.
4249
0
            UPRV_UNREACHABLE_ASSERT;
4250
            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4251
            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4252
            // See ICU-21669.
4253
0
            status = U_INTERNAL_PROGRAM_ERROR;
4254
0
        }
4255
4256
0
        if (U_FAILURE(status)) {
4257
0
            isMatch = false;
4258
0
            break;
4259
0
        }
4260
0
    }
4261
4262
0
breakFromLoop:
4263
0
    fMatch = isMatch;
4264
0
    if (isMatch) {
4265
0
        fLastMatchEnd = fMatchEnd;
4266
0
        fMatchStart   = startIdx;
4267
0
        fMatchEnd     = fp->fInputIdx;
4268
0
    }
4269
4270
#ifdef REGEX_RUN_DEBUG
4271
    if (fTraceDebug) {
4272
        if (isMatch) {
4273
            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4274
        } else {
4275
            printf("No match\n\n");
4276
        }
4277
    }
4278
#endif
4279
4280
0
    fFrame = fp;                // The active stack frame when the engine stopped.
4281
                                //   Contains the capture group results that we need to
4282
                                //    access later.
4283
0
}
4284
4285
4286
//--------------------------------------------------------------------------------
4287
//
4288
//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4289
//                  assumption that the entire string is available in the UText's
4290
//                  chunk buffer. For now, that means we can use int32_t indexes,
4291
//                  except for anything that needs to be saved (like group starts
4292
//                  and ends).
4293
//
4294
//                  startIdx:    begin matching a this index.
4295
//                  toEnd:       if true, match must extend to end of the input region
4296
//
4297
//--------------------------------------------------------------------------------
4298
20.6M
void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4299
20.6M
    UBool       isMatch  = false;      // True if the we have a match.
4300
4301
20.6M
    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4302
4303
20.6M
    int32_t     op;                    // Operation from the compiled pattern, split into
4304
20.6M
    int32_t     opType;                //    the opcode
4305
20.6M
    int32_t     opValue;               //    and the operand value.
4306
4307
#ifdef REGEX_RUN_DEBUG
4308
    if (fTraceDebug) {
4309
        printf("MatchAt(startIdx=%d)\n", startIdx);
4310
        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4311
        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4312
    }
4313
#endif
4314
4315
20.6M
    if (U_FAILURE(status)) {
4316
0
        return;
4317
0
    }
4318
4319
    //  Cache frequently referenced items from the compiled pattern
4320
    //
4321
20.6M
    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4322
4323
20.6M
    const char16_t      *litText       = fPattern->fLiteralText.getBuffer();
4324
20.6M
    UVector             *fSets         = fPattern->fSets;
4325
4326
20.6M
    const char16_t      *inputBuf      = fInputText->chunkContents;
4327
4328
20.6M
    fFrameSize = fPattern->fFrameSize;
4329
20.6M
    REStackFrame        *fp            = resetStack();
4330
20.6M
    if (U_FAILURE(fDeferredStatus)) {
4331
0
        status = fDeferredStatus;
4332
0
        return;
4333
0
    }
4334
4335
20.6M
    fp->fPatIdx   = 0;
4336
20.6M
    fp->fInputIdx = startIdx;
4337
4338
    // Zero out the pattern's static data
4339
20.6M
    int32_t i;
4340
1.44G
    for (i = 0; i<fPattern->fDataSize; i++) {
4341
1.42G
        fData[i] = 0;
4342
1.42G
    }
4343
4344
    //
4345
    //  Main loop for interpreting the compiled pattern.
4346
    //  One iteration of the loop per pattern operation performed.
4347
    //
4348
4.67G
    for (;;) {
4349
4.67G
        op = static_cast<int32_t>(pat[fp->fPatIdx]);
4350
4.67G
        opType  = URX_TYPE(op);
4351
4.67G
        opValue = URX_VAL(op);
4352
#ifdef REGEX_RUN_DEBUG
4353
        if (fTraceDebug) {
4354
            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4355
            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4356
                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4357
            fPattern->dumpOp(fp->fPatIdx);
4358
        }
4359
#endif
4360
4.67G
        fp->fPatIdx++;
4361
4362
4.67G
        switch (opType) {
4363
4364
4365
0
        case URX_NOP:
4366
0
            break;
4367
4368
4369
4.15M
        case URX_BACKTRACK:
4370
            // Force a backtrack.  In some circumstances, the pattern compiler
4371
            //   will notice that the pattern can't possibly match anything, and will
4372
            //   emit one of these at that point.
4373
4.15M
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4374
4.15M
            break;
4375
4376
4377
273M
        case URX_ONECHAR:
4378
273M
            if (fp->fInputIdx < fActiveLimit) {
4379
259M
                UChar32 c;
4380
259M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4381
259M
                if (c == opValue) {
4382
28.0M
                    break;
4383
28.0M
                }
4384
259M
            } else {
4385
13.4M
                fHitEnd = true;
4386
13.4M
            }
4387
244M
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4388
244M
            break;
4389
4390
4391
20.2M
        case URX_STRING:
4392
20.2M
            {
4393
                // Test input against a literal string.
4394
                // Strings require two slots in the compiled pattern, one for the
4395
                //   offset to the string text, and one for the length.
4396
20.2M
                int32_t   stringStartIdx = opValue;
4397
20.2M
                int32_t   stringLen;
4398
4399
20.2M
                op = static_cast<int32_t>(pat[fp->fPatIdx]); // Fetch the second operand
4400
20.2M
                fp->fPatIdx++;
4401
20.2M
                opType    = URX_TYPE(op);
4402
20.2M
                stringLen = URX_VAL(op);
4403
20.2M
                U_ASSERT(opType == URX_STRING_LEN);
4404
20.2M
                U_ASSERT(stringLen >= 2);
4405
4406
20.2M
                const char16_t * pInp = inputBuf + fp->fInputIdx;
4407
20.2M
                const char16_t * pInpLimit = inputBuf + fActiveLimit;
4408
20.2M
                const char16_t * pPat = litText+stringStartIdx;
4409
20.2M
                const char16_t * pEnd = pInp + stringLen;
4410
20.2M
                UBool success = true;
4411
20.3M
                while (pInp < pEnd) {
4412
20.3M
                    if (pInp >= pInpLimit) {
4413
66.8k
                        fHitEnd = true;
4414
66.8k
                        success = false;
4415
66.8k
                        break;
4416
66.8k
                    }
4417
20.2M
                    if (*pInp++ != *pPat++) {
4418
20.1M
                        success = false;
4419
20.1M
                        break;
4420
20.1M
                    }
4421
20.2M
                }
4422
4423
                // If the pattern string ends with an unpaired lead surrogate that
4424
                // matched the lead surrogate of a valid pair in the input text,
4425
                // this does not count as a match.
4426
20.2M
                if (success && U16_IS_LEAD(*(pInp-1)) &&
4427
15.5k
                        pInp < pInpLimit && U16_IS_TRAIL(*(pInp))) {
4428
13.3k
                    success = false;
4429
13.3k
                }
4430
4431
20.2M
                if (success) {
4432
18.2k
                    fp->fInputIdx += stringLen;
4433
20.2M
                } else {
4434
20.2M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4435
20.2M
                }
4436
20.2M
            }
4437
20.2M
            break;
4438
4439
4440
302M
        case URX_STATE_SAVE:
4441
302M
            fp = StateSave(fp, opValue, status);
4442
302M
            break;
4443
4444
4445
2.69k
        case URX_END:
4446
            // The match loop will exit via this path on a successful match,
4447
            //   when we reach the end of the pattern.
4448
2.69k
            if (toEnd && fp->fInputIdx != fActiveLimit) {
4449
                // The pattern matched, but not to the end of input.  Try some more.
4450
0
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4451
0
                break;
4452
0
            }
4453
2.69k
            isMatch = true;
4454
2.69k
            goto  breakFromLoop;
4455
4456
            // Start and End Capture stack frame variables are laid out out like this:
4457
            //  fp->fExtra[opValue]  - The start of a completed capture group
4458
            //             opValue+1 - The end   of a completed capture group
4459
            //             opValue+2 - the start of a capture group whose end
4460
            //                          has not yet been reached (and might not ever be).
4461
152M
        case URX_START_CAPTURE:
4462
152M
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4463
152M
            fp->fExtra[opValue+2] = fp->fInputIdx;
4464
152M
            break;
4465
4466
4467
187M
        case URX_END_CAPTURE:
4468
187M
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4469
187M
            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4470
187M
            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4471
187M
            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4472
187M
            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4473
187M
            break;
4474
4475
4476
50.5M
        case URX_DOLLAR:                   //  $, test for End of line
4477
            //     or for position before new line at end of input
4478
50.5M
            if (fp->fInputIdx < fAnchorLimit-2) {
4479
                // We are no where near the end of input.  Fail.
4480
                //   This is the common case.  Keep it first.
4481
46.0M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4482
46.0M
                break;
4483
46.0M
            }
4484
4.47M
            if (fp->fInputIdx >= fAnchorLimit) {
4485
                // We really are at the end of input.  Success.
4486
1.13M
                fHitEnd = true;
4487
1.13M
                fRequireEnd = true;
4488
1.13M
                break;
4489
1.13M
            }
4490
4491
            // If we are positioned just before a new-line that is located at the
4492
            //   end of input, succeed.
4493
3.34M
            if (fp->fInputIdx == fAnchorLimit-1) {
4494
3.17M
                UChar32 c;
4495
3.17M
                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4496
4497
3.17M
                if (isLineTerminator(c)) {
4498
3.03M
                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4499
                        // At new-line at end of input. Success
4500
3.03M
                        fHitEnd = true;
4501
3.03M
                        fRequireEnd = true;
4502
3.03M
                        break;
4503
3.03M
                    }
4504
3.03M
                }
4505
3.17M
            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4506
170k
                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4507
294
                    fHitEnd = true;
4508
294
                    fRequireEnd = true;
4509
294
                    break;                         // At CR/LF at end of input.  Success
4510
294
            }
4511
4512
304k
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4513
4514
304k
            break;
4515
4516
4517
4.77M
        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4518
4.77M
            if (fp->fInputIdx >= fAnchorLimit-1) {
4519
                // Either at the last character of input, or off the end.
4520
190k
                if (fp->fInputIdx == fAnchorLimit-1) {
4521
                    // At last char of input.  Success if it's a new line.
4522
36.1k
                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4523
1.10k
                        fHitEnd = true;
4524
1.10k
                        fRequireEnd = true;
4525
1.10k
                        break;
4526
1.10k
                    }
4527
154k
                } else {
4528
                    // Off the end of input.  Success.
4529
154k
                    fHitEnd = true;
4530
154k
                    fRequireEnd = true;
4531
154k
                    break;
4532
154k
                }
4533
190k
            }
4534
4535
            // Not at end of input.  Back-track out.
4536
4.62M
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4537
4.62M
            break;
4538
4539
4540
25.0M
        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4541
25.0M
            {
4542
25.0M
                if (fp->fInputIdx >= fAnchorLimit) {
4543
                    // We really are at the end of input.  Success.
4544
7.67k
                    fHitEnd = true;
4545
7.67k
                    fRequireEnd = true;
4546
7.67k
                    break;
4547
7.67k
                }
4548
                // If we are positioned just before a new-line, succeed.
4549
                // It makes no difference where the new-line is within the input.
4550
25.0M
                UChar32 c = inputBuf[fp->fInputIdx];
4551
25.0M
                if (isLineTerminator(c)) {
4552
                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4553
                    //  In multi-line mode, hitting a new-line just before the end of input does not
4554
                    //   set the hitEnd or requireEnd flags
4555
8.46M
                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4556
8.46M
                        break;
4557
8.46M
                    }
4558
8.46M
                }
4559
                // not at a new line.  Fail.
4560
16.5M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4561
16.5M
            }
4562
0
            break;
4563
4564
4565
17.2M
        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4566
17.2M
            {
4567
17.2M
                if (fp->fInputIdx >= fAnchorLimit) {
4568
                    // We really are at the end of input.  Success.
4569
7.56k
                    fHitEnd = true;
4570
7.56k
                    fRequireEnd = true;  // Java set requireEnd in this case, even though
4571
7.56k
                    break;               //   adding a new-line would not lose the match.
4572
7.56k
                }
4573
                // If we are not positioned just before a new-line, the test fails; backtrack out.
4574
                // It makes no difference where the new-line is within the input.
4575
17.2M
                if (inputBuf[fp->fInputIdx] != 0x0a) {
4576
17.2M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4577
17.2M
                }
4578
17.2M
            }
4579
0
            break;
4580
4581
4582
61.5M
        case URX_CARET:                    //  ^, test for start of line
4583
61.5M
            if (fp->fInputIdx != fAnchorStart) {
4584
18.8M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4585
18.8M
            }
4586
61.5M
            break;
4587
4588
4589
8.67M
        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4590
8.67M
            {
4591
8.67M
                if (fp->fInputIdx == fAnchorStart) {
4592
                    // We are at the start input.  Success.
4593
4.00M
                    break;
4594
4.00M
                }
4595
                // Check whether character just before the current pos is a new-line
4596
                //   unless we are at the end of input
4597
4.67M
                char16_t  c = inputBuf[fp->fInputIdx - 1];
4598
4.67M
                if ((fp->fInputIdx < fAnchorLimit) &&
4599
4.66M
                    isLineTerminator(c)) {
4600
                    //  It's a new-line.  ^ is true.  Success.
4601
                    //  TODO:  what should be done with positions between a CR and LF?
4602
20.5k
                    break;
4603
20.5k
                }
4604
                // Not at the start of a line.  Fail.
4605
4.65M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4606
4.65M
            }
4607
0
            break;
4608
4609
4610
4.37M
        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4611
4.37M
            {
4612
4.37M
                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4613
4.37M
                if (fp->fInputIdx <= fAnchorStart) {
4614
                    // We are at the start input.  Success.
4615
3.00M
                    break;
4616
3.00M
                }
4617
                // Check whether character just before the current pos is a new-line
4618
1.37M
                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4619
1.37M
                char16_t  c = inputBuf[fp->fInputIdx - 1];
4620
1.37M
                if (c != 0x0a) {
4621
                    // Not at the start of a line.  Back-track out.
4622
1.35M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4623
1.35M
                }
4624
1.37M
            }
4625
0
            break;
4626
4627
49.5M
        case URX_BACKSLASH_B:          // Test for word boundaries
4628
49.5M
            {
4629
49.5M
                UBool success = isChunkWordBoundary(static_cast<int32_t>(fp->fInputIdx));
4630
49.5M
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
4631
49.5M
                if (!success) {
4632
30.2M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4633
30.2M
                }
4634
49.5M
            }
4635
49.5M
            break;
4636
4637
4638
351M
        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4639
351M
            {
4640
351M
                UBool success = isUWordBoundary(fp->fInputIdx, status);
4641
351M
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \B
4642
351M
                if (!success) {
4643
252M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4644
252M
                }
4645
351M
            }
4646
351M
            break;
4647
4648
4649
98.3M
        case URX_BACKSLASH_D:            // Test for decimal digit
4650
98.3M
            {
4651
98.3M
                if (fp->fInputIdx >= fActiveLimit) {
4652
2.44M
                    fHitEnd = true;
4653
2.44M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4654
2.44M
                    break;
4655
2.44M
                }
4656
4657
95.8M
                UChar32 c;
4658
95.8M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4659
95.8M
                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4660
95.8M
                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4661
95.8M
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \D
4662
95.8M
                if (!success) {
4663
566k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4664
566k
                }
4665
95.8M
            }
4666
0
            break;
4667
4668
4669
4.84M
        case URX_BACKSLASH_G:          // Test for position at end of previous match
4670
4.84M
            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
4671
4.84M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4672
4.84M
            }
4673
4.84M
            break;
4674
4675
4676
22.8M
        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4677
22.8M
            {
4678
22.8M
                if (fp->fInputIdx >= fActiveLimit) {
4679
1.91M
                    fHitEnd = true;
4680
1.91M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4681
1.91M
                    break;
4682
1.91M
                }
4683
20.9M
                UChar32 c;
4684
20.9M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4685
20.9M
                int8_t ctype = u_charType(c);
4686
20.9M
                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4687
20.9M
                success ^= static_cast<UBool>(opValue != 0);  // flip sense for \H
4688
20.9M
                if (!success) {
4689
55.9k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4690
55.9k
                }
4691
20.9M
            }
4692
0
            break;
4693
4694
4695
6.52M
        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4696
6.52M
            {
4697
6.52M
                if (fp->fInputIdx >= fActiveLimit) {
4698
5.04k
                    fHitEnd = true;
4699
5.04k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4700
5.04k
                    break;
4701
5.04k
                }
4702
6.52M
                UChar32 c;
4703
6.52M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4704
6.52M
                if (isLineTerminator(c)) {
4705
16.8k
                    if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4706
                        // Check for CR/LF sequence. Consume both together when found.
4707
3.02k
                        char16_t c2;
4708
3.02k
                        U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4709
3.02k
                        if (c2 != 0x0a) {
4710
1.51k
                            U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4711
1.51k
                        }
4712
3.02k
                    }
4713
6.50M
                } else {
4714
6.50M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4715
6.50M
                }
4716
6.52M
            }
4717
0
            break;
4718
4719
4720
45.5M
        case URX_BACKSLASH_V:         // Any single code point line ending.
4721
45.5M
            {
4722
45.5M
                if (fp->fInputIdx >= fActiveLimit) {
4723
367k
                    fHitEnd = true;
4724
367k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4725
367k
                    break;
4726
367k
                }
4727
45.2M
                UChar32 c;
4728
45.2M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4729
45.2M
                UBool success = isLineTerminator(c);
4730
45.2M
                success ^= static_cast<UBool>(opValue != 0); // flip sense for \V
4731
45.2M
                if (!success) {
4732
1.86M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4733
1.86M
                }
4734
45.2M
            }
4735
0
            break;
4736
4737
4738
154M
        case URX_BACKSLASH_X:
4739
            //  Match a Grapheme, as defined by Unicode UAX 29.
4740
4741
            // Fail if at end of input
4742
154M
            if (fp->fInputIdx >= fActiveLimit) {
4743
975k
                fHitEnd = true;
4744
975k
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4745
975k
                break;
4746
975k
            }
4747
4748
153M
            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4749
153M
            if (fp->fInputIdx >= fActiveLimit) {
4750
2.03M
                fHitEnd = true;
4751
2.03M
                fp->fInputIdx = fActiveLimit;
4752
2.03M
            }
4753
153M
            break;
4754
4755
4756
1.48M
        case URX_BACKSLASH_Z:          // Test for end of Input
4757
1.48M
            if (fp->fInputIdx < fAnchorLimit) {
4758
1.47M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4759
1.47M
            } else {
4760
6.63k
                fHitEnd = true;
4761
6.63k
                fRequireEnd = true;
4762
6.63k
            }
4763
1.48M
            break;
4764
4765
4766
4767
12.1M
        case URX_STATIC_SETREF:
4768
12.1M
            {
4769
                // Test input character against one of the predefined sets
4770
                //    (Word Characters, for example)
4771
                // The high bit of the op value is a flag for the match polarity.
4772
                //    0:   success if input char is in set.
4773
                //    1:   success if input char is not in set.
4774
12.1M
                if (fp->fInputIdx >= fActiveLimit) {
4775
73.7k
                    fHitEnd = true;
4776
73.7k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4777
73.7k
                    break;
4778
73.7k
                }
4779
4780
12.0M
                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4781
12.0M
                opValue &= ~URX_NEG_SET;
4782
12.0M
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4783
4784
12.0M
                UChar32 c;
4785
12.0M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4786
12.0M
                if (c < 256) {
4787
2.50M
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4788
2.50M
                    if (s8.contains(c)) {
4789
34.4k
                        success = !success;
4790
34.4k
                    }
4791
9.53M
                } else {
4792
9.53M
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4793
9.53M
                    if (s.contains(c)) {
4794
1.64M
                        success = !success;
4795
1.64M
                    }
4796
9.53M
                }
4797
12.0M
                if (!success) {
4798
10.3M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4799
10.3M
                }
4800
12.0M
            }
4801
0
            break;
4802
4803
4804
80.1M
        case URX_STAT_SETREF_N:
4805
80.1M
            {
4806
                // Test input character for NOT being a member of  one of
4807
                //    the predefined sets (Word Characters, for example)
4808
80.1M
                if (fp->fInputIdx >= fActiveLimit) {
4809
91.2k
                    fHitEnd = true;
4810
91.2k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4811
91.2k
                    break;
4812
91.2k
                }
4813
4814
80.0M
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4815
4816
80.0M
                UChar32  c;
4817
80.0M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4818
80.0M
                if (c < 256) {
4819
30.0M
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4820
30.0M
                    if (s8.contains(c) == false) {
4821
29.3M
                        break;
4822
29.3M
                    }
4823
49.9M
                } else {
4824
49.9M
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4825
49.9M
                    if (s.contains(c) == false) {
4826
42.6M
                        break;
4827
42.6M
                    }
4828
49.9M
                }
4829
8.02M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4830
8.02M
            }
4831
0
            break;
4832
4833
4834
37.5M
        case URX_SETREF:
4835
37.5M
            {
4836
37.5M
                if (fp->fInputIdx >= fActiveLimit) {
4837
157k
                    fHitEnd = true;
4838
157k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4839
157k
                    break;
4840
157k
                }
4841
4842
37.3M
                U_ASSERT(opValue > 0 && opValue < fSets->size());
4843
4844
                // There is input left.  Pick up one char and test it for set membership.
4845
37.3M
                UChar32  c;
4846
37.3M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4847
37.3M
                if (c<256) {
4848
19.0M
                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4849
19.0M
                    if (s8->contains(c)) {
4850
                        // The character is in the set.  A Match.
4851
11.6M
                        break;
4852
11.6M
                    }
4853
19.0M
                } else {
4854
18.3M
                    UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
4855
18.3M
                    if (s->contains(c)) {
4856
                        // The character is in the set.  A Match.
4857
4.60M
                        break;
4858
4.60M
                    }
4859
18.3M
                }
4860
4861
                // the character wasn't in the set.
4862
21.1M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4863
21.1M
            }
4864
0
            break;
4865
4866
4867
246M
        case URX_DOTANY:
4868
246M
            {
4869
                // . matches anything, but stops at end-of-line.
4870
246M
                if (fp->fInputIdx >= fActiveLimit) {
4871
                    // At end of input.  Match failed.  Backtrack out.
4872
8.46M
                    fHitEnd = true;
4873
8.46M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4874
8.46M
                    break;
4875
8.46M
                }
4876
4877
                // There is input left.  Advance over one char, unless we've hit end-of-line
4878
237M
                UChar32  c;
4879
237M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4880
237M
                if (isLineTerminator(c)) {
4881
                    // End of line in normal mode.   . does not match.
4882
3.17M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4883
3.17M
                    break;
4884
3.17M
                }
4885
237M
            }
4886
234M
            break;
4887
4888
4889
234M
        case URX_DOTANY_ALL:
4890
24.4M
            {
4891
                // . in dot-matches-all (including new lines) mode
4892
24.4M
                if (fp->fInputIdx >= fActiveLimit) {
4893
                    // At end of input.  Match failed.  Backtrack out.
4894
6.02M
                    fHitEnd = true;
4895
6.02M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4896
6.02M
                    break;
4897
6.02M
                }
4898
4899
                // There is input left.  Advance over one char, except if we are
4900
                //   at a cr/lf, advance over both of them.
4901
18.4M
                UChar32 c;
4902
18.4M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4903
18.4M
                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4904
                    // In the case of a CR/LF, we need to advance over both.
4905
9.40k
                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4906
1.84k
                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4907
1.84k
                    }
4908
9.40k
                }
4909
18.4M
            }
4910
0
            break;
4911
4912
4913
1.55M
        case URX_DOTANY_UNIX:
4914
1.55M
            {
4915
                // '.' operator, matches all, but stops at end-of-line.
4916
                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4917
1.55M
                if (fp->fInputIdx >= fActiveLimit) {
4918
                    // At end of input.  Match failed.  Backtrack out.
4919
29.5k
                    fHitEnd = true;
4920
29.5k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4921
29.5k
                    break;
4922
29.5k
                }
4923
4924
                // There is input left.  Advance over one char, unless we've hit end-of-line
4925
1.52M
                UChar32 c;
4926
1.52M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4927
1.52M
                if (c == 0x0a) {
4928
                    // End of line in normal mode.   '.' does not match the \n
4929
898
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4930
898
                }
4931
1.52M
            }
4932
0
            break;
4933
4934
4935
90.0M
        case URX_JMP:
4936
90.0M
            fp->fPatIdx = opValue;
4937
90.0M
            break;
4938
4939
20.6M
        case URX_FAIL:
4940
20.6M
            isMatch = false;
4941
20.6M
            goto breakFromLoop;
4942
4943
194M
        case URX_JMP_SAV:
4944
194M
            U_ASSERT(opValue < fPattern->fCompiledPat->size());
4945
194M
            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4946
194M
            fp->fPatIdx = opValue;                         // Then JMP.
4947
194M
            break;
4948
4949
29.8M
        case URX_JMP_SAV_X:
4950
            // This opcode is used with (x)+, when x can match a zero length string.
4951
            // Same as JMP_SAV, except conditional on the match having made forward progress.
4952
            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4953
            //   data address of the input position at the start of the loop.
4954
29.8M
            {
4955
29.8M
                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4956
29.8M
                int32_t stoOp = static_cast<int32_t>(pat[opValue - 1]);
4957
29.8M
                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4958
29.8M
                int32_t  frameLoc = URX_VAL(stoOp);
4959
29.8M
                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4960
29.8M
                int32_t prevInputIdx = static_cast<int32_t>(fp->fExtra[frameLoc]);
4961
29.8M
                U_ASSERT(prevInputIdx <= fp->fInputIdx);
4962
29.8M
                if (prevInputIdx < fp->fInputIdx) {
4963
                    // The match did make progress.  Repeat the loop.
4964
17.9M
                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4965
17.9M
                    fp->fPatIdx = opValue;
4966
17.9M
                    fp->fExtra[frameLoc] = fp->fInputIdx;
4967
17.9M
                }
4968
                // If the input position did not advance, we do nothing here,
4969
                //   execution will fall out of the loop.
4970
29.8M
            }
4971
29.8M
            break;
4972
4973
16.1M
        case URX_CTR_INIT:
4974
16.1M
            {
4975
16.1M
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4976
16.1M
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4977
4978
                // Pick up the three extra operands that CTR_INIT has, and
4979
                //    skip the pattern location counter past
4980
16.1M
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
4981
16.1M
                fp->fPatIdx += 3;
4982
16.1M
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4983
16.1M
                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
4984
16.1M
                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
4985
16.1M
                U_ASSERT(minCount>=0);
4986
16.1M
                U_ASSERT(maxCount>=minCount || maxCount==-1);
4987
16.1M
                U_ASSERT(loopLoc>=fp->fPatIdx);
4988
4989
16.1M
                if (minCount == 0) {
4990
15.4M
                    fp = StateSave(fp, loopLoc+1, status);
4991
15.4M
                }
4992
16.1M
                if (maxCount == -1) {
4993
2.49M
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4994
13.6M
                } else if (maxCount == 0) {
4995
12.8M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
4996
12.8M
                }
4997
16.1M
            }
4998
16.1M
            break;
4999
5000
102M
        case URX_CTR_LOOP:
5001
102M
            {
5002
102M
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5003
102M
                int32_t initOp = static_cast<int32_t>(pat[opValue]);
5004
102M
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
5005
102M
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5006
102M
                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
5007
102M
                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
5008
102M
                (*pCounter)++;
5009
102M
                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
5010
1.40M
                    U_ASSERT(*pCounter == maxCount);
5011
1.40M
                    break;
5012
1.40M
                }
5013
101M
                if (*pCounter >= minCount) {
5014
66.0M
                    if (maxCount == -1) {
5015
                        // Loop has no hard upper bound.
5016
                        // Check that it is progressing through the input, break if it is not.
5017
44.6M
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5018
44.6M
                        if (fp->fInputIdx == *pLastInputIdx) {
5019
18.3M
                            break;
5020
26.2M
                        } else {
5021
26.2M
                            *pLastInputIdx = fp->fInputIdx;
5022
26.2M
                        }
5023
44.6M
                    }
5024
47.6M
                    fp = StateSave(fp, fp->fPatIdx, status);
5025
47.6M
                } else {
5026
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
5027
35.1M
                    fTickCounter--;
5028
35.1M
                    if (fTickCounter <= 0) {
5029
3.51k
                        IncrementTime(status);    // Re-initializes fTickCounter
5030
3.51k
                    }
5031
35.1M
                }
5032
82.8M
                fp->fPatIdx = opValue + 4;    // Loop back.
5033
82.8M
            }
5034
0
            break;
5035
5036
18.7M
        case URX_CTR_INIT_NG:
5037
18.7M
            {
5038
                // Initialize a non-greedy loop
5039
18.7M
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5040
18.7M
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5041
5042
                // Pick up the three extra operands that CTR_INIT_NG has, and
5043
                //    skip the pattern location counter past
5044
18.7M
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
5045
18.7M
                fp->fPatIdx += 3;
5046
18.7M
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5047
18.7M
                int32_t minCount = static_cast<int32_t>(pat[instrOperandLoc + 1]);
5048
18.7M
                int32_t maxCount = static_cast<int32_t>(pat[instrOperandLoc + 2]);
5049
18.7M
                U_ASSERT(minCount>=0);
5050
18.7M
                U_ASSERT(maxCount>=minCount || maxCount==-1);
5051
18.7M
                U_ASSERT(loopLoc>fp->fPatIdx);
5052
18.7M
                if (maxCount == -1) {
5053
17.0M
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5054
17.0M
                }
5055
5056
18.7M
                if (minCount == 0) {
5057
16.7M
                    if (maxCount != 0) {
5058
16.7M
                        fp = StateSave(fp, fp->fPatIdx, status);
5059
16.7M
                    }
5060
16.7M
                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5061
16.7M
                }
5062
18.7M
            }
5063
18.7M
            break;
5064
5065
57.3M
        case URX_CTR_LOOP_NG:
5066
57.3M
            {
5067
                // Non-greedy {min, max} loops
5068
57.3M
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5069
57.3M
                int32_t initOp = static_cast<int32_t>(pat[opValue]);
5070
57.3M
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5071
57.3M
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5072
57.3M
                int32_t minCount = static_cast<int32_t>(pat[opValue + 2]);
5073
57.3M
                int32_t maxCount = static_cast<int32_t>(pat[opValue + 3]);
5074
5075
57.3M
                (*pCounter)++;
5076
57.3M
                if (static_cast<uint64_t>(*pCounter) >= static_cast<uint32_t>(maxCount) && maxCount != -1) {
5077
                    // The loop has matched the maximum permitted number of times.
5078
                    //   Break out of here with no action.  Matching will
5079
                    //   continue with the following pattern.
5080
791k
                    U_ASSERT(*pCounter == maxCount);
5081
791k
                    break;
5082
791k
                }
5083
5084
56.5M
                if (*pCounter < minCount) {
5085
                    // We haven't met the minimum number of matches yet.
5086
                    //   Loop back for another one.
5087
21.0M
                    fp->fPatIdx = opValue + 4;    // Loop back.
5088
21.0M
                    fTickCounter--;
5089
21.0M
                    if (fTickCounter <= 0) {
5090
2.26k
                        IncrementTime(status);    // Re-initializes fTickCounter
5091
2.26k
                    }
5092
35.5M
                } else {
5093
                    // We do have the minimum number of matches.
5094
5095
                    // If there is no upper bound on the loop iterations, check that the input index
5096
                    // is progressing, and stop the loop if it is not.
5097
35.5M
                    if (maxCount == -1) {
5098
26.3M
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5099
26.3M
                        if (fp->fInputIdx == *pLastInputIdx) {
5100
1.42M
                            break;
5101
1.42M
                        }
5102
24.8M
                        *pLastInputIdx = fp->fInputIdx;
5103
24.8M
                    }
5104
5105
                    // Loop Continuation: we will fall into the pattern following the loop
5106
                    //   (non-greedy, don't execute loop body first), but first do
5107
                    //   a state save to the top of the loop, so that a match failure
5108
                    //   in the following pattern will try another iteration of the loop.
5109
34.1M
                    fp = StateSave(fp, opValue + 4, status);
5110
34.1M
                }
5111
56.5M
            }
5112
55.1M
            break;
5113
5114
55.1M
        case URX_STO_SP:
5115
28.7M
            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5116
28.7M
            fData[opValue] = fStack->size();
5117
28.7M
            break;
5118
5119
23.7M
        case URX_LD_SP:
5120
23.7M
            {
5121
23.7M
                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5122
23.7M
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
5123
23.7M
                U_ASSERT(newStackSize <= fStack->size());
5124
23.7M
                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5125
23.7M
                if (newFP == reinterpret_cast<int64_t*>(fp)) {
5126
22.8M
                    break;
5127
22.8M
                }
5128
882k
                int32_t j;
5129
120M
                for (j=0; j<fFrameSize; j++) {
5130
119M
                    newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
5131
119M
                }
5132
882k
                fp = reinterpret_cast<REStackFrame*>(newFP);
5133
882k
                fStack->setSize(newStackSize);
5134
882k
            }
5135
0
            break;
5136
5137
62.9M
        case URX_BACKREF:
5138
62.9M
            {
5139
62.9M
                U_ASSERT(opValue < fFrameSize);
5140
62.9M
                int64_t groupStartIdx = fp->fExtra[opValue];
5141
62.9M
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5142
62.9M
                U_ASSERT(groupStartIdx <= groupEndIdx);
5143
62.9M
                int64_t inputIndex = fp->fInputIdx;
5144
62.9M
                if (groupStartIdx < 0) {
5145
                    // This capture group has not participated in the match thus far,
5146
23.3k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
5147
23.3k
                    break;
5148
23.3k
                }
5149
62.9M
                UBool success = true;
5150
76.1M
                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5151
62.9M
                    if (inputIndex >= fActiveLimit) {
5152
134k
                        success = false;
5153
134k
                        fHitEnd = true;
5154
134k
                        break;
5155
134k
                    }
5156
62.7M
                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5157
49.5M
                        success = false;
5158
49.5M
                        break;
5159
49.5M
                    }
5160
62.7M
                }
5161
62.9M
                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5162
20.4k
                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5163
                    // Capture group ended with an unpaired lead surrogate.
5164
                    // Back reference is not permitted to match lead only of a surrogatge pair.
5165
503
                    success = false;
5166
503
                }
5167
62.9M
                if (success) {
5168
13.2M
                    fp->fInputIdx = inputIndex;
5169
49.6M
                } else {
5170
49.6M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5171
49.6M
                }
5172
62.9M
            }
5173
0
            break;
5174
5175
39.6M
        case URX_BACKREF_I:
5176
39.6M
            {
5177
39.6M
                U_ASSERT(opValue < fFrameSize);
5178
39.6M
                int64_t groupStartIdx = fp->fExtra[opValue];
5179
39.6M
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5180
39.6M
                U_ASSERT(groupStartIdx <= groupEndIdx);
5181
39.6M
                if (groupStartIdx < 0) {
5182
                    // This capture group has not participated in the match thus far,
5183
165k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no match.
5184
165k
                    break;
5185
165k
                }
5186
39.5M
                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5187
39.5M
                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5188
5189
                //   Note: if the capture group match was of an empty string the backref
5190
                //         match succeeds.  Verified by testing:  Perl matches succeed
5191
                //         in this case, so we do too.
5192
5193
39.5M
                UBool success = true;
5194
49.8M
                for (;;) {
5195
49.8M
                    UChar32 captureGroupChar = captureGroupItr.next();
5196
49.8M
                    if (captureGroupChar == U_SENTINEL) {
5197
10.3M
                        success = true;
5198
10.3M
                        break;
5199
10.3M
                    }
5200
39.5M
                    UChar32 inputChar = inputItr.next();
5201
39.5M
                    if (inputChar == U_SENTINEL) {
5202
38.0k
                        success = false;
5203
38.0k
                        fHitEnd = true;
5204
38.0k
                        break;
5205
38.0k
                    }
5206
39.4M
                    if (inputChar != captureGroupChar) {
5207
29.1M
                        success = false;
5208
29.1M
                        break;
5209
29.1M
                    }
5210
39.4M
                }
5211
5212
39.5M
                if (success && inputItr.inExpansion()) {
5213
                    // We obtained a match by consuming part of a string obtained from
5214
                    // case-folding a single code point of the input text.
5215
                    // This does not count as an overall match.
5216
279
                    success = false;
5217
279
                }
5218
5219
39.5M
                if (success) {
5220
10.3M
                    fp->fInputIdx = inputItr.getIndex();
5221
29.1M
                } else {
5222
29.1M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5223
29.1M
                }
5224
39.5M
            }
5225
0
            break;
5226
5227
45.7M
        case URX_STO_INP_LOC:
5228
45.7M
            {
5229
45.7M
                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5230
45.7M
                fp->fExtra[opValue] = fp->fInputIdx;
5231
45.7M
            }
5232
45.7M
            break;
5233
5234
0
        case URX_JMPX:
5235
0
            {
5236
0
                int32_t instrOperandLoc = static_cast<int32_t>(fp->fPatIdx);
5237
0
                fp->fPatIdx += 1;
5238
0
                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5239
0
                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5240
0
                int32_t savedInputIdx = static_cast<int32_t>(fp->fExtra[dataLoc]);
5241
0
                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5242
0
                if (savedInputIdx < fp->fInputIdx) {
5243
0
                    fp->fPatIdx = opValue;                               // JMP
5244
0
                } else {
5245
0
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize)); // FAIL, no progress in loop.
5246
0
                }
5247
0
            }
5248
0
            break;
5249
5250
2.54M
        case URX_LA_START:
5251
2.54M
            {
5252
                // Entering a look around block.
5253
                // Save Stack Ptr, Input Pos.
5254
2.54M
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5255
2.54M
                fData[opValue]   = fStack->size();
5256
2.54M
                fData[opValue+1] = fp->fInputIdx;
5257
2.54M
                fData[opValue+2] = fActiveStart;
5258
2.54M
                fData[opValue+3] = fActiveLimit;
5259
2.54M
                fActiveStart     = fLookStart;          // Set the match region change for
5260
2.54M
                fActiveLimit     = fLookLimit;          //   transparent bounds.
5261
2.54M
            }
5262
2.54M
            break;
5263
5264
2.56M
        case URX_LA_END:
5265
2.56M
            {
5266
                // Leaving a look around block.
5267
                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5268
2.56M
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5269
2.56M
                int32_t stackSize = fStack->size();
5270
2.56M
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
5271
2.56M
                U_ASSERT(stackSize >= newStackSize);
5272
2.56M
                if (stackSize > newStackSize) {
5273
                    // Copy the current top frame back to the new (cut back) top frame.
5274
                    //   This makes the capture groups from within the look-ahead
5275
                    //   expression available.
5276
25.6k
                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5277
25.6k
                    int32_t j;
5278
160k
                    for (j=0; j<fFrameSize; j++) {
5279
134k
                        newFP[j] = reinterpret_cast<int64_t*>(fp)[j];
5280
134k
                    }
5281
25.6k
                    fp = reinterpret_cast<REStackFrame*>(newFP);
5282
25.6k
                    fStack->setSize(newStackSize);
5283
25.6k
                }
5284
2.56M
                fp->fInputIdx = fData[opValue+1];
5285
5286
                // Restore the active region bounds in the input string; they may have
5287
                //    been changed because of transparent bounds on a Region.
5288
2.56M
                fActiveStart = fData[opValue+2];
5289
2.56M
                fActiveLimit = fData[opValue+3];
5290
2.56M
                U_ASSERT(fActiveStart >= 0);
5291
2.56M
                U_ASSERT(fActiveLimit <= fInputLength);
5292
2.56M
            }
5293
2.56M
            break;
5294
5295
126M
        case URX_ONECHAR_I:
5296
126M
            if (fp->fInputIdx < fActiveLimit) {
5297
123M
                UChar32 c;
5298
123M
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5299
123M
                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5300
945k
                    break;
5301
945k
                }
5302
123M
            } else {
5303
3.33M
                fHitEnd = true;
5304
3.33M
            }
5305
126M
            fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5306
126M
            break;
5307
5308
525M
        case URX_STRING_I:
5309
            // Case-insensitive test input against a literal string.
5310
            // Strings require two slots in the compiled pattern, one for the
5311
            //   offset to the string text, and one for the length.
5312
            //   The compiled string has already been case folded.
5313
525M
            {
5314
525M
                const char16_t *patternString = litText + opValue;
5315
5316
525M
                op = static_cast<int32_t>(pat[fp->fPatIdx]);
5317
525M
                fp->fPatIdx++;
5318
525M
                opType  = URX_TYPE(op);
5319
525M
                opValue = URX_VAL(op);
5320
525M
                U_ASSERT(opType == URX_STRING_LEN);
5321
525M
                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5322
5323
525M
                UChar32      cText;
5324
525M
                UChar32      cPattern;
5325
525M
                UBool        success = true;
5326
525M
                int32_t      patternStringIdx  = 0;
5327
525M
                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5328
826M
                while (patternStringIdx < patternStringLen) {
5329
824M
                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5330
824M
                    cText = inputIterator.next();
5331
824M
                    if (cText != cPattern) {
5332
523M
                        success = false;
5333
523M
                        if (cText == U_SENTINEL) {
5334
14.6M
                            fHitEnd = true;
5335
14.6M
                        }
5336
523M
                        break;
5337
523M
                    }
5338
824M
                }
5339
525M
                if (inputIterator.inExpansion()) {
5340
13.2M
                    success = false;
5341
13.2M
                }
5342
5343
525M
                if (success) {
5344
2.28M
                    fp->fInputIdx = inputIterator.getIndex();
5345
523M
                } else {
5346
523M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5347
523M
                }
5348
525M
            }
5349
525M
            break;
5350
5351
17.6M
        case URX_LB_START:
5352
17.6M
            {
5353
                // Entering a look-behind block.
5354
                // Save Stack Ptr, Input Pos and active input region.
5355
                //   TODO:  implement transparent bounds.  Ticket #6067
5356
17.6M
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5357
17.6M
                fData[opValue]   = fStack->size();
5358
17.6M
                fData[opValue+1] = fp->fInputIdx;
5359
                // Save input string length, then reset to pin any matches to end at
5360
                //   the current position.
5361
17.6M
                fData[opValue+2] = fActiveStart;
5362
17.6M
                fData[opValue+3] = fActiveLimit;
5363
17.6M
                fActiveStart     = fRegionStart;
5364
17.6M
                fActiveLimit     = fp->fInputIdx;
5365
                // Init the variable containing the start index for attempted matches.
5366
17.6M
                fData[opValue+4] = -1;
5367
17.6M
            }
5368
17.6M
            break;
5369
5370
5371
31.6M
        case URX_LB_CONT:
5372
31.6M
            {
5373
                // Positive Look-Behind, at top of loop checking for matches of LB expression
5374
                //    at all possible input starting positions.
5375
5376
                // Fetch the min and max possible match lengths.  They are the operands
5377
                //   of this op in the pattern.
5378
31.6M
                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
5379
31.6M
                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
5380
31.6M
                U_ASSERT(minML <= maxML);
5381
31.6M
                U_ASSERT(minML >= 0);
5382
5383
                // Fetch (from data) the last input index where a match was attempted.
5384
31.6M
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5385
31.6M
                int64_t  &lbStartIdx = fData[opValue+4];
5386
31.6M
                if (lbStartIdx < 0) {
5387
                    // First time through loop.
5388
5.01M
                    lbStartIdx = fp->fInputIdx - minML;
5389
5.01M
                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5390
5.00M
                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5391
5.00M
                    }
5392
26.5M
                } else {
5393
                    // 2nd through nth time through the loop.
5394
                    // Back up start position for match by one.
5395
26.5M
                    if (lbStartIdx == 0) {
5396
411k
                        lbStartIdx--;
5397
26.1M
                    } else {
5398
26.1M
                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5399
26.1M
                    }
5400
26.5M
                }
5401
5402
31.6M
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5403
                    // We have tried all potential match starting points without
5404
                    //  getting a match.  Backtrack out, and out of the
5405
                    //   Look Behind altogether.
5406
4.99M
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5407
4.99M
                    fActiveStart = fData[opValue+2];
5408
4.99M
                    fActiveLimit = fData[opValue+3];
5409
4.99M
                    U_ASSERT(fActiveStart >= 0);
5410
4.99M
                    U_ASSERT(fActiveLimit <= fInputLength);
5411
4.99M
                    break;
5412
4.99M
                }
5413
5414
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5415
                //      (successful match will fall off the end of the loop.)
5416
26.6M
                fp = StateSave(fp, fp->fPatIdx-3, status);
5417
26.6M
                fp->fInputIdx =  lbStartIdx;
5418
26.6M
            }
5419
0
            break;
5420
5421
100k
        case URX_LB_END:
5422
            // End of a look-behind block, after a successful match.
5423
100k
            {
5424
100k
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5425
100k
                if (fp->fInputIdx != fActiveLimit) {
5426
                    //  The look-behind expression matched, but the match did not
5427
                    //    extend all the way to the point that we are looking behind from.
5428
                    //  FAIL out of here, which will take us back to the LB_CONT, which
5429
                    //     will retry the match starting at another position or fail
5430
                    //     the look-behind altogether, whichever is appropriate.
5431
78.9k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5432
78.9k
                    break;
5433
78.9k
                }
5434
5435
                // Look-behind match is good.  Restore the original input string region,
5436
                //   which had been truncated to pin the end of the lookbehind match to the
5437
                //   position being looked-behind.
5438
21.6k
                fActiveStart = fData[opValue+2];
5439
21.6k
                fActiveLimit = fData[opValue+3];
5440
21.6k
                U_ASSERT(fActiveStart >= 0);
5441
21.6k
                U_ASSERT(fActiveLimit <= fInputLength);
5442
21.6k
            }
5443
0
            break;
5444
5445
5446
65.4M
        case URX_LBN_CONT:
5447
65.4M
            {
5448
                // Negative Look-Behind, at top of loop checking for matches of LB expression
5449
                //    at all possible input starting positions.
5450
5451
                // Fetch the extra parameters of this op.
5452
65.4M
                int32_t minML = static_cast<int32_t>(pat[fp->fPatIdx++]);
5453
65.4M
                int32_t maxML = static_cast<int32_t>(pat[fp->fPatIdx++]);
5454
65.4M
                int32_t continueLoc = static_cast<int32_t>(pat[fp->fPatIdx++]);
5455
65.4M
                continueLoc = URX_VAL(continueLoc);
5456
65.4M
                U_ASSERT(minML <= maxML);
5457
65.4M
                U_ASSERT(minML >= 0);
5458
65.4M
                U_ASSERT(continueLoc > fp->fPatIdx);
5459
5460
                // Fetch (from data) the last input index where a match was attempted.
5461
65.4M
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5462
65.4M
                int64_t  &lbStartIdx = fData[opValue+4];
5463
65.4M
                if (lbStartIdx < 0) {
5464
                    // First time through loop.
5465
12.5M
                    lbStartIdx = fp->fInputIdx - minML;
5466
12.5M
                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5467
9.56M
                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5468
9.56M
                    }
5469
52.8M
                } else {
5470
                    // 2nd through nth time through the loop.
5471
                    // Back up start position for match by one.
5472
52.8M
                    if (lbStartIdx == 0) {
5473
9.26k
                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5474
52.8M
                    } else {
5475
52.8M
                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5476
52.8M
                    }
5477
52.8M
                }
5478
5479
65.4M
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5480
                    // We have tried all potential match starting points without
5481
                    //  getting a match, which means that the negative lookbehind as
5482
                    //  a whole has succeeded.  Jump forward to the continue location
5483
11.5M
                    fActiveStart = fData[opValue+2];
5484
11.5M
                    fActiveLimit = fData[opValue+3];
5485
11.5M
                    U_ASSERT(fActiveStart >= 0);
5486
11.5M
                    U_ASSERT(fActiveLimit <= fInputLength);
5487
11.5M
                    fp->fPatIdx = continueLoc;
5488
11.5M
                    break;
5489
11.5M
                }
5490
5491
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5492
                //      (successful match will cause a FAIL out of the loop altogether.)
5493
53.8M
                fp = StateSave(fp, fp->fPatIdx-4, status);
5494
53.8M
                fp->fInputIdx =  lbStartIdx;
5495
53.8M
            }
5496
0
            break;
5497
5498
1.45M
        case URX_LBN_END:
5499
            // End of a negative look-behind block, after a successful match.
5500
1.45M
            {
5501
1.45M
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5502
1.45M
                if (fp->fInputIdx != fActiveLimit) {
5503
                    //  The look-behind expression matched, but the match did not
5504
                    //    extend all the way to the point that we are looking behind from.
5505
                    //  FAIL out of here, which will take us back to the LB_CONT, which
5506
                    //     will retry the match starting at another position or succeed
5507
                    //     the look-behind altogether, whichever is appropriate.
5508
401k
                    fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5509
401k
                    break;
5510
401k
                }
5511
5512
                // Look-behind expression matched, which means look-behind test as
5513
                //   a whole Fails
5514
5515
                //   Restore the original input string length, which had been truncated
5516
                //   inorder to pin the end of the lookbehind match
5517
                //   to the position being looked-behind.
5518
1.05M
                fActiveStart = fData[opValue+2];
5519
1.05M
                fActiveLimit = fData[opValue+3];
5520
1.05M
                U_ASSERT(fActiveStart >= 0);
5521
1.05M
                U_ASSERT(fActiveLimit <= fInputLength);
5522
5523
                // Restore original stack position, discarding any state saved
5524
                //   by the successful pattern match.
5525
1.05M
                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5526
1.05M
                int32_t newStackSize = static_cast<int32_t>(fData[opValue]);
5527
1.05M
                U_ASSERT(fStack->size() > newStackSize);
5528
1.05M
                fStack->setSize(newStackSize);
5529
5530
                //  FAIL, which will take control back to someplace
5531
                //  prior to entering the look-behind test.
5532
1.05M
                fp = reinterpret_cast<REStackFrame*>(fStack->popFrame(fFrameSize));
5533
1.05M
            }
5534
0
            break;
5535
5536
5537
35.4M
        case URX_LOOP_SR_I:
5538
            // Loop Initialization for the optimized implementation of
5539
            //     [some character set]*
5540
            //   This op scans through all matching input.
5541
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5542
35.4M
            {
5543
35.4M
                U_ASSERT(opValue > 0 && opValue < fSets->size());
5544
35.4M
                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5545
35.4M
                UnicodeSet* s = static_cast<UnicodeSet*>(fSets->elementAt(opValue));
5546
5547
                // Loop through input, until either the input is exhausted or
5548
                //   we reach a character that is not a member of the set.
5549
35.4M
                int32_t ix = static_cast<int32_t>(fp->fInputIdx);
5550
69.7M
                for (;;) {
5551
69.7M
                    if (ix >= fActiveLimit) {
5552
923k
                        fHitEnd = true;
5553
923k
                        break;
5554
923k
                    }
5555
68.8M
                    UChar32   c;
5556
68.8M
                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5557
68.8M
                    if (c<256) {
5558
35.2M
                        if (s8->contains(c) == false) {
5559
16.6M
                            U16_BACK_1(inputBuf, 0, ix);
5560
16.6M
                            break;
5561
16.6M
                        }
5562
35.2M
                    } else {
5563
33.5M
                        if (s->contains(c) == false) {
5564
17.8M
                            U16_BACK_1(inputBuf, 0, ix);
5565
17.8M
                            break;
5566
17.8M
                        }
5567
33.5M
                    }
5568
68.8M
                }
5569
5570
                // If there were no matching characters, skip over the loop altogether.
5571
                //   The loop doesn't run at all, a * op always succeeds.
5572
35.4M
                if (ix == fp->fInputIdx) {
5573
27.2M
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5574
27.2M
                    break;
5575
27.2M
                }
5576
5577
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5578
                //   must follow.  It's operand is the stack location
5579
                //   that holds the starting input index for the match of this [set]*
5580
8.22M
                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
5581
8.22M
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5582
8.22M
                int32_t stackLoc = URX_VAL(loopcOp);
5583
8.22M
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5584
8.22M
                fp->fExtra[stackLoc] = fp->fInputIdx;
5585
8.22M
                fp->fInputIdx = ix;
5586
5587
                // Save State to the URX_LOOP_C op that follows this one,
5588
                //   so that match failures in the following code will return to there.
5589
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5590
8.22M
                fp = StateSave(fp, fp->fPatIdx, status);
5591
8.22M
                fp->fPatIdx++;
5592
8.22M
            }
5593
0
            break;
5594
5595
5596
43.9M
        case URX_LOOP_DOT_I:
5597
            // Loop Initialization for the optimized implementation of .*
5598
            //   This op scans through all remaining input.
5599
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5600
43.9M
            {
5601
                // Loop through input until the input is exhausted (we reach an end-of-line)
5602
                // In DOTALL mode, we can just go straight to the end of the input.
5603
43.9M
                int32_t ix;
5604
43.9M
                if ((opValue & 1) == 1) {
5605
                    // Dot-matches-All mode.  Jump straight to the end of the string.
5606
12.0M
                    ix = static_cast<int32_t>(fActiveLimit);
5607
12.0M
                    fHitEnd = true;
5608
31.8M
                } else {
5609
                    // NOT DOT ALL mode.  Line endings do not match '.'
5610
                    // Scan forward until a line ending or end of input.
5611
31.8M
                    ix = static_cast<int32_t>(fp->fInputIdx);
5612
787M
                    for (;;) {
5613
787M
                        if (ix >= fActiveLimit) {
5614
28.4M
                            fHitEnd = true;
5615
28.4M
                            break;
5616
28.4M
                        }
5617
758M
                        UChar32   c;
5618
758M
                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5619
758M
                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5620
330M
                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5621
329M
                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5622
322M
                                   isLineTerminator(c))) {
5623
                                //  char is a line ending.  Put the input pos back to the
5624
                                //    line ending char, and exit the scanning loop.
5625
3.43M
                                U16_BACK_1(inputBuf, 0, ix);
5626
3.43M
                                break;
5627
3.43M
                            }
5628
330M
                        }
5629
758M
                    }
5630
31.8M
                }
5631
5632
                // If there were no matching characters, skip over the loop altogether.
5633
                //   The loop doesn't run at all, a * op always succeeds.
5634
43.9M
                if (ix == fp->fInputIdx) {
5635
22.7M
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5636
22.7M
                    break;
5637
22.7M
                }
5638
5639
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5640
                //   must follow.  It's operand is the stack location
5641
                //   that holds the starting input index for the match of this .*
5642
21.1M
                int32_t loopcOp = static_cast<int32_t>(pat[fp->fPatIdx]);
5643
21.1M
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5644
21.1M
                int32_t stackLoc = URX_VAL(loopcOp);
5645
21.1M
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5646
21.1M
                fp->fExtra[stackLoc] = fp->fInputIdx;
5647
21.1M
                fp->fInputIdx = ix;
5648
5649
                // Save State to the URX_LOOP_C op that follows this one,
5650
                //   so that match failures in the following code will return to there.
5651
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5652
21.1M
                fp = StateSave(fp, fp->fPatIdx, status);
5653
21.1M
                fp->fPatIdx++;
5654
21.1M
            }
5655
0
            break;
5656
5657
5658
846M
        case URX_LOOP_C:
5659
846M
            {
5660
846M
                U_ASSERT(opValue>=0 && opValue<fFrameSize);
5661
846M
                backSearchIndex = static_cast<int32_t>(fp->fExtra[opValue]);
5662
846M
                U_ASSERT(backSearchIndex <= fp->fInputIdx);
5663
846M
                if (backSearchIndex == fp->fInputIdx) {
5664
                    // We've backed up the input idx to the point that the loop started.
5665
                    // The loop is done.  Leave here without saving state.
5666
                    //  Subsequent failures won't come back here.
5667
29.3M
                    break;
5668
29.3M
                }
5669
                // Set up for the next iteration of the loop, with input index
5670
                //   backed up by one from the last time through,
5671
                //   and a state save to this instruction in case the following code fails again.
5672
                //   (We're going backwards because this loop emulates stack unwinding, not
5673
                //    the initial scan forward.)
5674
816M
                U_ASSERT(fp->fInputIdx > 0);
5675
816M
                UChar32 prevC;
5676
816M
                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5677
5678
816M
                if (prevC == 0x0a &&
5679
1.04M
                    fp->fInputIdx > backSearchIndex &&
5680
991k
                    inputBuf[fp->fInputIdx-1] == 0x0d) {
5681
73.3k
                    int32_t prevOp = static_cast<int32_t>(pat[fp->fPatIdx - 2]);
5682
73.3k
                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5683
                        // .*, stepping back over CRLF pair.
5684
7.88k
                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5685
7.88k
                    }
5686
73.3k
                }
5687
5688
5689
816M
                fp = StateSave(fp, fp->fPatIdx-1, status);
5690
816M
            }
5691
0
            break;
5692
5693
5694
5695
0
        default:
5696
            // Trouble.  The compiled pattern contains an entry with an
5697
            //           unrecognized type tag.
5698
0
            UPRV_UNREACHABLE_ASSERT;
5699
            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5700
            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5701
            // See ICU-21669.
5702
0
            status = U_INTERNAL_PROGRAM_ERROR;
5703
4.67G
        }
5704
5705
4.65G
        if (U_FAILURE(status)) {
5706
478
            isMatch = false;
5707
478
            break;
5708
478
        }
5709
4.65G
    }
5710
5711
20.6M
breakFromLoop:
5712
20.6M
    fMatch = isMatch;
5713
20.6M
    if (isMatch) {
5714
2.69k
        fLastMatchEnd = fMatchEnd;
5715
2.69k
        fMatchStart   = startIdx;
5716
2.69k
        fMatchEnd     = fp->fInputIdx;
5717
2.69k
    }
5718
5719
#ifdef REGEX_RUN_DEBUG
5720
    if (fTraceDebug) {
5721
        if (isMatch) {
5722
            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5723
        } else {
5724
            printf("No match\n\n");
5725
        }
5726
    }
5727
#endif
5728
5729
20.6M
    fFrame = fp;                // The active stack frame when the engine stopped.
5730
                                //   Contains the capture group results that we need to
5731
                                //    access later.
5732
20.6M
}
5733
5734
5735
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5736
5737
U_NAMESPACE_END
5738
5739
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5740