Coverage Report

Created: 2022-11-20 06:20

/src/icu/icu4c/source/i18n/rematch.cpp
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**************************************************************************
5
*   Copyright (C) 2002-2016 International Business Machines Corporation
6
*   and others. All rights reserved.
7
**************************************************************************
8
*/
9
//
10
//  file:  rematch.cpp
11
//
12
//         Contains the implementation of class RegexMatcher,
13
//         which is one of the main API classes for the ICU regular expression package.
14
//
15
16
#include "unicode/utypes.h"
17
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
18
19
#include "unicode/regex.h"
20
#include "unicode/uniset.h"
21
#include "unicode/uchar.h"
22
#include "unicode/ustring.h"
23
#include "unicode/rbbi.h"
24
#include "unicode/utf.h"
25
#include "unicode/utf16.h"
26
#include "uassert.h"
27
#include "cmemory.h"
28
#include "cstr.h"
29
#include "uvector.h"
30
#include "uvectr32.h"
31
#include "uvectr64.h"
32
#include "regeximp.h"
33
#include "regexst.h"
34
#include "regextxt.h"
35
#include "ucase.h"
36
37
// #include <malloc.h>        // Needed for heapcheck testing
38
39
40
U_NAMESPACE_BEGIN
41
42
// Default limit for the size of the back track stack, to avoid system
43
//    failures causedby heap exhaustion.  Units are in 32 bit words, not bytes.
44
// This value puts ICU's limits higher than most other regexp implementations,
45
//    which use recursion rather than the heap, and take more storage per
46
//    backtrack point.
47
//
48
static const int32_t DEFAULT_BACKTRACK_STACK_CAPACITY = 8000000;
49
50
// Time limit counter constant.
51
//   Time limits for expression evaluation are in terms of quanta of work by
52
//   the engine, each of which is 10,000 state saves.
53
//   This constant determines that state saves per tick number.
54
static const int32_t TIMER_INITIAL_VALUE = 10000;
55
56
57
// Test for any of the Unicode line terminating characters.
58
0
static inline UBool isLineTerminator(UChar32 c) {
59
0
    if (c & ~(0x0a | 0x0b | 0x0c | 0x0d | 0x85 | 0x2028 | 0x2029)) {
60
0
        return false;
61
0
    }
62
0
    return (c<=0x0d && c>=0x0a) || c==0x85 || c==0x2028 || c==0x2029;
63
0
}
64
65
//-----------------------------------------------------------------------------
66
//
67
//   Constructor and Destructor
68
//
69
//-----------------------------------------------------------------------------
70
3.94k
RegexMatcher::RegexMatcher(const RegexPattern *pat)  {
71
3.94k
    fDeferredStatus = U_ZERO_ERROR;
72
3.94k
    init(fDeferredStatus);
73
3.94k
    if (U_FAILURE(fDeferredStatus)) {
74
0
        return;
75
0
    }
76
3.94k
    if (pat==NULL) {
77
0
        fDeferredStatus = U_ILLEGAL_ARGUMENT_ERROR;
78
0
        return;
79
0
    }
80
3.94k
    fPattern = pat;
81
3.94k
    init2(RegexStaticSets::gStaticSets->fEmptyText, fDeferredStatus);
82
3.94k
}
83
84
85
86
RegexMatcher::RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
87
0
                           uint32_t flags, UErrorCode &status) {
88
0
    init(status);
89
0
    if (U_FAILURE(status)) {
90
0
        return;
91
0
    }
92
0
    UParseError    pe;
93
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
94
0
    fPattern           = fPatternOwned;
95
96
0
    UText inputText = UTEXT_INITIALIZER;
97
0
    utext_openConstUnicodeString(&inputText, &input, &status);
98
0
    init2(&inputText, status);
99
0
    utext_close(&inputText);
100
101
0
    fInputUniStrMaybeMutable = true;
102
0
}
103
104
105
RegexMatcher::RegexMatcher(UText *regexp, UText *input,
106
0
                           uint32_t flags, UErrorCode &status) {
107
0
    init(status);
108
0
    if (U_FAILURE(status)) {
109
0
        return;
110
0
    }
111
0
    UParseError    pe;
112
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
113
0
    if (U_FAILURE(status)) {
114
0
        return;
115
0
    }
116
117
0
    fPattern           = fPatternOwned;
118
0
    init2(input, status);
119
0
}
120
121
122
RegexMatcher::RegexMatcher(const UnicodeString &regexp,
123
0
                           uint32_t flags, UErrorCode &status) {
124
0
    init(status);
125
0
    if (U_FAILURE(status)) {
126
0
        return;
127
0
    }
128
0
    UParseError    pe;
129
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
130
0
    if (U_FAILURE(status)) {
131
0
        return;
132
0
    }
133
0
    fPattern           = fPatternOwned;
134
0
    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
135
0
}
136
137
RegexMatcher::RegexMatcher(UText *regexp,
138
0
                           uint32_t flags, UErrorCode &status) {
139
0
    init(status);
140
0
    if (U_FAILURE(status)) {
141
0
        return;
142
0
    }
143
0
    UParseError    pe;
144
0
    fPatternOwned      = RegexPattern::compile(regexp, flags, pe, status);
145
0
        if (U_FAILURE(status)) {
146
0
        return;
147
0
    }
148
149
0
    fPattern           = fPatternOwned;
150
0
    init2(RegexStaticSets::gStaticSets->fEmptyText, status);
151
0
}
152
153
154
155
156
3.94k
RegexMatcher::~RegexMatcher() {
157
3.94k
    delete fStack;
158
3.94k
    if (fData != fSmallData) {
159
144
        uprv_free(fData);
160
144
        fData = NULL;
161
144
    }
162
3.94k
    if (fPatternOwned) {
163
0
        delete fPatternOwned;
164
0
        fPatternOwned = NULL;
165
0
        fPattern = NULL;
166
0
    }
167
168
3.94k
    if (fInput) {
169
0
        delete fInput;
170
0
    }
171
3.94k
    if (fInputText) {
172
3.94k
        utext_close(fInputText);
173
3.94k
    }
174
3.94k
    if (fAltInputText) {
175
51
        utext_close(fAltInputText);
176
51
    }
177
178
3.94k
    #if UCONFIG_NO_BREAK_ITERATION==0
179
3.94k
    delete fWordBreakItr;
180
3.94k
    delete fGCBreakItr;
181
3.94k
    #endif
182
3.94k
}
183
184
//
185
//   init()   common initialization for use by all constructors.
186
//            Initialize all fields, get the object into a consistent state.
187
//            This must be done even when the initial status shows an error,
188
//            so that the object is initialized sufficiently well for the destructor
189
//            to run safely.
190
//
191
3.94k
void RegexMatcher::init(UErrorCode &status) {
192
3.94k
    fPattern           = NULL;
193
3.94k
    fPatternOwned      = NULL;
194
3.94k
    fFrameSize         = 0;
195
3.94k
    fRegionStart       = 0;
196
3.94k
    fRegionLimit       = 0;
197
3.94k
    fAnchorStart       = 0;
198
3.94k
    fAnchorLimit       = 0;
199
3.94k
    fLookStart         = 0;
200
3.94k
    fLookLimit         = 0;
201
3.94k
    fActiveStart       = 0;
202
3.94k
    fActiveLimit       = 0;
203
3.94k
    fTransparentBounds = false;
204
3.94k
    fAnchoringBounds   = true;
205
3.94k
    fMatch             = false;
206
3.94k
    fMatchStart        = 0;
207
3.94k
    fMatchEnd          = 0;
208
3.94k
    fLastMatchEnd      = -1;
209
3.94k
    fAppendPosition    = 0;
210
3.94k
    fHitEnd            = false;
211
3.94k
    fRequireEnd        = false;
212
3.94k
    fStack             = NULL;
213
3.94k
    fFrame             = NULL;
214
3.94k
    fTimeLimit         = 0;
215
3.94k
    fTime              = 0;
216
3.94k
    fTickCounter       = 0;
217
3.94k
    fStackLimit        = DEFAULT_BACKTRACK_STACK_CAPACITY;
218
3.94k
    fCallbackFn        = NULL;
219
3.94k
    fCallbackContext   = NULL;
220
3.94k
    fFindProgressCallbackFn      = NULL;
221
3.94k
    fFindProgressCallbackContext = NULL;
222
3.94k
    fTraceDebug        = false;
223
3.94k
    fDeferredStatus    = status;
224
3.94k
    fData              = fSmallData;
225
3.94k
    fWordBreakItr      = NULL;
226
3.94k
    fGCBreakItr        = NULL;
227
228
3.94k
    fStack             = NULL;
229
3.94k
    fInputText         = NULL;
230
3.94k
    fAltInputText      = NULL;
231
3.94k
    fInput             = NULL;
232
3.94k
    fInputLength       = 0;
233
3.94k
    fInputUniStrMaybeMutable = false;
234
3.94k
}
235
236
//
237
//  init2()   Common initialization for use by RegexMatcher constructors, part 2.
238
//            This handles the common setup to be done after the Pattern is available.
239
//
240
3.94k
void RegexMatcher::init2(UText *input, UErrorCode &status) {
241
3.94k
    if (U_FAILURE(status)) {
242
0
        fDeferredStatus = status;
243
0
        return;
244
0
    }
245
246
3.94k
    if (fPattern->fDataSize > UPRV_LENGTHOF(fSmallData)) {
247
144
        fData = (int64_t *)uprv_malloc(fPattern->fDataSize * sizeof(int64_t));
248
144
        if (fData == NULL) {
249
0
            status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
250
0
            return;
251
0
        }
252
144
    }
253
254
3.94k
    fStack = new UVector64(status);
255
3.94k
    if (fStack == NULL) {
256
0
        status = fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
257
0
        return;
258
0
    }
259
260
3.94k
    reset(input);
261
3.94k
    setStackLimit(DEFAULT_BACKTRACK_STACK_CAPACITY, status);
262
3.94k
    if (U_FAILURE(status)) {
263
0
        fDeferredStatus = status;
264
0
        return;
265
0
    }
266
3.94k
}
267
268
269
static const UChar BACKSLASH  = 0x5c;
270
static const UChar DOLLARSIGN = 0x24;
271
static const UChar LEFTBRACKET = 0x7b;
272
static const UChar RIGHTBRACKET = 0x7d;
273
274
//--------------------------------------------------------------------------------
275
//
276
//    appendReplacement
277
//
278
//--------------------------------------------------------------------------------
279
RegexMatcher &RegexMatcher::appendReplacement(UnicodeString &dest,
280
                                              const UnicodeString &replacement,
281
0
                                              UErrorCode &status) {
282
0
    UText replacementText = UTEXT_INITIALIZER;
283
284
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
285
0
    if (U_SUCCESS(status)) {
286
0
        UText resultText = UTEXT_INITIALIZER;
287
0
        utext_openUnicodeString(&resultText, &dest, &status);
288
289
0
        if (U_SUCCESS(status)) {
290
0
            appendReplacement(&resultText, &replacementText, status);
291
0
            utext_close(&resultText);
292
0
        }
293
0
        utext_close(&replacementText);
294
0
    }
295
296
0
    return *this;
297
0
}
298
299
//
300
//    appendReplacement, UText mode
301
//
302
RegexMatcher &RegexMatcher::appendReplacement(UText *dest,
303
                                              UText *replacement,
304
0
                                              UErrorCode &status) {
305
0
    if (U_FAILURE(status)) {
306
0
        return *this;
307
0
    }
308
0
    if (U_FAILURE(fDeferredStatus)) {
309
0
        status = fDeferredStatus;
310
0
        return *this;
311
0
    }
312
0
    if (fMatch == false) {
313
0
        status = U_REGEX_INVALID_STATE;
314
0
        return *this;
315
0
    }
316
317
    // Copy input string from the end of previous match to start of current match
318
0
    int64_t  destLen = utext_nativeLength(dest);
319
0
    if (fMatchStart > fAppendPosition) {
320
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
321
0
            destLen += utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
322
0
                                     (int32_t)(fMatchStart-fAppendPosition), &status);
323
0
        } else {
324
0
            int32_t len16;
325
0
            if (UTEXT_USES_U16(fInputText)) {
326
0
                len16 = (int32_t)(fMatchStart-fAppendPosition);
327
0
            } else {
328
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
329
0
                len16 = utext_extract(fInputText, fAppendPosition, fMatchStart, NULL, 0, &lengthStatus);
330
0
            }
331
0
            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
332
0
            if (inputChars == NULL) {
333
0
                status = U_MEMORY_ALLOCATION_ERROR;
334
0
                return *this;
335
0
            }
336
0
            utext_extract(fInputText, fAppendPosition, fMatchStart, inputChars, len16+1, &status);
337
0
            destLen += utext_replace(dest, destLen, destLen, inputChars, len16, &status);
338
0
            uprv_free(inputChars);
339
0
        }
340
0
    }
341
0
    fAppendPosition = fMatchEnd;
342
343
344
    // scan the replacement text, looking for substitutions ($n) and \escapes.
345
    //  TODO:  optimize this loop by efficiently scanning for '$' or '\',
346
    //         move entire ranges not containing substitutions.
347
0
    UTEXT_SETNATIVEINDEX(replacement, 0);
348
0
    for (UChar32 c = UTEXT_NEXT32(replacement); U_SUCCESS(status) && c != U_SENTINEL;  c = UTEXT_NEXT32(replacement)) {
349
0
        if (c == BACKSLASH) {
350
            // Backslash Escape.  Copy the following char out without further checks.
351
            //                    Note:  Surrogate pairs don't need any special handling
352
            //                           The second half wont be a '$' or a '\', and
353
            //                           will move to the dest normally on the next
354
            //                           loop iteration.
355
0
            c = UTEXT_CURRENT32(replacement);
356
0
            if (c == U_SENTINEL) {
357
0
                break;
358
0
            }
359
360
0
            if (c==0x55/*U*/ || c==0x75/*u*/) {
361
                // We have a \udddd or \Udddddddd escape sequence.
362
0
                int32_t offset = 0;
363
0
                struct URegexUTextUnescapeCharContext context = U_REGEX_UTEXT_UNESCAPE_CONTEXT(replacement);
364
0
                UChar32 escapedChar = u_unescapeAt(uregex_utext_unescape_charAt, &offset, INT32_MAX, &context);
365
0
                if (escapedChar != (UChar32)0xFFFFFFFF) {
366
0
                    if (U_IS_BMP(escapedChar)) {
367
0
                        UChar c16 = (UChar)escapedChar;
368
0
                        destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
369
0
                    } else {
370
0
                        UChar surrogate[2];
371
0
                        surrogate[0] = U16_LEAD(escapedChar);
372
0
                        surrogate[1] = U16_TRAIL(escapedChar);
373
0
                        if (U_SUCCESS(status)) {
374
0
                            destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
375
0
                        }
376
0
                    }
377
                    // TODO:  Report errors for mal-formed \u escapes?
378
                    //        As this is, the original sequence is output, which may be OK.
379
0
                    if (context.lastOffset == offset) {
380
0
                        (void)UTEXT_PREVIOUS32(replacement);
381
0
                    } else if (context.lastOffset != offset-1) {
382
0
                        utext_moveIndex32(replacement, offset - context.lastOffset - 1);
383
0
                    }
384
0
                }
385
0
            } else {
386
0
                (void)UTEXT_NEXT32(replacement);
387
                // Plain backslash escape.  Just put out the escaped character.
388
0
                if (U_IS_BMP(c)) {
389
0
                    UChar c16 = (UChar)c;
390
0
                    destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
391
0
                } else {
392
0
                    UChar surrogate[2];
393
0
                    surrogate[0] = U16_LEAD(c);
394
0
                    surrogate[1] = U16_TRAIL(c);
395
0
                    if (U_SUCCESS(status)) {
396
0
                        destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
397
0
                    }
398
0
                }
399
0
            }
400
0
        } else if (c != DOLLARSIGN) {
401
            // Normal char, not a $.  Copy it out without further checks.
402
0
            if (U_IS_BMP(c)) {
403
0
                UChar c16 = (UChar)c;
404
0
                destLen += utext_replace(dest, destLen, destLen, &c16, 1, &status);
405
0
            } else {
406
0
                UChar surrogate[2];
407
0
                surrogate[0] = U16_LEAD(c);
408
0
                surrogate[1] = U16_TRAIL(c);
409
0
                if (U_SUCCESS(status)) {
410
0
                    destLen += utext_replace(dest, destLen, destLen, surrogate, 2, &status);
411
0
                }
412
0
            }
413
0
        } else {
414
            // We've got a $.  Pick up a capture group name or number if one follows.
415
            // Consume digits so long as the resulting group number <= the number of
416
            // number of capture groups in the pattern.
417
418
0
            int32_t groupNum  = 0;
419
0
            int32_t numDigits = 0;
420
0
            UChar32 nextChar = utext_current32(replacement);
421
0
            if (nextChar == LEFTBRACKET) {
422
                // Scan for a Named Capture Group, ${name}.
423
0
                UnicodeString groupName;
424
0
                utext_next32(replacement);
425
0
                while(U_SUCCESS(status) && nextChar != RIGHTBRACKET) {
426
0
                    nextChar = utext_next32(replacement);
427
0
                    if (nextChar == U_SENTINEL) {
428
0
                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
429
0
                    } else if ((nextChar >= 0x41 && nextChar <= 0x5a) ||       // A..Z
430
0
                               (nextChar >= 0x61 && nextChar <= 0x7a) ||       // a..z
431
0
                               (nextChar >= 0x31 && nextChar <= 0x39)) {       // 0..9
432
0
                        groupName.append(nextChar);
433
0
                    } else if (nextChar == RIGHTBRACKET) {
434
0
                        groupNum = fPattern->fNamedCaptureMap ? uhash_geti(fPattern->fNamedCaptureMap, &groupName) : 0;
435
0
                        if (groupNum == 0) {
436
0
                            status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
437
0
                        }
438
0
                    } else {
439
                        // Character was something other than a name char or a closing '}'
440
0
                        status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
441
0
                    }
442
0
                }
443
444
0
            } else if (u_isdigit(nextChar)) {
445
                // $n    Scan for a capture group number
446
0
                int32_t numCaptureGroups = fPattern->fGroupMap->size();
447
0
                for (;;) {
448
0
                    nextChar = UTEXT_CURRENT32(replacement);
449
0
                    if (nextChar == U_SENTINEL) {
450
0
                        break;
451
0
                    }
452
0
                    if (u_isdigit(nextChar) == false) {
453
0
                        break;
454
0
                    }
455
0
                    int32_t nextDigitVal = u_charDigitValue(nextChar);
456
0
                    if (groupNum*10 + nextDigitVal > numCaptureGroups) {
457
                        // Don't consume the next digit if it makes the capture group number too big.
458
0
                        if (numDigits == 0) {
459
0
                            status = U_INDEX_OUTOFBOUNDS_ERROR;
460
0
                        }
461
0
                        break;
462
0
                    }
463
0
                    (void)UTEXT_NEXT32(replacement);
464
0
                    groupNum=groupNum*10 + nextDigitVal;
465
0
                    ++numDigits;
466
0
                }
467
0
            } else {
468
                // $ not followed by capture group name or number.
469
0
                status = U_REGEX_INVALID_CAPTURE_GROUP_NAME;
470
0
            }
471
472
0
            if (U_SUCCESS(status)) {
473
0
                destLen += appendGroup(groupNum, dest, status);
474
0
            }
475
0
        }  // End of $ capture group handling
476
0
    }  // End of per-character loop through the replacement string.
477
478
0
    return *this;
479
0
}
480
481
482
483
//--------------------------------------------------------------------------------
484
//
485
//    appendTail     Intended to be used in conjunction with appendReplacement()
486
//                   To the destination string, append everything following
487
//                   the last match position from the input string.
488
//
489
//                   Note:  Match ranges do not affect appendTail or appendReplacement
490
//
491
//--------------------------------------------------------------------------------
492
0
UnicodeString &RegexMatcher::appendTail(UnicodeString &dest) {
493
0
    UErrorCode status = U_ZERO_ERROR;
494
0
    UText resultText = UTEXT_INITIALIZER;
495
0
    utext_openUnicodeString(&resultText, &dest, &status);
496
497
0
    if (U_SUCCESS(status)) {
498
0
        appendTail(&resultText, status);
499
0
        utext_close(&resultText);
500
0
    }
501
502
0
    return dest;
503
0
}
504
505
//
506
//   appendTail, UText mode
507
//
508
0
UText *RegexMatcher::appendTail(UText *dest, UErrorCode &status) {
509
0
    if (U_FAILURE(status)) {
510
0
        return dest;
511
0
    }
512
0
    if (U_FAILURE(fDeferredStatus)) {
513
0
        status = fDeferredStatus;
514
0
        return dest;
515
0
    }
516
517
0
    if (fInputLength > fAppendPosition) {
518
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
519
0
            int64_t destLen = utext_nativeLength(dest);
520
0
            utext_replace(dest, destLen, destLen, fInputText->chunkContents+fAppendPosition,
521
0
                          (int32_t)(fInputLength-fAppendPosition), &status);
522
0
        } else {
523
0
            int32_t len16;
524
0
            if (UTEXT_USES_U16(fInputText)) {
525
0
                len16 = (int32_t)(fInputLength-fAppendPosition);
526
0
            } else {
527
0
                len16 = utext_extract(fInputText, fAppendPosition, fInputLength, NULL, 0, &status);
528
0
                status = U_ZERO_ERROR; // buffer overflow
529
0
            }
530
531
0
            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16));
532
0
            if (inputChars == NULL) {
533
0
                fDeferredStatus = U_MEMORY_ALLOCATION_ERROR;
534
0
            } else {
535
0
                utext_extract(fInputText, fAppendPosition, fInputLength, inputChars, len16, &status); // unterminated
536
0
                int64_t destLen = utext_nativeLength(dest);
537
0
                utext_replace(dest, destLen, destLen, inputChars, len16, &status);
538
0
                uprv_free(inputChars);
539
0
            }
540
0
        }
541
0
    }
542
0
    return dest;
543
0
}
544
545
546
547
//--------------------------------------------------------------------------------
548
//
549
//   end
550
//
551
//--------------------------------------------------------------------------------
552
0
int32_t RegexMatcher::end(UErrorCode &err) const {
553
0
    return end(0, err);
554
0
}
555
556
0
int64_t RegexMatcher::end64(UErrorCode &err) const {
557
0
    return end64(0, err);
558
0
}
559
560
0
int64_t RegexMatcher::end64(int32_t group, UErrorCode &err) const {
561
0
    if (U_FAILURE(err)) {
562
0
        return -1;
563
0
    }
564
0
    if (fMatch == false) {
565
0
        err = U_REGEX_INVALID_STATE;
566
0
        return -1;
567
0
    }
568
0
    if (group < 0 || group > fPattern->fGroupMap->size()) {
569
0
        err = U_INDEX_OUTOFBOUNDS_ERROR;
570
0
        return -1;
571
0
    }
572
0
    int64_t e = -1;
573
0
    if (group == 0) {
574
0
        e = fMatchEnd;
575
0
    } else {
576
        // Get the position within the stack frame of the variables for
577
        //    this capture group.
578
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
579
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
580
0
        U_ASSERT(groupOffset >= 0);
581
0
        e = fFrame->fExtra[groupOffset + 1];
582
0
    }
583
584
0
        return e;
585
0
}
586
587
0
int32_t RegexMatcher::end(int32_t group, UErrorCode &err) const {
588
0
    return (int32_t)end64(group, err);
589
0
}
590
591
//--------------------------------------------------------------------------------
592
//
593
//   findProgressInterrupt  This function is called once for each advance in the target
594
//                          string from the find() function, and calls the user progress callback
595
//                          function if there is one installed.
596
//
597
//         Return:  true if the find operation is to be terminated.
598
//                  false if the find operation is to continue running.
599
//
600
//--------------------------------------------------------------------------------
601
0
UBool RegexMatcher::findProgressInterrupt(int64_t pos, UErrorCode &status) {
602
0
    if (fFindProgressCallbackFn && !(*fFindProgressCallbackFn)(fFindProgressCallbackContext, pos)) {
603
0
        status = U_REGEX_STOPPED_BY_CALLER;
604
0
        return true;
605
0
    }
606
0
    return false;
607
0
}
608
609
//--------------------------------------------------------------------------------
610
//
611
//   find()
612
//
613
//--------------------------------------------------------------------------------
614
0
UBool RegexMatcher::find() {
615
0
    if (U_FAILURE(fDeferredStatus)) {
616
0
        return false;
617
0
    }
618
0
    UErrorCode status = U_ZERO_ERROR;
619
0
    UBool result = find(status);
620
0
    return result;
621
0
}
622
623
//--------------------------------------------------------------------------------
624
//
625
//   find()
626
//
627
//--------------------------------------------------------------------------------
628
0
UBool RegexMatcher::find(UErrorCode &status) {
629
    // Start at the position of the last match end.  (Will be zero if the
630
    //   matcher has been reset.)
631
    //
632
0
    if (U_FAILURE(status)) {
633
0
        return false;
634
0
    }
635
0
    if (U_FAILURE(fDeferredStatus)) {
636
0
        status = fDeferredStatus;
637
0
        return false;
638
0
    }
639
640
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
641
0
        return findUsingChunk(status);
642
0
    }
643
644
0
    int64_t startPos = fMatchEnd;
645
0
    if (startPos==0) {
646
0
        startPos = fActiveStart;
647
0
    }
648
649
0
    if (fMatch) {
650
        // Save the position of any previous successful match.
651
0
        fLastMatchEnd = fMatchEnd;
652
653
0
        if (fMatchStart == fMatchEnd) {
654
            // Previous match had zero length.  Move start position up one position
655
            //  to avoid sending find() into a loop on zero-length matches.
656
0
            if (startPos >= fActiveLimit) {
657
0
                fMatch = false;
658
0
                fHitEnd = true;
659
0
                return false;
660
0
            }
661
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
662
0
            (void)UTEXT_NEXT32(fInputText);
663
0
            startPos = UTEXT_GETNATIVEINDEX(fInputText);
664
0
        }
665
0
    } else {
666
0
        if (fLastMatchEnd >= 0) {
667
            // A previous find() failed to match.  Don't try again.
668
            //   (without this test, a pattern with a zero-length match
669
            //    could match again at the end of an input string.)
670
0
            fHitEnd = true;
671
0
            return false;
672
0
        }
673
0
    }
674
675
676
    // Compute the position in the input string beyond which a match can not begin, because
677
    //   the minimum length match would extend past the end of the input.
678
    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
679
    //          Be aware of possible overflows if making changes here.
680
0
    int64_t testStartLimit;
681
0
    if (UTEXT_USES_U16(fInputText)) {
682
0
        testStartLimit = fActiveLimit - fPattern->fMinMatchLen;
683
0
        if (startPos > testStartLimit) {
684
0
            fMatch = false;
685
0
            fHitEnd = true;
686
0
            return false;
687
0
        }
688
0
    } else {
689
        // We don't know exactly how long the minimum match length is in native characters.
690
        // Treat anything > 0 as 1.
691
0
        testStartLimit = fActiveLimit - (fPattern->fMinMatchLen > 0 ? 1 : 0);
692
0
    }
693
694
0
    UChar32  c;
695
0
    U_ASSERT(startPos >= 0);
696
697
0
    switch (fPattern->fStartType) {
698
0
    case START_NO_INFO:
699
        // No optimization was found.
700
        //  Try a match at each input position.
701
0
        for (;;) {
702
0
            MatchAt(startPos, false, status);
703
0
            if (U_FAILURE(status)) {
704
0
                return false;
705
0
            }
706
0
            if (fMatch) {
707
0
                return true;
708
0
            }
709
0
            if (startPos >= testStartLimit) {
710
0
                fHitEnd = true;
711
0
                return false;
712
0
            }
713
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
714
0
            (void)UTEXT_NEXT32(fInputText);
715
0
            startPos = UTEXT_GETNATIVEINDEX(fInputText);
716
            // Note that it's perfectly OK for a pattern to have a zero-length
717
            //   match at the end of a string, so we must make sure that the loop
718
            //   runs with startPos == testStartLimit the last time through.
719
0
            if  (findProgressInterrupt(startPos, status))
720
0
                return false;
721
0
        }
722
0
        UPRV_UNREACHABLE_EXIT;
723
724
0
    case START_START:
725
        // Matches are only possible at the start of the input string
726
        //   (pattern begins with ^ or \A)
727
0
        if (startPos > fActiveStart) {
728
0
            fMatch = false;
729
0
            return false;
730
0
        }
731
0
        MatchAt(startPos, false, status);
732
0
        if (U_FAILURE(status)) {
733
0
            return false;
734
0
        }
735
0
        return fMatch;
736
737
738
0
    case START_SET:
739
0
        {
740
            // Match may start on any char from a pre-computed set.
741
0
            U_ASSERT(fPattern->fMinMatchLen > 0);
742
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
743
0
            for (;;) {
744
0
                int64_t pos = startPos;
745
0
                c = UTEXT_NEXT32(fInputText);
746
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
747
                // c will be -1 (U_SENTINEL) at end of text, in which case we
748
                // skip this next block (so we don't have a negative array index)
749
                // and handle end of text in the following block.
750
0
                if (c >= 0 && ((c<256 && fPattern->fInitialChars8->contains(c)) ||
751
0
                              (c>=256 && fPattern->fInitialChars->contains(c)))) {
752
0
                    MatchAt(pos, false, status);
753
0
                    if (U_FAILURE(status)) {
754
0
                        return false;
755
0
                    }
756
0
                    if (fMatch) {
757
0
                        return true;
758
0
                    }
759
0
                    UTEXT_SETNATIVEINDEX(fInputText, pos);
760
0
                }
761
0
                if (startPos > testStartLimit) {
762
0
                    fMatch = false;
763
0
                    fHitEnd = true;
764
0
                    return false;
765
0
                }
766
0
                if  (findProgressInterrupt(startPos, status))
767
0
                    return false;
768
0
            }
769
0
        }
770
0
        UPRV_UNREACHABLE_EXIT;
771
772
0
    case START_STRING:
773
0
    case START_CHAR:
774
0
        {
775
            // Match starts on exactly one char.
776
0
            U_ASSERT(fPattern->fMinMatchLen > 0);
777
0
            UChar32 theChar = fPattern->fInitialChar;
778
0
            UTEXT_SETNATIVEINDEX(fInputText, startPos);
779
0
            for (;;) {
780
0
                int64_t pos = startPos;
781
0
                c = UTEXT_NEXT32(fInputText);
782
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
783
0
                if (c == theChar) {
784
0
                    MatchAt(pos, false, status);
785
0
                    if (U_FAILURE(status)) {
786
0
                        return false;
787
0
                    }
788
0
                    if (fMatch) {
789
0
                        return true;
790
0
                    }
791
0
                    UTEXT_SETNATIVEINDEX(fInputText, startPos);
792
0
                }
793
0
                if (startPos > testStartLimit) {
794
0
                    fMatch = false;
795
0
                    fHitEnd = true;
796
0
                    return false;
797
0
                }
798
0
                if  (findProgressInterrupt(startPos, status))
799
0
                    return false;
800
0
           }
801
0
        }
802
0
        UPRV_UNREACHABLE_EXIT;
803
804
0
    case START_LINE:
805
0
        {
806
0
            UChar32 ch;
807
0
            if (startPos == fAnchorStart) {
808
0
                MatchAt(startPos, false, status);
809
0
                if (U_FAILURE(status)) {
810
0
                    return false;
811
0
                }
812
0
                if (fMatch) {
813
0
                    return true;
814
0
                }
815
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
816
0
                ch = UTEXT_NEXT32(fInputText);
817
0
                startPos = UTEXT_GETNATIVEINDEX(fInputText);
818
0
            } else {
819
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
820
0
                ch = UTEXT_PREVIOUS32(fInputText);
821
0
                UTEXT_SETNATIVEINDEX(fInputText, startPos);
822
0
            }
823
824
0
            if (fPattern->fFlags & UREGEX_UNIX_LINES) {
825
0
                for (;;) {
826
0
                    if (ch == 0x0a) {
827
0
                            MatchAt(startPos, false, status);
828
0
                            if (U_FAILURE(status)) {
829
0
                                return false;
830
0
                            }
831
0
                            if (fMatch) {
832
0
                                return true;
833
0
                            }
834
0
                            UTEXT_SETNATIVEINDEX(fInputText, startPos);
835
0
                    }
836
0
                    if (startPos >= testStartLimit) {
837
0
                        fMatch = false;
838
0
                        fHitEnd = true;
839
0
                        return false;
840
0
                    }
841
0
                    ch = UTEXT_NEXT32(fInputText);
842
0
                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
843
                    // Note that it's perfectly OK for a pattern to have a zero-length
844
                    //   match at the end of a string, so we must make sure that the loop
845
                    //   runs with startPos == testStartLimit the last time through.
846
0
                    if  (findProgressInterrupt(startPos, status))
847
0
                        return false;
848
0
                }
849
0
            } else {
850
0
                for (;;) {
851
0
                    if (isLineTerminator(ch)) {
852
0
                        if (ch == 0x0d && startPos < fActiveLimit && UTEXT_CURRENT32(fInputText) == 0x0a) {
853
0
                            (void)UTEXT_NEXT32(fInputText);
854
0
                            startPos = UTEXT_GETNATIVEINDEX(fInputText);
855
0
                        }
856
0
                        MatchAt(startPos, false, status);
857
0
                        if (U_FAILURE(status)) {
858
0
                            return false;
859
0
                        }
860
0
                        if (fMatch) {
861
0
                            return true;
862
0
                        }
863
0
                        UTEXT_SETNATIVEINDEX(fInputText, startPos);
864
0
                    }
865
0
                    if (startPos >= testStartLimit) {
866
0
                        fMatch = false;
867
0
                        fHitEnd = true;
868
0
                        return false;
869
0
                    }
870
0
                    ch = UTEXT_NEXT32(fInputText);
871
0
                    startPos = UTEXT_GETNATIVEINDEX(fInputText);
872
                    // Note that it's perfectly OK for a pattern to have a zero-length
873
                    //   match at the end of a string, so we must make sure that the loop
874
                    //   runs with startPos == testStartLimit the last time through.
875
0
                    if  (findProgressInterrupt(startPos, status))
876
0
                        return false;
877
0
                }
878
0
            }
879
0
        }
880
881
0
    default:
882
0
        UPRV_UNREACHABLE_ASSERT;
883
        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
884
        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
885
        // See ICU-21669.
886
0
        status = U_INTERNAL_PROGRAM_ERROR;
887
0
        return false;
888
0
    }
889
890
0
    UPRV_UNREACHABLE_EXIT;
891
0
}
892
893
894
895
0
UBool RegexMatcher::find(int64_t start, UErrorCode &status) {
896
0
    if (U_FAILURE(status)) {
897
0
        return false;
898
0
    }
899
0
    if (U_FAILURE(fDeferredStatus)) {
900
0
        status = fDeferredStatus;
901
0
        return false;
902
0
    }
903
0
    this->reset();                        // Note:  Reset() is specified by Java Matcher documentation.
904
                                          //        This will reset the region to be the full input length.
905
0
    if (start < 0) {
906
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
907
0
        return false;
908
0
    }
909
910
0
    int64_t nativeStart = start;
911
0
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
912
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
913
0
        return false;
914
0
    }
915
0
    fMatchEnd = nativeStart;
916
0
    return find(status);
917
0
}
918
919
920
//--------------------------------------------------------------------------------
921
//
922
//   findUsingChunk() -- like find(), but with the advance knowledge that the
923
//                       entire string is available in the UText's chunk buffer.
924
//
925
//--------------------------------------------------------------------------------
926
0
UBool RegexMatcher::findUsingChunk(UErrorCode &status) {
927
    // Start at the position of the last match end.  (Will be zero if the
928
    //   matcher has been reset.
929
    //
930
931
0
    int32_t startPos = (int32_t)fMatchEnd;
932
0
    if (startPos==0) {
933
0
        startPos = (int32_t)fActiveStart;
934
0
    }
935
936
0
    const UChar *inputBuf = fInputText->chunkContents;
937
938
0
    if (fMatch) {
939
        // Save the position of any previous successful match.
940
0
        fLastMatchEnd = fMatchEnd;
941
942
0
        if (fMatchStart == fMatchEnd) {
943
            // Previous match had zero length.  Move start position up one position
944
            //  to avoid sending find() into a loop on zero-length matches.
945
0
            if (startPos >= fActiveLimit) {
946
0
                fMatch = false;
947
0
                fHitEnd = true;
948
0
                return false;
949
0
            }
950
0
            U16_FWD_1(inputBuf, startPos, fInputLength);
951
0
        }
952
0
    } else {
953
0
        if (fLastMatchEnd >= 0) {
954
            // A previous find() failed to match.  Don't try again.
955
            //   (without this test, a pattern with a zero-length match
956
            //    could match again at the end of an input string.)
957
0
            fHitEnd = true;
958
0
            return false;
959
0
        }
960
0
    }
961
962
963
    // Compute the position in the input string beyond which a match can not begin, because
964
    //   the minimum length match would extend past the end of the input.
965
    //   Note:  some patterns that cannot match anything will have fMinMatchLength==Max Int.
966
    //          Be aware of possible overflows if making changes here.
967
    //   Note:  a match can begin at inputBuf + testLen; it is an inclusive limit.
968
0
    int32_t testLen  = (int32_t)(fActiveLimit - fPattern->fMinMatchLen);
969
0
    if (startPos > testLen) {
970
0
        fMatch = false;
971
0
        fHitEnd = true;
972
0
        return false;
973
0
    }
974
975
0
    UChar32  c;
976
0
    U_ASSERT(startPos >= 0);
977
978
0
    switch (fPattern->fStartType) {
979
0
    case START_NO_INFO:
980
        // No optimization was found.
981
        //  Try a match at each input position.
982
0
        for (;;) {
983
0
            MatchChunkAt(startPos, false, status);
984
0
            if (U_FAILURE(status)) {
985
0
                return false;
986
0
            }
987
0
            if (fMatch) {
988
0
                return true;
989
0
            }
990
0
            if (startPos >= testLen) {
991
0
                fHitEnd = true;
992
0
                return false;
993
0
            }
994
0
            U16_FWD_1(inputBuf, startPos, fActiveLimit);
995
            // Note that it's perfectly OK for a pattern to have a zero-length
996
            //   match at the end of a string, so we must make sure that the loop
997
            //   runs with startPos == testLen the last time through.
998
0
            if  (findProgressInterrupt(startPos, status))
999
0
                return false;
1000
0
        }
1001
0
        UPRV_UNREACHABLE_EXIT;
1002
1003
0
    case START_START:
1004
        // Matches are only possible at the start of the input string
1005
        //   (pattern begins with ^ or \A)
1006
0
        if (startPos > fActiveStart) {
1007
0
            fMatch = false;
1008
0
            return false;
1009
0
        }
1010
0
        MatchChunkAt(startPos, false, status);
1011
0
        if (U_FAILURE(status)) {
1012
0
            return false;
1013
0
        }
1014
0
        return fMatch;
1015
1016
1017
0
    case START_SET:
1018
0
    {
1019
        // Match may start on any char from a pre-computed set.
1020
0
        U_ASSERT(fPattern->fMinMatchLen > 0);
1021
0
        for (;;) {
1022
0
            int32_t pos = startPos;
1023
0
            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1024
0
            if ((c<256 && fPattern->fInitialChars8->contains(c)) ||
1025
0
                (c>=256 && fPattern->fInitialChars->contains(c))) {
1026
0
                MatchChunkAt(pos, false, status);
1027
0
                if (U_FAILURE(status)) {
1028
0
                    return false;
1029
0
                }
1030
0
                if (fMatch) {
1031
0
                    return true;
1032
0
                }
1033
0
            }
1034
0
            if (startPos > testLen) {
1035
0
                fMatch = false;
1036
0
                fHitEnd = true;
1037
0
                return false;
1038
0
            }
1039
0
            if  (findProgressInterrupt(startPos, status))
1040
0
                return false;
1041
0
        }
1042
0
    }
1043
0
    UPRV_UNREACHABLE_EXIT;
1044
1045
0
    case START_STRING:
1046
0
    case START_CHAR:
1047
0
    {
1048
        // Match starts on exactly one char.
1049
0
        U_ASSERT(fPattern->fMinMatchLen > 0);
1050
0
        UChar32 theChar = fPattern->fInitialChar;
1051
0
        for (;;) {
1052
0
            int32_t pos = startPos;
1053
0
            U16_NEXT(inputBuf, startPos, fActiveLimit, c);  // like c = inputBuf[startPos++];
1054
0
            if (c == theChar) {
1055
0
                MatchChunkAt(pos, false, status);
1056
0
                if (U_FAILURE(status)) {
1057
0
                    return false;
1058
0
                }
1059
0
                if (fMatch) {
1060
0
                    return true;
1061
0
                }
1062
0
            }
1063
0
            if (startPos > testLen) {
1064
0
                fMatch = false;
1065
0
                fHitEnd = true;
1066
0
                return false;
1067
0
            }
1068
0
            if  (findProgressInterrupt(startPos, status))
1069
0
                return false;
1070
0
        }
1071
0
    }
1072
0
    UPRV_UNREACHABLE_EXIT;
1073
1074
0
    case START_LINE:
1075
0
    {
1076
0
        UChar32 ch;
1077
0
        if (startPos == fAnchorStart) {
1078
0
            MatchChunkAt(startPos, false, status);
1079
0
            if (U_FAILURE(status)) {
1080
0
                return false;
1081
0
            }
1082
0
            if (fMatch) {
1083
0
                return true;
1084
0
            }
1085
0
            U16_FWD_1(inputBuf, startPos, fActiveLimit);
1086
0
        }
1087
1088
0
        if (fPattern->fFlags & UREGEX_UNIX_LINES) {
1089
0
            for (;;) {
1090
0
                ch = inputBuf[startPos-1];
1091
0
                if (ch == 0x0a) {
1092
0
                    MatchChunkAt(startPos, false, status);
1093
0
                    if (U_FAILURE(status)) {
1094
0
                        return false;
1095
0
                    }
1096
0
                    if (fMatch) {
1097
0
                        return true;
1098
0
                    }
1099
0
                }
1100
0
                if (startPos >= testLen) {
1101
0
                    fMatch = false;
1102
0
                    fHitEnd = true;
1103
0
                    return false;
1104
0
                }
1105
0
                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1106
                // Note that it's perfectly OK for a pattern to have a zero-length
1107
                //   match at the end of a string, so we must make sure that the loop
1108
                //   runs with startPos == testLen the last time through.
1109
0
                if  (findProgressInterrupt(startPos, status))
1110
0
                    return false;
1111
0
            }
1112
0
        } else {
1113
0
            for (;;) {
1114
0
                ch = inputBuf[startPos-1];
1115
0
                if (isLineTerminator(ch)) {
1116
0
                    if (ch == 0x0d && startPos < fActiveLimit && inputBuf[startPos] == 0x0a) {
1117
0
                        startPos++;
1118
0
                    }
1119
0
                    MatchChunkAt(startPos, false, status);
1120
0
                    if (U_FAILURE(status)) {
1121
0
                        return false;
1122
0
                    }
1123
0
                    if (fMatch) {
1124
0
                        return true;
1125
0
                    }
1126
0
                }
1127
0
                if (startPos >= testLen) {
1128
0
                    fMatch = false;
1129
0
                    fHitEnd = true;
1130
0
                    return false;
1131
0
                }
1132
0
                U16_FWD_1(inputBuf, startPos, fActiveLimit);
1133
                // Note that it's perfectly OK for a pattern to have a zero-length
1134
                //   match at the end of a string, so we must make sure that the loop
1135
                //   runs with startPos == testLen the last time through.
1136
0
                if  (findProgressInterrupt(startPos, status))
1137
0
                    return false;
1138
0
            }
1139
0
        }
1140
0
    }
1141
1142
0
    default:
1143
0
        UPRV_UNREACHABLE_ASSERT;
1144
        // Unknown value in fPattern->fStartType, should be from StartOfMatch enum. But
1145
        // we have reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
1146
        // See ICU-21669.
1147
0
        status = U_INTERNAL_PROGRAM_ERROR;
1148
0
        return false;
1149
0
    }
1150
1151
0
    UPRV_UNREACHABLE_EXIT;
1152
0
}
1153
1154
1155
1156
//--------------------------------------------------------------------------------
1157
//
1158
//  group()
1159
//
1160
//--------------------------------------------------------------------------------
1161
0
UnicodeString RegexMatcher::group(UErrorCode &status) const {
1162
0
    return group(0, status);
1163
0
}
1164
1165
//  Return immutable shallow clone
1166
0
UText *RegexMatcher::group(UText *dest, int64_t &group_len, UErrorCode &status) const {
1167
0
    return group(0, dest, group_len, status);
1168
0
}
1169
1170
//  Return immutable shallow clone
1171
0
UText *RegexMatcher::group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const {
1172
0
    group_len = 0;
1173
0
    if (U_FAILURE(status)) {
1174
0
        return dest;
1175
0
    }
1176
0
    if (U_FAILURE(fDeferredStatus)) {
1177
0
        status = fDeferredStatus;
1178
0
    } else if (fMatch == false) {
1179
0
        status = U_REGEX_INVALID_STATE;
1180
0
    } else if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1181
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1182
0
    }
1183
1184
0
    if (U_FAILURE(status)) {
1185
0
        return dest;
1186
0
    }
1187
1188
0
    int64_t s, e;
1189
0
    if (groupNum == 0) {
1190
0
        s = fMatchStart;
1191
0
        e = fMatchEnd;
1192
0
    } else {
1193
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1194
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
1195
0
        U_ASSERT(groupOffset >= 0);
1196
0
        s = fFrame->fExtra[groupOffset];
1197
0
        e = fFrame->fExtra[groupOffset+1];
1198
0
    }
1199
1200
0
    if (s < 0) {
1201
        // A capture group wasn't part of the match
1202
0
        return utext_clone(dest, fInputText, false, true, &status);
1203
0
    }
1204
0
    U_ASSERT(s <= e);
1205
0
    group_len = e - s;
1206
1207
0
    dest = utext_clone(dest, fInputText, false, true, &status);
1208
0
    if (dest)
1209
0
        UTEXT_SETNATIVEINDEX(dest, s);
1210
0
    return dest;
1211
0
}
1212
1213
0
UnicodeString RegexMatcher::group(int32_t groupNum, UErrorCode &status) const {
1214
0
    UnicodeString result;
1215
0
    int64_t groupStart = start64(groupNum, status);
1216
0
    int64_t groupEnd = end64(groupNum, status);
1217
0
    if (U_FAILURE(status) || groupStart == -1 || groupStart == groupEnd) {
1218
0
        return result;
1219
0
    }
1220
1221
    // Get the group length using a utext_extract preflight.
1222
    //    UText is actually pretty efficient at this when underlying encoding is UTF-16.
1223
0
    int32_t length = utext_extract(fInputText, groupStart, groupEnd, NULL, 0, &status);
1224
0
    if (status != U_BUFFER_OVERFLOW_ERROR) {
1225
0
        return result;
1226
0
    }
1227
1228
0
    status = U_ZERO_ERROR;
1229
0
    UChar *buf = result.getBuffer(length);
1230
0
    if (buf == NULL) {
1231
0
        status = U_MEMORY_ALLOCATION_ERROR;
1232
0
    } else {
1233
0
        int32_t extractLength = utext_extract(fInputText, groupStart, groupEnd, buf, length, &status);
1234
0
        result.releaseBuffer(extractLength);
1235
0
        U_ASSERT(length == extractLength);
1236
0
    }
1237
0
    return result;
1238
0
}
1239
1240
1241
//--------------------------------------------------------------------------------
1242
//
1243
//  appendGroup() -- currently internal only, appends a group to a UText rather
1244
//                   than replacing its contents
1245
//
1246
//--------------------------------------------------------------------------------
1247
1248
0
int64_t RegexMatcher::appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const {
1249
0
    if (U_FAILURE(status)) {
1250
0
        return 0;
1251
0
    }
1252
0
    if (U_FAILURE(fDeferredStatus)) {
1253
0
        status = fDeferredStatus;
1254
0
        return 0;
1255
0
    }
1256
0
    int64_t destLen = utext_nativeLength(dest);
1257
1258
0
    if (fMatch == false) {
1259
0
        status = U_REGEX_INVALID_STATE;
1260
0
        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1261
0
    }
1262
0
    if (groupNum < 0 || groupNum > fPattern->fGroupMap->size()) {
1263
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1264
0
        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1265
0
    }
1266
1267
0
    int64_t s, e;
1268
0
    if (groupNum == 0) {
1269
0
        s = fMatchStart;
1270
0
        e = fMatchEnd;
1271
0
    } else {
1272
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(groupNum-1);
1273
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
1274
0
        U_ASSERT(groupOffset >= 0);
1275
0
        s = fFrame->fExtra[groupOffset];
1276
0
        e = fFrame->fExtra[groupOffset+1];
1277
0
    }
1278
1279
0
    if (s < 0) {
1280
        // A capture group wasn't part of the match
1281
0
        return utext_replace(dest, destLen, destLen, NULL, 0, &status);
1282
0
    }
1283
0
    U_ASSERT(s <= e);
1284
1285
0
    int64_t deltaLen;
1286
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1287
0
        U_ASSERT(e <= fInputLength);
1288
0
        deltaLen = utext_replace(dest, destLen, destLen, fInputText->chunkContents+s, (int32_t)(e-s), &status);
1289
0
    } else {
1290
0
        int32_t len16;
1291
0
        if (UTEXT_USES_U16(fInputText)) {
1292
0
            len16 = (int32_t)(e-s);
1293
0
        } else {
1294
0
            UErrorCode lengthStatus = U_ZERO_ERROR;
1295
0
            len16 = utext_extract(fInputText, s, e, NULL, 0, &lengthStatus);
1296
0
        }
1297
0
        UChar *groupChars = (UChar *)uprv_malloc(sizeof(UChar)*(len16+1));
1298
0
        if (groupChars == NULL) {
1299
0
            status = U_MEMORY_ALLOCATION_ERROR;
1300
0
            return 0;
1301
0
        }
1302
0
        utext_extract(fInputText, s, e, groupChars, len16+1, &status);
1303
1304
0
        deltaLen = utext_replace(dest, destLen, destLen, groupChars, len16, &status);
1305
0
        uprv_free(groupChars);
1306
0
    }
1307
0
    return deltaLen;
1308
0
}
1309
1310
1311
1312
//--------------------------------------------------------------------------------
1313
//
1314
//  groupCount()
1315
//
1316
//--------------------------------------------------------------------------------
1317
0
int32_t RegexMatcher::groupCount() const {
1318
0
    return fPattern->fGroupMap->size();
1319
0
}
1320
1321
//--------------------------------------------------------------------------------
1322
//
1323
//  hasAnchoringBounds()
1324
//
1325
//--------------------------------------------------------------------------------
1326
0
UBool RegexMatcher::hasAnchoringBounds() const {
1327
0
    return fAnchoringBounds;
1328
0
}
1329
1330
1331
//--------------------------------------------------------------------------------
1332
//
1333
//  hasTransparentBounds()
1334
//
1335
//--------------------------------------------------------------------------------
1336
0
UBool RegexMatcher::hasTransparentBounds() const {
1337
0
    return fTransparentBounds;
1338
0
}
1339
1340
1341
1342
//--------------------------------------------------------------------------------
1343
//
1344
//  hitEnd()
1345
//
1346
//--------------------------------------------------------------------------------
1347
0
UBool RegexMatcher::hitEnd() const {
1348
0
    return fHitEnd;
1349
0
}
1350
1351
1352
//--------------------------------------------------------------------------------
1353
//
1354
//  input()
1355
//
1356
//--------------------------------------------------------------------------------
1357
0
const UnicodeString &RegexMatcher::input() const {
1358
0
    if (!fInput) {
1359
0
        UErrorCode status = U_ZERO_ERROR;
1360
0
        int32_t len16;
1361
0
        if (UTEXT_USES_U16(fInputText)) {
1362
0
            len16 = (int32_t)fInputLength;
1363
0
        } else {
1364
0
            len16 = utext_extract(fInputText, 0, fInputLength, NULL, 0, &status);
1365
0
            status = U_ZERO_ERROR; // overflow, length status
1366
0
        }
1367
0
        UnicodeString *result = new UnicodeString(len16, 0, 0);
1368
1369
0
        UChar *inputChars = result->getBuffer(len16);
1370
0
        utext_extract(fInputText, 0, fInputLength, inputChars, len16, &status); // unterminated warning
1371
0
        result->releaseBuffer(len16);
1372
1373
0
        (*(const UnicodeString **)&fInput) = result; // pointer assignment, rather than operator=
1374
0
    }
1375
1376
0
    return *fInput;
1377
0
}
1378
1379
//--------------------------------------------------------------------------------
1380
//
1381
//  inputText()
1382
//
1383
//--------------------------------------------------------------------------------
1384
0
UText *RegexMatcher::inputText() const {
1385
0
    return fInputText;
1386
0
}
1387
1388
1389
//--------------------------------------------------------------------------------
1390
//
1391
//  getInput() -- like inputText(), but makes a clone or copies into another UText
1392
//
1393
//--------------------------------------------------------------------------------
1394
0
UText *RegexMatcher::getInput (UText *dest, UErrorCode &status) const {
1395
0
    if (U_FAILURE(status)) {
1396
0
        return dest;
1397
0
    }
1398
0
    if (U_FAILURE(fDeferredStatus)) {
1399
0
        status = fDeferredStatus;
1400
0
        return dest;
1401
0
    }
1402
1403
0
    if (dest) {
1404
0
        if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1405
0
            utext_replace(dest, 0, utext_nativeLength(dest), fInputText->chunkContents, (int32_t)fInputLength, &status);
1406
0
        } else {
1407
0
            int32_t input16Len;
1408
0
            if (UTEXT_USES_U16(fInputText)) {
1409
0
                input16Len = (int32_t)fInputLength;
1410
0
            } else {
1411
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
1412
0
                input16Len = utext_extract(fInputText, 0, fInputLength, NULL, 0, &lengthStatus); // buffer overflow error
1413
0
            }
1414
0
            UChar *inputChars = (UChar *)uprv_malloc(sizeof(UChar)*(input16Len));
1415
0
            if (inputChars == NULL) {
1416
0
                return dest;
1417
0
            }
1418
1419
0
            status = U_ZERO_ERROR;
1420
0
            utext_extract(fInputText, 0, fInputLength, inputChars, input16Len, &status); // not terminated warning
1421
0
            status = U_ZERO_ERROR;
1422
0
            utext_replace(dest, 0, utext_nativeLength(dest), inputChars, input16Len, &status);
1423
1424
0
            uprv_free(inputChars);
1425
0
        }
1426
0
        return dest;
1427
0
    } else {
1428
0
        return utext_clone(NULL, fInputText, false, true, &status);
1429
0
    }
1430
0
}
1431
1432
1433
static UBool compat_SyncMutableUTextContents(UText *ut);
1434
0
static UBool compat_SyncMutableUTextContents(UText *ut) {
1435
0
    UBool retVal = false;
1436
1437
    //  In the following test, we're really only interested in whether the UText should switch
1438
    //  between heap and stack allocation.  If length hasn't changed, we won't, so the chunkContents
1439
    //  will still point to the correct data.
1440
0
    if (utext_nativeLength(ut) != ut->nativeIndexingLimit) {
1441
0
        UnicodeString *us=(UnicodeString *)ut->context;
1442
1443
        // Update to the latest length.
1444
        // For example, (utext_nativeLength(ut) != ut->nativeIndexingLimit).
1445
0
        int32_t newLength = us->length();
1446
1447
        // Update the chunk description.
1448
        // The buffer may have switched between stack- and heap-based.
1449
0
        ut->chunkContents    = us->getBuffer();
1450
0
        ut->chunkLength      = newLength;
1451
0
        ut->chunkNativeLimit = newLength;
1452
0
        ut->nativeIndexingLimit = newLength;
1453
0
        retVal = true;
1454
0
    }
1455
1456
0
    return retVal;
1457
0
}
1458
1459
//--------------------------------------------------------------------------------
1460
//
1461
//  lookingAt()
1462
//
1463
//--------------------------------------------------------------------------------
1464
0
UBool RegexMatcher::lookingAt(UErrorCode &status) {
1465
0
    if (U_FAILURE(status)) {
1466
0
        return false;
1467
0
    }
1468
0
    if (U_FAILURE(fDeferredStatus)) {
1469
0
        status = fDeferredStatus;
1470
0
        return false;
1471
0
    }
1472
1473
0
    if (fInputUniStrMaybeMutable) {
1474
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1475
0
        fInputLength = utext_nativeLength(fInputText);
1476
0
        reset();
1477
0
        }
1478
0
    }
1479
0
    else {
1480
0
        resetPreserveRegion();
1481
0
    }
1482
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1483
0
        MatchChunkAt((int32_t)fActiveStart, false, status);
1484
0
    } else {
1485
0
        MatchAt(fActiveStart, false, status);
1486
0
    }
1487
0
    return fMatch;
1488
0
}
1489
1490
1491
0
UBool RegexMatcher::lookingAt(int64_t start, UErrorCode &status) {
1492
0
    if (U_FAILURE(status)) {
1493
0
        return false;
1494
0
    }
1495
0
    if (U_FAILURE(fDeferredStatus)) {
1496
0
        status = fDeferredStatus;
1497
0
        return false;
1498
0
    }
1499
0
    reset();
1500
1501
0
    if (start < 0) {
1502
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1503
0
        return false;
1504
0
    }
1505
1506
0
    if (fInputUniStrMaybeMutable) {
1507
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1508
0
        fInputLength = utext_nativeLength(fInputText);
1509
0
        reset();
1510
0
        }
1511
0
    }
1512
1513
0
    int64_t nativeStart;
1514
0
    nativeStart = start;
1515
0
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1516
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1517
0
        return false;
1518
0
    }
1519
1520
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1521
0
        MatchChunkAt((int32_t)nativeStart, false, status);
1522
0
    } else {
1523
0
        MatchAt(nativeStart, false, status);
1524
0
    }
1525
0
    return fMatch;
1526
0
}
1527
1528
1529
1530
//--------------------------------------------------------------------------------
1531
//
1532
//  matches()
1533
//
1534
//--------------------------------------------------------------------------------
1535
0
UBool RegexMatcher::matches(UErrorCode &status) {
1536
0
    if (U_FAILURE(status)) {
1537
0
        return false;
1538
0
    }
1539
0
    if (U_FAILURE(fDeferredStatus)) {
1540
0
        status = fDeferredStatus;
1541
0
        return false;
1542
0
    }
1543
1544
0
    if (fInputUniStrMaybeMutable) {
1545
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1546
0
        fInputLength = utext_nativeLength(fInputText);
1547
0
        reset();
1548
0
        }
1549
0
    }
1550
0
    else {
1551
0
        resetPreserveRegion();
1552
0
    }
1553
1554
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1555
0
        MatchChunkAt((int32_t)fActiveStart, true, status);
1556
0
    } else {
1557
0
        MatchAt(fActiveStart, true, status);
1558
0
    }
1559
0
    return fMatch;
1560
0
}
1561
1562
1563
0
UBool RegexMatcher::matches(int64_t start, UErrorCode &status) {
1564
0
    if (U_FAILURE(status)) {
1565
0
        return false;
1566
0
    }
1567
0
    if (U_FAILURE(fDeferredStatus)) {
1568
0
        status = fDeferredStatus;
1569
0
        return false;
1570
0
    }
1571
0
    reset();
1572
1573
0
    if (start < 0) {
1574
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1575
0
        return false;
1576
0
    }
1577
1578
0
    if (fInputUniStrMaybeMutable) {
1579
0
        if (compat_SyncMutableUTextContents(fInputText)) {
1580
0
        fInputLength = utext_nativeLength(fInputText);
1581
0
        reset();
1582
0
        }
1583
0
    }
1584
1585
0
    int64_t nativeStart;
1586
0
    nativeStart = start;
1587
0
    if (nativeStart < fActiveStart || nativeStart > fActiveLimit) {
1588
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1589
0
        return false;
1590
0
    }
1591
1592
0
    if (UTEXT_FULL_TEXT_IN_CHUNK(fInputText, fInputLength)) {
1593
0
        MatchChunkAt((int32_t)nativeStart, true, status);
1594
0
    } else {
1595
0
        MatchAt(nativeStart, true, status);
1596
0
    }
1597
0
    return fMatch;
1598
0
}
1599
1600
1601
1602
//--------------------------------------------------------------------------------
1603
//
1604
//    pattern
1605
//
1606
//--------------------------------------------------------------------------------
1607
0
const RegexPattern &RegexMatcher::pattern() const {
1608
0
    return *fPattern;
1609
0
}
1610
1611
1612
1613
//--------------------------------------------------------------------------------
1614
//
1615
//    region
1616
//
1617
//--------------------------------------------------------------------------------
1618
0
RegexMatcher &RegexMatcher::region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status) {
1619
0
    if (U_FAILURE(status)) {
1620
0
        return *this;
1621
0
    }
1622
1623
0
    if (regionStart>regionLimit || regionStart<0 || regionLimit<0) {
1624
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1625
0
    }
1626
1627
0
    int64_t nativeStart = regionStart;
1628
0
    int64_t nativeLimit = regionLimit;
1629
0
    if (nativeStart > fInputLength || nativeLimit > fInputLength) {
1630
0
      status = U_ILLEGAL_ARGUMENT_ERROR;
1631
0
    }
1632
1633
0
    if (startIndex == -1)
1634
0
      this->reset();
1635
0
    else
1636
0
      resetPreserveRegion();
1637
1638
0
    fRegionStart = nativeStart;
1639
0
    fRegionLimit = nativeLimit;
1640
0
    fActiveStart = nativeStart;
1641
0
    fActiveLimit = nativeLimit;
1642
1643
0
    if (startIndex != -1) {
1644
0
      if (startIndex < fActiveStart || startIndex > fActiveLimit) {
1645
0
          status = U_INDEX_OUTOFBOUNDS_ERROR;
1646
0
      }
1647
0
      fMatchEnd = startIndex;
1648
0
    }
1649
1650
0
    if (!fTransparentBounds) {
1651
0
        fLookStart = nativeStart;
1652
0
        fLookLimit = nativeLimit;
1653
0
    }
1654
0
    if (fAnchoringBounds) {
1655
0
        fAnchorStart = nativeStart;
1656
0
        fAnchorLimit = nativeLimit;
1657
0
    }
1658
0
    return *this;
1659
0
}
1660
1661
0
RegexMatcher &RegexMatcher::region(int64_t start, int64_t limit, UErrorCode &status) {
1662
0
  return region(start, limit, -1, status);
1663
0
}
1664
1665
//--------------------------------------------------------------------------------
1666
//
1667
//    regionEnd
1668
//
1669
//--------------------------------------------------------------------------------
1670
0
int32_t RegexMatcher::regionEnd() const {
1671
0
    return (int32_t)fRegionLimit;
1672
0
}
1673
1674
0
int64_t RegexMatcher::regionEnd64() const {
1675
0
    return fRegionLimit;
1676
0
}
1677
1678
//--------------------------------------------------------------------------------
1679
//
1680
//    regionStart
1681
//
1682
//--------------------------------------------------------------------------------
1683
0
int32_t RegexMatcher::regionStart() const {
1684
0
    return (int32_t)fRegionStart;
1685
0
}
1686
1687
0
int64_t RegexMatcher::regionStart64() const {
1688
0
    return fRegionStart;
1689
0
}
1690
1691
1692
//--------------------------------------------------------------------------------
1693
//
1694
//    replaceAll
1695
//
1696
//--------------------------------------------------------------------------------
1697
0
UnicodeString RegexMatcher::replaceAll(const UnicodeString &replacement, UErrorCode &status) {
1698
0
    UText replacementText = UTEXT_INITIALIZER;
1699
0
    UText resultText = UTEXT_INITIALIZER;
1700
0
    UnicodeString resultString;
1701
0
    if (U_FAILURE(status)) {
1702
0
        return resultString;
1703
0
    }
1704
1705
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1706
0
    utext_openUnicodeString(&resultText, &resultString, &status);
1707
1708
0
    replaceAll(&replacementText, &resultText, status);
1709
1710
0
    utext_close(&resultText);
1711
0
    utext_close(&replacementText);
1712
1713
0
    return resultString;
1714
0
}
1715
1716
1717
//
1718
//    replaceAll, UText mode
1719
//
1720
0
UText *RegexMatcher::replaceAll(UText *replacement, UText *dest, UErrorCode &status) {
1721
0
    if (U_FAILURE(status)) {
1722
0
        return dest;
1723
0
    }
1724
0
    if (U_FAILURE(fDeferredStatus)) {
1725
0
        status = fDeferredStatus;
1726
0
        return dest;
1727
0
    }
1728
1729
0
    if (dest == NULL) {
1730
0
        UnicodeString emptyString;
1731
0
        UText empty = UTEXT_INITIALIZER;
1732
1733
0
        utext_openUnicodeString(&empty, &emptyString, &status);
1734
0
        dest = utext_clone(NULL, &empty, true, false, &status);
1735
0
        utext_close(&empty);
1736
0
    }
1737
1738
0
    if (U_SUCCESS(status)) {
1739
0
        reset();
1740
0
        while (find()) {
1741
0
            appendReplacement(dest, replacement, status);
1742
0
            if (U_FAILURE(status)) {
1743
0
                break;
1744
0
            }
1745
0
        }
1746
0
        appendTail(dest, status);
1747
0
    }
1748
1749
0
    return dest;
1750
0
}
1751
1752
1753
//--------------------------------------------------------------------------------
1754
//
1755
//    replaceFirst
1756
//
1757
//--------------------------------------------------------------------------------
1758
0
UnicodeString RegexMatcher::replaceFirst(const UnicodeString &replacement, UErrorCode &status) {
1759
0
    UText replacementText = UTEXT_INITIALIZER;
1760
0
    UText resultText = UTEXT_INITIALIZER;
1761
0
    UnicodeString resultString;
1762
1763
0
    utext_openConstUnicodeString(&replacementText, &replacement, &status);
1764
0
    utext_openUnicodeString(&resultText, &resultString, &status);
1765
1766
0
    replaceFirst(&replacementText, &resultText, status);
1767
1768
0
    utext_close(&resultText);
1769
0
    utext_close(&replacementText);
1770
1771
0
    return resultString;
1772
0
}
1773
1774
//
1775
//    replaceFirst, UText mode
1776
//
1777
0
UText *RegexMatcher::replaceFirst(UText *replacement, UText *dest, UErrorCode &status) {
1778
0
    if (U_FAILURE(status)) {
1779
0
        return dest;
1780
0
    }
1781
0
    if (U_FAILURE(fDeferredStatus)) {
1782
0
        status = fDeferredStatus;
1783
0
        return dest;
1784
0
    }
1785
1786
0
    reset();
1787
0
    if (!find()) {
1788
0
        return getInput(dest, status);
1789
0
    }
1790
1791
0
    if (dest == NULL) {
1792
0
        UnicodeString emptyString;
1793
0
        UText empty = UTEXT_INITIALIZER;
1794
1795
0
        utext_openUnicodeString(&empty, &emptyString, &status);
1796
0
        dest = utext_clone(NULL, &empty, true, false, &status);
1797
0
        utext_close(&empty);
1798
0
    }
1799
1800
0
    appendReplacement(dest, replacement, status);
1801
0
    appendTail(dest, status);
1802
1803
0
    return dest;
1804
0
}
1805
1806
1807
//--------------------------------------------------------------------------------
1808
//
1809
//     requireEnd
1810
//
1811
//--------------------------------------------------------------------------------
1812
0
UBool RegexMatcher::requireEnd() const {
1813
0
    return fRequireEnd;
1814
0
}
1815
1816
1817
//--------------------------------------------------------------------------------
1818
//
1819
//     reset
1820
//
1821
//--------------------------------------------------------------------------------
1822
7.89k
RegexMatcher &RegexMatcher::reset() {
1823
7.89k
    fRegionStart    = 0;
1824
7.89k
    fRegionLimit    = fInputLength;
1825
7.89k
    fActiveStart    = 0;
1826
7.89k
    fActiveLimit    = fInputLength;
1827
7.89k
    fAnchorStart    = 0;
1828
7.89k
    fAnchorLimit    = fInputLength;
1829
7.89k
    fLookStart      = 0;
1830
7.89k
    fLookLimit      = fInputLength;
1831
7.89k
    resetPreserveRegion();
1832
7.89k
    return *this;
1833
7.89k
}
1834
1835
1836
1837
7.89k
void RegexMatcher::resetPreserveRegion() {
1838
7.89k
    fMatchStart     = 0;
1839
7.89k
    fMatchEnd       = 0;
1840
7.89k
    fLastMatchEnd   = -1;
1841
7.89k
    fAppendPosition = 0;
1842
7.89k
    fMatch          = false;
1843
7.89k
    fHitEnd         = false;
1844
7.89k
    fRequireEnd     = false;
1845
7.89k
    fTime           = 0;
1846
7.89k
    fTickCounter    = TIMER_INITIAL_VALUE;
1847
    //resetStack(); // more expensive than it looks...
1848
7.89k
}
1849
1850
1851
0
RegexMatcher &RegexMatcher::reset(const UnicodeString &input) {
1852
0
    fInputText = utext_openConstUnicodeString(fInputText, &input, &fDeferredStatus);
1853
0
    if (fPattern->fNeedsAltInput) {
1854
0
        fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1855
0
    }
1856
0
    if (U_FAILURE(fDeferredStatus)) {
1857
0
        return *this;
1858
0
    }
1859
0
    fInputLength = utext_nativeLength(fInputText);
1860
1861
0
    reset();
1862
0
    delete fInput;
1863
0
    fInput = NULL;
1864
1865
    //  Do the following for any UnicodeString.
1866
    //  This is for compatibility for those clients who modify the input string "live" during regex operations.
1867
0
    fInputUniStrMaybeMutable = true;
1868
1869
0
#if UCONFIG_NO_BREAK_ITERATION==0
1870
0
    if (fWordBreakItr) {
1871
0
        fWordBreakItr->setText(fInputText, fDeferredStatus);
1872
0
    }
1873
0
    if (fGCBreakItr) {
1874
0
        fGCBreakItr->setText(fInputText, fDeferredStatus);
1875
0
    }
1876
0
#endif
1877
1878
0
    return *this;
1879
0
}
1880
1881
1882
3.94k
RegexMatcher &RegexMatcher::reset(UText *input) {
1883
3.94k
    if (fInputText != input) {
1884
3.94k
        fInputText = utext_clone(fInputText, input, false, true, &fDeferredStatus);
1885
3.94k
        if (fPattern->fNeedsAltInput) fAltInputText = utext_clone(fAltInputText, fInputText, false, true, &fDeferredStatus);
1886
3.94k
        if (U_FAILURE(fDeferredStatus)) {
1887
0
            return *this;
1888
0
        }
1889
3.94k
        fInputLength = utext_nativeLength(fInputText);
1890
1891
3.94k
        delete fInput;
1892
3.94k
        fInput = NULL;
1893
1894
3.94k
#if UCONFIG_NO_BREAK_ITERATION==0
1895
3.94k
        if (fWordBreakItr) {
1896
0
            fWordBreakItr->setText(input, fDeferredStatus);
1897
0
        }
1898
3.94k
        if (fGCBreakItr) {
1899
0
            fGCBreakItr->setText(fInputText, fDeferredStatus);
1900
0
        }
1901
3.94k
#endif
1902
3.94k
    }
1903
3.94k
    reset();
1904
3.94k
    fInputUniStrMaybeMutable = false;
1905
1906
3.94k
    return *this;
1907
3.94k
}
1908
1909
/*RegexMatcher &RegexMatcher::reset(const UChar *) {
1910
    fDeferredStatus = U_INTERNAL_PROGRAM_ERROR;
1911
    return *this;
1912
}*/
1913
1914
0
RegexMatcher &RegexMatcher::reset(int64_t position, UErrorCode &status) {
1915
0
    if (U_FAILURE(status)) {
1916
0
        return *this;
1917
0
    }
1918
0
    reset();       // Reset also resets the region to be the entire string.
1919
1920
0
    if (position < 0 || position > fActiveLimit) {
1921
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
1922
0
        return *this;
1923
0
    }
1924
0
    fMatchEnd = position;
1925
0
    return *this;
1926
0
}
1927
1928
1929
//--------------------------------------------------------------------------------
1930
//
1931
//    refresh
1932
//
1933
//--------------------------------------------------------------------------------
1934
0
RegexMatcher &RegexMatcher::refreshInputText(UText *input, UErrorCode &status) {
1935
0
    if (U_FAILURE(status)) {
1936
0
        return *this;
1937
0
    }
1938
0
    if (input == NULL) {
1939
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1940
0
        return *this;
1941
0
    }
1942
0
    if (utext_nativeLength(fInputText) != utext_nativeLength(input)) {
1943
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
1944
0
        return *this;
1945
0
    }
1946
0
    int64_t  pos = utext_getNativeIndex(fInputText);
1947
    //  Shallow read-only clone of the new UText into the existing input UText
1948
0
    fInputText = utext_clone(fInputText, input, false, true, &status);
1949
0
    if (U_FAILURE(status)) {
1950
0
        return *this;
1951
0
    }
1952
0
    utext_setNativeIndex(fInputText, pos);
1953
1954
0
    if (fAltInputText != NULL) {
1955
0
        pos = utext_getNativeIndex(fAltInputText);
1956
0
        fAltInputText = utext_clone(fAltInputText, input, false, true, &status);
1957
0
        if (U_FAILURE(status)) {
1958
0
            return *this;
1959
0
        }
1960
0
        utext_setNativeIndex(fAltInputText, pos);
1961
0
    }
1962
0
    return *this;
1963
0
}
1964
1965
1966
1967
//--------------------------------------------------------------------------------
1968
//
1969
//    setTrace
1970
//
1971
//--------------------------------------------------------------------------------
1972
0
void RegexMatcher::setTrace(UBool state) {
1973
0
    fTraceDebug = state;
1974
0
}
1975
1976
1977
1978
/**
1979
  *  UText, replace entire contents of the destination UText with a substring of the source UText.
1980
  *
1981
  *     @param src    The source UText
1982
  *     @param dest   The destination UText. Must be writable.
1983
  *                   May be NULL, in which case a new UText will be allocated.
1984
  *     @param start  Start index of source substring.
1985
  *     @param limit  Limit index of source substring.
1986
  *     @param status An error code.
1987
  */
1988
0
static UText *utext_extract_replace(UText *src, UText *dest, int64_t start, int64_t limit, UErrorCode *status) {
1989
0
    if (U_FAILURE(*status)) {
1990
0
        return dest;
1991
0
    }
1992
0
    if (start == limit) {
1993
0
        if (dest) {
1994
0
            utext_replace(dest, 0, utext_nativeLength(dest), NULL, 0, status);
1995
0
            return dest;
1996
0
        } else {
1997
0
            return utext_openUChars(NULL, NULL, 0, status);
1998
0
        }
1999
0
    }
2000
0
    int32_t length = utext_extract(src, start, limit, NULL, 0, status);
2001
0
    if (*status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(*status)) {
2002
0
        return dest;
2003
0
    }
2004
0
    *status = U_ZERO_ERROR;
2005
0
    MaybeStackArray<UChar, 40> buffer;
2006
0
    if (length >= buffer.getCapacity()) {
2007
0
        UChar *newBuf = buffer.resize(length+1);   // Leave space for terminating Nul.
2008
0
        if (newBuf == NULL) {
2009
0
            *status = U_MEMORY_ALLOCATION_ERROR;
2010
0
        }
2011
0
    }
2012
0
    utext_extract(src, start, limit, buffer.getAlias(), length+1, status);
2013
0
    if (dest) {
2014
0
        utext_replace(dest, 0, utext_nativeLength(dest), buffer.getAlias(), length, status);
2015
0
        return dest;
2016
0
    }
2017
2018
    // Caller did not provide a preexisting UText.
2019
    // Open a new one, and have it adopt the text buffer storage.
2020
0
    if (U_FAILURE(*status)) {
2021
0
        return NULL;
2022
0
    }
2023
0
    int32_t ownedLength = 0;
2024
0
    UChar *ownedBuf = buffer.orphanOrClone(length+1, ownedLength);
2025
0
    if (ownedBuf == NULL) {
2026
0
        *status = U_MEMORY_ALLOCATION_ERROR;
2027
0
        return NULL;
2028
0
    }
2029
0
    UText *result = utext_openUChars(NULL, ownedBuf, length, status);
2030
0
    if (U_FAILURE(*status)) {
2031
0
        uprv_free(ownedBuf);
2032
0
        return NULL;
2033
0
    }
2034
0
    result->providerProperties |= (1 << UTEXT_PROVIDER_OWNS_TEXT);
2035
0
    return result;
2036
0
}
2037
2038
2039
//---------------------------------------------------------------------
2040
//
2041
//   split
2042
//
2043
//---------------------------------------------------------------------
2044
int32_t  RegexMatcher::split(const UnicodeString &input,
2045
        UnicodeString    dest[],
2046
        int32_t          destCapacity,
2047
        UErrorCode      &status)
2048
0
{
2049
0
    UText inputText = UTEXT_INITIALIZER;
2050
0
    utext_openConstUnicodeString(&inputText, &input, &status);
2051
0
    if (U_FAILURE(status)) {
2052
0
        return 0;
2053
0
    }
2054
2055
0
    UText **destText = (UText **)uprv_malloc(sizeof(UText*)*destCapacity);
2056
0
    if (destText == NULL) {
2057
0
        status = U_MEMORY_ALLOCATION_ERROR;
2058
0
        return 0;
2059
0
    }
2060
0
    int32_t i;
2061
0
    for (i = 0; i < destCapacity; i++) {
2062
0
        destText[i] = utext_openUnicodeString(NULL, &dest[i], &status);
2063
0
    }
2064
2065
0
    int32_t fieldCount = split(&inputText, destText, destCapacity, status);
2066
2067
0
    for (i = 0; i < destCapacity; i++) {
2068
0
        utext_close(destText[i]);
2069
0
    }
2070
2071
0
    uprv_free(destText);
2072
0
    utext_close(&inputText);
2073
0
    return fieldCount;
2074
0
}
2075
2076
//
2077
//   split, UText mode
2078
//
2079
int32_t  RegexMatcher::split(UText *input,
2080
        UText           *dest[],
2081
        int32_t          destCapacity,
2082
        UErrorCode      &status)
2083
0
{
2084
    //
2085
    // Check arguments for validity
2086
    //
2087
0
    if (U_FAILURE(status)) {
2088
0
        return 0;
2089
0
    }
2090
2091
0
    if (destCapacity < 1) {
2092
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2093
0
        return 0;
2094
0
    }
2095
2096
    //
2097
    // Reset for the input text
2098
    //
2099
0
    reset(input);
2100
0
    int64_t   nextOutputStringStart = 0;
2101
0
    if (fActiveLimit == 0) {
2102
0
        return 0;
2103
0
    }
2104
2105
    //
2106
    // Loop through the input text, searching for the delimiter pattern
2107
    //
2108
0
    int32_t i;
2109
0
    int32_t numCaptureGroups = fPattern->fGroupMap->size();
2110
0
    for (i=0; ; i++) {
2111
0
        if (i>=destCapacity-1) {
2112
            // There is one or zero output string left.
2113
            // Fill the last output string with whatever is left from the input, then exit the loop.
2114
            //  ( i will be == destCapacity if we filled the output array while processing
2115
            //    capture groups of the delimiter expression, in which case we will discard the
2116
            //    last capture group saved in favor of the unprocessed remainder of the
2117
            //    input string.)
2118
0
            i = destCapacity-1;
2119
0
            if (fActiveLimit > nextOutputStringStart) {
2120
0
                if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2121
0
                    if (dest[i]) {
2122
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2123
0
                                      input->chunkContents+nextOutputStringStart,
2124
0
                                      (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2125
0
                    } else {
2126
0
                        UText remainingText = UTEXT_INITIALIZER;
2127
0
                        utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2128
0
                                         fActiveLimit-nextOutputStringStart, &status);
2129
0
                        dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2130
0
                        utext_close(&remainingText);
2131
0
                    }
2132
0
                } else {
2133
0
                    UErrorCode lengthStatus = U_ZERO_ERROR;
2134
0
                    int32_t remaining16Length =
2135
0
                        utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2136
0
                    UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2137
0
                    if (remainingChars == NULL) {
2138
0
                        status = U_MEMORY_ALLOCATION_ERROR;
2139
0
                        break;
2140
0
                    }
2141
2142
0
                    utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2143
0
                    if (dest[i]) {
2144
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2145
0
                    } else {
2146
0
                        UText remainingText = UTEXT_INITIALIZER;
2147
0
                        utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2148
0
                        dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2149
0
                        utext_close(&remainingText);
2150
0
                    }
2151
2152
0
                    uprv_free(remainingChars);
2153
0
                }
2154
0
            }
2155
0
            break;
2156
0
        }
2157
0
        if (find()) {
2158
            // We found another delimiter.  Move everything from where we started looking
2159
            //  up until the start of the delimiter into the next output string.
2160
0
            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2161
0
                if (dest[i]) {
2162
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2163
0
                                  input->chunkContents+nextOutputStringStart,
2164
0
                                  (int32_t)(fMatchStart-nextOutputStringStart), &status);
2165
0
                } else {
2166
0
                    UText remainingText = UTEXT_INITIALIZER;
2167
0
                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2168
0
                                      fMatchStart-nextOutputStringStart, &status);
2169
0
                    dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2170
0
                    utext_close(&remainingText);
2171
0
                }
2172
0
            } else {
2173
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
2174
0
                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fMatchStart, NULL, 0, &lengthStatus);
2175
0
                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2176
0
                if (remainingChars == NULL) {
2177
0
                    status = U_MEMORY_ALLOCATION_ERROR;
2178
0
                    break;
2179
0
                }
2180
0
                utext_extract(input, nextOutputStringStart, fMatchStart, remainingChars, remaining16Length+1, &status);
2181
0
                if (dest[i]) {
2182
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2183
0
                } else {
2184
0
                    UText remainingText = UTEXT_INITIALIZER;
2185
0
                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2186
0
                    dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2187
0
                    utext_close(&remainingText);
2188
0
                }
2189
2190
0
                uprv_free(remainingChars);
2191
0
            }
2192
0
            nextOutputStringStart = fMatchEnd;
2193
2194
            // If the delimiter pattern has capturing parentheses, the captured
2195
            //  text goes out into the next n destination strings.
2196
0
            int32_t groupNum;
2197
0
            for (groupNum=1; groupNum<=numCaptureGroups; groupNum++) {
2198
0
                if (i >= destCapacity-2) {
2199
                    // Never fill the last available output string with capture group text.
2200
                    // It will filled with the last field, the remainder of the
2201
                    //  unsplit input text.
2202
0
                    break;
2203
0
                }
2204
0
                i++;
2205
0
                dest[i] = utext_extract_replace(fInputText, dest[i],
2206
0
                                               start64(groupNum, status), end64(groupNum, status), &status);
2207
0
            }
2208
2209
0
            if (nextOutputStringStart == fActiveLimit) {
2210
                // The delimiter was at the end of the string.  We're done, but first
2211
                // we output one last empty string, for the empty field following
2212
                //   the delimiter at the end of input.
2213
0
                if (i+1 < destCapacity) {
2214
0
                    ++i;
2215
0
                    if (dest[i] == NULL) {
2216
0
                        dest[i] = utext_openUChars(NULL, NULL, 0, &status);
2217
0
                    } else {
2218
0
                        static const UChar emptyString[] = {(UChar)0};
2219
0
                        utext_replace(dest[i], 0, utext_nativeLength(dest[i]), emptyString, 0, &status);
2220
0
                    }
2221
0
                }
2222
0
                break;
2223
2224
0
            }
2225
0
        }
2226
0
        else
2227
0
        {
2228
            // We ran off the end of the input while looking for the next delimiter.
2229
            // All the remaining text goes into the current output string.
2230
0
            if (UTEXT_FULL_TEXT_IN_CHUNK(input, fInputLength)) {
2231
0
                if (dest[i]) {
2232
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]),
2233
0
                                  input->chunkContents+nextOutputStringStart,
2234
0
                                  (int32_t)(fActiveLimit-nextOutputStringStart), &status);
2235
0
                } else {
2236
0
                    UText remainingText = UTEXT_INITIALIZER;
2237
0
                    utext_openUChars(&remainingText, input->chunkContents+nextOutputStringStart,
2238
0
                                     fActiveLimit-nextOutputStringStart, &status);
2239
0
                    dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2240
0
                    utext_close(&remainingText);
2241
0
                }
2242
0
            } else {
2243
0
                UErrorCode lengthStatus = U_ZERO_ERROR;
2244
0
                int32_t remaining16Length = utext_extract(input, nextOutputStringStart, fActiveLimit, NULL, 0, &lengthStatus);
2245
0
                UChar *remainingChars = (UChar *)uprv_malloc(sizeof(UChar)*(remaining16Length+1));
2246
0
                if (remainingChars == NULL) {
2247
0
                    status = U_MEMORY_ALLOCATION_ERROR;
2248
0
                    break;
2249
0
                }
2250
2251
0
                utext_extract(input, nextOutputStringStart, fActiveLimit, remainingChars, remaining16Length+1, &status);
2252
0
                if (dest[i]) {
2253
0
                    utext_replace(dest[i], 0, utext_nativeLength(dest[i]), remainingChars, remaining16Length, &status);
2254
0
                } else {
2255
0
                    UText remainingText = UTEXT_INITIALIZER;
2256
0
                    utext_openUChars(&remainingText, remainingChars, remaining16Length, &status);
2257
0
                    dest[i] = utext_clone(NULL, &remainingText, true, false, &status);
2258
0
                    utext_close(&remainingText);
2259
0
                }
2260
2261
0
                uprv_free(remainingChars);
2262
0
            }
2263
0
            break;
2264
0
        }
2265
0
        if (U_FAILURE(status)) {
2266
0
            break;
2267
0
        }
2268
0
    }   // end of for loop
2269
0
    return i+1;
2270
0
}
2271
2272
2273
//--------------------------------------------------------------------------------
2274
//
2275
//     start
2276
//
2277
//--------------------------------------------------------------------------------
2278
0
int32_t RegexMatcher::start(UErrorCode &status) const {
2279
0
    return start(0, status);
2280
0
}
2281
2282
0
int64_t RegexMatcher::start64(UErrorCode &status) const {
2283
0
    return start64(0, status);
2284
0
}
2285
2286
//--------------------------------------------------------------------------------
2287
//
2288
//     start(int32_t group, UErrorCode &status)
2289
//
2290
//--------------------------------------------------------------------------------
2291
2292
0
int64_t RegexMatcher::start64(int32_t group, UErrorCode &status) const {
2293
0
    if (U_FAILURE(status)) {
2294
0
        return -1;
2295
0
    }
2296
0
    if (U_FAILURE(fDeferredStatus)) {
2297
0
        status = fDeferredStatus;
2298
0
        return -1;
2299
0
    }
2300
0
    if (fMatch == false) {
2301
0
        status = U_REGEX_INVALID_STATE;
2302
0
        return -1;
2303
0
    }
2304
0
    if (group < 0 || group > fPattern->fGroupMap->size()) {
2305
0
        status = U_INDEX_OUTOFBOUNDS_ERROR;
2306
0
        return -1;
2307
0
    }
2308
0
    int64_t s;
2309
0
    if (group == 0) {
2310
0
        s = fMatchStart;
2311
0
    } else {
2312
0
        int32_t groupOffset = fPattern->fGroupMap->elementAti(group-1);
2313
0
        U_ASSERT(groupOffset < fPattern->fFrameSize);
2314
0
        U_ASSERT(groupOffset >= 0);
2315
0
        s = fFrame->fExtra[groupOffset];
2316
0
    }
2317
2318
0
    return s;
2319
0
}
2320
2321
2322
0
int32_t RegexMatcher::start(int32_t group, UErrorCode &status) const {
2323
0
    return (int32_t)start64(group, status);
2324
0
}
2325
2326
//--------------------------------------------------------------------------------
2327
//
2328
//     useAnchoringBounds
2329
//
2330
//--------------------------------------------------------------------------------
2331
0
RegexMatcher &RegexMatcher::useAnchoringBounds(UBool b) {
2332
0
    fAnchoringBounds = b;
2333
0
    fAnchorStart = (fAnchoringBounds ? fRegionStart : 0);
2334
0
    fAnchorLimit = (fAnchoringBounds ? fRegionLimit : fInputLength);
2335
0
    return *this;
2336
0
}
2337
2338
2339
//--------------------------------------------------------------------------------
2340
//
2341
//     useTransparentBounds
2342
//
2343
//--------------------------------------------------------------------------------
2344
0
RegexMatcher &RegexMatcher::useTransparentBounds(UBool b) {
2345
0
    fTransparentBounds = b;
2346
0
    fLookStart = (fTransparentBounds ? 0 : fRegionStart);
2347
0
    fLookLimit = (fTransparentBounds ? fInputLength : fRegionLimit);
2348
0
    return *this;
2349
0
}
2350
2351
//--------------------------------------------------------------------------------
2352
//
2353
//     setTimeLimit
2354
//
2355
//--------------------------------------------------------------------------------
2356
0
void RegexMatcher::setTimeLimit(int32_t limit, UErrorCode &status) {
2357
0
    if (U_FAILURE(status)) {
2358
0
        return;
2359
0
    }
2360
0
    if (U_FAILURE(fDeferredStatus)) {
2361
0
        status = fDeferredStatus;
2362
0
        return;
2363
0
    }
2364
0
    if (limit < 0) {
2365
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2366
0
        return;
2367
0
    }
2368
0
    fTimeLimit = limit;
2369
0
}
2370
2371
2372
//--------------------------------------------------------------------------------
2373
//
2374
//     getTimeLimit
2375
//
2376
//--------------------------------------------------------------------------------
2377
0
int32_t RegexMatcher::getTimeLimit() const {
2378
0
    return fTimeLimit;
2379
0
}
2380
2381
2382
//--------------------------------------------------------------------------------
2383
//
2384
//     setStackLimit
2385
//
2386
//--------------------------------------------------------------------------------
2387
3.94k
void RegexMatcher::setStackLimit(int32_t limit, UErrorCode &status) {
2388
3.94k
    if (U_FAILURE(status)) {
2389
0
        return;
2390
0
    }
2391
3.94k
    if (U_FAILURE(fDeferredStatus)) {
2392
0
        status = fDeferredStatus;
2393
0
        return;
2394
0
    }
2395
3.94k
    if (limit < 0) {
2396
0
        status = U_ILLEGAL_ARGUMENT_ERROR;
2397
0
        return;
2398
0
    }
2399
2400
    // Reset the matcher.  This is needed here in case there is a current match
2401
    //    whose final stack frame (containing the match results, pointed to by fFrame)
2402
    //    would be lost by resizing to a smaller stack size.
2403
3.94k
    reset();
2404
2405
3.94k
    if (limit == 0) {
2406
        // Unlimited stack expansion
2407
0
        fStack->setMaxCapacity(0);
2408
3.94k
    } else {
2409
        // Change the units of the limit  from bytes to ints, and bump the size up
2410
        //   to be big enough to hold at least one stack frame for the pattern,
2411
        //   if it isn't there already.
2412
3.94k
        int32_t adjustedLimit = limit / sizeof(int32_t);
2413
3.94k
        if (adjustedLimit < fPattern->fFrameSize) {
2414
0
            adjustedLimit = fPattern->fFrameSize;
2415
0
        }
2416
3.94k
        fStack->setMaxCapacity(adjustedLimit);
2417
3.94k
    }
2418
3.94k
    fStackLimit = limit;
2419
3.94k
}
2420
2421
2422
//--------------------------------------------------------------------------------
2423
//
2424
//     getStackLimit
2425
//
2426
//--------------------------------------------------------------------------------
2427
0
int32_t RegexMatcher::getStackLimit() const {
2428
0
    return fStackLimit;
2429
0
}
2430
2431
2432
//--------------------------------------------------------------------------------
2433
//
2434
//     setMatchCallback
2435
//
2436
//--------------------------------------------------------------------------------
2437
void RegexMatcher::setMatchCallback(URegexMatchCallback     *callback,
2438
                                    const void              *context,
2439
0
                                    UErrorCode              &status) {
2440
0
    if (U_FAILURE(status)) {
2441
0
        return;
2442
0
    }
2443
0
    fCallbackFn = callback;
2444
0
    fCallbackContext = context;
2445
0
}
2446
2447
2448
//--------------------------------------------------------------------------------
2449
//
2450
//     getMatchCallback
2451
//
2452
//--------------------------------------------------------------------------------
2453
void RegexMatcher::getMatchCallback(URegexMatchCallback   *&callback,
2454
                                  const void              *&context,
2455
0
                                  UErrorCode              &status) {
2456
0
    if (U_FAILURE(status)) {
2457
0
       return;
2458
0
    }
2459
0
    callback = fCallbackFn;
2460
0
    context  = fCallbackContext;
2461
0
}
2462
2463
2464
//--------------------------------------------------------------------------------
2465
//
2466
//     setMatchCallback
2467
//
2468
//--------------------------------------------------------------------------------
2469
void RegexMatcher::setFindProgressCallback(URegexFindProgressCallback      *callback,
2470
                                                const void                      *context,
2471
0
                                                UErrorCode                      &status) {
2472
0
    if (U_FAILURE(status)) {
2473
0
        return;
2474
0
    }
2475
0
    fFindProgressCallbackFn = callback;
2476
0
    fFindProgressCallbackContext = context;
2477
0
}
2478
2479
2480
//--------------------------------------------------------------------------------
2481
//
2482
//     getMatchCallback
2483
//
2484
//--------------------------------------------------------------------------------
2485
void RegexMatcher::getFindProgressCallback(URegexFindProgressCallback    *&callback,
2486
                                                const void                    *&context,
2487
0
                                                UErrorCode                    &status) {
2488
0
    if (U_FAILURE(status)) {
2489
0
       return;
2490
0
    }
2491
0
    callback = fFindProgressCallbackFn;
2492
0
    context  = fFindProgressCallbackContext;
2493
0
}
2494
2495
2496
//================================================================================
2497
//
2498
//    Code following this point in this file is the internal
2499
//    Match Engine Implementation.
2500
//
2501
//================================================================================
2502
2503
2504
//--------------------------------------------------------------------------------
2505
//
2506
//   resetStack
2507
//           Discard any previous contents of the state save stack, and initialize a
2508
//           new stack frame to all -1.  The -1s are needed for capture group limits,
2509
//           where they indicate that a group has not yet matched anything.
2510
//--------------------------------------------------------------------------------
2511
0
REStackFrame *RegexMatcher::resetStack() {
2512
    // Discard any previous contents of the state save stack, and initialize a
2513
    //  new stack frame with all -1 data.  The -1s are needed for capture group limits,
2514
    //  where they indicate that a group has not yet matched anything.
2515
0
    fStack->removeAllElements();
2516
2517
0
    REStackFrame *iFrame = (REStackFrame *)fStack->reserveBlock(fPattern->fFrameSize, fDeferredStatus);
2518
0
    if(U_FAILURE(fDeferredStatus)) {
2519
0
        return NULL;
2520
0
    }
2521
2522
0
    int32_t i;
2523
0
    for (i=0; i<fPattern->fFrameSize-RESTACKFRAME_HDRCOUNT; i++) {
2524
0
        iFrame->fExtra[i] = -1;
2525
0
    }
2526
0
    return iFrame;
2527
0
}
2528
2529
2530
2531
//--------------------------------------------------------------------------------
2532
//
2533
//   isWordBoundary
2534
//                     in perl, "xab..cd..", \b is true at positions 0,3,5,7
2535
//                     For us,
2536
//                       If the current char is a combining mark,
2537
//                          \b is false.
2538
//                       Else Scan backwards to the first non-combining char.
2539
//                            We are at a boundary if the this char and the original chars are
2540
//                               opposite in membership in \w set
2541
//
2542
//          parameters:   pos   - the current position in the input buffer
2543
//
2544
//              TODO:  double-check edge cases at region boundaries.
2545
//
2546
//--------------------------------------------------------------------------------
2547
0
UBool RegexMatcher::isWordBoundary(int64_t pos) {
2548
0
    UBool isBoundary = false;
2549
0
    UBool cIsWord    = false;
2550
2551
0
    if (pos >= fLookLimit) {
2552
0
        fHitEnd = true;
2553
0
    } else {
2554
        // Determine whether char c at current position is a member of the word set of chars.
2555
        // If we're off the end of the string, behave as though we're not at a word char.
2556
0
        UTEXT_SETNATIVEINDEX(fInputText, pos);
2557
0
        UChar32  c = UTEXT_CURRENT32(fInputText);
2558
0
        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2559
            // Current char is a combining one.  Not a boundary.
2560
0
            return false;
2561
0
        }
2562
0
        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2563
0
    }
2564
2565
    // Back up until we come to a non-combining char, determine whether
2566
    //  that char is a word char.
2567
0
    UBool prevCIsWord = false;
2568
0
    for (;;) {
2569
0
        if (UTEXT_GETNATIVEINDEX(fInputText) <= fLookStart) {
2570
0
            break;
2571
0
        }
2572
0
        UChar32 prevChar = UTEXT_PREVIOUS32(fInputText);
2573
0
        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2574
0
              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2575
0
            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2576
0
            break;
2577
0
        }
2578
0
    }
2579
0
    isBoundary = cIsWord ^ prevCIsWord;
2580
0
    return isBoundary;
2581
0
}
2582
2583
0
UBool RegexMatcher::isChunkWordBoundary(int32_t pos) {
2584
0
    UBool isBoundary = false;
2585
0
    UBool cIsWord    = false;
2586
2587
0
    const UChar *inputBuf = fInputText->chunkContents;
2588
2589
0
    if (pos >= fLookLimit) {
2590
0
        fHitEnd = true;
2591
0
    } else {
2592
        // Determine whether char c at current position is a member of the word set of chars.
2593
        // If we're off the end of the string, behave as though we're not at a word char.
2594
0
        UChar32 c;
2595
0
        U16_GET(inputBuf, fLookStart, pos, fLookLimit, c);
2596
0
        if (u_hasBinaryProperty(c, UCHAR_GRAPHEME_EXTEND) || u_charType(c) == U_FORMAT_CHAR) {
2597
            // Current char is a combining one.  Not a boundary.
2598
0
            return false;
2599
0
        }
2600
0
        cIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(c);
2601
0
    }
2602
2603
    // Back up until we come to a non-combining char, determine whether
2604
    //  that char is a word char.
2605
0
    UBool prevCIsWord = false;
2606
0
    for (;;) {
2607
0
        if (pos <= fLookStart) {
2608
0
            break;
2609
0
        }
2610
0
        UChar32 prevChar;
2611
0
        U16_PREV(inputBuf, fLookStart, pos, prevChar);
2612
0
        if (!(u_hasBinaryProperty(prevChar, UCHAR_GRAPHEME_EXTEND)
2613
0
              || u_charType(prevChar) == U_FORMAT_CHAR)) {
2614
0
            prevCIsWord = RegexStaticSets::gStaticSets->fPropSets[URX_ISWORD_SET].contains(prevChar);
2615
0
            break;
2616
0
        }
2617
0
    }
2618
0
    isBoundary = cIsWord ^ prevCIsWord;
2619
0
    return isBoundary;
2620
0
}
2621
2622
//--------------------------------------------------------------------------------
2623
//
2624
//   isUWordBoundary
2625
//
2626
//         Test for a word boundary using RBBI word break.
2627
//
2628
//          parameters:   pos   - the current position in the input buffer
2629
//
2630
//--------------------------------------------------------------------------------
2631
0
UBool RegexMatcher::isUWordBoundary(int64_t pos, UErrorCode &status) {
2632
0
    UBool       returnVal = false;
2633
2634
0
#if UCONFIG_NO_BREAK_ITERATION==0
2635
    // Note: this point will never be reached if break iteration is configured out.
2636
    //       Regex patterns that would require this function will fail to compile.
2637
2638
    // If we haven't yet created a break iterator for this matcher, do it now.
2639
0
    if (fWordBreakItr == nullptr) {
2640
0
        fWordBreakItr = BreakIterator::createWordInstance(Locale::getEnglish(), status);
2641
0
        if (U_FAILURE(status)) {
2642
0
            return false;
2643
0
        }
2644
0
        fWordBreakItr->setText(fInputText, status);
2645
0
    }
2646
2647
    // Note: zero width boundary tests like \b see through transparent region bounds,
2648
    //       which is why fLookLimit is used here, rather than fActiveLimit.
2649
0
    if (pos >= fLookLimit) {
2650
0
        fHitEnd = true;
2651
0
        returnVal = true;   // With Unicode word rules, only positions within the interior of "real"
2652
                            //    words are not boundaries.  All non-word chars stand by themselves,
2653
                            //    with word boundaries on both sides.
2654
0
    } else {
2655
0
        returnVal = fWordBreakItr->isBoundary((int32_t)pos);
2656
0
    }
2657
0
#endif
2658
0
    return   returnVal;
2659
0
}
2660
2661
2662
0
int64_t RegexMatcher::followingGCBoundary(int64_t pos, UErrorCode &status) {
2663
0
    int64_t result = pos;
2664
2665
0
#if UCONFIG_NO_BREAK_ITERATION==0
2666
    // Note: this point will never be reached if break iteration is configured out.
2667
    //       Regex patterns that would require this function will fail to compile.
2668
2669
    // If we haven't yet created a break iterator for this matcher, do it now.
2670
0
    if (fGCBreakItr == nullptr) {
2671
0
        fGCBreakItr = BreakIterator::createCharacterInstance(Locale::getEnglish(), status);
2672
0
        if (U_FAILURE(status)) {
2673
0
            return pos;
2674
0
        }
2675
0
        fGCBreakItr->setText(fInputText, status);
2676
0
    }
2677
0
    result = fGCBreakItr->following(pos);
2678
0
    if (result == BreakIterator::DONE) {
2679
0
        result = pos;
2680
0
    }
2681
0
#endif
2682
0
    return result;
2683
0
}
2684
2685
//--------------------------------------------------------------------------------
2686
//
2687
//   IncrementTime     This function is called once each TIMER_INITIAL_VALUE state
2688
//                     saves. Increment the "time" counter, and call the
2689
//                     user callback function if there is one installed.
2690
//
2691
//                     If the match operation needs to be aborted, either for a time-out
2692
//                     or because the user callback asked for it, just set an error status.
2693
//                     The engine will pick that up and stop in its outer loop.
2694
//
2695
//--------------------------------------------------------------------------------
2696
0
void RegexMatcher::IncrementTime(UErrorCode &status) {
2697
0
    fTickCounter = TIMER_INITIAL_VALUE;
2698
0
    fTime++;
2699
0
    if (fCallbackFn != NULL) {
2700
0
        if ((*fCallbackFn)(fCallbackContext, fTime) == false) {
2701
0
            status = U_REGEX_STOPPED_BY_CALLER;
2702
0
            return;
2703
0
        }
2704
0
    }
2705
0
    if (fTimeLimit > 0 && fTime >= fTimeLimit) {
2706
0
        status = U_REGEX_TIME_OUT;
2707
0
    }
2708
0
}
2709
2710
//--------------------------------------------------------------------------------
2711
//
2712
//   StateSave
2713
//       Make a new stack frame, initialized as a copy of the current stack frame.
2714
//       Set the pattern index in the original stack frame from the operand value
2715
//       in the opcode.  Execution of the engine continues with the state in
2716
//       the newly created stack frame
2717
//
2718
//       Note that reserveBlock() may grow the stack, resulting in the
2719
//       whole thing being relocated in memory.
2720
//
2721
//    Parameters:
2722
//       fp           The top frame pointer when called.  At return, a new
2723
//                    fame will be present
2724
//       savePatIdx   An index into the compiled pattern.  Goes into the original
2725
//                    (not new) frame.  If execution ever back-tracks out of the
2726
//                    new frame, this will be where we continue from in the pattern.
2727
//    Return
2728
//                    The new frame pointer.
2729
//
2730
//--------------------------------------------------------------------------------
2731
0
inline REStackFrame *RegexMatcher::StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status) {
2732
0
    if (U_FAILURE(status)) {
2733
0
        return fp;
2734
0
    }
2735
    // push storage for a new frame.
2736
0
    int64_t *newFP = fStack->reserveBlock(fFrameSize, status);
2737
0
    if (U_FAILURE(status)) {
2738
        // Failure on attempted stack expansion.
2739
        //   Stack function set some other error code, change it to a more
2740
        //   specific one for regular expressions.
2741
0
        status = U_REGEX_STACK_OVERFLOW;
2742
        // We need to return a writable stack frame, so just return the
2743
        //    previous frame.  The match operation will stop quickly
2744
        //    because of the error status, after which the frame will never
2745
        //    be looked at again.
2746
0
        return fp;
2747
0
    }
2748
0
    fp = (REStackFrame *)(newFP - fFrameSize);  // in case of realloc of stack.
2749
2750
    // New stack frame = copy of old top frame.
2751
0
    int64_t *source = (int64_t *)fp;
2752
0
    int64_t *dest   = newFP;
2753
0
    for (;;) {
2754
0
        *dest++ = *source++;
2755
0
        if (source == newFP) {
2756
0
            break;
2757
0
        }
2758
0
    }
2759
2760
0
    fTickCounter--;
2761
0
    if (fTickCounter <= 0) {
2762
0
       IncrementTime(status);    // Re-initializes fTickCounter
2763
0
    }
2764
0
    fp->fPatIdx = savePatIdx;
2765
0
    return (REStackFrame *)newFP;
2766
0
}
2767
2768
#if defined(REGEX_DEBUG)
2769
namespace {
2770
UnicodeString StringFromUText(UText *ut) {
2771
    UnicodeString result;
2772
    for (UChar32 c = utext_next32From(ut, 0); c != U_SENTINEL; c = UTEXT_NEXT32(ut)) {
2773
        result.append(c);
2774
    }
2775
    return result;
2776
}
2777
}
2778
#endif // REGEX_DEBUG
2779
2780
2781
//--------------------------------------------------------------------------------
2782
//
2783
//   MatchAt      This is the actual matching engine.
2784
//
2785
//                  startIdx:    begin matching a this index.
2786
//                  toEnd:       if true, match must extend to end of the input region
2787
//
2788
//--------------------------------------------------------------------------------
2789
0
void RegexMatcher::MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status) {
2790
0
    UBool       isMatch  = false;      // True if the we have a match.
2791
2792
0
    int64_t     backSearchIndex = U_INT64_MAX; // used after greedy single-character matches for searching backwards
2793
2794
0
    int32_t     op;                    // Operation from the compiled pattern, split into
2795
0
    int32_t     opType;                //    the opcode
2796
0
    int32_t     opValue;               //    and the operand value.
2797
2798
#ifdef REGEX_RUN_DEBUG
2799
    if (fTraceDebug) {
2800
        printf("MatchAt(startIdx=%ld)\n", startIdx);
2801
        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
2802
        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
2803
    }
2804
#endif
2805
2806
0
    if (U_FAILURE(status)) {
2807
0
        return;
2808
0
    }
2809
2810
    //  Cache frequently referenced items from the compiled pattern
2811
    //
2812
0
    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
2813
2814
0
    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
2815
0
    UVector             *fSets         = fPattern->fSets;
2816
2817
0
    fFrameSize = fPattern->fFrameSize;
2818
0
    REStackFrame        *fp            = resetStack();
2819
0
    if (U_FAILURE(fDeferredStatus)) {
2820
0
        status = fDeferredStatus;
2821
0
        return;
2822
0
    }
2823
2824
0
    fp->fPatIdx   = 0;
2825
0
    fp->fInputIdx = startIdx;
2826
2827
    // Zero out the pattern's static data
2828
0
    int32_t i;
2829
0
    for (i = 0; i<fPattern->fDataSize; i++) {
2830
0
        fData[i] = 0;
2831
0
    }
2832
2833
    //
2834
    //  Main loop for interpreting the compiled pattern.
2835
    //  One iteration of the loop per pattern operation performed.
2836
    //
2837
0
    for (;;) {
2838
0
        op      = (int32_t)pat[fp->fPatIdx];
2839
0
        opType  = URX_TYPE(op);
2840
0
        opValue = URX_VAL(op);
2841
#ifdef REGEX_RUN_DEBUG
2842
        if (fTraceDebug) {
2843
            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2844
            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
2845
                UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
2846
            fPattern->dumpOp(fp->fPatIdx);
2847
        }
2848
#endif
2849
0
        fp->fPatIdx++;
2850
2851
0
        switch (opType) {
2852
2853
2854
0
        case URX_NOP:
2855
0
            break;
2856
2857
2858
0
        case URX_BACKTRACK:
2859
            // Force a backtrack.  In some circumstances, the pattern compiler
2860
            //   will notice that the pattern can't possibly match anything, and will
2861
            //   emit one of these at that point.
2862
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2863
0
            break;
2864
2865
2866
0
        case URX_ONECHAR:
2867
0
            if (fp->fInputIdx < fActiveLimit) {
2868
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2869
0
                UChar32 c = UTEXT_NEXT32(fInputText);
2870
0
                if (c == opValue) {
2871
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2872
0
                    break;
2873
0
                }
2874
0
            } else {
2875
0
                fHitEnd = true;
2876
0
            }
2877
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2878
0
            break;
2879
2880
2881
0
        case URX_STRING:
2882
0
            {
2883
                // Test input against a literal string.
2884
                // Strings require two slots in the compiled pattern, one for the
2885
                //   offset to the string text, and one for the length.
2886
2887
0
                int32_t   stringStartIdx = opValue;
2888
0
                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
2889
0
                fp->fPatIdx++;
2890
0
                opType    = URX_TYPE(op);
2891
0
                int32_t stringLen = URX_VAL(op);
2892
0
                U_ASSERT(opType == URX_STRING_LEN);
2893
0
                U_ASSERT(stringLen >= 2);
2894
2895
0
                const UChar *patternString = litText+stringStartIdx;
2896
0
                int32_t patternStringIndex = 0;
2897
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2898
0
                UChar32 inputChar;
2899
0
                UChar32 patternChar;
2900
0
                UBool success = true;
2901
0
                while (patternStringIndex < stringLen) {
2902
0
                    if (UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
2903
0
                        success = false;
2904
0
                        fHitEnd = true;
2905
0
                        break;
2906
0
                    }
2907
0
                    inputChar = UTEXT_NEXT32(fInputText);
2908
0
                    U16_NEXT(patternString, patternStringIndex, stringLen, patternChar);
2909
0
                    if (patternChar != inputChar) {
2910
0
                        success = false;
2911
0
                        break;
2912
0
                    }
2913
0
                }
2914
2915
0
                if (success) {
2916
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
2917
0
                } else {
2918
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2919
0
                }
2920
0
            }
2921
0
            break;
2922
2923
2924
0
        case URX_STATE_SAVE:
2925
0
            fp = StateSave(fp, opValue, status);
2926
0
            break;
2927
2928
2929
0
        case URX_END:
2930
            // The match loop will exit via this path on a successful match,
2931
            //   when we reach the end of the pattern.
2932
0
            if (toEnd && fp->fInputIdx != fActiveLimit) {
2933
                // The pattern matched, but not to the end of input.  Try some more.
2934
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2935
0
                break;
2936
0
            }
2937
0
            isMatch = true;
2938
0
            goto  breakFromLoop;
2939
2940
        // Start and End Capture stack frame variables are laid out out like this:
2941
            //  fp->fExtra[opValue]  - The start of a completed capture group
2942
            //             opValue+1 - The end   of a completed capture group
2943
            //             opValue+2 - the start of a capture group whose end
2944
            //                          has not yet been reached (and might not ever be).
2945
0
        case URX_START_CAPTURE:
2946
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2947
0
            fp->fExtra[opValue+2] = fp->fInputIdx;
2948
0
            break;
2949
2950
2951
0
        case URX_END_CAPTURE:
2952
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
2953
0
            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
2954
0
            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
2955
0
            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
2956
0
            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
2957
0
            break;
2958
2959
2960
0
        case URX_DOLLAR:                   //  $, test for End of line
2961
                                           //     or for position before new line at end of input
2962
0
            {
2963
0
                if (fp->fInputIdx >= fAnchorLimit) {
2964
                    // We really are at the end of input.  Success.
2965
0
                    fHitEnd = true;
2966
0
                    fRequireEnd = true;
2967
0
                    break;
2968
0
                }
2969
2970
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
2971
2972
                // If we are positioned just before a new-line that is located at the
2973
                //   end of input, succeed.
2974
0
                UChar32 c = UTEXT_NEXT32(fInputText);
2975
0
                if (UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2976
0
                    if (isLineTerminator(c)) {
2977
                        // If not in the middle of a CR/LF sequence
2978
0
                        if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && ((void)UTEXT_PREVIOUS32(fInputText), UTEXT_PREVIOUS32(fInputText))==0x0d)) {
2979
                            // At new-line at end of input. Success
2980
0
                            fHitEnd = true;
2981
0
                            fRequireEnd = true;
2982
2983
0
                            break;
2984
0
                        }
2985
0
                    }
2986
0
                } else {
2987
0
                    UChar32 nextC = UTEXT_NEXT32(fInputText);
2988
0
                    if (c == 0x0d && nextC == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) >= fAnchorLimit) {
2989
0
                        fHitEnd = true;
2990
0
                        fRequireEnd = true;
2991
0
                        break;                         // At CR/LF at end of input.  Success
2992
0
                    }
2993
0
                }
2994
2995
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
2996
0
            }
2997
0
            break;
2998
2999
3000
0
         case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
3001
0
            if (fp->fInputIdx >= fAnchorLimit) {
3002
                // Off the end of input.  Success.
3003
0
                fHitEnd = true;
3004
0
                fRequireEnd = true;
3005
0
                break;
3006
0
            } else {
3007
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3008
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3009
                // Either at the last character of input, or off the end.
3010
0
                if (c == 0x0a && UTEXT_GETNATIVEINDEX(fInputText) == fAnchorLimit) {
3011
0
                    fHitEnd = true;
3012
0
                    fRequireEnd = true;
3013
0
                    break;
3014
0
                }
3015
0
            }
3016
3017
            // Not at end of input.  Back-track out.
3018
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3019
0
            break;
3020
3021
3022
0
         case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
3023
0
             {
3024
0
                 if (fp->fInputIdx >= fAnchorLimit) {
3025
                     // We really are at the end of input.  Success.
3026
0
                     fHitEnd = true;
3027
0
                     fRequireEnd = true;
3028
0
                     break;
3029
0
                 }
3030
                 // If we are positioned just before a new-line, succeed.
3031
                 // It makes no difference where the new-line is within the input.
3032
0
                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3033
0
                 UChar32 c = UTEXT_CURRENT32(fInputText);
3034
0
                 if (isLineTerminator(c)) {
3035
                     // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
3036
                     //  In multi-line mode, hitting a new-line just before the end of input does not
3037
                     //   set the hitEnd or requireEnd flags
3038
0
                     if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && UTEXT_PREVIOUS32(fInputText)==0x0d)) {
3039
0
                        break;
3040
0
                     }
3041
0
                 }
3042
                 // not at a new line.  Fail.
3043
0
                 fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3044
0
             }
3045
0
             break;
3046
3047
3048
0
         case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
3049
0
             {
3050
0
                 if (fp->fInputIdx >= fAnchorLimit) {
3051
                     // We really are at the end of input.  Success.
3052
0
                     fHitEnd = true;
3053
0
                     fRequireEnd = true;  // Java set requireEnd in this case, even though
3054
0
                     break;               //   adding a new-line would not lose the match.
3055
0
                 }
3056
                 // If we are not positioned just before a new-line, the test fails; backtrack out.
3057
                 // It makes no difference where the new-line is within the input.
3058
0
                 UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3059
0
                 if (UTEXT_CURRENT32(fInputText) != 0x0a) {
3060
0
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3061
0
                 }
3062
0
             }
3063
0
             break;
3064
3065
3066
0
       case URX_CARET:                    //  ^, test for start of line
3067
0
            if (fp->fInputIdx != fAnchorStart) {
3068
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3069
0
            }
3070
0
            break;
3071
3072
3073
0
       case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
3074
0
           {
3075
0
               if (fp->fInputIdx == fAnchorStart) {
3076
                   // We are at the start input.  Success.
3077
0
                   break;
3078
0
               }
3079
               // Check whether character just before the current pos is a new-line
3080
               //   unless we are at the end of input
3081
0
               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3082
0
               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3083
0
               if ((fp->fInputIdx < fAnchorLimit) && isLineTerminator(c)) {
3084
                   //  It's a new-line.  ^ is true.  Success.
3085
                   //  TODO:  what should be done with positions between a CR and LF?
3086
0
                   break;
3087
0
               }
3088
               // Not at the start of a line.  Fail.
3089
0
               fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3090
0
           }
3091
0
           break;
3092
3093
3094
0
       case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
3095
0
           {
3096
0
               U_ASSERT(fp->fInputIdx >= fAnchorStart);
3097
0
               if (fp->fInputIdx <= fAnchorStart) {
3098
                   // We are at the start input.  Success.
3099
0
                   break;
3100
0
               }
3101
               // Check whether character just before the current pos is a new-line
3102
0
               U_ASSERT(fp->fInputIdx <= fAnchorLimit);
3103
0
               UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3104
0
               UChar32  c = UTEXT_PREVIOUS32(fInputText);
3105
0
               if (c != 0x0a) {
3106
                   // Not at the start of a line.  Back-track out.
3107
0
                   fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3108
0
               }
3109
0
           }
3110
0
           break;
3111
3112
0
        case URX_BACKSLASH_B:          // Test for word boundaries
3113
0
            {
3114
0
                UBool success = isWordBoundary(fp->fInputIdx);
3115
0
                success ^= (UBool)(opValue != 0);     // flip sense for \B
3116
0
                if (!success) {
3117
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3118
0
                }
3119
0
            }
3120
0
            break;
3121
3122
3123
0
        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
3124
0
            {
3125
0
                UBool success = isUWordBoundary(fp->fInputIdx, status);
3126
0
                success ^= (UBool)(opValue != 0);     // flip sense for \B
3127
0
                if (!success) {
3128
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3129
0
                }
3130
0
            }
3131
0
            break;
3132
3133
3134
0
        case URX_BACKSLASH_D:            // Test for decimal digit
3135
0
            {
3136
0
                if (fp->fInputIdx >= fActiveLimit) {
3137
0
                    fHitEnd = true;
3138
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3139
0
                    break;
3140
0
                }
3141
3142
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3143
3144
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3145
0
                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
3146
0
                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
3147
0
                success ^= (UBool)(opValue != 0);        // flip sense for \D
3148
0
                if (success) {
3149
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3150
0
                } else {
3151
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3152
0
                }
3153
0
            }
3154
0
            break;
3155
3156
3157
0
        case URX_BACKSLASH_G:          // Test for position at end of previous match
3158
0
            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
3159
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3160
0
            }
3161
0
            break;
3162
3163
3164
0
        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
3165
0
            {
3166
0
                if (fp->fInputIdx >= fActiveLimit) {
3167
0
                    fHitEnd = true;
3168
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3169
0
                    break;
3170
0
                }
3171
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3172
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3173
0
                int8_t ctype = u_charType(c);
3174
0
                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
3175
0
                success ^= (UBool)(opValue != 0);        // flip sense for \H
3176
0
                if (success) {
3177
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3178
0
                } else {
3179
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3180
0
                }
3181
0
            }
3182
0
            break;
3183
3184
3185
0
        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
3186
0
            {
3187
0
                if (fp->fInputIdx >= fActiveLimit) {
3188
0
                    fHitEnd = true;
3189
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3190
0
                    break;
3191
0
                }
3192
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3193
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3194
0
                if (isLineTerminator(c)) {
3195
0
                    if (c == 0x0d && utext_current32(fInputText) == 0x0a) {
3196
0
                        utext_next32(fInputText);
3197
0
                    }
3198
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3199
0
                } else {
3200
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3201
0
                }
3202
0
            }
3203
0
            break;
3204
3205
3206
0
        case URX_BACKSLASH_V:            // \v, any single line ending character.
3207
0
            {
3208
0
                if (fp->fInputIdx >= fActiveLimit) {
3209
0
                    fHitEnd = true;
3210
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3211
0
                    break;
3212
0
                }
3213
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3214
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3215
0
                UBool success = isLineTerminator(c);
3216
0
                success ^= (UBool)(opValue != 0);        // flip sense for \V
3217
0
                if (success) {
3218
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3219
0
                } else {
3220
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3221
0
                }
3222
0
            }
3223
0
            break;
3224
3225
3226
0
        case URX_BACKSLASH_X:
3227
            //  Match a Grapheme, as defined by Unicode UAX 29.
3228
3229
            // Fail if at end of input
3230
0
            if (fp->fInputIdx >= fActiveLimit) {
3231
0
                fHitEnd = true;
3232
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3233
0
                break;
3234
0
            }
3235
3236
0
            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
3237
0
            if (fp->fInputIdx >= fActiveLimit) {
3238
0
                fHitEnd = true;
3239
0
                fp->fInputIdx = fActiveLimit;
3240
0
            }
3241
0
            break;
3242
3243
3244
0
        case URX_BACKSLASH_Z:          // Test for end of Input
3245
0
            if (fp->fInputIdx < fAnchorLimit) {
3246
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3247
0
            } else {
3248
0
                fHitEnd = true;
3249
0
                fRequireEnd = true;
3250
0
            }
3251
0
            break;
3252
3253
3254
3255
0
        case URX_STATIC_SETREF:
3256
0
            {
3257
                // Test input character against one of the predefined sets
3258
                //    (Word Characters, for example)
3259
                // The high bit of the op value is a flag for the match polarity.
3260
                //    0:   success if input char is in set.
3261
                //    1:   success if input char is not in set.
3262
0
                if (fp->fInputIdx >= fActiveLimit) {
3263
0
                    fHitEnd = true;
3264
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3265
0
                    break;
3266
0
                }
3267
3268
0
                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
3269
0
                opValue &= ~URX_NEG_SET;
3270
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3271
3272
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3273
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3274
0
                if (c < 256) {
3275
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3276
0
                    if (s8.contains(c)) {
3277
0
                        success = !success;
3278
0
                    }
3279
0
                } else {
3280
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3281
0
                    if (s.contains(c)) {
3282
0
                        success = !success;
3283
0
                    }
3284
0
                }
3285
0
                if (success) {
3286
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3287
0
                } else {
3288
                    // the character wasn't in the set.
3289
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3290
0
                }
3291
0
            }
3292
0
            break;
3293
3294
3295
0
        case URX_STAT_SETREF_N:
3296
0
            {
3297
                // Test input character for NOT being a member of  one of
3298
                //    the predefined sets (Word Characters, for example)
3299
0
                if (fp->fInputIdx >= fActiveLimit) {
3300
0
                    fHitEnd = true;
3301
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3302
0
                    break;
3303
0
                }
3304
3305
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
3306
3307
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3308
3309
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3310
0
                if (c < 256) {
3311
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
3312
0
                    if (s8.contains(c) == false) {
3313
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3314
0
                        break;
3315
0
                    }
3316
0
                } else {
3317
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
3318
0
                    if (s.contains(c) == false) {
3319
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3320
0
                        break;
3321
0
                    }
3322
0
                }
3323
                // the character wasn't in the set.
3324
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3325
0
            }
3326
0
            break;
3327
3328
3329
0
        case URX_SETREF:
3330
0
            if (fp->fInputIdx >= fActiveLimit) {
3331
0
                fHitEnd = true;
3332
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3333
0
                break;
3334
0
            } else {
3335
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3336
3337
                // There is input left.  Pick up one char and test it for set membership.
3338
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3339
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
3340
0
                if (c<256) {
3341
0
                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
3342
0
                    if (s8->contains(c)) {
3343
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3344
0
                        break;
3345
0
                    }
3346
0
                } else {
3347
0
                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
3348
0
                    if (s->contains(c)) {
3349
                        // The character is in the set.  A Match.
3350
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3351
0
                        break;
3352
0
                    }
3353
0
                }
3354
3355
                // the character wasn't in the set.
3356
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3357
0
            }
3358
0
            break;
3359
3360
3361
0
        case URX_DOTANY:
3362
0
            {
3363
                // . matches anything, but stops at end-of-line.
3364
0
                if (fp->fInputIdx >= fActiveLimit) {
3365
                    // At end of input.  Match failed.  Backtrack out.
3366
0
                    fHitEnd = true;
3367
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3368
0
                    break;
3369
0
                }
3370
3371
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3372
3373
                // There is input left.  Advance over one char, unless we've hit end-of-line
3374
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3375
0
                if (isLineTerminator(c)) {
3376
                    // End of line in normal mode.   . does not match.
3377
0
                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3378
0
                    break;
3379
0
                }
3380
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3381
0
            }
3382
0
            break;
3383
3384
3385
0
        case URX_DOTANY_ALL:
3386
0
            {
3387
                // ., in dot-matches-all (including new lines) mode
3388
0
                if (fp->fInputIdx >= fActiveLimit) {
3389
                    // At end of input.  Match failed.  Backtrack out.
3390
0
                    fHitEnd = true;
3391
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3392
0
                    break;
3393
0
                }
3394
3395
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3396
3397
                // There is input left.  Advance over one char, except if we are
3398
                //   at a cr/lf, advance over both of them.
3399
0
                UChar32 c;
3400
0
                c = UTEXT_NEXT32(fInputText);
3401
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3402
0
                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
3403
                    // In the case of a CR/LF, we need to advance over both.
3404
0
                    UChar32 nextc = UTEXT_CURRENT32(fInputText);
3405
0
                    if (nextc == 0x0a) {
3406
0
                        (void)UTEXT_NEXT32(fInputText);
3407
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3408
0
                    }
3409
0
                }
3410
0
            }
3411
0
            break;
3412
3413
3414
0
        case URX_DOTANY_UNIX:
3415
0
            {
3416
                // '.' operator, matches all, but stops at end-of-line.
3417
                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
3418
0
                if (fp->fInputIdx >= fActiveLimit) {
3419
                    // At end of input.  Match failed.  Backtrack out.
3420
0
                    fHitEnd = true;
3421
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3422
0
                    break;
3423
0
                }
3424
3425
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3426
3427
                // There is input left.  Advance over one char, unless we've hit end-of-line
3428
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3429
0
                if (c == 0x0a) {
3430
                    // End of line in normal mode.   '.' does not match the \n
3431
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3432
0
                } else {
3433
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3434
0
                }
3435
0
            }
3436
0
            break;
3437
3438
3439
0
        case URX_JMP:
3440
0
            fp->fPatIdx = opValue;
3441
0
            break;
3442
3443
0
        case URX_FAIL:
3444
0
            isMatch = false;
3445
0
            goto breakFromLoop;
3446
3447
0
        case URX_JMP_SAV:
3448
0
            U_ASSERT(opValue < fPattern->fCompiledPat->size());
3449
0
            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
3450
0
            fp->fPatIdx = opValue;                         // Then JMP.
3451
0
            break;
3452
3453
0
        case URX_JMP_SAV_X:
3454
            // This opcode is used with (x)+, when x can match a zero length string.
3455
            // Same as JMP_SAV, except conditional on the match having made forward progress.
3456
            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
3457
            //   data address of the input position at the start of the loop.
3458
0
            {
3459
0
                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
3460
0
                int32_t  stoOp = (int32_t)pat[opValue-1];
3461
0
                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
3462
0
                int32_t  frameLoc = URX_VAL(stoOp);
3463
0
                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
3464
0
                int64_t prevInputIdx = fp->fExtra[frameLoc];
3465
0
                U_ASSERT(prevInputIdx <= fp->fInputIdx);
3466
0
                if (prevInputIdx < fp->fInputIdx) {
3467
                    // The match did make progress.  Repeat the loop.
3468
0
                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
3469
0
                    fp->fPatIdx = opValue;
3470
0
                    fp->fExtra[frameLoc] = fp->fInputIdx;
3471
0
                }
3472
                // If the input position did not advance, we do nothing here,
3473
                //   execution will fall out of the loop.
3474
0
            }
3475
0
            break;
3476
3477
0
        case URX_CTR_INIT:
3478
0
            {
3479
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3480
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3481
3482
                // Pick up the three extra operands that CTR_INIT has, and
3483
                //    skip the pattern location counter past
3484
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3485
0
                fp->fPatIdx += 3;
3486
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3487
0
                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3488
0
                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3489
0
                U_ASSERT(minCount>=0);
3490
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
3491
0
                U_ASSERT(loopLoc>=fp->fPatIdx);
3492
3493
0
                if (minCount == 0) {
3494
0
                    fp = StateSave(fp, loopLoc+1, status);
3495
0
                }
3496
0
                if (maxCount == -1) {
3497
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
3498
0
                } else if (maxCount == 0) {
3499
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3500
0
                }
3501
0
            }
3502
0
            break;
3503
3504
0
        case URX_CTR_LOOP:
3505
0
            {
3506
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3507
0
                int32_t initOp = (int32_t)pat[opValue];
3508
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
3509
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3510
0
                int32_t minCount  = (int32_t)pat[opValue+2];
3511
0
                int32_t maxCount  = (int32_t)pat[opValue+3];
3512
0
                (*pCounter)++;
3513
0
                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3514
0
                    U_ASSERT(*pCounter == maxCount);
3515
0
                    break;
3516
0
                }
3517
0
                if (*pCounter >= minCount) {
3518
0
                    if (maxCount == -1) {
3519
                        // Loop has no hard upper bound.
3520
                        // Check that it is progressing through the input, break if it is not.
3521
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3522
0
                        if (fp->fInputIdx == *pLastInputIdx) {
3523
0
                            break;
3524
0
                        } else {
3525
0
                            *pLastInputIdx = fp->fInputIdx;
3526
0
                        }
3527
0
                    }
3528
0
                    fp = StateSave(fp, fp->fPatIdx, status);
3529
0
                } else {
3530
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
3531
0
                    fTickCounter--;
3532
0
                    if (fTickCounter <= 0) {
3533
0
                        IncrementTime(status);    // Re-initializes fTickCounter
3534
0
                    }
3535
0
                }
3536
3537
0
                fp->fPatIdx = opValue + 4;    // Loop back.
3538
0
            }
3539
0
            break;
3540
3541
0
        case URX_CTR_INIT_NG:
3542
0
            {
3543
                // Initialize a non-greedy loop
3544
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
3545
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
3546
3547
                // Pick up the three extra operands that CTR_INIT_NG has, and
3548
                //    skip the pattern location counter past
3549
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3550
0
                fp->fPatIdx += 3;
3551
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
3552
0
                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
3553
0
                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
3554
0
                U_ASSERT(minCount>=0);
3555
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
3556
0
                U_ASSERT(loopLoc>fp->fPatIdx);
3557
0
                if (maxCount == -1) {
3558
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
3559
0
                }
3560
3561
0
                if (minCount == 0) {
3562
0
                    if (maxCount != 0) {
3563
0
                        fp = StateSave(fp, fp->fPatIdx, status);
3564
0
                    }
3565
0
                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
3566
0
                }
3567
0
            }
3568
0
            break;
3569
3570
0
        case URX_CTR_LOOP_NG:
3571
0
            {
3572
                // Non-greedy {min, max} loops
3573
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
3574
0
                int32_t initOp = (int32_t)pat[opValue];
3575
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
3576
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
3577
0
                int32_t minCount  = (int32_t)pat[opValue+2];
3578
0
                int32_t maxCount  = (int32_t)pat[opValue+3];
3579
3580
0
                (*pCounter)++;
3581
0
                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
3582
                    // The loop has matched the maximum permitted number of times.
3583
                    //   Break out of here with no action.  Matching will
3584
                    //   continue with the following pattern.
3585
0
                    U_ASSERT(*pCounter == maxCount);
3586
0
                    break;
3587
0
                }
3588
3589
0
                if (*pCounter < minCount) {
3590
                    // We haven't met the minimum number of matches yet.
3591
                    //   Loop back for another one.
3592
0
                    fp->fPatIdx = opValue + 4;    // Loop back.
3593
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
3594
0
                    fTickCounter--;
3595
0
                    if (fTickCounter <= 0) {
3596
0
                        IncrementTime(status);    // Re-initializes fTickCounter
3597
0
                    }
3598
0
                } else {
3599
                    // We do have the minimum number of matches.
3600
3601
                    // If there is no upper bound on the loop iterations, check that the input index
3602
                    // is progressing, and stop the loop if it is not.
3603
0
                    if (maxCount == -1) {
3604
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
3605
0
                        if (fp->fInputIdx == *pLastInputIdx) {
3606
0
                            break;
3607
0
                        }
3608
0
                        *pLastInputIdx = fp->fInputIdx;
3609
0
                    }
3610
3611
                    // Loop Continuation: we will fall into the pattern following the loop
3612
                    //   (non-greedy, don't execute loop body first), but first do
3613
                    //   a state save to the top of the loop, so that a match failure
3614
                    //   in the following pattern will try another iteration of the loop.
3615
0
                    fp = StateSave(fp, opValue + 4, status);
3616
0
                }
3617
0
            }
3618
0
            break;
3619
3620
0
        case URX_STO_SP:
3621
0
            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3622
0
            fData[opValue] = fStack->size();
3623
0
            break;
3624
3625
0
        case URX_LD_SP:
3626
0
            {
3627
0
                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
3628
0
                int32_t newStackSize = (int32_t)fData[opValue];
3629
0
                U_ASSERT(newStackSize <= fStack->size());
3630
0
                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3631
0
                if (newFP == (int64_t *)fp) {
3632
0
                    break;
3633
0
                }
3634
0
                int32_t j;
3635
0
                for (j=0; j<fFrameSize; j++) {
3636
0
                    newFP[j] = ((int64_t *)fp)[j];
3637
0
                }
3638
0
                fp = (REStackFrame *)newFP;
3639
0
                fStack->setSize(newStackSize);
3640
0
            }
3641
0
            break;
3642
3643
0
        case URX_BACKREF:
3644
0
            {
3645
0
                U_ASSERT(opValue < fFrameSize);
3646
0
                int64_t groupStartIdx = fp->fExtra[opValue];
3647
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3648
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
3649
0
                if (groupStartIdx < 0) {
3650
                    // This capture group has not participated in the match thus far,
3651
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3652
0
                    break;
3653
0
                }
3654
0
                UTEXT_SETNATIVEINDEX(fAltInputText, groupStartIdx);
3655
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3656
3657
                //   Note: if the capture group match was of an empty string the backref
3658
                //         match succeeds.  Verified by testing:  Perl matches succeed
3659
                //         in this case, so we do too.
3660
3661
0
                UBool success = true;
3662
0
                for (;;) {
3663
0
                    if (utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3664
0
                        success = true;
3665
0
                        break;
3666
0
                    }
3667
0
                    if (utext_getNativeIndex(fInputText) >= fActiveLimit) {
3668
0
                        success = false;
3669
0
                        fHitEnd = true;
3670
0
                        break;
3671
0
                    }
3672
0
                    UChar32 captureGroupChar = utext_next32(fAltInputText);
3673
0
                    UChar32 inputChar = utext_next32(fInputText);
3674
0
                    if (inputChar != captureGroupChar) {
3675
0
                        success = false;
3676
0
                        break;
3677
0
                    }
3678
0
                }
3679
3680
0
                if (success) {
3681
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3682
0
                } else {
3683
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3684
0
                }
3685
0
            }
3686
0
            break;
3687
3688
3689
3690
0
        case URX_BACKREF_I:
3691
0
            {
3692
0
                U_ASSERT(opValue < fFrameSize);
3693
0
                int64_t groupStartIdx = fp->fExtra[opValue];
3694
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
3695
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
3696
0
                if (groupStartIdx < 0) {
3697
                    // This capture group has not participated in the match thus far,
3698
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
3699
0
                    break;
3700
0
                }
3701
0
                utext_setNativeIndex(fAltInputText, groupStartIdx);
3702
0
                utext_setNativeIndex(fInputText, fp->fInputIdx);
3703
0
                CaseFoldingUTextIterator captureGroupItr(*fAltInputText);
3704
0
                CaseFoldingUTextIterator inputItr(*fInputText);
3705
3706
                //   Note: if the capture group match was of an empty string the backref
3707
                //         match succeeds.  Verified by testing:  Perl matches succeed
3708
                //         in this case, so we do too.
3709
3710
0
                UBool success = true;
3711
0
                for (;;) {
3712
0
                    if (!captureGroupItr.inExpansion() && utext_getNativeIndex(fAltInputText) >= groupEndIdx) {
3713
0
                        success = true;
3714
0
                        break;
3715
0
                    }
3716
0
                    if (!inputItr.inExpansion() && utext_getNativeIndex(fInputText) >= fActiveLimit) {
3717
0
                        success = false;
3718
0
                        fHitEnd = true;
3719
0
                        break;
3720
0
                    }
3721
0
                    UChar32 captureGroupChar = captureGroupItr.next();
3722
0
                    UChar32 inputChar = inputItr.next();
3723
0
                    if (inputChar != captureGroupChar) {
3724
0
                        success = false;
3725
0
                        break;
3726
0
                    }
3727
0
                }
3728
3729
0
                if (success && inputItr.inExpansion()) {
3730
                    // We obtained a match by consuming part of a string obtained from
3731
                    // case-folding a single code point of the input text.
3732
                    // This does not count as an overall match.
3733
0
                    success = false;
3734
0
                }
3735
3736
0
                if (success) {
3737
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3738
0
                } else {
3739
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3740
0
                }
3741
3742
0
            }
3743
0
            break;
3744
3745
0
        case URX_STO_INP_LOC:
3746
0
            {
3747
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
3748
0
                fp->fExtra[opValue] = fp->fInputIdx;
3749
0
            }
3750
0
            break;
3751
3752
0
        case URX_JMPX:
3753
0
            {
3754
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
3755
0
                fp->fPatIdx += 1;
3756
0
                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
3757
0
                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
3758
0
                int64_t savedInputIdx = fp->fExtra[dataLoc];
3759
0
                U_ASSERT(savedInputIdx <= fp->fInputIdx);
3760
0
                if (savedInputIdx < fp->fInputIdx) {
3761
0
                    fp->fPatIdx = opValue;                               // JMP
3762
0
                } else {
3763
0
                     fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
3764
0
                }
3765
0
            }
3766
0
            break;
3767
3768
0
        case URX_LA_START:
3769
0
            {
3770
                // Entering a look around block.
3771
                // Save Stack Ptr, Input Pos.
3772
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3773
0
                fData[opValue]   = fStack->size();
3774
0
                fData[opValue+1] = fp->fInputIdx;
3775
0
                fData[opValue+2] = fActiveStart;
3776
0
                fData[opValue+3] = fActiveLimit;
3777
0
                fActiveStart     = fLookStart;          // Set the match region change for
3778
0
                fActiveLimit     = fLookLimit;          //   transparent bounds.
3779
0
            }
3780
0
            break;
3781
3782
0
        case URX_LA_END:
3783
0
            {
3784
                // Leaving a look-ahead block.
3785
                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
3786
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
3787
0
                int32_t stackSize = fStack->size();
3788
0
                int32_t newStackSize =(int32_t)fData[opValue];
3789
0
                U_ASSERT(stackSize >= newStackSize);
3790
0
                if (stackSize > newStackSize) {
3791
                    // Copy the current top frame back to the new (cut back) top frame.
3792
                    //   This makes the capture groups from within the look-ahead
3793
                    //   expression available.
3794
0
                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
3795
0
                    int32_t j;
3796
0
                    for (j=0; j<fFrameSize; j++) {
3797
0
                        newFP[j] = ((int64_t *)fp)[j];
3798
0
                    }
3799
0
                    fp = (REStackFrame *)newFP;
3800
0
                    fStack->setSize(newStackSize);
3801
0
                }
3802
0
                fp->fInputIdx = fData[opValue+1];
3803
3804
                // Restore the active region bounds in the input string; they may have
3805
                //    been changed because of transparent bounds on a Region.
3806
0
                fActiveStart = fData[opValue+2];
3807
0
                fActiveLimit = fData[opValue+3];
3808
0
                U_ASSERT(fActiveStart >= 0);
3809
0
                U_ASSERT(fActiveLimit <= fInputLength);
3810
0
            }
3811
0
            break;
3812
3813
0
        case URX_ONECHAR_I:
3814
            // Case insensitive one char.  The char from the pattern is already case folded.
3815
            // Input text is not, but case folding the input can not reduce two or more code
3816
            // points to one.
3817
0
            if (fp->fInputIdx < fActiveLimit) {
3818
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3819
3820
0
                UChar32 c = UTEXT_NEXT32(fInputText);
3821
0
                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
3822
0
                    fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3823
0
                    break;
3824
0
                }
3825
0
            } else {
3826
0
                fHitEnd = true;
3827
0
            }
3828
3829
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3830
0
            break;
3831
3832
0
        case URX_STRING_I:
3833
0
            {
3834
                // Case-insensitive test input against a literal string.
3835
                // Strings require two slots in the compiled pattern, one for the
3836
                //   offset to the string text, and one for the length.
3837
                //   The compiled string has already been case folded.
3838
0
                {
3839
0
                    const UChar *patternString = litText + opValue;
3840
0
                    int32_t      patternStringIdx  = 0;
3841
3842
0
                    op      = (int32_t)pat[fp->fPatIdx];
3843
0
                    fp->fPatIdx++;
3844
0
                    opType  = URX_TYPE(op);
3845
0
                    opValue = URX_VAL(op);
3846
0
                    U_ASSERT(opType == URX_STRING_LEN);
3847
0
                    int32_t patternStringLen = opValue;  // Length of the string from the pattern.
3848
3849
3850
0
                    UChar32   cPattern;
3851
0
                    UChar32   cText;
3852
0
                    UBool     success = true;
3853
3854
0
                    UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
3855
0
                    CaseFoldingUTextIterator inputIterator(*fInputText);
3856
0
                    while (patternStringIdx < patternStringLen) {
3857
0
                        if (!inputIterator.inExpansion() && UTEXT_GETNATIVEINDEX(fInputText) >= fActiveLimit) {
3858
0
                            success = false;
3859
0
                            fHitEnd = true;
3860
0
                            break;
3861
0
                        }
3862
0
                        U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
3863
0
                        cText = inputIterator.next();
3864
0
                        if (cText != cPattern) {
3865
0
                            success = false;
3866
0
                            break;
3867
0
                        }
3868
0
                    }
3869
0
                    if (inputIterator.inExpansion()) {
3870
0
                        success = false;
3871
0
                    }
3872
3873
0
                    if (success) {
3874
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
3875
0
                    } else {
3876
0
                        fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3877
0
                    }
3878
0
                }
3879
0
            }
3880
0
            break;
3881
3882
0
        case URX_LB_START:
3883
0
            {
3884
                // Entering a look-behind block.
3885
                // Save Stack Ptr, Input Pos and active input region.
3886
                //   TODO:  implement transparent bounds.  Ticket #6067
3887
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3888
0
                fData[opValue]   = fStack->size();
3889
0
                fData[opValue+1] = fp->fInputIdx;
3890
                // Save input string length, then reset to pin any matches to end at
3891
                //   the current position.
3892
0
                fData[opValue+2] = fActiveStart;
3893
0
                fData[opValue+3] = fActiveLimit;
3894
0
                fActiveStart     = fRegionStart;
3895
0
                fActiveLimit     = fp->fInputIdx;
3896
                // Init the variable containing the start index for attempted matches.
3897
0
                fData[opValue+4] = -1;
3898
0
            }
3899
0
            break;
3900
3901
3902
0
        case URX_LB_CONT:
3903
0
            {
3904
                // Positive Look-Behind, at top of loop checking for matches of LB expression
3905
                //    at all possible input starting positions.
3906
3907
                // Fetch the min and max possible match lengths.  They are the operands
3908
                //   of this op in the pattern.
3909
0
                int32_t minML = (int32_t)pat[fp->fPatIdx++];
3910
0
                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
3911
0
                if (!UTEXT_USES_U16(fInputText)) {
3912
                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3913
                    // The max length need not be exact; it just needs to be >= actual maximum.
3914
0
                    maxML *= 3;
3915
0
                }
3916
0
                U_ASSERT(minML <= maxML);
3917
0
                U_ASSERT(minML >= 0);
3918
3919
                // Fetch (from data) the last input index where a match was attempted.
3920
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3921
0
                int64_t  &lbStartIdx = fData[opValue+4];
3922
0
                if (lbStartIdx < 0) {
3923
                    // First time through loop.
3924
0
                    lbStartIdx = fp->fInputIdx - minML;
3925
0
                    if (lbStartIdx > 0) {
3926
                        // move index to a code point boundary, if it's not on one already.
3927
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3928
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3929
0
                    }
3930
0
                } else {
3931
                    // 2nd through nth time through the loop.
3932
                    // Back up start position for match by one.
3933
0
                    if (lbStartIdx == 0) {
3934
0
                        (lbStartIdx)--;
3935
0
                    } else {
3936
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
3937
0
                        (void)UTEXT_PREVIOUS32(fInputText);
3938
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
3939
0
                    }
3940
0
                }
3941
3942
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
3943
                    // We have tried all potential match starting points without
3944
                    //  getting a match.  Backtrack out, and out of the
3945
                    //   Look Behind altogether.
3946
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3947
0
                    fActiveStart = fData[opValue+2];
3948
0
                    fActiveLimit = fData[opValue+3];
3949
0
                    U_ASSERT(fActiveStart >= 0);
3950
0
                    U_ASSERT(fActiveLimit <= fInputLength);
3951
0
                    break;
3952
0
                }
3953
3954
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
3955
                //      (successful match will fall off the end of the loop.)
3956
0
                fp = StateSave(fp, fp->fPatIdx-3, status);
3957
0
                fp->fInputIdx = lbStartIdx;
3958
0
            }
3959
0
            break;
3960
3961
0
        case URX_LB_END:
3962
            // End of a look-behind block, after a successful match.
3963
0
            {
3964
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
3965
0
                if (fp->fInputIdx != fActiveLimit) {
3966
                    //  The look-behind expression matched, but the match did not
3967
                    //    extend all the way to the point that we are looking behind from.
3968
                    //  FAIL out of here, which will take us back to the LB_CONT, which
3969
                    //     will retry the match starting at another position or fail
3970
                    //     the look-behind altogether, whichever is appropriate.
3971
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
3972
0
                    break;
3973
0
                }
3974
3975
                // Look-behind match is good.  Restore the original input string region,
3976
                //   which had been truncated to pin the end of the lookbehind match to the
3977
                //   position being looked-behind.
3978
0
                fActiveStart = fData[opValue+2];
3979
0
                fActiveLimit = fData[opValue+3];
3980
0
                U_ASSERT(fActiveStart >= 0);
3981
0
                U_ASSERT(fActiveLimit <= fInputLength);
3982
0
            }
3983
0
            break;
3984
3985
3986
0
        case URX_LBN_CONT:
3987
0
            {
3988
                // Negative Look-Behind, at top of loop checking for matches of LB expression
3989
                //    at all possible input starting positions.
3990
3991
                // Fetch the extra parameters of this op.
3992
0
                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
3993
0
                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
3994
0
                if (!UTEXT_USES_U16(fInputText)) {
3995
                    // utf-8 fix to maximum match length. The pattern compiler assumes utf-16.
3996
                    // The max length need not be exact; it just needs to be >= actual maximum.
3997
0
                    maxML *= 3;
3998
0
                }
3999
0
                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
4000
0
                        continueLoc = URX_VAL(continueLoc);
4001
0
                U_ASSERT(minML <= maxML);
4002
0
                U_ASSERT(minML >= 0);
4003
0
                U_ASSERT(continueLoc > fp->fPatIdx);
4004
4005
                // Fetch (from data) the last input index where a match was attempted.
4006
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4007
0
                int64_t  &lbStartIdx = fData[opValue+4];
4008
0
                if (lbStartIdx < 0) {
4009
                    // First time through loop.
4010
0
                    lbStartIdx = fp->fInputIdx - minML;
4011
0
                    if (lbStartIdx > 0) {
4012
                        // move index to a code point boundary, if it's not on one already.
4013
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4014
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4015
0
                    }
4016
0
                } else {
4017
                    // 2nd through nth time through the loop.
4018
                    // Back up start position for match by one.
4019
0
                    if (lbStartIdx == 0) {
4020
0
                        (lbStartIdx)--;
4021
0
                    } else {
4022
0
                        UTEXT_SETNATIVEINDEX(fInputText, lbStartIdx);
4023
0
                        (void)UTEXT_PREVIOUS32(fInputText);
4024
0
                        lbStartIdx = UTEXT_GETNATIVEINDEX(fInputText);
4025
0
                    }
4026
0
                }
4027
4028
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
4029
                    // We have tried all potential match starting points without
4030
                    //  getting a match, which means that the negative lookbehind as
4031
                    //  a whole has succeeded.  Jump forward to the continue location
4032
0
                    fActiveStart = fData[opValue+2];
4033
0
                    fActiveLimit = fData[opValue+3];
4034
0
                    U_ASSERT(fActiveStart >= 0);
4035
0
                    U_ASSERT(fActiveLimit <= fInputLength);
4036
0
                    fp->fPatIdx = continueLoc;
4037
0
                    break;
4038
0
                }
4039
4040
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
4041
                //      (successful match will cause a FAIL out of the loop altogether.)
4042
0
                fp = StateSave(fp, fp->fPatIdx-4, status);
4043
0
                fp->fInputIdx = lbStartIdx;
4044
0
            }
4045
0
            break;
4046
4047
0
        case URX_LBN_END:
4048
            // End of a negative look-behind block, after a successful match.
4049
0
            {
4050
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
4051
0
                if (fp->fInputIdx != fActiveLimit) {
4052
                    //  The look-behind expression matched, but the match did not
4053
                    //    extend all the way to the point that we are looking behind from.
4054
                    //  FAIL out of here, which will take us back to the LB_CONT, which
4055
                    //     will retry the match starting at another position or succeed
4056
                    //     the look-behind altogether, whichever is appropriate.
4057
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4058
0
                    break;
4059
0
                }
4060
4061
                // Look-behind expression matched, which means look-behind test as
4062
                //   a whole Fails
4063
4064
                //   Restore the original input string length, which had been truncated
4065
                //   inorder to pin the end of the lookbehind match
4066
                //   to the position being looked-behind.
4067
0
                fActiveStart = fData[opValue+2];
4068
0
                fActiveLimit = fData[opValue+3];
4069
0
                U_ASSERT(fActiveStart >= 0);
4070
0
                U_ASSERT(fActiveLimit <= fInputLength);
4071
4072
                // Restore original stack position, discarding any state saved
4073
                //   by the successful pattern match.
4074
0
                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
4075
0
                int32_t newStackSize = (int32_t)fData[opValue];
4076
0
                U_ASSERT(fStack->size() > newStackSize);
4077
0
                fStack->setSize(newStackSize);
4078
4079
                //  FAIL, which will take control back to someplace
4080
                //  prior to entering the look-behind test.
4081
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4082
0
            }
4083
0
            break;
4084
4085
4086
0
        case URX_LOOP_SR_I:
4087
            // Loop Initialization for the optimized implementation of
4088
            //     [some character set]*
4089
            //   This op scans through all matching input.
4090
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4091
0
            {
4092
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
4093
0
                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4094
0
                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
4095
4096
                // Loop through input, until either the input is exhausted or
4097
                //   we reach a character that is not a member of the set.
4098
0
                int64_t ix = fp->fInputIdx;
4099
0
                UTEXT_SETNATIVEINDEX(fInputText, ix);
4100
0
                for (;;) {
4101
0
                    if (ix >= fActiveLimit) {
4102
0
                        fHitEnd = true;
4103
0
                        break;
4104
0
                    }
4105
0
                    UChar32 c = UTEXT_NEXT32(fInputText);
4106
0
                    if (c<256) {
4107
0
                        if (s8->contains(c) == false) {
4108
0
                            break;
4109
0
                        }
4110
0
                    } else {
4111
0
                        if (s->contains(c) == false) {
4112
0
                            break;
4113
0
                        }
4114
0
                    }
4115
0
                    ix = UTEXT_GETNATIVEINDEX(fInputText);
4116
0
                }
4117
4118
                // If there were no matching characters, skip over the loop altogether.
4119
                //   The loop doesn't run at all, a * op always succeeds.
4120
0
                if (ix == fp->fInputIdx) {
4121
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4122
0
                    break;
4123
0
                }
4124
4125
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4126
                //   must follow.  It's operand is the stack location
4127
                //   that holds the starting input index for the match of this [set]*
4128
0
                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4129
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4130
0
                int32_t stackLoc = URX_VAL(loopcOp);
4131
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4132
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
4133
0
                fp->fInputIdx = ix;
4134
4135
                // Save State to the URX_LOOP_C op that follows this one,
4136
                //   so that match failures in the following code will return to there.
4137
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4138
0
                fp = StateSave(fp, fp->fPatIdx, status);
4139
0
                fp->fPatIdx++;
4140
0
            }
4141
0
            break;
4142
4143
4144
0
        case URX_LOOP_DOT_I:
4145
            // Loop Initialization for the optimized implementation of .*
4146
            //   This op scans through all remaining input.
4147
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
4148
0
            {
4149
                // Loop through input until the input is exhausted (we reach an end-of-line)
4150
                // In DOTALL mode, we can just go straight to the end of the input.
4151
0
                int64_t ix;
4152
0
                if ((opValue & 1) == 1) {
4153
                    // Dot-matches-All mode.  Jump straight to the end of the string.
4154
0
                    ix = fActiveLimit;
4155
0
                    fHitEnd = true;
4156
0
                } else {
4157
                    // NOT DOT ALL mode.  Line endings do not match '.'
4158
                    // Scan forward until a line ending or end of input.
4159
0
                    ix = fp->fInputIdx;
4160
0
                    UTEXT_SETNATIVEINDEX(fInputText, ix);
4161
0
                    for (;;) {
4162
0
                        if (ix >= fActiveLimit) {
4163
0
                            fHitEnd = true;
4164
0
                            break;
4165
0
                        }
4166
0
                        UChar32 c = UTEXT_NEXT32(fInputText);
4167
0
                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
4168
0
                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
4169
0
                               (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
4170
0
                                    isLineTerminator(c))) {
4171
                                //  char is a line ending.  Exit the scanning loop.
4172
0
                                break;
4173
0
                            }
4174
0
                        }
4175
0
                        ix = UTEXT_GETNATIVEINDEX(fInputText);
4176
0
                    }
4177
0
                }
4178
4179
                // If there were no matching characters, skip over the loop altogether.
4180
                //   The loop doesn't run at all, a * op always succeeds.
4181
0
                if (ix == fp->fInputIdx) {
4182
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
4183
0
                    break;
4184
0
                }
4185
4186
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
4187
                //   must follow.  It's operand is the stack location
4188
                //   that holds the starting input index for the match of this .*
4189
0
                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
4190
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
4191
0
                int32_t stackLoc = URX_VAL(loopcOp);
4192
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
4193
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
4194
0
                fp->fInputIdx = ix;
4195
4196
                // Save State to the URX_LOOP_C op that follows this one,
4197
                //   so that match failures in the following code will return to there.
4198
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
4199
0
                fp = StateSave(fp, fp->fPatIdx, status);
4200
0
                fp->fPatIdx++;
4201
0
            }
4202
0
            break;
4203
4204
4205
0
        case URX_LOOP_C:
4206
0
            {
4207
0
                U_ASSERT(opValue>=0 && opValue<fFrameSize);
4208
0
                backSearchIndex = fp->fExtra[opValue];
4209
0
                U_ASSERT(backSearchIndex <= fp->fInputIdx);
4210
0
                if (backSearchIndex == fp->fInputIdx) {
4211
                    // We've backed up the input idx to the point that the loop started.
4212
                    // The loop is done.  Leave here without saving state.
4213
                    //  Subsequent failures won't come back here.
4214
0
                    break;
4215
0
                }
4216
                // Set up for the next iteration of the loop, with input index
4217
                //   backed up by one from the last time through,
4218
                //   and a state save to this instruction in case the following code fails again.
4219
                //   (We're going backwards because this loop emulates stack unwinding, not
4220
                //    the initial scan forward.)
4221
0
                U_ASSERT(fp->fInputIdx > 0);
4222
0
                UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4223
0
                UChar32 prevC = UTEXT_PREVIOUS32(fInputText);
4224
0
                fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4225
4226
0
                UChar32 twoPrevC = UTEXT_PREVIOUS32(fInputText);
4227
0
                if (prevC == 0x0a &&
4228
0
                    fp->fInputIdx > backSearchIndex &&
4229
0
                    twoPrevC == 0x0d) {
4230
0
                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
4231
0
                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
4232
                        // .*, stepping back over CRLF pair.
4233
0
                        fp->fInputIdx = UTEXT_GETNATIVEINDEX(fInputText);
4234
0
                    }
4235
0
                }
4236
4237
4238
0
                fp = StateSave(fp, fp->fPatIdx-1, status);
4239
0
            }
4240
0
            break;
4241
4242
4243
4244
0
        default:
4245
            // Trouble.  The compiled pattern contains an entry with an
4246
            //           unrecognized type tag.
4247
0
            UPRV_UNREACHABLE_ASSERT;
4248
            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
4249
            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
4250
            // See ICU-21669.
4251
0
            status = U_INTERNAL_PROGRAM_ERROR;
4252
0
        }
4253
4254
0
        if (U_FAILURE(status)) {
4255
0
            isMatch = false;
4256
0
            break;
4257
0
        }
4258
0
    }
4259
4260
0
breakFromLoop:
4261
0
    fMatch = isMatch;
4262
0
    if (isMatch) {
4263
0
        fLastMatchEnd = fMatchEnd;
4264
0
        fMatchStart   = startIdx;
4265
0
        fMatchEnd     = fp->fInputIdx;
4266
0
    }
4267
4268
#ifdef REGEX_RUN_DEBUG
4269
    if (fTraceDebug) {
4270
        if (isMatch) {
4271
            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
4272
        } else {
4273
            printf("No match\n\n");
4274
        }
4275
    }
4276
#endif
4277
4278
0
    fFrame = fp;                // The active stack frame when the engine stopped.
4279
                                //   Contains the capture group results that we need to
4280
                                //    access later.
4281
0
    return;
4282
0
}
4283
4284
4285
//--------------------------------------------------------------------------------
4286
//
4287
//   MatchChunkAt   This is the actual matching engine. Like MatchAt, but with the
4288
//                  assumption that the entire string is available in the UText's
4289
//                  chunk buffer. For now, that means we can use int32_t indexes,
4290
//                  except for anything that needs to be saved (like group starts
4291
//                  and ends).
4292
//
4293
//                  startIdx:    begin matching a this index.
4294
//                  toEnd:       if true, match must extend to end of the input region
4295
//
4296
//--------------------------------------------------------------------------------
4297
0
void RegexMatcher::MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status) {
4298
0
    UBool       isMatch  = false;      // True if the we have a match.
4299
4300
0
    int32_t     backSearchIndex = INT32_MAX; // used after greedy single-character matches for searching backwards
4301
4302
0
    int32_t     op;                    // Operation from the compiled pattern, split into
4303
0
    int32_t     opType;                //    the opcode
4304
0
    int32_t     opValue;               //    and the operand value.
4305
4306
#ifdef REGEX_RUN_DEBUG
4307
    if (fTraceDebug) {
4308
        printf("MatchAt(startIdx=%d)\n", startIdx);
4309
        printf("Original Pattern: \"%s\"\n", CStr(StringFromUText(fPattern->fPattern))());
4310
        printf("Input String:     \"%s\"\n\n", CStr(StringFromUText(fInputText))());
4311
    }
4312
#endif
4313
4314
0
    if (U_FAILURE(status)) {
4315
0
        return;
4316
0
    }
4317
4318
    //  Cache frequently referenced items from the compiled pattern
4319
    //
4320
0
    int64_t             *pat           = fPattern->fCompiledPat->getBuffer();
4321
4322
0
    const UChar         *litText       = fPattern->fLiteralText.getBuffer();
4323
0
    UVector             *fSets         = fPattern->fSets;
4324
4325
0
    const UChar         *inputBuf      = fInputText->chunkContents;
4326
4327
0
    fFrameSize = fPattern->fFrameSize;
4328
0
    REStackFrame        *fp            = resetStack();
4329
0
    if (U_FAILURE(fDeferredStatus)) {
4330
0
        status = fDeferredStatus;
4331
0
        return;
4332
0
    }
4333
4334
0
    fp->fPatIdx   = 0;
4335
0
    fp->fInputIdx = startIdx;
4336
4337
    // Zero out the pattern's static data
4338
0
    int32_t i;
4339
0
    for (i = 0; i<fPattern->fDataSize; i++) {
4340
0
        fData[i] = 0;
4341
0
    }
4342
4343
    //
4344
    //  Main loop for interpreting the compiled pattern.
4345
    //  One iteration of the loop per pattern operation performed.
4346
    //
4347
0
    for (;;) {
4348
0
        op      = (int32_t)pat[fp->fPatIdx];
4349
0
        opType  = URX_TYPE(op);
4350
0
        opValue = URX_VAL(op);
4351
#ifdef REGEX_RUN_DEBUG
4352
        if (fTraceDebug) {
4353
            UTEXT_SETNATIVEINDEX(fInputText, fp->fInputIdx);
4354
            printf("inputIdx=%ld   inputChar=%x   sp=%3ld   activeLimit=%ld  ", fp->fInputIdx,
4355
                   UTEXT_CURRENT32(fInputText), (int64_t *)fp-fStack->getBuffer(), fActiveLimit);
4356
            fPattern->dumpOp(fp->fPatIdx);
4357
        }
4358
#endif
4359
0
        fp->fPatIdx++;
4360
4361
0
        switch (opType) {
4362
4363
4364
0
        case URX_NOP:
4365
0
            break;
4366
4367
4368
0
        case URX_BACKTRACK:
4369
            // Force a backtrack.  In some circumstances, the pattern compiler
4370
            //   will notice that the pattern can't possibly match anything, and will
4371
            //   emit one of these at that point.
4372
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4373
0
            break;
4374
4375
4376
0
        case URX_ONECHAR:
4377
0
            if (fp->fInputIdx < fActiveLimit) {
4378
0
                UChar32 c;
4379
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4380
0
                if (c == opValue) {
4381
0
                    break;
4382
0
                }
4383
0
            } else {
4384
0
                fHitEnd = true;
4385
0
            }
4386
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4387
0
            break;
4388
4389
4390
0
        case URX_STRING:
4391
0
            {
4392
                // Test input against a literal string.
4393
                // Strings require two slots in the compiled pattern, one for the
4394
                //   offset to the string text, and one for the length.
4395
0
                int32_t   stringStartIdx = opValue;
4396
0
                int32_t   stringLen;
4397
4398
0
                op      = (int32_t)pat[fp->fPatIdx];     // Fetch the second operand
4399
0
                fp->fPatIdx++;
4400
0
                opType    = URX_TYPE(op);
4401
0
                stringLen = URX_VAL(op);
4402
0
                U_ASSERT(opType == URX_STRING_LEN);
4403
0
                U_ASSERT(stringLen >= 2);
4404
4405
0
                const UChar * pInp = inputBuf + fp->fInputIdx;
4406
0
                const UChar * pInpLimit = inputBuf + fActiveLimit;
4407
0
                const UChar * pPat = litText+stringStartIdx;
4408
0
                const UChar * pEnd = pInp + stringLen;
4409
0
                UBool success = true;
4410
0
                while (pInp < pEnd) {
4411
0
                    if (pInp >= pInpLimit) {
4412
0
                        fHitEnd = true;
4413
0
                        success = false;
4414
0
                        break;
4415
0
                    }
4416
0
                    if (*pInp++ != *pPat++) {
4417
0
                        success = false;
4418
0
                        break;
4419
0
                    }
4420
0
                }
4421
4422
0
                if (success) {
4423
0
                    fp->fInputIdx += stringLen;
4424
0
                } else {
4425
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4426
0
                }
4427
0
            }
4428
0
            break;
4429
4430
4431
0
        case URX_STATE_SAVE:
4432
0
            fp = StateSave(fp, opValue, status);
4433
0
            break;
4434
4435
4436
0
        case URX_END:
4437
            // The match loop will exit via this path on a successful match,
4438
            //   when we reach the end of the pattern.
4439
0
            if (toEnd && fp->fInputIdx != fActiveLimit) {
4440
                // The pattern matched, but not to the end of input.  Try some more.
4441
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4442
0
                break;
4443
0
            }
4444
0
            isMatch = true;
4445
0
            goto  breakFromLoop;
4446
4447
            // Start and End Capture stack frame variables are laid out out like this:
4448
            //  fp->fExtra[opValue]  - The start of a completed capture group
4449
            //             opValue+1 - The end   of a completed capture group
4450
            //             opValue+2 - the start of a capture group whose end
4451
            //                          has not yet been reached (and might not ever be).
4452
0
        case URX_START_CAPTURE:
4453
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4454
0
            fp->fExtra[opValue+2] = fp->fInputIdx;
4455
0
            break;
4456
4457
4458
0
        case URX_END_CAPTURE:
4459
0
            U_ASSERT(opValue >= 0 && opValue < fFrameSize-3);
4460
0
            U_ASSERT(fp->fExtra[opValue+2] >= 0);            // Start pos for this group must be set.
4461
0
            fp->fExtra[opValue]   = fp->fExtra[opValue+2];   // Tentative start becomes real.
4462
0
            fp->fExtra[opValue+1] = fp->fInputIdx;           // End position
4463
0
            U_ASSERT(fp->fExtra[opValue] <= fp->fExtra[opValue+1]);
4464
0
            break;
4465
4466
4467
0
        case URX_DOLLAR:                   //  $, test for End of line
4468
            //     or for position before new line at end of input
4469
0
            if (fp->fInputIdx < fAnchorLimit-2) {
4470
                // We are no where near the end of input.  Fail.
4471
                //   This is the common case.  Keep it first.
4472
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4473
0
                break;
4474
0
            }
4475
0
            if (fp->fInputIdx >= fAnchorLimit) {
4476
                // We really are at the end of input.  Success.
4477
0
                fHitEnd = true;
4478
0
                fRequireEnd = true;
4479
0
                break;
4480
0
            }
4481
4482
            // If we are positioned just before a new-line that is located at the
4483
            //   end of input, succeed.
4484
0
            if (fp->fInputIdx == fAnchorLimit-1) {
4485
0
                UChar32 c;
4486
0
                U16_GET(inputBuf, fAnchorStart, fp->fInputIdx, fAnchorLimit, c);
4487
4488
0
                if (isLineTerminator(c)) {
4489
0
                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4490
                        // At new-line at end of input. Success
4491
0
                        fHitEnd = true;
4492
0
                        fRequireEnd = true;
4493
0
                        break;
4494
0
                    }
4495
0
                }
4496
0
            } else if (fp->fInputIdx == fAnchorLimit-2 &&
4497
0
                inputBuf[fp->fInputIdx]==0x0d && inputBuf[fp->fInputIdx+1]==0x0a) {
4498
0
                    fHitEnd = true;
4499
0
                    fRequireEnd = true;
4500
0
                    break;                         // At CR/LF at end of input.  Success
4501
0
            }
4502
4503
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4504
4505
0
            break;
4506
4507
4508
0
        case URX_DOLLAR_D:                   //  $, test for End of Line, in UNIX_LINES mode.
4509
0
            if (fp->fInputIdx >= fAnchorLimit-1) {
4510
                // Either at the last character of input, or off the end.
4511
0
                if (fp->fInputIdx == fAnchorLimit-1) {
4512
                    // At last char of input.  Success if it's a new line.
4513
0
                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4514
0
                        fHitEnd = true;
4515
0
                        fRequireEnd = true;
4516
0
                        break;
4517
0
                    }
4518
0
                } else {
4519
                    // Off the end of input.  Success.
4520
0
                    fHitEnd = true;
4521
0
                    fRequireEnd = true;
4522
0
                    break;
4523
0
                }
4524
0
            }
4525
4526
            // Not at end of input.  Back-track out.
4527
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4528
0
            break;
4529
4530
4531
0
        case URX_DOLLAR_M:                //  $, test for End of line in multi-line mode
4532
0
            {
4533
0
                if (fp->fInputIdx >= fAnchorLimit) {
4534
                    // We really are at the end of input.  Success.
4535
0
                    fHitEnd = true;
4536
0
                    fRequireEnd = true;
4537
0
                    break;
4538
0
                }
4539
                // If we are positioned just before a new-line, succeed.
4540
                // It makes no difference where the new-line is within the input.
4541
0
                UChar32 c = inputBuf[fp->fInputIdx];
4542
0
                if (isLineTerminator(c)) {
4543
                    // At a line end, except for the odd chance of  being in the middle of a CR/LF sequence
4544
                    //  In multi-line mode, hitting a new-line just before the end of input does not
4545
                    //   set the hitEnd or requireEnd flags
4546
0
                    if ( !(c==0x0a && fp->fInputIdx>fAnchorStart && inputBuf[fp->fInputIdx-1]==0x0d)) {
4547
0
                        break;
4548
0
                    }
4549
0
                }
4550
                // not at a new line.  Fail.
4551
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4552
0
            }
4553
0
            break;
4554
4555
4556
0
        case URX_DOLLAR_MD:                //  $, test for End of line in multi-line and UNIX_LINES mode
4557
0
            {
4558
0
                if (fp->fInputIdx >= fAnchorLimit) {
4559
                    // We really are at the end of input.  Success.
4560
0
                    fHitEnd = true;
4561
0
                    fRequireEnd = true;  // Java set requireEnd in this case, even though
4562
0
                    break;               //   adding a new-line would not lose the match.
4563
0
                }
4564
                // If we are not positioned just before a new-line, the test fails; backtrack out.
4565
                // It makes no difference where the new-line is within the input.
4566
0
                if (inputBuf[fp->fInputIdx] != 0x0a) {
4567
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4568
0
                }
4569
0
            }
4570
0
            break;
4571
4572
4573
0
        case URX_CARET:                    //  ^, test for start of line
4574
0
            if (fp->fInputIdx != fAnchorStart) {
4575
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4576
0
            }
4577
0
            break;
4578
4579
4580
0
        case URX_CARET_M:                   //  ^, test for start of line in mulit-line mode
4581
0
            {
4582
0
                if (fp->fInputIdx == fAnchorStart) {
4583
                    // We are at the start input.  Success.
4584
0
                    break;
4585
0
                }
4586
                // Check whether character just before the current pos is a new-line
4587
                //   unless we are at the end of input
4588
0
                UChar  c = inputBuf[fp->fInputIdx - 1];
4589
0
                if ((fp->fInputIdx < fAnchorLimit) &&
4590
0
                    isLineTerminator(c)) {
4591
                    //  It's a new-line.  ^ is true.  Success.
4592
                    //  TODO:  what should be done with positions between a CR and LF?
4593
0
                    break;
4594
0
                }
4595
                // Not at the start of a line.  Fail.
4596
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4597
0
            }
4598
0
            break;
4599
4600
4601
0
        case URX_CARET_M_UNIX:       //  ^, test for start of line in mulit-line + Unix-line mode
4602
0
            {
4603
0
                U_ASSERT(fp->fInputIdx >= fAnchorStart);
4604
0
                if (fp->fInputIdx <= fAnchorStart) {
4605
                    // We are at the start input.  Success.
4606
0
                    break;
4607
0
                }
4608
                // Check whether character just before the current pos is a new-line
4609
0
                U_ASSERT(fp->fInputIdx <= fAnchorLimit);
4610
0
                UChar  c = inputBuf[fp->fInputIdx - 1];
4611
0
                if (c != 0x0a) {
4612
                    // Not at the start of a line.  Back-track out.
4613
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4614
0
                }
4615
0
            }
4616
0
            break;
4617
4618
0
        case URX_BACKSLASH_B:          // Test for word boundaries
4619
0
            {
4620
0
                UBool success = isChunkWordBoundary((int32_t)fp->fInputIdx);
4621
0
                success ^= (UBool)(opValue != 0);     // flip sense for \B
4622
0
                if (!success) {
4623
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4624
0
                }
4625
0
            }
4626
0
            break;
4627
4628
4629
0
        case URX_BACKSLASH_BU:          // Test for word boundaries, Unicode-style
4630
0
            {
4631
0
                UBool success = isUWordBoundary(fp->fInputIdx, status);
4632
0
                success ^= (UBool)(opValue != 0);     // flip sense for \B
4633
0
                if (!success) {
4634
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4635
0
                }
4636
0
            }
4637
0
            break;
4638
4639
4640
0
        case URX_BACKSLASH_D:            // Test for decimal digit
4641
0
            {
4642
0
                if (fp->fInputIdx >= fActiveLimit) {
4643
0
                    fHitEnd = true;
4644
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4645
0
                    break;
4646
0
                }
4647
4648
0
                UChar32 c;
4649
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4650
0
                int8_t ctype = u_charType(c);     // TODO:  make a unicode set for this.  Will be faster.
4651
0
                UBool success = (ctype == U_DECIMAL_DIGIT_NUMBER);
4652
0
                success ^= (UBool)(opValue != 0);        // flip sense for \D
4653
0
                if (!success) {
4654
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4655
0
                }
4656
0
            }
4657
0
            break;
4658
4659
4660
0
        case URX_BACKSLASH_G:          // Test for position at end of previous match
4661
0
            if (!((fMatch && fp->fInputIdx==fMatchEnd) || (fMatch==false && fp->fInputIdx==fActiveStart))) {
4662
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4663
0
            }
4664
0
            break;
4665
4666
4667
0
        case URX_BACKSLASH_H:            // Test for \h, horizontal white space.
4668
0
            {
4669
0
                if (fp->fInputIdx >= fActiveLimit) {
4670
0
                    fHitEnd = true;
4671
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4672
0
                    break;
4673
0
                }
4674
0
                UChar32 c;
4675
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4676
0
                int8_t ctype = u_charType(c);
4677
0
                UBool success = (ctype == U_SPACE_SEPARATOR || c == 9);  // SPACE_SEPARATOR || TAB
4678
0
                success ^= (UBool)(opValue != 0);        // flip sense for \H
4679
0
                if (!success) {
4680
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4681
0
                }
4682
0
            }
4683
0
            break;
4684
4685
4686
0
        case URX_BACKSLASH_R:            // Test for \R, any line break sequence.
4687
0
            {
4688
0
                if (fp->fInputIdx >= fActiveLimit) {
4689
0
                    fHitEnd = true;
4690
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4691
0
                    break;
4692
0
                }
4693
0
                UChar32 c;
4694
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4695
0
                if (isLineTerminator(c)) {
4696
0
                    if (c == 0x0d && fp->fInputIdx < fActiveLimit) {
4697
                        // Check for CR/LF sequence. Consume both together when found.
4698
0
                        UChar c2;
4699
0
                        U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c2);
4700
0
                        if (c2 != 0x0a) {
4701
0
                            U16_PREV(inputBuf, 0, fp->fInputIdx, c2);
4702
0
                        }
4703
0
                    }
4704
0
                } else {
4705
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4706
0
                }
4707
0
            }
4708
0
            break;
4709
4710
4711
0
        case URX_BACKSLASH_V:         // Any single code point line ending.
4712
0
            {
4713
0
                if (fp->fInputIdx >= fActiveLimit) {
4714
0
                    fHitEnd = true;
4715
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4716
0
                    break;
4717
0
                }
4718
0
                UChar32 c;
4719
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4720
0
                UBool success = isLineTerminator(c);
4721
0
                success ^= (UBool)(opValue != 0);        // flip sense for \V
4722
0
                if (!success) {
4723
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4724
0
                }
4725
0
            }
4726
0
            break;
4727
4728
4729
0
        case URX_BACKSLASH_X:
4730
            //  Match a Grapheme, as defined by Unicode UAX 29.
4731
4732
            // Fail if at end of input
4733
0
            if (fp->fInputIdx >= fActiveLimit) {
4734
0
                fHitEnd = true;
4735
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4736
0
                break;
4737
0
            }
4738
4739
0
            fp->fInputIdx = followingGCBoundary(fp->fInputIdx, status);
4740
0
            if (fp->fInputIdx >= fActiveLimit) {
4741
0
                fHitEnd = true;
4742
0
                fp->fInputIdx = fActiveLimit;
4743
0
            }
4744
0
            break;
4745
4746
4747
0
        case URX_BACKSLASH_Z:          // Test for end of Input
4748
0
            if (fp->fInputIdx < fAnchorLimit) {
4749
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4750
0
            } else {
4751
0
                fHitEnd = true;
4752
0
                fRequireEnd = true;
4753
0
            }
4754
0
            break;
4755
4756
4757
4758
0
        case URX_STATIC_SETREF:
4759
0
            {
4760
                // Test input character against one of the predefined sets
4761
                //    (Word Characters, for example)
4762
                // The high bit of the op value is a flag for the match polarity.
4763
                //    0:   success if input char is in set.
4764
                //    1:   success if input char is not in set.
4765
0
                if (fp->fInputIdx >= fActiveLimit) {
4766
0
                    fHitEnd = true;
4767
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4768
0
                    break;
4769
0
                }
4770
4771
0
                UBool success = ((opValue & URX_NEG_SET) == URX_NEG_SET);
4772
0
                opValue &= ~URX_NEG_SET;
4773
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4774
4775
0
                UChar32 c;
4776
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4777
0
                if (c < 256) {
4778
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4779
0
                    if (s8.contains(c)) {
4780
0
                        success = !success;
4781
0
                    }
4782
0
                } else {
4783
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4784
0
                    if (s.contains(c)) {
4785
0
                        success = !success;
4786
0
                    }
4787
0
                }
4788
0
                if (!success) {
4789
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4790
0
                }
4791
0
            }
4792
0
            break;
4793
4794
4795
0
        case URX_STAT_SETREF_N:
4796
0
            {
4797
                // Test input character for NOT being a member of  one of
4798
                //    the predefined sets (Word Characters, for example)
4799
0
                if (fp->fInputIdx >= fActiveLimit) {
4800
0
                    fHitEnd = true;
4801
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4802
0
                    break;
4803
0
                }
4804
4805
0
                U_ASSERT(opValue > 0 && opValue < URX_LAST_SET);
4806
4807
0
                UChar32  c;
4808
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4809
0
                if (c < 256) {
4810
0
                    Regex8BitSet &s8 = RegexStaticSets::gStaticSets->fPropSets8[opValue];
4811
0
                    if (s8.contains(c) == false) {
4812
0
                        break;
4813
0
                    }
4814
0
                } else {
4815
0
                    const UnicodeSet &s = RegexStaticSets::gStaticSets->fPropSets[opValue];
4816
0
                    if (s.contains(c) == false) {
4817
0
                        break;
4818
0
                    }
4819
0
                }
4820
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4821
0
            }
4822
0
            break;
4823
4824
4825
0
        case URX_SETREF:
4826
0
            {
4827
0
                if (fp->fInputIdx >= fActiveLimit) {
4828
0
                    fHitEnd = true;
4829
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4830
0
                    break;
4831
0
                }
4832
4833
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
4834
4835
                // There is input left.  Pick up one char and test it for set membership.
4836
0
                UChar32  c;
4837
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4838
0
                if (c<256) {
4839
0
                    Regex8BitSet *s8 = &fPattern->fSets8[opValue];
4840
0
                    if (s8->contains(c)) {
4841
                        // The character is in the set.  A Match.
4842
0
                        break;
4843
0
                    }
4844
0
                } else {
4845
0
                    UnicodeSet *s = (UnicodeSet *)fSets->elementAt(opValue);
4846
0
                    if (s->contains(c)) {
4847
                        // The character is in the set.  A Match.
4848
0
                        break;
4849
0
                    }
4850
0
                }
4851
4852
                // the character wasn't in the set.
4853
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4854
0
            }
4855
0
            break;
4856
4857
4858
0
        case URX_DOTANY:
4859
0
            {
4860
                // . matches anything, but stops at end-of-line.
4861
0
                if (fp->fInputIdx >= fActiveLimit) {
4862
                    // At end of input.  Match failed.  Backtrack out.
4863
0
                    fHitEnd = true;
4864
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4865
0
                    break;
4866
0
                }
4867
4868
                // There is input left.  Advance over one char, unless we've hit end-of-line
4869
0
                UChar32  c;
4870
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4871
0
                if (isLineTerminator(c)) {
4872
                    // End of line in normal mode.   . does not match.
4873
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4874
0
                    break;
4875
0
                }
4876
0
            }
4877
0
            break;
4878
4879
4880
0
        case URX_DOTANY_ALL:
4881
0
            {
4882
                // . in dot-matches-all (including new lines) mode
4883
0
                if (fp->fInputIdx >= fActiveLimit) {
4884
                    // At end of input.  Match failed.  Backtrack out.
4885
0
                    fHitEnd = true;
4886
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4887
0
                    break;
4888
0
                }
4889
4890
                // There is input left.  Advance over one char, except if we are
4891
                //   at a cr/lf, advance over both of them.
4892
0
                UChar32 c;
4893
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4894
0
                if (c==0x0d && fp->fInputIdx < fActiveLimit) {
4895
                    // In the case of a CR/LF, we need to advance over both.
4896
0
                    if (inputBuf[fp->fInputIdx] == 0x0a) {
4897
0
                        U16_FWD_1(inputBuf, fp->fInputIdx, fActiveLimit);
4898
0
                    }
4899
0
                }
4900
0
            }
4901
0
            break;
4902
4903
4904
0
        case URX_DOTANY_UNIX:
4905
0
            {
4906
                // '.' operator, matches all, but stops at end-of-line.
4907
                //   UNIX_LINES mode, so 0x0a is the only recognized line ending.
4908
0
                if (fp->fInputIdx >= fActiveLimit) {
4909
                    // At end of input.  Match failed.  Backtrack out.
4910
0
                    fHitEnd = true;
4911
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4912
0
                    break;
4913
0
                }
4914
4915
                // There is input left.  Advance over one char, unless we've hit end-of-line
4916
0
                UChar32 c;
4917
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
4918
0
                if (c == 0x0a) {
4919
                    // End of line in normal mode.   '.' does not match the \n
4920
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4921
0
                }
4922
0
            }
4923
0
            break;
4924
4925
4926
0
        case URX_JMP:
4927
0
            fp->fPatIdx = opValue;
4928
0
            break;
4929
4930
0
        case URX_FAIL:
4931
0
            isMatch = false;
4932
0
            goto breakFromLoop;
4933
4934
0
        case URX_JMP_SAV:
4935
0
            U_ASSERT(opValue < fPattern->fCompiledPat->size());
4936
0
            fp = StateSave(fp, fp->fPatIdx, status);       // State save to loc following current
4937
0
            fp->fPatIdx = opValue;                         // Then JMP.
4938
0
            break;
4939
4940
0
        case URX_JMP_SAV_X:
4941
            // This opcode is used with (x)+, when x can match a zero length string.
4942
            // Same as JMP_SAV, except conditional on the match having made forward progress.
4943
            // Destination of the JMP must be a URX_STO_INP_LOC, from which we get the
4944
            //   data address of the input position at the start of the loop.
4945
0
            {
4946
0
                U_ASSERT(opValue > 0 && opValue < fPattern->fCompiledPat->size());
4947
0
                int32_t  stoOp = (int32_t)pat[opValue-1];
4948
0
                U_ASSERT(URX_TYPE(stoOp) == URX_STO_INP_LOC);
4949
0
                int32_t  frameLoc = URX_VAL(stoOp);
4950
0
                U_ASSERT(frameLoc >= 0 && frameLoc < fFrameSize);
4951
0
                int32_t prevInputIdx = (int32_t)fp->fExtra[frameLoc];
4952
0
                U_ASSERT(prevInputIdx <= fp->fInputIdx);
4953
0
                if (prevInputIdx < fp->fInputIdx) {
4954
                    // The match did make progress.  Repeat the loop.
4955
0
                    fp = StateSave(fp, fp->fPatIdx, status);  // State save to loc following current
4956
0
                    fp->fPatIdx = opValue;
4957
0
                    fp->fExtra[frameLoc] = fp->fInputIdx;
4958
0
                }
4959
                // If the input position did not advance, we do nothing here,
4960
                //   execution will fall out of the loop.
4961
0
            }
4962
0
            break;
4963
4964
0
        case URX_CTR_INIT:
4965
0
            {
4966
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
4967
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
4968
4969
                // Pick up the three extra operands that CTR_INIT has, and
4970
                //    skip the pattern location counter past
4971
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
4972
0
                fp->fPatIdx += 3;
4973
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
4974
0
                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
4975
0
                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
4976
0
                U_ASSERT(minCount>=0);
4977
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
4978
0
                U_ASSERT(loopLoc>=fp->fPatIdx);
4979
4980
0
                if (minCount == 0) {
4981
0
                    fp = StateSave(fp, loopLoc+1, status);
4982
0
                }
4983
0
                if (maxCount == -1) {
4984
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  For loop breaking.
4985
0
                } else if (maxCount == 0) {
4986
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
4987
0
                }
4988
0
            }
4989
0
            break;
4990
4991
0
        case URX_CTR_LOOP:
4992
0
            {
4993
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
4994
0
                int32_t initOp = (int32_t)pat[opValue];
4995
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT);
4996
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
4997
0
                int32_t minCount  = (int32_t)pat[opValue+2];
4998
0
                int32_t maxCount  = (int32_t)pat[opValue+3];
4999
0
                (*pCounter)++;
5000
0
                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5001
0
                    U_ASSERT(*pCounter == maxCount);
5002
0
                    break;
5003
0
                }
5004
0
                if (*pCounter >= minCount) {
5005
0
                    if (maxCount == -1) {
5006
                        // Loop has no hard upper bound.
5007
                        // Check that it is progressing through the input, break if it is not.
5008
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5009
0
                        if (fp->fInputIdx == *pLastInputIdx) {
5010
0
                            break;
5011
0
                        } else {
5012
0
                            *pLastInputIdx = fp->fInputIdx;
5013
0
                        }
5014
0
                    }
5015
0
                    fp = StateSave(fp, fp->fPatIdx, status);
5016
0
                } else {
5017
                    // Increment time-out counter. (StateSave() does it if count >= minCount)
5018
0
                    fTickCounter--;
5019
0
                    if (fTickCounter <= 0) {
5020
0
                        IncrementTime(status);    // Re-initializes fTickCounter
5021
0
                    }
5022
0
                }
5023
0
                fp->fPatIdx = opValue + 4;    // Loop back.
5024
0
            }
5025
0
            break;
5026
5027
0
        case URX_CTR_INIT_NG:
5028
0
            {
5029
                // Initialize a non-greedy loop
5030
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize-2);
5031
0
                fp->fExtra[opValue] = 0;                 //  Set the loop counter variable to zero
5032
5033
                // Pick up the three extra operands that CTR_INIT_NG has, and
5034
                //    skip the pattern location counter past
5035
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5036
0
                fp->fPatIdx += 3;
5037
0
                int32_t loopLoc  = URX_VAL(pat[instrOperandLoc]);
5038
0
                int32_t minCount = (int32_t)pat[instrOperandLoc+1];
5039
0
                int32_t maxCount = (int32_t)pat[instrOperandLoc+2];
5040
0
                U_ASSERT(minCount>=0);
5041
0
                U_ASSERT(maxCount>=minCount || maxCount==-1);
5042
0
                U_ASSERT(loopLoc>fp->fPatIdx);
5043
0
                if (maxCount == -1) {
5044
0
                    fp->fExtra[opValue+1] = fp->fInputIdx;   //  Save initial input index for loop breaking.
5045
0
                }
5046
5047
0
                if (minCount == 0) {
5048
0
                    if (maxCount != 0) {
5049
0
                        fp = StateSave(fp, fp->fPatIdx, status);
5050
0
                    }
5051
0
                    fp->fPatIdx = loopLoc+1;   // Continue with stuff after repeated block
5052
0
                }
5053
0
            }
5054
0
            break;
5055
5056
0
        case URX_CTR_LOOP_NG:
5057
0
            {
5058
                // Non-greedy {min, max} loops
5059
0
                U_ASSERT(opValue>0 && opValue < fp->fPatIdx-2);
5060
0
                int32_t initOp = (int32_t)pat[opValue];
5061
0
                U_ASSERT(URX_TYPE(initOp) == URX_CTR_INIT_NG);
5062
0
                int64_t *pCounter = &fp->fExtra[URX_VAL(initOp)];
5063
0
                int32_t minCount  = (int32_t)pat[opValue+2];
5064
0
                int32_t maxCount  = (int32_t)pat[opValue+3];
5065
5066
0
                (*pCounter)++;
5067
0
                if ((uint64_t)*pCounter >= (uint32_t)maxCount && maxCount != -1) {
5068
                    // The loop has matched the maximum permitted number of times.
5069
                    //   Break out of here with no action.  Matching will
5070
                    //   continue with the following pattern.
5071
0
                    U_ASSERT(*pCounter == maxCount);
5072
0
                    break;
5073
0
                }
5074
5075
0
                if (*pCounter < minCount) {
5076
                    // We haven't met the minimum number of matches yet.
5077
                    //   Loop back for another one.
5078
0
                    fp->fPatIdx = opValue + 4;    // Loop back.
5079
0
                    fTickCounter--;
5080
0
                    if (fTickCounter <= 0) {
5081
0
                        IncrementTime(status);    // Re-initializes fTickCounter
5082
0
                    }
5083
0
                } else {
5084
                    // We do have the minimum number of matches.
5085
5086
                    // If there is no upper bound on the loop iterations, check that the input index
5087
                    // is progressing, and stop the loop if it is not.
5088
0
                    if (maxCount == -1) {
5089
0
                        int64_t *pLastInputIdx =  &fp->fExtra[URX_VAL(initOp) + 1];
5090
0
                        if (fp->fInputIdx == *pLastInputIdx) {
5091
0
                            break;
5092
0
                        }
5093
0
                        *pLastInputIdx = fp->fInputIdx;
5094
0
                    }
5095
5096
                    // Loop Continuation: we will fall into the pattern following the loop
5097
                    //   (non-greedy, don't execute loop body first), but first do
5098
                    //   a state save to the top of the loop, so that a match failure
5099
                    //   in the following pattern will try another iteration of the loop.
5100
0
                    fp = StateSave(fp, opValue + 4, status);
5101
0
                }
5102
0
            }
5103
0
            break;
5104
5105
0
        case URX_STO_SP:
5106
0
            U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5107
0
            fData[opValue] = fStack->size();
5108
0
            break;
5109
5110
0
        case URX_LD_SP:
5111
0
            {
5112
0
                U_ASSERT(opValue >= 0 && opValue < fPattern->fDataSize);
5113
0
                int32_t newStackSize = (int32_t)fData[opValue];
5114
0
                U_ASSERT(newStackSize <= fStack->size());
5115
0
                int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5116
0
                if (newFP == (int64_t *)fp) {
5117
0
                    break;
5118
0
                }
5119
0
                int32_t j;
5120
0
                for (j=0; j<fFrameSize; j++) {
5121
0
                    newFP[j] = ((int64_t *)fp)[j];
5122
0
                }
5123
0
                fp = (REStackFrame *)newFP;
5124
0
                fStack->setSize(newStackSize);
5125
0
            }
5126
0
            break;
5127
5128
0
        case URX_BACKREF:
5129
0
            {
5130
0
                U_ASSERT(opValue < fFrameSize);
5131
0
                int64_t groupStartIdx = fp->fExtra[opValue];
5132
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5133
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
5134
0
                int64_t inputIndex = fp->fInputIdx;
5135
0
                if (groupStartIdx < 0) {
5136
                    // This capture group has not participated in the match thus far,
5137
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5138
0
                    break;
5139
0
                }
5140
0
                UBool success = true;
5141
0
                for (int64_t groupIndex = groupStartIdx; groupIndex < groupEndIdx; ++groupIndex,++inputIndex) {
5142
0
                    if (inputIndex >= fActiveLimit) {
5143
0
                        success = false;
5144
0
                        fHitEnd = true;
5145
0
                        break;
5146
0
                    }
5147
0
                    if (inputBuf[groupIndex] != inputBuf[inputIndex]) {
5148
0
                        success = false;
5149
0
                        break;
5150
0
                    }
5151
0
                }
5152
0
                if (success && groupStartIdx < groupEndIdx && U16_IS_LEAD(inputBuf[groupEndIdx-1]) &&
5153
0
                        inputIndex < fActiveLimit && U16_IS_TRAIL(inputBuf[inputIndex])) {
5154
                    // Capture group ended with an unpaired lead surrogate.
5155
                    // Back reference is not permitted to match lead only of a surrogatge pair.
5156
0
                    success = false;
5157
0
                }
5158
0
                if (success) {
5159
0
                    fp->fInputIdx = inputIndex;
5160
0
                } else {
5161
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5162
0
                }
5163
0
            }
5164
0
            break;
5165
5166
0
        case URX_BACKREF_I:
5167
0
            {
5168
0
                U_ASSERT(opValue < fFrameSize);
5169
0
                int64_t groupStartIdx = fp->fExtra[opValue];
5170
0
                int64_t groupEndIdx   = fp->fExtra[opValue+1];
5171
0
                U_ASSERT(groupStartIdx <= groupEndIdx);
5172
0
                if (groupStartIdx < 0) {
5173
                    // This capture group has not participated in the match thus far,
5174
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no match.
5175
0
                    break;
5176
0
                }
5177
0
                CaseFoldingUCharIterator captureGroupItr(inputBuf, groupStartIdx, groupEndIdx);
5178
0
                CaseFoldingUCharIterator inputItr(inputBuf, fp->fInputIdx, fActiveLimit);
5179
5180
                //   Note: if the capture group match was of an empty string the backref
5181
                //         match succeeds.  Verified by testing:  Perl matches succeed
5182
                //         in this case, so we do too.
5183
5184
0
                UBool success = true;
5185
0
                for (;;) {
5186
0
                    UChar32 captureGroupChar = captureGroupItr.next();
5187
0
                    if (captureGroupChar == U_SENTINEL) {
5188
0
                        success = true;
5189
0
                        break;
5190
0
                    }
5191
0
                    UChar32 inputChar = inputItr.next();
5192
0
                    if (inputChar == U_SENTINEL) {
5193
0
                        success = false;
5194
0
                        fHitEnd = true;
5195
0
                        break;
5196
0
                    }
5197
0
                    if (inputChar != captureGroupChar) {
5198
0
                        success = false;
5199
0
                        break;
5200
0
                    }
5201
0
                }
5202
5203
0
                if (success && inputItr.inExpansion()) {
5204
                    // We obtained a match by consuming part of a string obtained from
5205
                    // case-folding a single code point of the input text.
5206
                    // This does not count as an overall match.
5207
0
                    success = false;
5208
0
                }
5209
5210
0
                if (success) {
5211
0
                    fp->fInputIdx = inputItr.getIndex();
5212
0
                } else {
5213
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5214
0
                }
5215
0
            }
5216
0
            break;
5217
5218
0
        case URX_STO_INP_LOC:
5219
0
            {
5220
0
                U_ASSERT(opValue >= 0 && opValue < fFrameSize);
5221
0
                fp->fExtra[opValue] = fp->fInputIdx;
5222
0
            }
5223
0
            break;
5224
5225
0
        case URX_JMPX:
5226
0
            {
5227
0
                int32_t instrOperandLoc = (int32_t)fp->fPatIdx;
5228
0
                fp->fPatIdx += 1;
5229
0
                int32_t dataLoc  = URX_VAL(pat[instrOperandLoc]);
5230
0
                U_ASSERT(dataLoc >= 0 && dataLoc < fFrameSize);
5231
0
                int32_t savedInputIdx = (int32_t)fp->fExtra[dataLoc];
5232
0
                U_ASSERT(savedInputIdx <= fp->fInputIdx);
5233
0
                if (savedInputIdx < fp->fInputIdx) {
5234
0
                    fp->fPatIdx = opValue;                               // JMP
5235
0
                } else {
5236
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);   // FAIL, no progress in loop.
5237
0
                }
5238
0
            }
5239
0
            break;
5240
5241
0
        case URX_LA_START:
5242
0
            {
5243
                // Entering a look around block.
5244
                // Save Stack Ptr, Input Pos.
5245
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5246
0
                fData[opValue]   = fStack->size();
5247
0
                fData[opValue+1] = fp->fInputIdx;
5248
0
                fData[opValue+2] = fActiveStart;
5249
0
                fData[opValue+3] = fActiveLimit;
5250
0
                fActiveStart     = fLookStart;          // Set the match region change for
5251
0
                fActiveLimit     = fLookLimit;          //   transparent bounds.
5252
0
            }
5253
0
            break;
5254
5255
0
        case URX_LA_END:
5256
0
            {
5257
                // Leaving a look around block.
5258
                //  restore Stack Ptr, Input Pos to positions they had on entry to block.
5259
0
                U_ASSERT(opValue>=0 && opValue+3<fPattern->fDataSize);
5260
0
                int32_t stackSize = fStack->size();
5261
0
                int32_t newStackSize = (int32_t)fData[opValue];
5262
0
                U_ASSERT(stackSize >= newStackSize);
5263
0
                if (stackSize > newStackSize) {
5264
                    // Copy the current top frame back to the new (cut back) top frame.
5265
                    //   This makes the capture groups from within the look-ahead
5266
                    //   expression available.
5267
0
                    int64_t *newFP = fStack->getBuffer() + newStackSize - fFrameSize;
5268
0
                    int32_t j;
5269
0
                    for (j=0; j<fFrameSize; j++) {
5270
0
                        newFP[j] = ((int64_t *)fp)[j];
5271
0
                    }
5272
0
                    fp = (REStackFrame *)newFP;
5273
0
                    fStack->setSize(newStackSize);
5274
0
                }
5275
0
                fp->fInputIdx = fData[opValue+1];
5276
5277
                // Restore the active region bounds in the input string; they may have
5278
                //    been changed because of transparent bounds on a Region.
5279
0
                fActiveStart = fData[opValue+2];
5280
0
                fActiveLimit = fData[opValue+3];
5281
0
                U_ASSERT(fActiveStart >= 0);
5282
0
                U_ASSERT(fActiveLimit <= fInputLength);
5283
0
            }
5284
0
            break;
5285
5286
0
        case URX_ONECHAR_I:
5287
0
            if (fp->fInputIdx < fActiveLimit) {
5288
0
                UChar32 c;
5289
0
                U16_NEXT(inputBuf, fp->fInputIdx, fActiveLimit, c);
5290
0
                if (u_foldCase(c, U_FOLD_CASE_DEFAULT) == opValue) {
5291
0
                    break;
5292
0
                }
5293
0
            } else {
5294
0
                fHitEnd = true;
5295
0
            }
5296
0
            fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5297
0
            break;
5298
5299
0
        case URX_STRING_I:
5300
            // Case-insensitive test input against a literal string.
5301
            // Strings require two slots in the compiled pattern, one for the
5302
            //   offset to the string text, and one for the length.
5303
            //   The compiled string has already been case folded.
5304
0
            {
5305
0
                const UChar *patternString = litText + opValue;
5306
5307
0
                op      = (int32_t)pat[fp->fPatIdx];
5308
0
                fp->fPatIdx++;
5309
0
                opType  = URX_TYPE(op);
5310
0
                opValue = URX_VAL(op);
5311
0
                U_ASSERT(opType == URX_STRING_LEN);
5312
0
                int32_t patternStringLen = opValue;  // Length of the string from the pattern.
5313
5314
0
                UChar32      cText;
5315
0
                UChar32      cPattern;
5316
0
                UBool        success = true;
5317
0
                int32_t      patternStringIdx  = 0;
5318
0
                CaseFoldingUCharIterator inputIterator(inputBuf, fp->fInputIdx, fActiveLimit);
5319
0
                while (patternStringIdx < patternStringLen) {
5320
0
                    U16_NEXT(patternString, patternStringIdx, patternStringLen, cPattern);
5321
0
                    cText = inputIterator.next();
5322
0
                    if (cText != cPattern) {
5323
0
                        success = false;
5324
0
                        if (cText == U_SENTINEL) {
5325
0
                            fHitEnd = true;
5326
0
                        }
5327
0
                        break;
5328
0
                    }
5329
0
                }
5330
0
                if (inputIterator.inExpansion()) {
5331
0
                    success = false;
5332
0
                }
5333
5334
0
                if (success) {
5335
0
                    fp->fInputIdx = inputIterator.getIndex();
5336
0
                } else {
5337
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5338
0
                }
5339
0
            }
5340
0
            break;
5341
5342
0
        case URX_LB_START:
5343
0
            {
5344
                // Entering a look-behind block.
5345
                // Save Stack Ptr, Input Pos and active input region.
5346
                //   TODO:  implement transparent bounds.  Ticket #6067
5347
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5348
0
                fData[opValue]   = fStack->size();
5349
0
                fData[opValue+1] = fp->fInputIdx;
5350
                // Save input string length, then reset to pin any matches to end at
5351
                //   the current position.
5352
0
                fData[opValue+2] = fActiveStart;
5353
0
                fData[opValue+3] = fActiveLimit;
5354
0
                fActiveStart     = fRegionStart;
5355
0
                fActiveLimit     = fp->fInputIdx;
5356
                // Init the variable containing the start index for attempted matches.
5357
0
                fData[opValue+4] = -1;
5358
0
            }
5359
0
            break;
5360
5361
5362
0
        case URX_LB_CONT:
5363
0
            {
5364
                // Positive Look-Behind, at top of loop checking for matches of LB expression
5365
                //    at all possible input starting positions.
5366
5367
                // Fetch the min and max possible match lengths.  They are the operands
5368
                //   of this op in the pattern.
5369
0
                int32_t minML = (int32_t)pat[fp->fPatIdx++];
5370
0
                int32_t maxML = (int32_t)pat[fp->fPatIdx++];
5371
0
                U_ASSERT(minML <= maxML);
5372
0
                U_ASSERT(minML >= 0);
5373
5374
                // Fetch (from data) the last input index where a match was attempted.
5375
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5376
0
                int64_t  &lbStartIdx = fData[opValue+4];
5377
0
                if (lbStartIdx < 0) {
5378
                    // First time through loop.
5379
0
                    lbStartIdx = fp->fInputIdx - minML;
5380
0
                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5381
0
                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5382
0
                    }
5383
0
                } else {
5384
                    // 2nd through nth time through the loop.
5385
                    // Back up start position for match by one.
5386
0
                    if (lbStartIdx == 0) {
5387
0
                        lbStartIdx--;
5388
0
                    } else {
5389
0
                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5390
0
                    }
5391
0
                }
5392
5393
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5394
                    // We have tried all potential match starting points without
5395
                    //  getting a match.  Backtrack out, and out of the
5396
                    //   Look Behind altogether.
5397
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5398
0
                    fActiveStart = fData[opValue+2];
5399
0
                    fActiveLimit = fData[opValue+3];
5400
0
                    U_ASSERT(fActiveStart >= 0);
5401
0
                    U_ASSERT(fActiveLimit <= fInputLength);
5402
0
                    break;
5403
0
                }
5404
5405
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5406
                //      (successful match will fall off the end of the loop.)
5407
0
                fp = StateSave(fp, fp->fPatIdx-3, status);
5408
0
                fp->fInputIdx =  lbStartIdx;
5409
0
            }
5410
0
            break;
5411
5412
0
        case URX_LB_END:
5413
            // End of a look-behind block, after a successful match.
5414
0
            {
5415
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5416
0
                if (fp->fInputIdx != fActiveLimit) {
5417
                    //  The look-behind expression matched, but the match did not
5418
                    //    extend all the way to the point that we are looking behind from.
5419
                    //  FAIL out of here, which will take us back to the LB_CONT, which
5420
                    //     will retry the match starting at another position or fail
5421
                    //     the look-behind altogether, whichever is appropriate.
5422
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5423
0
                    break;
5424
0
                }
5425
5426
                // Look-behind match is good.  Restore the original input string region,
5427
                //   which had been truncated to pin the end of the lookbehind match to the
5428
                //   position being looked-behind.
5429
0
                fActiveStart = fData[opValue+2];
5430
0
                fActiveLimit = fData[opValue+3];
5431
0
                U_ASSERT(fActiveStart >= 0);
5432
0
                U_ASSERT(fActiveLimit <= fInputLength);
5433
0
            }
5434
0
            break;
5435
5436
5437
0
        case URX_LBN_CONT:
5438
0
            {
5439
                // Negative Look-Behind, at top of loop checking for matches of LB expression
5440
                //    at all possible input starting positions.
5441
5442
                // Fetch the extra parameters of this op.
5443
0
                int32_t minML       = (int32_t)pat[fp->fPatIdx++];
5444
0
                int32_t maxML       = (int32_t)pat[fp->fPatIdx++];
5445
0
                int32_t continueLoc = (int32_t)pat[fp->fPatIdx++];
5446
0
                continueLoc = URX_VAL(continueLoc);
5447
0
                U_ASSERT(minML <= maxML);
5448
0
                U_ASSERT(minML >= 0);
5449
0
                U_ASSERT(continueLoc > fp->fPatIdx);
5450
5451
                // Fetch (from data) the last input index where a match was attempted.
5452
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5453
0
                int64_t  &lbStartIdx = fData[opValue+4];
5454
0
                if (lbStartIdx < 0) {
5455
                    // First time through loop.
5456
0
                    lbStartIdx = fp->fInputIdx - minML;
5457
0
                    if (lbStartIdx > 0 && lbStartIdx < fInputLength) {
5458
0
                        U16_SET_CP_START(inputBuf, 0, lbStartIdx);
5459
0
                    }
5460
0
                } else {
5461
                    // 2nd through nth time through the loop.
5462
                    // Back up start position for match by one.
5463
0
                    if (lbStartIdx == 0) {
5464
0
                        lbStartIdx--;   // Because U16_BACK is unsafe starting at 0.
5465
0
                    } else {
5466
0
                        U16_BACK_1(inputBuf, 0, lbStartIdx);
5467
0
                    }
5468
0
                }
5469
5470
0
                if (lbStartIdx < 0 || lbStartIdx < fp->fInputIdx - maxML) {
5471
                    // We have tried all potential match starting points without
5472
                    //  getting a match, which means that the negative lookbehind as
5473
                    //  a whole has succeeded.  Jump forward to the continue location
5474
0
                    fActiveStart = fData[opValue+2];
5475
0
                    fActiveLimit = fData[opValue+3];
5476
0
                    U_ASSERT(fActiveStart >= 0);
5477
0
                    U_ASSERT(fActiveLimit <= fInputLength);
5478
0
                    fp->fPatIdx = continueLoc;
5479
0
                    break;
5480
0
                }
5481
5482
                //    Save state to this URX_LB_CONT op, so failure to match will repeat the loop.
5483
                //      (successful match will cause a FAIL out of the loop altogether.)
5484
0
                fp = StateSave(fp, fp->fPatIdx-4, status);
5485
0
                fp->fInputIdx =  lbStartIdx;
5486
0
            }
5487
0
            break;
5488
5489
0
        case URX_LBN_END:
5490
            // End of a negative look-behind block, after a successful match.
5491
0
            {
5492
0
                U_ASSERT(opValue>=0 && opValue+4<fPattern->fDataSize);
5493
0
                if (fp->fInputIdx != fActiveLimit) {
5494
                    //  The look-behind expression matched, but the match did not
5495
                    //    extend all the way to the point that we are looking behind from.
5496
                    //  FAIL out of here, which will take us back to the LB_CONT, which
5497
                    //     will retry the match starting at another position or succeed
5498
                    //     the look-behind altogether, whichever is appropriate.
5499
0
                    fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5500
0
                    break;
5501
0
                }
5502
5503
                // Look-behind expression matched, which means look-behind test as
5504
                //   a whole Fails
5505
5506
                //   Restore the original input string length, which had been truncated
5507
                //   inorder to pin the end of the lookbehind match
5508
                //   to the position being looked-behind.
5509
0
                fActiveStart = fData[opValue+2];
5510
0
                fActiveLimit = fData[opValue+3];
5511
0
                U_ASSERT(fActiveStart >= 0);
5512
0
                U_ASSERT(fActiveLimit <= fInputLength);
5513
5514
                // Restore original stack position, discarding any state saved
5515
                //   by the successful pattern match.
5516
0
                U_ASSERT(opValue>=0 && opValue+1<fPattern->fDataSize);
5517
0
                int32_t newStackSize = (int32_t)fData[opValue];
5518
0
                U_ASSERT(fStack->size() > newStackSize);
5519
0
                fStack->setSize(newStackSize);
5520
5521
                //  FAIL, which will take control back to someplace
5522
                //  prior to entering the look-behind test.
5523
0
                fp = (REStackFrame *)fStack->popFrame(fFrameSize);
5524
0
            }
5525
0
            break;
5526
5527
5528
0
        case URX_LOOP_SR_I:
5529
            // Loop Initialization for the optimized implementation of
5530
            //     [some character set]*
5531
            //   This op scans through all matching input.
5532
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5533
0
            {
5534
0
                U_ASSERT(opValue > 0 && opValue < fSets->size());
5535
0
                Regex8BitSet *s8 = &fPattern->fSets8[opValue];
5536
0
                UnicodeSet   *s  = (UnicodeSet *)fSets->elementAt(opValue);
5537
5538
                // Loop through input, until either the input is exhausted or
5539
                //   we reach a character that is not a member of the set.
5540
0
                int32_t ix = (int32_t)fp->fInputIdx;
5541
0
                for (;;) {
5542
0
                    if (ix >= fActiveLimit) {
5543
0
                        fHitEnd = true;
5544
0
                        break;
5545
0
                    }
5546
0
                    UChar32   c;
5547
0
                    U16_NEXT(inputBuf, ix, fActiveLimit, c);
5548
0
                    if (c<256) {
5549
0
                        if (s8->contains(c) == false) {
5550
0
                            U16_BACK_1(inputBuf, 0, ix);
5551
0
                            break;
5552
0
                        }
5553
0
                    } else {
5554
0
                        if (s->contains(c) == false) {
5555
0
                            U16_BACK_1(inputBuf, 0, ix);
5556
0
                            break;
5557
0
                        }
5558
0
                    }
5559
0
                }
5560
5561
                // If there were no matching characters, skip over the loop altogether.
5562
                //   The loop doesn't run at all, a * op always succeeds.
5563
0
                if (ix == fp->fInputIdx) {
5564
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5565
0
                    break;
5566
0
                }
5567
5568
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5569
                //   must follow.  It's operand is the stack location
5570
                //   that holds the starting input index for the match of this [set]*
5571
0
                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5572
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5573
0
                int32_t stackLoc = URX_VAL(loopcOp);
5574
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5575
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
5576
0
                fp->fInputIdx = ix;
5577
5578
                // Save State to the URX_LOOP_C op that follows this one,
5579
                //   so that match failures in the following code will return to there.
5580
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5581
0
                fp = StateSave(fp, fp->fPatIdx, status);
5582
0
                fp->fPatIdx++;
5583
0
            }
5584
0
            break;
5585
5586
5587
0
        case URX_LOOP_DOT_I:
5588
            // Loop Initialization for the optimized implementation of .*
5589
            //   This op scans through all remaining input.
5590
            //   The following LOOP_C op emulates stack unwinding if the following pattern fails.
5591
0
            {
5592
                // Loop through input until the input is exhausted (we reach an end-of-line)
5593
                // In DOTALL mode, we can just go straight to the end of the input.
5594
0
                int32_t ix;
5595
0
                if ((opValue & 1) == 1) {
5596
                    // Dot-matches-All mode.  Jump straight to the end of the string.
5597
0
                    ix = (int32_t)fActiveLimit;
5598
0
                    fHitEnd = true;
5599
0
                } else {
5600
                    // NOT DOT ALL mode.  Line endings do not match '.'
5601
                    // Scan forward until a line ending or end of input.
5602
0
                    ix = (int32_t)fp->fInputIdx;
5603
0
                    for (;;) {
5604
0
                        if (ix >= fActiveLimit) {
5605
0
                            fHitEnd = true;
5606
0
                            break;
5607
0
                        }
5608
0
                        UChar32   c;
5609
0
                        U16_NEXT(inputBuf, ix, fActiveLimit, c);   // c = inputBuf[ix++]
5610
0
                        if ((c & 0x7f) <= 0x29) {          // Fast filter of non-new-line-s
5611
0
                            if ((c == 0x0a) ||             //  0x0a is newline in both modes.
5612
0
                                (((opValue & 2) == 0) &&    // IF not UNIX_LINES mode
5613
0
                                   isLineTerminator(c))) {
5614
                                //  char is a line ending.  Put the input pos back to the
5615
                                //    line ending char, and exit the scanning loop.
5616
0
                                U16_BACK_1(inputBuf, 0, ix);
5617
0
                                break;
5618
0
                            }
5619
0
                        }
5620
0
                    }
5621
0
                }
5622
5623
                // If there were no matching characters, skip over the loop altogether.
5624
                //   The loop doesn't run at all, a * op always succeeds.
5625
0
                if (ix == fp->fInputIdx) {
5626
0
                    fp->fPatIdx++;   // skip the URX_LOOP_C op.
5627
0
                    break;
5628
0
                }
5629
5630
                // Peek ahead in the compiled pattern, to the URX_LOOP_C that
5631
                //   must follow.  It's operand is the stack location
5632
                //   that holds the starting input index for the match of this .*
5633
0
                int32_t loopcOp = (int32_t)pat[fp->fPatIdx];
5634
0
                U_ASSERT(URX_TYPE(loopcOp) == URX_LOOP_C);
5635
0
                int32_t stackLoc = URX_VAL(loopcOp);
5636
0
                U_ASSERT(stackLoc >= 0 && stackLoc < fFrameSize);
5637
0
                fp->fExtra[stackLoc] = fp->fInputIdx;
5638
0
                fp->fInputIdx = ix;
5639
5640
                // Save State to the URX_LOOP_C op that follows this one,
5641
                //   so that match failures in the following code will return to there.
5642
                //   Then bump the pattern idx so the LOOP_C is skipped on the way out of here.
5643
0
                fp = StateSave(fp, fp->fPatIdx, status);
5644
0
                fp->fPatIdx++;
5645
0
            }
5646
0
            break;
5647
5648
5649
0
        case URX_LOOP_C:
5650
0
            {
5651
0
                U_ASSERT(opValue>=0 && opValue<fFrameSize);
5652
0
                backSearchIndex = (int32_t)fp->fExtra[opValue];
5653
0
                U_ASSERT(backSearchIndex <= fp->fInputIdx);
5654
0
                if (backSearchIndex == fp->fInputIdx) {
5655
                    // We've backed up the input idx to the point that the loop started.
5656
                    // The loop is done.  Leave here without saving state.
5657
                    //  Subsequent failures won't come back here.
5658
0
                    break;
5659
0
                }
5660
                // Set up for the next iteration of the loop, with input index
5661
                //   backed up by one from the last time through,
5662
                //   and a state save to this instruction in case the following code fails again.
5663
                //   (We're going backwards because this loop emulates stack unwinding, not
5664
                //    the initial scan forward.)
5665
0
                U_ASSERT(fp->fInputIdx > 0);
5666
0
                UChar32 prevC;
5667
0
                U16_PREV(inputBuf, 0, fp->fInputIdx, prevC); // !!!: should this 0 be one of f*Limit?
5668
5669
0
                if (prevC == 0x0a &&
5670
0
                    fp->fInputIdx > backSearchIndex &&
5671
0
                    inputBuf[fp->fInputIdx-1] == 0x0d) {
5672
0
                    int32_t prevOp = (int32_t)pat[fp->fPatIdx-2];
5673
0
                    if (URX_TYPE(prevOp) == URX_LOOP_DOT_I) {
5674
                        // .*, stepping back over CRLF pair.
5675
0
                        U16_BACK_1(inputBuf, 0, fp->fInputIdx);
5676
0
                    }
5677
0
                }
5678
5679
5680
0
                fp = StateSave(fp, fp->fPatIdx-1, status);
5681
0
            }
5682
0
            break;
5683
5684
5685
5686
0
        default:
5687
            // Trouble.  The compiled pattern contains an entry with an
5688
            //           unrecognized type tag.
5689
0
            UPRV_UNREACHABLE_ASSERT;
5690
            // Unknown opcode type in opType = URX_TYPE(pat[fp->fPatIdx]). But we have
5691
            // reports of this in production code, don't use UPRV_UNREACHABLE_EXIT.
5692
            // See ICU-21669.
5693
0
            status = U_INTERNAL_PROGRAM_ERROR;
5694
0
        }
5695
5696
0
        if (U_FAILURE(status)) {
5697
0
            isMatch = false;
5698
0
            break;
5699
0
        }
5700
0
    }
5701
5702
0
breakFromLoop:
5703
0
    fMatch = isMatch;
5704
0
    if (isMatch) {
5705
0
        fLastMatchEnd = fMatchEnd;
5706
0
        fMatchStart   = startIdx;
5707
0
        fMatchEnd     = fp->fInputIdx;
5708
0
    }
5709
5710
#ifdef REGEX_RUN_DEBUG
5711
    if (fTraceDebug) {
5712
        if (isMatch) {
5713
            printf("Match.  start=%ld   end=%ld\n\n", fMatchStart, fMatchEnd);
5714
        } else {
5715
            printf("No match\n\n");
5716
        }
5717
    }
5718
#endif
5719
5720
0
    fFrame = fp;                // The active stack frame when the engine stopped.
5721
                                //   Contains the capture group results that we need to
5722
                                //    access later.
5723
5724
0
    return;
5725
0
}
5726
5727
5728
UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexMatcher)
5729
5730
U_NAMESPACE_END
5731
5732
#endif  // !UCONFIG_NO_REGULAR_EXPRESSIONS
5733