Coverage Report

Created: 2025-06-24 06:54

/src/icu/icu4c/source/i18n/unicode/regex.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  regex.h
9
*   encoding:   UTF-8
10
*   indentation:4
11
*
12
*   created on: 2002oct22
13
*   created by: Andy Heninger
14
*
15
*   ICU Regular Expressions, API for C++
16
*/
17
18
#ifndef REGEX_H
19
#define REGEX_H
20
21
//#define REGEX_DEBUG
22
23
/**
24
 * \file
25
 * \brief C++ API: Regular Expressions
26
 *
27
 * The ICU API for processing regular expressions consists of two classes,
28
 *  `RegexPattern` and `RegexMatcher`.
29
 *  `RegexPattern` objects represent a pre-processed, or compiled
30
 *  regular expression.  They are created from a regular expression pattern string,
31
 *  and can be used to create `RegexMatcher` objects for the pattern.
32
 *
33
 * Class `RegexMatcher` bundles together a regular expression
34
 *  pattern and a target string to which the search pattern will be applied.
35
 *  `RegexMatcher` includes API for doing plain find or search
36
 *  operations, for search and replace operations, and for obtaining detailed
37
 *  information about bounds of a match.
38
 *
39
 * Note that by constructing `RegexMatcher` objects directly from regular
40
 * expression pattern strings application code can be simplified and the explicit
41
 * need for `RegexPattern` objects can usually be eliminated.
42
 *
43
 */
44
45
#include "unicode/utypes.h"
46
47
#if U_SHOW_CPLUSPLUS_API
48
49
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
50
51
#include "unicode/uobject.h"
52
#include "unicode/unistr.h"
53
#include "unicode/utext.h"
54
#include "unicode/parseerr.h"
55
56
#include "unicode/uregex.h"
57
58
// Forward Declarations
59
60
struct UHashtable;
61
62
U_NAMESPACE_BEGIN
63
64
struct Regex8BitSet;
65
class  RegexCImpl;
66
class  RegexMatcher;
67
class  RegexPattern;
68
struct REStackFrame;
69
class  BreakIterator;
70
class  UnicodeSet;
71
class  UVector;
72
class  UVector32;
73
class  UVector64;
74
75
76
/**
77
  * Class `RegexPattern` represents a compiled regular expression.  It includes
78
  * factory methods for creating a RegexPattern object from the source (string) form
79
  * of a regular expression, methods for creating RegexMatchers that allow the pattern
80
  * to be applied to input text, and a few convenience methods for simple common
81
  * uses of regular expressions.
82
  *
83
  * Class RegexPattern is not intended to be subclassed.
84
  *
85
  * @stable ICU 2.4
86
  */
87
class U_I18N_API RegexPattern final : public UObject {
88
public:
89
90
    /**
91
     * default constructor.  Create a RegexPattern object that refers to no actual
92
     *   pattern.  Not normally needed; RegexPattern objects are usually
93
     *   created using the factory method `compile()`.
94
     *
95
     * @stable ICU 2.4
96
     */
97
    RegexPattern();
98
99
    /**
100
     * Copy Constructor.  Create a new RegexPattern object that is equivalent
101
     *                    to the source object.
102
     * @param source the pattern object to be copied.
103
     * @stable ICU 2.4
104
     */
105
    RegexPattern(const RegexPattern &source);
106
107
    /**
108
     * Destructor.  Note that a RegexPattern object must persist so long as any
109
     *  RegexMatcher objects that were created from the RegexPattern are active.
110
     * @stable ICU 2.4
111
     */
112
    virtual ~RegexPattern();
113
114
    /**
115
     * Comparison operator.  Two RegexPattern objects are considered equal if they
116
     * were constructed from identical source patterns using the same #URegexpFlag
117
     * settings.
118
     * @param that a RegexPattern object to compare with "this".
119
     * @return true if the objects are equivalent.
120
     * @stable ICU 2.4
121
     */
122
    bool            operator==(const RegexPattern& that) const;
123
124
    /**
125
     * Comparison operator.  Two RegexPattern objects are considered equal if they
126
     * were constructed from identical source patterns using the same #URegexpFlag
127
     * settings.
128
     * @param that a RegexPattern object to compare with "this".
129
     * @return true if the objects are different.
130
     * @stable ICU 2.4
131
     */
132
0
    inline bool     operator!=(const RegexPattern& that) const {return ! operator ==(that);}
133
134
    /**
135
     * Assignment operator.  After assignment, this RegexPattern will behave identically
136
     *     to the source object.
137
     * @stable ICU 2.4
138
     */
139
    RegexPattern  &operator =(const RegexPattern &source);
140
141
    /**
142
     * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
143
     * intended to be subclassed, <code>clone()</code> and the copy construction are
144
     * equivalent operations.
145
     * @return the copy of this RegexPattern
146
     * @stable ICU 2.4
147
     */
148
    RegexPattern  *clone() const;
149
150
151
   /**
152
    * Compiles the regular expression in string form into a RegexPattern
153
    * object.  These compile methods, rather than the constructors, are the usual
154
    * way that RegexPattern objects are created.
155
    *
156
    * Note that RegexPattern objects must not be deleted while RegexMatcher
157
    * objects created from the pattern are active.  RegexMatchers keep a pointer
158
    * back to their pattern, so premature deletion of the pattern is a
159
    * catastrophic error.
160
    *
161
    * All #URegexpFlag pattern match mode flags are set to their default values.
162
    *
163
    * Note that it is often more convenient to construct a RegexMatcher directly
164
    *    from a pattern string rather than separately compiling the pattern and
165
    *    then creating a RegexMatcher object from the pattern.
166
    *
167
    * @param regex The regular expression to be compiled.
168
    * @param pe    Receives the position (line and column nubers) of any error
169
    *              within the regular expression.)
170
    * @param status A reference to a UErrorCode to receive any errors.
171
    * @return      A regexPattern object for the compiled pattern.
172
    *
173
    * @stable ICU 2.4
174
    */
175
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
176
        UParseError          &pe,
177
        UErrorCode           &status);
178
179
   /**
180
    * Compiles the regular expression in string form into a RegexPattern
181
    * object.  These compile methods, rather than the constructors, are the usual
182
    * way that RegexPattern objects are created.
183
    *
184
    * Note that RegexPattern objects must not be deleted while RegexMatcher
185
    * objects created from the pattern are active.  RegexMatchers keep a pointer
186
    * back to their pattern, so premature deletion of the pattern is a
187
    * catastrophic error.
188
    *
189
    * All #URegexpFlag pattern match mode flags are set to their default values.
190
    *
191
    * Note that it is often more convenient to construct a RegexMatcher directly
192
    *    from a pattern string rather than separately compiling the pattern and
193
    *    then creating a RegexMatcher object from the pattern.
194
    *
195
    * @param regex The regular expression to be compiled. Note, the text referred
196
    *              to by this UText must not be deleted during the lifetime of the
197
    *              RegexPattern object or any RegexMatcher object created from it.
198
    * @param pe    Receives the position (line and column nubers) of any error
199
    *              within the regular expression.)
200
    * @param status A reference to a UErrorCode to receive any errors.
201
    * @return      A regexPattern object for the compiled pattern.
202
    *
203
    * @stable ICU 4.6
204
    */
205
    static RegexPattern * U_EXPORT2 compile( UText *regex,
206
        UParseError          &pe,
207
        UErrorCode           &status);
208
209
   /**
210
    * Compiles the regular expression in string form into a RegexPattern
211
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
212
    * rather than the constructors, are the usual way that RegexPattern objects
213
    * are created.
214
    *
215
    * Note that RegexPattern objects must not be deleted while RegexMatcher
216
    * objects created from the pattern are active.  RegexMatchers keep a pointer
217
    * back to their pattern, so premature deletion of the pattern is a
218
    * catastrophic error.
219
    *
220
    * Note that it is often more convenient to construct a RegexMatcher directly
221
    *    from a pattern string instead of than separately compiling the pattern and
222
    *    then creating a RegexMatcher object from the pattern.
223
    *
224
    * @param regex The regular expression to be compiled.
225
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
226
    * @param pe    Receives the position (line and column numbers) of any error
227
    *              within the regular expression.)
228
    * @param status   A reference to a UErrorCode to receive any errors.
229
    * @return      A regexPattern object for the compiled pattern.
230
    *
231
    * @stable ICU 2.4
232
    */
233
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
234
        uint32_t             flags,
235
        UParseError          &pe,
236
        UErrorCode           &status);
237
238
   /**
239
    * Compiles the regular expression in string form into a RegexPattern
240
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
241
    * rather than the constructors, are the usual way that RegexPattern objects
242
    * are created.
243
    *
244
    * Note that RegexPattern objects must not be deleted while RegexMatcher
245
    * objects created from the pattern are active.  RegexMatchers keep a pointer
246
    * back to their pattern, so premature deletion of the pattern is a
247
    * catastrophic error.
248
    *
249
    * Note that it is often more convenient to construct a RegexMatcher directly
250
    *    from a pattern string instead of than separately compiling the pattern and
251
    *    then creating a RegexMatcher object from the pattern.
252
    *
253
    * @param regex The regular expression to be compiled. Note, the text referred
254
    *              to by this UText must not be deleted during the lifetime of the
255
    *              RegexPattern object or any RegexMatcher object created from it.
256
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
257
    * @param pe    Receives the position (line and column numbers) of any error
258
    *              within the regular expression.)
259
    * @param status   A reference to a UErrorCode to receive any errors.
260
    * @return      A regexPattern object for the compiled pattern.
261
    *
262
    * @stable ICU 4.6
263
    */
264
    static RegexPattern * U_EXPORT2 compile( UText *regex,
265
        uint32_t             flags,
266
        UParseError          &pe,
267
        UErrorCode           &status);
268
269
   /**
270
    * Compiles the regular expression in string form into a RegexPattern
271
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
272
    * rather than the constructors, are the usual way that RegexPattern objects
273
    * are created.
274
    *
275
    * Note that RegexPattern objects must not be deleted while RegexMatcher
276
    * objects created from the pattern are active.  RegexMatchers keep a pointer
277
    * back to their pattern, so premature deletion of the pattern is a
278
    * catastrophic error.
279
    *
280
    * Note that it is often more convenient to construct a RegexMatcher directly
281
    *    from a pattern string instead of than separately compiling the pattern and
282
    *    then creating a RegexMatcher object from the pattern.
283
    *
284
    * @param regex The regular expression to be compiled.
285
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
286
    * @param status   A reference to a UErrorCode to receive any errors.
287
    * @return      A regexPattern object for the compiled pattern.
288
    *
289
    * @stable ICU 2.6
290
    */
291
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
292
        uint32_t             flags,
293
        UErrorCode           &status);
294
295
   /**
296
    * Compiles the regular expression in string form into a RegexPattern
297
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
298
    * rather than the constructors, are the usual way that RegexPattern objects
299
    * are created.
300
    *
301
    * Note that RegexPattern objects must not be deleted while RegexMatcher
302
    * objects created from the pattern are active.  RegexMatchers keep a pointer
303
    * back to their pattern, so premature deletion of the pattern is a
304
    * catastrophic error.
305
    *
306
    * Note that it is often more convenient to construct a RegexMatcher directly
307
    *    from a pattern string instead of than separately compiling the pattern and
308
    *    then creating a RegexMatcher object from the pattern.
309
    *
310
    * @param regex The regular expression to be compiled. Note, the text referred
311
    *              to by this UText must not be deleted during the lifetime of the
312
    *              RegexPattern object or any RegexMatcher object created from it.
313
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
314
    * @param status   A reference to a UErrorCode to receive any errors.
315
    * @return      A regexPattern object for the compiled pattern.
316
    *
317
    * @stable ICU 4.6
318
    */
319
    static RegexPattern * U_EXPORT2 compile( UText *regex,
320
        uint32_t             flags,
321
        UErrorCode           &status);
322
323
   /**
324
    * Get the #URegexpFlag match mode flags that were used when compiling this pattern.
325
    * @return  the #URegexpFlag match mode flags
326
    * @stable ICU 2.4
327
    */
328
    uint32_t flags() const;
329
330
   /**
331
    * Creates a RegexMatcher that will match the given input against this pattern.  The
332
    * RegexMatcher can then be used to perform match, find or replace operations
333
    * on the input.  Note that a RegexPattern object must not be deleted while
334
    * RegexMatchers created from it still exist and might possibly be used again.
335
    *
336
    * The matcher will retain a reference to the supplied input string, and all regexp
337
    * pattern matching operations happen directly on this original string.  It is
338
    * critical that the string not be altered or deleted before use by the regular
339
    * expression operations is complete.
340
    *
341
    * @param input    The input string to which the regular expression will be applied.
342
    * @param status   A reference to a UErrorCode to receive any errors.
343
    * @return         A RegexMatcher object for this pattern and input.
344
    *
345
    * @stable ICU 2.4
346
    */
347
    RegexMatcher *matcher(const UnicodeString &input,
348
        UErrorCode          &status) const;
349
        
350
private:
351
    /**
352
     * Cause a compilation error if an application accidentally attempts to
353
     *   create a matcher with a (char16_t *) string as input rather than
354
     *   a UnicodeString.  Avoids a dangling reference to a temporary string.
355
     *
356
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
357
     * using one of the aliasing constructors, such as
358
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
359
     * or in a UText, using
360
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
361
     *
362
     */
363
    RegexMatcher *matcher(const char16_t *input,
364
        UErrorCode          &status) const = delete;
365
public:
366
367
368
   /**
369
    * Creates a RegexMatcher that will match against this pattern.  The
370
    * RegexMatcher can be used to perform match, find or replace operations.
371
    * Note that a RegexPattern object must not be deleted while
372
    * RegexMatchers created from it still exist and might possibly be used again.
373
    *
374
    * @param status   A reference to a UErrorCode to receive any errors.
375
    * @return      A RegexMatcher object for this pattern and input.
376
    *
377
    * @stable ICU 2.6
378
    */
379
    RegexMatcher *matcher(UErrorCode  &status) const;
380
381
382
   /**
383
    * Test whether a string matches a regular expression.  This convenience function
384
    * both compiles the regular expression and applies it in a single operation.
385
    * Note that if the same pattern needs to be applied repeatedly, this method will be
386
    * less efficient than creating and reusing a RegexMatcher object.
387
    *
388
    * @param regex The regular expression
389
    * @param input The string data to be matched
390
    * @param pe Receives the position of any syntax errors within the regular expression
391
    * @param status A reference to a UErrorCode to receive any errors.
392
    * @return True if the regular expression exactly matches the full input string.
393
    *
394
    * @stable ICU 2.4
395
    */
396
    static UBool U_EXPORT2 matches(const UnicodeString   &regex,
397
        const UnicodeString   &input,
398
              UParseError     &pe,
399
              UErrorCode      &status);
400
401
   /**
402
    * Test whether a string matches a regular expression.  This convenience function
403
    * both compiles the regular expression and applies it in a single operation.
404
    * Note that if the same pattern needs to be applied repeatedly, this method will be
405
    * less efficient than creating and reusing a RegexMatcher object.
406
    *
407
    * @param regex The regular expression
408
    * @param input The string data to be matched
409
    * @param pe Receives the position of any syntax errors within the regular expression
410
    * @param status A reference to a UErrorCode to receive any errors.
411
    * @return True if the regular expression exactly matches the full input string.
412
    *
413
    * @stable ICU 4.6
414
    */
415
    static UBool U_EXPORT2 matches(UText *regex,
416
        UText           *input,
417
        UParseError     &pe,
418
        UErrorCode      &status);
419
420
   /**
421
    * Returns the regular expression from which this pattern was compiled. This method will work
422
    * even if the pattern was compiled from a UText.
423
    *
424
    * Note: If the pattern was originally compiled from a UText, and that UText was modified,
425
    * the returned string may no longer reflect the RegexPattern object.
426
    * @stable ICU 2.4
427
    */
428
    UnicodeString pattern() const;
429
    
430
    
431
   /**
432
    * Returns the regular expression from which this pattern was compiled. This method will work
433
    * even if the pattern was compiled from a UnicodeString.
434
    *
435
    * Note: This is the original input, not a clone. If the pattern was originally compiled from a
436
    * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
437
    * object.
438
    *
439
    * @stable ICU 4.6
440
    */
441
    UText *patternText(UErrorCode      &status) const;
442
443
444
    /**
445
     * Get the group number corresponding to a named capture group.
446
     * The returned number can be used with any function that access
447
     * capture groups by number.
448
     *
449
     * The function returns an error status if the specified name does not
450
     * appear in the pattern.
451
     *
452
     * @param  groupName   The capture group name.
453
     * @param  status      A UErrorCode to receive any errors.
454
     *
455
     * @stable ICU 55
456
     */
457
    int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
458
459
460
    /**
461
     * Get the group number corresponding to a named capture group.
462
     * The returned number can be used with any function that access
463
     * capture groups by number.
464
     *
465
     * The function returns an error status if the specified name does not
466
     * appear in the pattern.
467
     *
468
     * @param  groupName   The capture group name,
469
     *                     platform invariant characters only.
470
     * @param  nameLength  The length of the name, or -1 if the name is
471
     *                     nul-terminated.
472
     * @param  status      A UErrorCode to receive any errors.
473
     *
474
     * @stable ICU 55
475
     */
476
    int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
477
478
479
    /**
480
     * Split a string into fields.  Somewhat like split() from Perl or Java.
481
     * Pattern matches identify delimiters that separate the input
482
     * into fields.  The input data between the delimiters becomes the
483
     * fields themselves.
484
     *
485
     * If the delimiter pattern includes capture groups, the captured text will
486
     * also appear in the destination array of output strings, interspersed
487
     * with the fields.  This is similar to Perl, but differs from Java, 
488
     * which ignores the presence of capture groups in the pattern.
489
     * 
490
     * Trailing empty fields will always be returned, assuming sufficient
491
     * destination capacity.  This differs from the default behavior for Java
492
     * and Perl where trailing empty fields are not returned.
493
     *
494
     * The number of strings produced by the split operation is returned.
495
     * This count includes the strings from capture groups in the delimiter pattern.
496
     * This behavior differs from Java, which ignores capture groups.
497
     *
498
     * For the best performance on split() operations,
499
     * <code>RegexMatcher::split</code> is preferable to this function
500
     *
501
     * @param input   The string to be split into fields.  The field delimiters
502
     *                match the pattern (in the "this" object)
503
     * @param dest    An array of UnicodeStrings to receive the results of the split.
504
     *                This is an array of actual UnicodeString objects, not an
505
     *                array of pointers to strings.  Local (stack based) arrays can
506
     *                work well here.
507
     * @param destCapacity  The number of elements in the destination array.
508
     *                If the number of fields found is less than destCapacity, the
509
     *                extra strings in the destination array are not altered.
510
     *                If the number of destination strings is less than the number
511
     *                of fields, the trailing part of the input string, including any
512
     *                field delimiters, is placed in the last destination string.
513
     * @param status  A reference to a UErrorCode to receive any errors.
514
     * @return        The number of fields into which the input string was split.
515
     * @stable ICU 2.4
516
     */
517
    int32_t  split(const UnicodeString &input,
518
        UnicodeString    dest[],
519
        int32_t          destCapacity,
520
        UErrorCode       &status) const;
521
522
523
    /**
524
     * Split a string into fields.  Somewhat like %split() from Perl or Java.
525
     * Pattern matches identify delimiters that separate the input
526
     * into fields.  The input data between the delimiters becomes the
527
     * fields themselves.
528
     *
529
     * If the delimiter pattern includes capture groups, the captured text will
530
     * also appear in the destination array of output strings, interspersed
531
     * with the fields.  This is similar to Perl, but differs from Java, 
532
     * which ignores the presence of capture groups in the pattern.
533
     * 
534
     * Trailing empty fields will always be returned, assuming sufficient
535
     * destination capacity.  This differs from the default behavior for Java
536
     * and Perl where trailing empty fields are not returned.
537
     *
538
     * The number of strings produced by the split operation is returned.
539
     * This count includes the strings from capture groups in the delimiter pattern.
540
     * This behavior differs from Java, which ignores capture groups.
541
     *
542
     *  For the best performance on split() operations,
543
     *  `RegexMatcher::split()` is preferable to this function
544
     *
545
     * @param input   The string to be split into fields.  The field delimiters
546
     *                match the pattern (in the "this" object)
547
     * @param dest    An array of mutable UText structs to receive the results of the split.
548
     *                If a field is nullptr, a new UText is allocated to contain the results for
549
     *                that field. This new UText is not guaranteed to be mutable.
550
     * @param destCapacity  The number of elements in the destination array.
551
     *                If the number of fields found is less than destCapacity, the
552
     *                extra strings in the destination array are not altered.
553
     *                If the number of destination strings is less than the number
554
     *                of fields, the trailing part of the input string, including any
555
     *                field delimiters, is placed in the last destination string.
556
     * @param status  A reference to a UErrorCode to receive any errors.
557
     * @return        The number of destination strings used.  
558
     *
559
     * @stable ICU 4.6
560
     */
561
    int32_t  split(UText *input,
562
        UText            *dest[],
563
        int32_t          destCapacity,
564
        UErrorCode       &status) const;
565
566
567
    /**
568
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
569
     *
570
     * @stable ICU 2.4
571
     */
572
    virtual UClassID getDynamicClassID() const override;
573
574
    /**
575
     * ICU "poor man's RTTI", returns a UClassID for this class.
576
     *
577
     * @stable ICU 2.4
578
     */
579
    static UClassID U_EXPORT2 getStaticClassID();
580
581
private:
582
    //
583
    //  Implementation Data
584
    //
585
    UText          *fPattern;      // The original pattern string.
586
    UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
587
    uint32_t        fFlags;        // The flags used when compiling the pattern.
588
                                   //
589
    UVector64       *fCompiledPat; // The compiled pattern p-code.
590
    UnicodeString   fLiteralText;  // Any literal string data from the pattern,
591
                                   //   after un-escaping, for use during the match.
592
593
    UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
594
    Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
595
596
597
    UErrorCode      fDeferredStatus; // status if some prior error has left this
598
                                   //  RegexPattern in an unusable state.
599
600
    int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
601
                                   //   >= this value.  For some patterns, this calculated
602
                                   //   value may be less than the true shortest
603
                                   //   possible match.
604
    
605
    int32_t         fFrameSize;    // Size of a state stack frame in the
606
                                   //   execution engine.
607
608
    int32_t         fDataSize;     // The size of the data needed by the pattern that
609
                                   //   does not go on the state stack, but has just
610
                                   //   a single copy per matcher.
611
612
    UVector32       *fGroupMap;    // Map from capture group number to position of
613
                                   //   the group's variables in the matcher stack frame.
614
615
    int32_t         fStartType;    // Info on how a match must start.
616
    int32_t         fInitialStringIdx;     //
617
    int32_t         fInitialStringLen;
618
    UnicodeSet     *fInitialChars;
619
    UChar32         fInitialChar;
620
    Regex8BitSet   *fInitialChars8;
621
    UBool           fNeedsAltInput;
622
623
    UHashtable     *fNamedCaptureMap;  // Map from capture group names to numbers.
624
625
    friend class RegexCompile;
626
    friend class RegexMatcher;
627
    friend class RegexCImpl;
628
629
    //
630
    //  Implementation Methods
631
    //
632
    void        init();                 // Common initialization, for use by constructors.
633
    bool        initNamedCaptureMap();  // Lazy init for fNamedCaptureMap.
634
    void        zap();                  // Common cleanup
635
636
    void        dumpOp(int32_t index) const;
637
638
  public:
639
#ifndef U_HIDE_INTERNAL_API
640
    /**
641
      * Dump a compiled pattern. Internal debug function.
642
      * @internal
643
      */
644
    void        dumpPattern() const;
645
#endif  /* U_HIDE_INTERNAL_API */
646
};
647
648
649
650
/**
651
 *  class RegexMatcher bundles together a regular expression pattern and
652
 *  input text to which the expression can be applied.  It includes methods
653
 *  for testing for matches, and for find and replace operations.
654
 *
655
 * <p>Class RegexMatcher is not intended to be subclassed.</p>
656
 *
657
 * @stable ICU 2.4
658
 */
659
class U_I18N_API RegexMatcher final : public UObject {
660
public:
661
662
    /**
663
      * Construct a RegexMatcher for a regular expression.
664
      * This is a convenience method that avoids the need to explicitly create
665
      * a RegexPattern object.  Note that if several RegexMatchers need to be
666
      * created for the same expression, it will be more efficient to
667
      * separately create and cache a RegexPattern object, and use
668
      * its matcher() method to create the RegexMatcher objects.
669
      *
670
      *  @param regexp The Regular Expression to be compiled.
671
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
672
      *  @param status Any errors are reported by setting this UErrorCode variable.
673
      *  @stable ICU 2.6
674
      */
675
    RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
676
677
    /**
678
      * Construct a RegexMatcher for a regular expression.
679
      * This is a convenience method that avoids the need to explicitly create
680
      * a RegexPattern object.  Note that if several RegexMatchers need to be
681
      * created for the same expression, it will be more efficient to
682
      * separately create and cache a RegexPattern object, and use
683
      * its matcher() method to create the RegexMatcher objects.
684
      *
685
      *  @param regexp The regular expression to be compiled.
686
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
687
      *  @param status Any errors are reported by setting this UErrorCode variable.
688
      *
689
      *  @stable ICU 4.6
690
      */
691
    RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
692
693
    /**
694
      * Construct a RegexMatcher for a regular expression.
695
      * This is a convenience method that avoids the need to explicitly create
696
      * a RegexPattern object.  Note that if several RegexMatchers need to be
697
      * created for the same expression, it will be more efficient to
698
      * separately create and cache a RegexPattern object, and use
699
      * its matcher() method to create the RegexMatcher objects.
700
      *
701
      * The matcher will retain a reference to the supplied input string, and all regexp
702
      * pattern matching operations happen directly on the original string.  It is
703
      * critical that the string not be altered or deleted before use by the regular
704
      * expression operations is complete.
705
      *
706
      *  @param regexp The Regular Expression to be compiled.
707
      *  @param input  The string to match.  The matcher retains a reference to the
708
      *                caller's string; mo copy is made.
709
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
710
      *  @param status Any errors are reported by setting this UErrorCode variable.
711
      *  @stable ICU 2.6
712
      */
713
    RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
714
        uint32_t flags, UErrorCode &status);
715
716
    /**
717
      * Construct a RegexMatcher for a regular expression.
718
      * This is a convenience method that avoids the need to explicitly create
719
      * a RegexPattern object.  Note that if several RegexMatchers need to be
720
      * created for the same expression, it will be more efficient to
721
      * separately create and cache a RegexPattern object, and use
722
      * its matcher() method to create the RegexMatcher objects.
723
      *
724
      * The matcher will make a shallow clone of the supplied input text, and all regexp
725
      * pattern matching operations happen on this clone.  While read-only operations on
726
      * the supplied text are permitted, it is critical that the underlying string not be
727
      * altered or deleted before use by the regular expression operations is complete.
728
      *
729
      *  @param regexp The Regular Expression to be compiled.
730
      *  @param input  The string to match.  The matcher retains a shallow clone of the text.
731
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
732
      *  @param status Any errors are reported by setting this UErrorCode variable.
733
      *
734
      *  @stable ICU 4.6
735
      */
736
    RegexMatcher(UText *regexp, UText *input,
737
        uint32_t flags, UErrorCode &status);
738
739
private:
740
    /**
741
     * Cause a compilation error if an application accidentally attempts to
742
     *   create a matcher with a (char16_t *) string as input rather than
743
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
744
     *
745
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
746
     * using one of the aliasing constructors, such as
747
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
748
     * or in a UText, using
749
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
750
     */
751
    RegexMatcher(const UnicodeString &regexp, const char16_t *input,
752
        uint32_t flags, UErrorCode &status) = delete;
753
public:
754
755
756
   /**
757
    *   Destructor.
758
    *
759
    *  @stable ICU 2.4
760
    */
761
    virtual ~RegexMatcher();
762
763
764
   /**
765
    *   Attempts to match the entire input region against the pattern.
766
    *    @param   status     A reference to a UErrorCode to receive any errors.
767
    *    @return true if there is a match
768
    *    @stable ICU 2.4
769
    */
770
    UBool matches(UErrorCode &status);
771
772
773
   /**
774
    *   Resets the matcher, then attempts to match the input beginning 
775
    *   at the specified startIndex, and extending to the end of the input.
776
    *   The input region is reset to include the entire input string.
777
    *   A successful match must extend to the end of the input.
778
    *    @param   startIndex The input string (native) index at which to begin matching.
779
    *    @param   status     A reference to a UErrorCode to receive any errors.
780
    *    @return true if there is a match
781
    *    @stable ICU 2.8
782
    */
783
    UBool matches(int64_t startIndex, UErrorCode &status);
784
785
786
   /**
787
    *   Attempts to match the input string, starting from the beginning of the region,
788
    *   against the pattern.  Like the matches() method, this function 
789
    *   always starts at the beginning of the input region;
790
    *   unlike that function, it does not require that the entire region be matched.
791
    *
792
    *   If the match succeeds then more information can be obtained via the start(),
793
    *   end(), and group() functions.
794
    *
795
    *    @param   status     A reference to a UErrorCode to receive any errors.
796
    *    @return  true if there is a match at the start of the input string.
797
    *    @stable ICU 2.4
798
    */
799
    UBool lookingAt(UErrorCode &status);
800
801
802
  /**
803
    *   Attempts to match the input string, starting from the specified index, against the pattern.
804
    *   The match may be of any length, and is not required to extend to the end
805
    *   of the input string.  Contrast with match().
806
    *
807
    *   If the match succeeds then more information can be obtained via the start(),
808
    *   end(), and group() functions.
809
    *
810
    *    @param   startIndex The input string (native) index at which to begin matching.
811
    *    @param   status     A reference to a UErrorCode to receive any errors.
812
    *    @return  true if there is a match.
813
    *    @stable ICU 2.8
814
    */
815
    UBool lookingAt(int64_t startIndex, UErrorCode &status);
816
817
818
   /**
819
    *  Find the next pattern match in the input string.
820
    *  The find begins searching the input at the location following the end of
821
    *  the previous match, or at the start of the string if there is no previous match.
822
    *  If a match is found, `start()`, `end()` and `group()`
823
    *  will provide more information regarding the match.
824
    *  Note that if the input string is changed by the application,
825
    *     use find(startPos, status) instead of find(), because the saved starting
826
    *     position may not be valid with the altered input string.
827
    *  @return  true if a match is found.
828
    *  @stable ICU 2.4
829
    */
830
    UBool find();
831
832
833
   /**
834
    *  Find the next pattern match in the input string.
835
    *  The find begins searching the input at the location following the end of
836
    *  the previous match, or at the start of the string if there is no previous match.
837
    *  If a match is found, `start()`, `end()` and `group()`
838
    *  will provide more information regarding the match.
839
    *
840
    *  Note that if the input string is changed by the application,
841
    *  use find(startPos, status) instead of find(), because the saved starting
842
    *  position may not be valid with the altered input string.
843
    *  @param   status  A reference to a UErrorCode to receive any errors.
844
    *  @return  true if a match is found.
845
    * @stable ICU 55
846
    */
847
    UBool find(UErrorCode &status);
848
849
   /**
850
    *   Resets this RegexMatcher and then attempts to find the next substring of the
851
    *   input string that matches the pattern, starting at the specified index.
852
    *
853
    *   @param   start     The (native) index in the input string to begin the search.
854
    *   @param   status    A reference to a UErrorCode to receive any errors.
855
    *   @return  true if a match is found.
856
    *   @stable ICU 2.4
857
    */
858
    UBool find(int64_t start, UErrorCode &status);
859
860
861
   /**
862
    *   Returns a string containing the text matched by the previous match.
863
    *   If the pattern can match an empty string, an empty string may be returned.
864
    *   @param   status      A reference to a UErrorCode to receive any errors.
865
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
866
    *                        has been attempted or the last match failed.
867
    *   @return  a string containing the matched input text.
868
    *   @stable ICU 2.4
869
    */
870
    UnicodeString group(UErrorCode &status) const;
871
872
873
   /**
874
    *    Returns a string containing the text captured by the given group
875
    *    during the previous match operation.  Group(0) is the entire match.
876
    *
877
    *    A zero length string is returned both for capture groups that did not
878
    *    participate in the match and for actual zero length matches.
879
    *    To distinguish between these two cases use the function start(),
880
    *    which returns -1 for non-participating groups.
881
    *
882
    *    @param groupNum the capture group number
883
    *    @param   status     A reference to a UErrorCode to receive any errors.
884
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
885
    *                        has been attempted or the last match failed and
886
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
887
    *    @return the captured text
888
    *    @stable ICU 2.4
889
    */
890
    UnicodeString group(int32_t groupNum, UErrorCode &status) const;
891
892
   /**
893
    *   Returns the number of capturing groups in this matcher's pattern.
894
    *   @return the number of capture groups
895
    *   @stable ICU 2.4
896
    */
897
    int32_t groupCount() const;
898
899
900
   /**
901
    *   Returns a shallow clone of the entire live input string with the UText current native index
902
    *   set to the beginning of the requested group.
903
    *
904
    *   @param   dest        The UText into which the input should be cloned, or nullptr to create a new UText
905
    *   @param   group_len   A reference to receive the length of the desired capture group
906
    *   @param   status      A reference to a UErrorCode to receive any errors.
907
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
908
    *                        has been attempted or the last match failed and
909
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
910
    *   @return dest if non-nullptr, a shallow copy of the input text otherwise
911
    *
912
    *   @stable ICU 4.6
913
    */
914
    UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
915
916
   /**
917
    *   Returns a shallow clone of the entire live input string with the UText current native index
918
    *   set to the beginning of the requested group.
919
    *
920
    *   A group length of zero is returned both for capture groups that did not
921
    *   participate in the match and for actual zero length matches.
922
    *   To distinguish between these two cases use the function start(),
923
    *   which returns -1 for non-participating groups.
924
    *
925
    *   @param   groupNum   The capture group number.
926
    *   @param   dest        The UText into which the input should be cloned, or nullptr to create a new UText.
927
    *   @param   group_len   A reference to receive the length of the desired capture group
928
    *   @param   status      A reference to a UErrorCode to receive any errors.
929
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
930
    *                        has been attempted or the last match failed and
931
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
932
    *   @return dest if non-nullptr, a shallow copy of the input text otherwise
933
    *
934
    *   @stable ICU 4.6
935
    */
936
    UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
937
938
   /**
939
    *   Returns the index in the input string of the start of the text matched
940
    *   during the previous match operation.
941
    *    @param   status      a reference to a UErrorCode to receive any errors.
942
    *    @return              The (native) position in the input string of the start of the last match.
943
    *    @stable ICU 2.4
944
    */
945
    int32_t start(UErrorCode &status) const;
946
947
   /**
948
    *   Returns the index in the input string of the start of the text matched
949
    *   during the previous match operation.
950
    *    @param   status      a reference to a UErrorCode to receive any errors.
951
    *    @return              The (native) position in the input string of the start of the last match.
952
    *   @stable ICU 4.6
953
    */
954
    int64_t start64(UErrorCode &status) const;
955
956
957
   /**
958
    *   Returns the index in the input string of the start of the text matched by the
959
    *    specified capture group during the previous match operation.  Return -1 if
960
    *    the capture group exists in the pattern, but was not part of the last match.
961
    *
962
    *    @param  group       the capture group number
963
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
964
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
965
    *                        attempted or the last match failed, and
966
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
967
    *    @return the (native) start position of substring matched by the specified group.
968
    *    @stable ICU 2.4
969
    */
970
    int32_t start(int32_t group, UErrorCode &status) const;
971
972
   /**
973
    *   Returns the index in the input string of the start of the text matched by the
974
    *    specified capture group during the previous match operation.  Return -1 if
975
    *    the capture group exists in the pattern, but was not part of the last match.
976
    *
977
    *    @param  group       the capture group number.
978
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
979
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
980
    *                        attempted or the last match failed, and
981
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
982
    *    @return the (native) start position of substring matched by the specified group.
983
    *    @stable ICU 4.6
984
    */
985
    int64_t start64(int32_t group, UErrorCode &status) const;
986
987
   /**
988
    *    Returns the index in the input string of the first character following the
989
    *    text matched during the previous match operation.
990
    *
991
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
992
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
993
    *                        attempted or the last match failed.
994
    *    @return the index of the last character matched, plus one.
995
    *                        The index value returned is a native index, corresponding to
996
    *                        code units for the underlying encoding type, for example,
997
    *                        a byte index for UTF-8.
998
    *   @stable ICU 2.4
999
    */
1000
    int32_t end(UErrorCode &status) const;
1001
1002
   /**
1003
    *    Returns the index in the input string of the first character following the
1004
    *    text matched during the previous match operation.
1005
    *
1006
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1007
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1008
    *                        attempted or the last match failed.
1009
    *    @return the index of the last character matched, plus one.
1010
    *                        The index value returned is a native index, corresponding to
1011
    *                        code units for the underlying encoding type, for example,
1012
    *                        a byte index for UTF-8.
1013
    *   @stable ICU 4.6
1014
    */
1015
    int64_t end64(UErrorCode &status) const;
1016
1017
1018
   /**
1019
    *    Returns the index in the input string of the character following the
1020
    *    text matched by the specified capture group during the previous match operation.
1021
    *
1022
    *    @param group  the capture group number
1023
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1024
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1025
    *                        attempted or the last match failed and
1026
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1027
    *    @return  the index of the first character following the text
1028
    *              captured by the specified group during the previous match operation.
1029
    *              Return -1 if the capture group exists in the pattern but was not part of the match.
1030
    *              The index value returned is a native index, corresponding to
1031
    *              code units for the underlying encoding type, for example,
1032
    *              a byte index for UTF8.
1033
    *    @stable ICU 2.4
1034
    */
1035
    int32_t end(int32_t group, UErrorCode &status) const;
1036
1037
   /**
1038
    *    Returns the index in the input string of the character following the
1039
    *    text matched by the specified capture group during the previous match operation.
1040
    *
1041
    *    @param group  the capture group number
1042
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1043
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1044
    *                        attempted or the last match failed and
1045
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1046
    *    @return  the index of the first character following the text
1047
    *              captured by the specified group during the previous match operation.
1048
    *              Return -1 if the capture group exists in the pattern but was not part of the match.
1049
    *              The index value returned is a native index, corresponding to
1050
    *              code units for the underlying encoding type, for example,
1051
    *              a byte index for UTF8.
1052
    *   @stable ICU 4.6
1053
    */
1054
    int64_t end64(int32_t group, UErrorCode &status) const;
1055
1056
   /**
1057
    *   Resets this matcher.  The effect is to remove any memory of previous matches,
1058
    *       and to cause subsequent find() operations to begin at the beginning of
1059
    *       the input string.
1060
    *
1061
    *   @return this RegexMatcher.
1062
    *   @stable ICU 2.4
1063
    */
1064
    RegexMatcher &reset();
1065
1066
1067
   /**
1068
    *   Resets this matcher, and set the current input position.
1069
    *   The effect is to remove any memory of previous matches,
1070
    *       and to cause subsequent find() operations to begin at
1071
    *       the specified (native) position in the input string.
1072
    *
1073
    *   The matcher's region is reset to its default, which is the entire
1074
    *   input string.
1075
    *
1076
    *   An alternative to this function is to set a match region
1077
    *   beginning at the desired index.
1078
    *
1079
    *   @return this RegexMatcher.
1080
    *   @stable ICU 2.8
1081
    */
1082
    RegexMatcher &reset(int64_t index, UErrorCode &status);
1083
1084
1085
   /**
1086
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1087
    *     to be reused, which is more efficient than creating a new RegexMatcher for
1088
    *     each input string to be processed.
1089
    *   @param input The new string on which subsequent pattern matches will operate.
1090
    *                The matcher retains a reference to the callers string, and operates
1091
    *                directly on that.  Ownership of the string remains with the caller.
1092
    *                Because no copy of the string is made, it is essential that the
1093
    *                caller not delete the string until after regexp operations on it
1094
    *                are done.
1095
    *                Note that while a reset on the matcher with an input string that is then
1096
    *                modified across/during matcher operations may be supported currently for UnicodeString,
1097
    *                this was not originally intended behavior, and support for this is not guaranteed
1098
    *                in upcoming versions of ICU.
1099
    *   @return this RegexMatcher.
1100
    *   @stable ICU 2.4
1101
    */
1102
    RegexMatcher &reset(const UnicodeString &input);
1103
1104
1105
   /**
1106
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1107
    *     to be reused, which is more efficient than creating a new RegexMatcher for
1108
    *     each input string to be processed.
1109
    *   @param input The new string on which subsequent pattern matches will operate.
1110
    *                The matcher makes a shallow clone of the given text; ownership of the
1111
    *                original string remains with the caller. Because no deep copy of the
1112
    *                text is made, it is essential that the caller not modify the string
1113
    *                until after regexp operations on it are done.
1114
    *   @return this RegexMatcher.
1115
    *
1116
    *   @stable ICU 4.6
1117
    */
1118
    RegexMatcher &reset(UText *input);
1119
1120
1121
  /**
1122
    *  Set the subject text string upon which the regular expression is looking for matches
1123
    *  without changing any other aspect of the matching state.
1124
    *  The new and previous text strings must have the same content.
1125
    *
1126
    *  This function is intended for use in environments where ICU is operating on 
1127
    *  strings that may move around in memory.  It provides a mechanism for notifying
1128
    *  ICU that the string has been relocated, and providing a new UText to access the
1129
    *  string in its new position.
1130
    *
1131
    *  Note that the regular expression implementation never copies the underlying text
1132
    *  of a string being matched, but always operates directly on the original text 
1133
    *  provided by the user. Refreshing simply drops the references to the old text 
1134
    *  and replaces them with references to the new.
1135
    *
1136
    *  Caution:  this function is normally used only by very specialized,
1137
    *  system-level code.  One example use case is with garbage collection that moves
1138
    *  the text in memory.
1139
    *
1140
    * @param input      The new (moved) text string.
1141
    * @param status     Receives errors detected by this function.
1142
    *
1143
    * @stable ICU 4.8 
1144
    */
1145
    RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1146
1147
private:
1148
    /**
1149
     * Cause a compilation error if an application accidentally attempts to
1150
     *   reset a matcher with a (char16_t *) string as input rather than
1151
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
1152
     *
1153
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
1154
     * using one of the aliasing constructors, such as
1155
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
1156
     * or in a UText, using
1157
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
1158
     *
1159
     */
1160
    RegexMatcher &reset(const char16_t *input) = delete;
1161
public:
1162
1163
   /**
1164
    *   Returns the input string being matched.  Ownership of the string belongs to
1165
    *   the matcher; it should not be altered or deleted. This method will work even if the input
1166
    *   was originally supplied as a UText.
1167
    *   @return the input string
1168
    *   @stable ICU 2.4
1169
    */
1170
    const UnicodeString &input() const;
1171
    
1172
   /**
1173
    *   Returns the input string being matched.  This is the live input text; it should not be
1174
    *   altered or deleted. This method will work even if the input was originally supplied as
1175
    *   a UnicodeString.
1176
    *   @return the input text
1177
    *
1178
    *   @stable ICU 4.6
1179
    */
1180
    UText *inputText() const;
1181
    
1182
   /**
1183
    *   Returns the input string being matched, either by copying it into the provided
1184
    *   UText parameter or by returning a shallow clone of the live input. Note that copying
1185
    *   the entire input may cause significant performance and memory issues.
1186
    *   @param dest The UText into which the input should be copied, or nullptr to create a new UText
1187
    *   @param status error code
1188
    *   @return dest if non-nullptr, a shallow copy of the input text otherwise
1189
    *
1190
    *   @stable ICU 4.6
1191
    */
1192
    UText *getInput(UText *dest, UErrorCode &status) const;
1193
    
1194
1195
   /** Sets the limits of this matcher's region.
1196
     * The region is the part of the input string that will be searched to find a match.
1197
     * Invoking this method resets the matcher, and then sets the region to start
1198
     * at the index specified by the start parameter and end at the index specified
1199
     * by the end parameter.
1200
     *
1201
     * Depending on the transparency and anchoring being used (see useTransparentBounds
1202
     * and useAnchoringBounds), certain constructs such as anchors may behave differently
1203
     * at or around the boundaries of the region
1204
     *
1205
     * The function will fail if start is greater than limit, or if either index
1206
     *  is less than zero or greater than the length of the string being matched.
1207
     *
1208
     * @param start  The (native) index to begin searches at.
1209
     * @param limit  The index to end searches at (exclusive).
1210
     * @param status A reference to a UErrorCode to receive any errors.
1211
     * @stable ICU 4.0
1212
     */
1213
     RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1214
1215
   /** 
1216
     * Identical to region(start, limit, status) but also allows a start position without
1217
     *  resetting the region state.
1218
     * @param regionStart The region start
1219
     * @param regionLimit the limit of the region
1220
     * @param startIndex  The (native) index within the region bounds at which to begin searches.
1221
     * @param status A reference to a UErrorCode to receive any errors.
1222
     *                If startIndex is not within the specified region bounds, 
1223
     *                U_INDEX_OUTOFBOUNDS_ERROR is returned.
1224
     * @stable ICU 4.6
1225
     */
1226
     RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1227
1228
   /**
1229
     * Reports the start index of this matcher's region. The searches this matcher
1230
     * conducts are limited to finding matches within regionStart (inclusive) and
1231
     * regionEnd (exclusive).
1232
     *
1233
     * @return The starting (native) index of this matcher's region.
1234
     * @stable ICU 4.0
1235
     */
1236
     int32_t regionStart() const;
1237
1238
   /**
1239
     * Reports the start index of this matcher's region. The searches this matcher
1240
     * conducts are limited to finding matches within regionStart (inclusive) and
1241
     * regionEnd (exclusive).
1242
     *
1243
     * @return The starting (native) index of this matcher's region.
1244
     * @stable ICU 4.6
1245
     */
1246
     int64_t regionStart64() const;
1247
1248
1249
    /**
1250
      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1251
      * this matcher conducts are limited to finding matches within regionStart
1252
      * (inclusive) and regionEnd (exclusive).
1253
      *
1254
      * @return The ending point (native) of this matcher's region.
1255
      * @stable ICU 4.0
1256
      */
1257
      int32_t regionEnd() const;
1258
1259
   /**
1260
     * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1261
     * this matcher conducts are limited to finding matches within regionStart
1262
     * (inclusive) and regionEnd (exclusive).
1263
     *
1264
     * @return The ending point (native) of this matcher's region.
1265
     * @stable ICU 4.6
1266
     */
1267
      int64_t regionEnd64() const;
1268
1269
    /**
1270
      * Queries the transparency of region bounds for this matcher.
1271
      * See useTransparentBounds for a description of transparent and opaque bounds.
1272
      * By default, a matcher uses opaque region boundaries.
1273
      *
1274
      * @return true if this matcher is using opaque bounds, false if it is not.
1275
      * @stable ICU 4.0
1276
      */
1277
      UBool hasTransparentBounds() const;
1278
1279
    /**
1280
      * Sets the transparency of region bounds for this matcher.
1281
      * Invoking this function with an argument of true will set this matcher to use transparent bounds.
1282
      * If the boolean argument is false, then opaque bounds will be used.
1283
      *
1284
      * Using transparent bounds, the boundaries of this matcher's region are transparent
1285
      * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
1286
      * see text beyond the boundaries of the region while checking for a match.
1287
      *
1288
      * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
1289
      * lookbehind, and boundary matching constructs.
1290
      *
1291
      * By default, a matcher uses opaque bounds.
1292
      *
1293
      * @param   b true for transparent bounds; false for opaque bounds
1294
      * @return  This Matcher;
1295
      * @stable ICU 4.0
1296
      **/
1297
      RegexMatcher &useTransparentBounds(UBool b);
1298
1299
     
1300
    /**
1301
      * Return true if this matcher is using anchoring bounds.
1302
      * By default, matchers use anchoring region bounds.
1303
      *
1304
      * @return true if this matcher is using anchoring bounds.
1305
      * @stable ICU 4.0
1306
      */    
1307
      UBool hasAnchoringBounds() const;
1308
1309
1310
    /**
1311
      * Set whether this matcher is using Anchoring Bounds for its region.
1312
      * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
1313
      * and end of the region.  Without Anchoring Bounds, anchors will only match at
1314
      * the positions they would in the complete text.
1315
      *
1316
      * Anchoring Bounds are the default for regions.
1317
      *
1318
      * @param b true if to enable anchoring bounds; false to disable them.
1319
      * @return  This Matcher
1320
      * @stable ICU 4.0
1321
      */
1322
      RegexMatcher &useAnchoringBounds(UBool b);
1323
1324
1325
    /**
1326
      * Return true if the most recent matching operation attempted to access
1327
      *  additional input beyond the available input text.
1328
      *  In this case, additional input text could change the results of the match.
1329
      *
1330
      *  hitEnd() is defined for both successful and unsuccessful matches.
1331
      *  In either case hitEnd() will return true if if the end of the text was
1332
      *  reached at any point during the matching process.
1333
      *
1334
      *  @return  true if the most recent match hit the end of input
1335
      *  @stable ICU 4.0
1336
      */
1337
      UBool hitEnd() const;
1338
1339
    /**
1340
      * Return true the most recent match succeeded and additional input could cause
1341
      * it to fail. If this method returns false and a match was found, then more input
1342
      * might change the match but the match won't be lost. If a match was not found,
1343
      * then requireEnd has no meaning.
1344
      *
1345
      * @return true if more input could cause the most recent match to no longer match.
1346
      * @stable ICU 4.0
1347
      */
1348
      UBool requireEnd() const;
1349
1350
1351
   /**
1352
    *    Returns the pattern that is interpreted by this matcher.
1353
    *    @return  the RegexPattern for this RegexMatcher
1354
    *    @stable ICU 2.4
1355
    */
1356
    const RegexPattern &pattern() const;
1357
1358
1359
   /**
1360
    *    Replaces every substring of the input that matches the pattern
1361
    *    with the given replacement string.  This is a convenience function that
1362
    *    provides a complete find-and-replace-all operation.
1363
    *
1364
    *    This method first resets this matcher. It then scans the input string
1365
    *    looking for matches of the pattern. Input that is not part of any
1366
    *    match is left unchanged; each match is replaced in the result by the
1367
    *    replacement string. The replacement string may contain references to
1368
    *    capture groups.
1369
    *
1370
    *    @param   replacement a string containing the replacement text.
1371
    *    @param   status      a reference to a UErrorCode to receive any errors.
1372
    *    @return              a string containing the results of the find and replace.
1373
    *    @stable ICU 2.4
1374
    */
1375
    UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1376
1377
1378
   /**
1379
    *    Replaces every substring of the input that matches the pattern
1380
    *    with the given replacement string.  This is a convenience function that
1381
    *    provides a complete find-and-replace-all operation.
1382
    *
1383
    *    This method first resets this matcher. It then scans the input string
1384
    *    looking for matches of the pattern. Input that is not part of any
1385
    *    match is left unchanged; each match is replaced in the result by the
1386
    *    replacement string. The replacement string may contain references to
1387
    *    capture groups.
1388
    *
1389
    *    @param   replacement a string containing the replacement text.
1390
    *    @param   dest        a mutable UText in which the results are placed.
1391
    *                          If nullptr, a new UText will be created (which may not be mutable).
1392
    *    @param   status      a reference to a UErrorCode to receive any errors.
1393
    *    @return              a string containing the results of the find and replace.
1394
    *                          If a pre-allocated UText was provided, it will always be used and returned.
1395
    *
1396
    *    @stable ICU 4.6
1397
    */
1398
    UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1399
    
1400
1401
   /**
1402
    * Replaces the first substring of the input that matches
1403
    * the pattern with the replacement string.   This is a convenience
1404
    * function that provides a complete find-and-replace operation.
1405
    *
1406
    * This function first resets this RegexMatcher. It then scans the input string
1407
    * looking for a match of the pattern. Input that is not part
1408
    * of the match is appended directly to the result string; the match is replaced
1409
    * in the result by the replacement string. The replacement string may contain
1410
    * references to captured groups.
1411
    *
1412
    * The state of the matcher (the position at which a subsequent find()
1413
    *    would begin) after completing a replaceFirst() is not specified.  The
1414
    *    RegexMatcher should be reset before doing additional find() operations.
1415
    *
1416
    *    @param   replacement a string containing the replacement text.
1417
    *    @param   status      a reference to a UErrorCode to receive any errors.
1418
    *    @return              a string containing the results of the find and replace.
1419
    *    @stable ICU 2.4
1420
    */
1421
    UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1422
    
1423
1424
   /**
1425
    * Replaces the first substring of the input that matches
1426
    * the pattern with the replacement string.   This is a convenience
1427
    * function that provides a complete find-and-replace operation.
1428
    *
1429
    * This function first resets this RegexMatcher. It then scans the input string
1430
    * looking for a match of the pattern. Input that is not part
1431
    * of the match is appended directly to the result string; the match is replaced
1432
    * in the result by the replacement string. The replacement string may contain
1433
    * references to captured groups.
1434
    *
1435
    * The state of the matcher (the position at which a subsequent find()
1436
    *    would begin) after completing a replaceFirst() is not specified.  The
1437
    *    RegexMatcher should be reset before doing additional find() operations.
1438
    *
1439
    *    @param   replacement a string containing the replacement text.
1440
    *    @param   dest        a mutable UText in which the results are placed.
1441
    *                          If nullptr, a new UText will be created (which may not be mutable).
1442
    *    @param   status      a reference to a UErrorCode to receive any errors.
1443
    *    @return              a string containing the results of the find and replace.
1444
    *                          If a pre-allocated UText was provided, it will always be used and returned.
1445
    *
1446
    *    @stable ICU 4.6
1447
    */
1448
    UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1449
    
1450
    
1451
   /**
1452
    *   Implements a replace operation intended to be used as part of an
1453
    *   incremental find-and-replace.
1454
    *
1455
    *   The input string, starting from the end of the previous replacement and ending at
1456
    *   the start of the current match, is appended to the destination string.  Then the
1457
    *   replacement string is appended to the output string,
1458
    *   including handling any substitutions of captured text.
1459
    *
1460
    *   For simple, prepackaged, non-incremental find-and-replace
1461
    *   operations, see replaceFirst() or replaceAll().
1462
    *
1463
    *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
1464
    *   @param   replacement A UnicodeString that provides the text to be substituted for
1465
    *                        the input text that matched the regexp pattern.  The replacement
1466
    *                        text may contain references to captured text from the
1467
    *                        input.
1468
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1469
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1470
    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1471
    *                        if the replacement text specifies a capture group that
1472
    *                        does not exist in the pattern.
1473
    *
1474
    *   @return  this  RegexMatcher
1475
    *   @stable ICU 2.4
1476
    *
1477
    */
1478
    RegexMatcher &appendReplacement(UnicodeString &dest,
1479
        const UnicodeString &replacement, UErrorCode &status);
1480
    
1481
    
1482
   /**
1483
    *   Implements a replace operation intended to be used as part of an
1484
    *   incremental find-and-replace.
1485
    *
1486
    *   The input string, starting from the end of the previous replacement and ending at
1487
    *   the start of the current match, is appended to the destination string.  Then the
1488
    *   replacement string is appended to the output string,
1489
    *   including handling any substitutions of captured text.
1490
    *
1491
    *   For simple, prepackaged, non-incremental find-and-replace
1492
    *   operations, see replaceFirst() or replaceAll().
1493
    *
1494
    *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
1495
    *                         Must not be nullptr.
1496
    *   @param   replacement A UText that provides the text to be substituted for
1497
    *                        the input text that matched the regexp pattern.  The replacement
1498
    *                        text may contain references to captured text from the input.
1499
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1500
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1501
    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1502
    *                        if the replacement text specifies a capture group that
1503
    *                        does not exist in the pattern.
1504
    *
1505
    *   @return  this  RegexMatcher
1506
    *
1507
    *   @stable ICU 4.6
1508
    */
1509
    RegexMatcher &appendReplacement(UText *dest,
1510
        UText *replacement, UErrorCode &status);
1511
1512
1513
   /**
1514
    * As the final step in a find-and-replace operation, append the remainder
1515
    * of the input string, starting at the position following the last appendReplacement(),
1516
    * to the destination string. `appendTail()` is intended to be invoked after one
1517
    * or more invocations of the `RegexMatcher::appendReplacement()`.
1518
    *
1519
    *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
1520
    *  @return  the destination string.
1521
    *  @stable ICU 2.4
1522
    */
1523
    UnicodeString &appendTail(UnicodeString &dest);
1524
1525
1526
   /**
1527
    * As the final step in a find-and-replace operation, append the remainder
1528
    * of the input string, starting at the position following the last appendReplacement(),
1529
    * to the destination string. `appendTail()` is intended to be invoked after one
1530
    * or more invocations of the `RegexMatcher::appendReplacement()`.
1531
    *
1532
    *  @param dest A mutable UText to which the results of the find-and-replace are appended.
1533
    *               Must not be nullptr.
1534
    *  @param status error cod
1535
    *  @return  the destination string.
1536
    *
1537
    *  @stable ICU 4.6
1538
    */
1539
    UText *appendTail(UText *dest, UErrorCode &status);
1540
1541
1542
    /**
1543
     * Split a string into fields.  Somewhat like %split() from Perl.
1544
     * The pattern matches identify delimiters that separate the input
1545
     *  into fields.  The input data between the matches becomes the
1546
     *  fields themselves.
1547
     *
1548
     * @param input   The string to be split into fields.  The field delimiters
1549
     *                match the pattern (in the "this" object).  This matcher
1550
     *                will be reset to this input string.
1551
     * @param dest    An array of UnicodeStrings to receive the results of the split.
1552
     *                This is an array of actual UnicodeString objects, not an
1553
     *                array of pointers to strings.  Local (stack based) arrays can
1554
     *                work well here.
1555
     * @param destCapacity  The number of elements in the destination array.
1556
     *                If the number of fields found is less than destCapacity, the
1557
     *                extra strings in the destination array are not altered.
1558
     *                If the number of destination strings is less than the number
1559
     *                of fields, the trailing part of the input string, including any
1560
     *                field delimiters, is placed in the last destination string.
1561
     * @param status  A reference to a UErrorCode to receive any errors.
1562
     * @return        The number of fields into which the input string was split.
1563
     * @stable ICU 2.6
1564
     */
1565
     int32_t  split(const UnicodeString &input,
1566
        UnicodeString    dest[],
1567
        int32_t          destCapacity,
1568
        UErrorCode       &status);
1569
1570
1571
    /**
1572
     * Split a string into fields.  Somewhat like %split() from Perl.
1573
     * The pattern matches identify delimiters that separate the input
1574
     *  into fields.  The input data between the matches becomes the
1575
     *  fields themselves.
1576
     *
1577
     * @param input   The string to be split into fields.  The field delimiters
1578
     *                match the pattern (in the "this" object).  This matcher
1579
     *                will be reset to this input string.
1580
     * @param dest    An array of mutable UText structs to receive the results of the split.
1581
     *                If a field is nullptr, a new UText is allocated to contain the results for
1582
     *                that field. This new UText is not guaranteed to be mutable.
1583
     * @param destCapacity  The number of elements in the destination array.
1584
     *                If the number of fields found is less than destCapacity, the
1585
     *                extra strings in the destination array are not altered.
1586
     *                If the number of destination strings is less than the number
1587
     *                of fields, the trailing part of the input string, including any
1588
     *                field delimiters, is placed in the last destination string.
1589
     * @param status  A reference to a UErrorCode to receive any errors.
1590
     * @return        The number of fields into which the input string was split.
1591
     *
1592
     * @stable ICU 4.6
1593
     */
1594
     int32_t  split(UText *input,
1595
        UText           *dest[],
1596
        int32_t          destCapacity,
1597
        UErrorCode       &status);
1598
    
1599
  /**
1600
    *   Set a processing time limit for match operations with this Matcher.
1601
    *  
1602
    *   Some patterns, when matching certain strings, can run in exponential time.
1603
    *   For practical purposes, the match operation may appear to be in an
1604
    *   infinite loop.
1605
    *   When a limit is set a match operation will fail with an error if the
1606
    *   limit is exceeded.
1607
    *
1608
    *   The units of the limit are steps of the match engine.
1609
    *   Correspondence with actual processor time will depend on the speed
1610
    *   of the processor and the details of the specific pattern, but will
1611
    *   typically be on the order of milliseconds.
1612
    *
1613
    *   By default, the matching time is not limited.
1614
    *
1615
    *
1616
    *   @param   limit       The limit value, or 0 for no limit.
1617
    *   @param   status      A reference to a UErrorCode to receive any errors.
1618
    *   @stable ICU 4.0
1619
    */
1620
    void setTimeLimit(int32_t limit, UErrorCode &status);
1621
1622
  /**
1623
    * Get the time limit, if any, for match operations made with this Matcher.
1624
    *
1625
    *   @return the maximum allowed time for a match, in units of processing steps.
1626
    *   @stable ICU 4.0
1627
    */
1628
    int32_t getTimeLimit() const;
1629
1630
  /**
1631
    *  Set the amount of heap storage available for use by the match backtracking stack.
1632
    *  The matcher is also reset, discarding any results from previous matches.
1633
    *
1634
    *  ICU uses a backtracking regular expression engine, with the backtrack stack
1635
    *  maintained on the heap.  This function sets the limit to the amount of memory
1636
    *  that can be used for this purpose.  A backtracking stack overflow will
1637
    *  result in an error from the match operation that caused it.
1638
    *
1639
    *  A limit is desirable because a malicious or poorly designed pattern can use
1640
    *  excessive memory, potentially crashing the process.  A limit is enabled
1641
    *  by default.
1642
    *
1643
    *  @param limit  The maximum size, in bytes, of the matching backtrack stack.
1644
    *                A value of zero means no limit.
1645
    *                The limit must be greater or equal to zero.
1646
    *
1647
    *  @param status   A reference to a UErrorCode to receive any errors.
1648
    *
1649
    *  @stable ICU 4.0
1650
    */
1651
    void setStackLimit(int32_t  limit, UErrorCode &status);
1652
    
1653
  /**
1654
    *  Get the size of the heap storage available for use by the back tracking stack.
1655
    *
1656
    *  @return  the maximum backtracking stack size, in bytes, or zero if the
1657
    *           stack size is unlimited.
1658
    *  @stable ICU 4.0
1659
    */
1660
    int32_t  getStackLimit() const;
1661
1662
1663
  /**
1664
    * Set a callback function for use with this Matcher.
1665
    * During matching operations the function will be called periodically,
1666
    * giving the application the opportunity to terminate a long-running
1667
    * match.
1668
    *
1669
    *    @param   callback    A pointer to the user-supplied callback function.
1670
    *    @param   context     User context pointer.  The value supplied at the
1671
    *                         time the callback function is set will be saved
1672
    *                         and passed to the callback each time that it is called.
1673
    *    @param   status      A reference to a UErrorCode to receive any errors.
1674
    *  @stable ICU 4.0
1675
    */
1676
    void setMatchCallback(URegexMatchCallback     *callback,
1677
                          const void              *context,
1678
                          UErrorCode              &status);
1679
1680
1681
  /**
1682
    *  Get the callback function for this URegularExpression.
1683
    *
1684
    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
1685
    *                         callback function.
1686
    *    @param   context     Out parameter, receives the user context pointer that
1687
    *                         was set when uregex_setMatchCallback() was called.
1688
    *    @param   status      A reference to a UErrorCode to receive any errors.
1689
    *    @stable ICU 4.0
1690
    */
1691
    void getMatchCallback(URegexMatchCallback     *&callback,
1692
                          const void              *&context,
1693
                          UErrorCode              &status);
1694
1695
1696
  /**
1697
    * Set a progress callback function for use with find operations on this Matcher.
1698
    * During find operations, the callback will be invoked after each return from a
1699
    * match attempt, giving the application the opportunity to terminate a long-running
1700
    * find operation.
1701
    *
1702
    *    @param   callback    A pointer to the user-supplied callback function.
1703
    *    @param   context     User context pointer.  The value supplied at the
1704
    *                         time the callback function is set will be saved
1705
    *                         and passed to the callback each time that it is called.
1706
    *    @param   status      A reference to a UErrorCode to receive any errors.
1707
    *    @stable ICU 4.6
1708
    */
1709
    void setFindProgressCallback(URegexFindProgressCallback      *callback,
1710
                                 const void                              *context,
1711
                                 UErrorCode                              &status);
1712
1713
1714
  /**
1715
    *  Get the find progress callback function for this URegularExpression.
1716
    *
1717
    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
1718
    *                         callback function.
1719
    *    @param   context     Out parameter, receives the user context pointer that
1720
    *                         was set when uregex_setFindProgressCallback() was called.
1721
    *    @param   status      A reference to a UErrorCode to receive any errors.
1722
    *    @stable ICU 4.6
1723
    */
1724
    void getFindProgressCallback(URegexFindProgressCallback      *&callback,
1725
                                 const void                      *&context,
1726
                                 UErrorCode                      &status);
1727
1728
#ifndef U_HIDE_INTERNAL_API
1729
   /**
1730
     *   setTrace   Debug function, enable/disable tracing of the matching engine.
1731
     *              For internal ICU development use only.  DO NO USE!!!!
1732
     *   @internal
1733
     */
1734
    void setTrace(UBool state);
1735
#endif  /* U_HIDE_INTERNAL_API */
1736
1737
    /**
1738
    * ICU "poor man's RTTI", returns a UClassID for this class.
1739
    *
1740
    * @stable ICU 2.2
1741
    */
1742
    static UClassID U_EXPORT2 getStaticClassID();
1743
1744
    /**
1745
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
1746
     *
1747
     * @stable ICU 2.2
1748
     */
1749
    virtual UClassID getDynamicClassID() const override;
1750
1751
private:
1752
    // Constructors and other object boilerplate are private.
1753
    // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1754
    RegexMatcher() = delete;                  // default constructor not implemented
1755
    RegexMatcher(const RegexPattern *pat);
1756
    RegexMatcher(const RegexMatcher &other) = delete;
1757
    RegexMatcher &operator =(const RegexMatcher &rhs) = delete;
1758
    void init(UErrorCode &status);                      // Common initialization
1759
    void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
1760
1761
    friend class RegexPattern;
1762
    friend class RegexCImpl;
1763
public:
1764
#ifndef U_HIDE_INTERNAL_API
1765
    /** @internal  */
1766
    void resetPreserveRegion();  // Reset matcher state, but preserve any region.
1767
#endif  /* U_HIDE_INTERNAL_API */
1768
private:
1769
1770
    //
1771
    //  MatchAt   This is the internal interface to the match engine itself.
1772
    //            Match status comes back in matcher member variables.
1773
    //
1774
    void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1775
    inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
1776
    UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
1777
    UBool                isUWordBoundary(int64_t pos, UErrorCode &status);   // perform RBBI based \b test
1778
    // Find a grapheme cluster boundary using a break iterator. For handling \X in regexes.
1779
    int64_t              followingGCBoundary(int64_t pos, UErrorCode &status);
1780
    REStackFrame        *resetStack();
1781
    inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1782
    void                 IncrementTime(UErrorCode &status);
1783
1784
    // Call user find callback function, if set. Return true if operation should be interrupted.
1785
    inline UBool         findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1786
    
1787
    int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1788
    
1789
    UBool                findUsingChunk(UErrorCode &status);
1790
    void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1791
    UBool                isChunkWordBoundary(int32_t pos);
1792
1793
    const RegexPattern  *fPattern;
1794
    RegexPattern        *fPatternOwned;    // Non-nullptr if this matcher owns the pattern, and
1795
                                           //   should delete it when through.
1796
1797
    const UnicodeString *fInput;           // The string being matched. Only used for input()
1798
    UText               *fInputText;       // The text being matched. Is never nullptr.
1799
    UText               *fAltInputText;    // A shallow copy of the text being matched.
1800
                                           //   Only created if the pattern contains backreferences.
1801
    int64_t              fInputLength;     // Full length of the input text.
1802
    int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
1803
    
1804
    int64_t              fRegionStart;     // Start of the input region, default = 0.
1805
    int64_t              fRegionLimit;     // End of input region, default to input.length.
1806
    
1807
    int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
1808
    int64_t              fAnchorLimit;     //   See useAnchoringBounds
1809
    
1810
    int64_t              fLookStart;       // Region bounds for look-ahead/behind and
1811
    int64_t              fLookLimit;       //   and other boundary tests.  See
1812
                                           //   useTransparentBounds
1813
1814
    int64_t              fActiveStart;     // Currently active bounds for matching.
1815
    int64_t              fActiveLimit;     //   Usually is the same as region, but
1816
                                           //   is changed to fLookStart/Limit when
1817
                                           //   entering look around regions.
1818
1819
    UBool                fTransparentBounds;  // True if using transparent bounds.
1820
    UBool                fAnchoringBounds; // True if using anchoring bounds.
1821
1822
    UBool                fMatch;           // True if the last attempted match was successful.
1823
    int64_t              fMatchStart;      // Position of the start of the most recent match
1824
    int64_t              fMatchEnd;        // First position after the end of the most recent match
1825
                                           //   Zero if no previous match, even when a region
1826
                                           //   is active.
1827
    int64_t              fLastMatchEnd;    // First position after the end of the previous match,
1828
                                           //   or -1 if there was no previous match.
1829
    int64_t              fAppendPosition;  // First position after the end of the previous
1830
                                           //   appendReplacement().  As described by the
1831
                                           //   JavaDoc for Java Matcher, where it is called 
1832
                                           //   "append position"
1833
    UBool                fHitEnd;          // True if the last match touched the end of input.
1834
    UBool                fRequireEnd;      // True if the last match required end-of-input
1835
                                           //    (matched $ or Z)
1836
1837
    UVector64           *fStack;
1838
    REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
1839
                                           //   which will contain the capture group results.
1840
                                           //   NOT valid while match engine is running.
1841
1842
    int64_t             *fData;            // Data area for use by the compiled pattern.
1843
    int64_t             fSmallData[8];     //   Use this for data if it's enough.
1844
1845
    int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
1846
                                           //   match engine run.  Zero for unlimited.
1847
    
1848
    int32_t             fTime;             // Match time, accumulates while matching.
1849
    int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
1850
                                           //   Kept separately from fTime to keep as much
1851
                                           //   code as possible out of the inline
1852
                                           //   StateSave function.
1853
1854
    int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
1855
                                           //   stack, in bytes.  Zero for unlimited.
1856
1857
    URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
1858
                                           //   nullptr if there is no callback.
1859
    const void         *fCallbackContext;  // User Context ptr for callback function.
1860
1861
    URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
1862
                                                           //   nullptr if there is no callback.
1863
    const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
1864
1865
1866
    UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1867
1868
    UBool               fTraceDebug;       // Set true for debug tracing of match engine.
1869
1870
    UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
1871
                                           //   reported, or that permanently disables this matcher.
1872
1873
    BreakIterator       *fWordBreakItr;
1874
    BreakIterator       *fGCBreakItr;
1875
};
1876
1877
U_NAMESPACE_END
1878
#endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
1879
1880
#endif /* U_SHOW_CPLUSPLUS_API */
1881
1882
#endif