Coverage Report

Created: 2025-06-13 06:29

/src/icu/source/i18n/unicode/regex.h
Line
Count
Source (jump to first uncovered line)
1
// © 2016 and later: Unicode, Inc. and others.
2
// License & terms of use: http://www.unicode.org/copyright.html
3
/*
4
**********************************************************************
5
*   Copyright (C) 2002-2016, International Business Machines
6
*   Corporation and others.  All Rights Reserved.
7
**********************************************************************
8
*   file name:  regex.h
9
*   encoding:   UTF-8
10
*   indentation:4
11
*
12
*   created on: 2002oct22
13
*   created by: Andy Heninger
14
*
15
*   ICU Regular Expressions, API for C++
16
*/
17
18
#ifndef REGEX_H
19
#define REGEX_H
20
21
//#define REGEX_DEBUG
22
23
/**
24
 * \file
25
 * \brief  C++ API:  Regular Expressions
26
 *
27
 * The ICU API for processing regular expressions consists of two classes,
28
 *  `RegexPattern` and `RegexMatcher`.
29
 *  `RegexPattern` objects represent a pre-processed, or compiled
30
 *  regular expression.  They are created from a regular expression pattern string,
31
 *  and can be used to create `RegexMatcher` objects for the pattern.
32
 *
33
 * Class `RegexMatcher` bundles together a regular expression
34
 *  pattern and a target string to which the search pattern will be applied.
35
 *  `RegexMatcher` includes API for doing plain find or search
36
 *  operations, for search and replace operations, and for obtaining detailed
37
 *  information about bounds of a match.
38
 *
39
 * Note that by constructing `RegexMatcher` objects directly from regular
40
 * expression pattern strings application code can be simplified and the explicit
41
 * need for `RegexPattern` objects can usually be eliminated.
42
 *
43
 */
44
45
#include "unicode/utypes.h"
46
47
#if U_SHOW_CPLUSPLUS_API
48
49
#if !UCONFIG_NO_REGULAR_EXPRESSIONS
50
51
#include "unicode/uobject.h"
52
#include "unicode/unistr.h"
53
#include "unicode/utext.h"
54
#include "unicode/parseerr.h"
55
56
#include "unicode/uregex.h"
57
58
// Forward Declarations
59
60
struct UHashtable;
61
62
U_NAMESPACE_BEGIN
63
64
struct Regex8BitSet;
65
class  RegexCImpl;
66
class  RegexMatcher;
67
class  RegexPattern;
68
struct REStackFrame;
69
class  RuleBasedBreakIterator;
70
class  UnicodeSet;
71
class  UVector;
72
class  UVector32;
73
class  UVector64;
74
75
76
/**
77
  * Class `RegexPattern` represents a compiled regular expression.  It includes
78
  * factory methods for creating a RegexPattern object from the source (string) form
79
  * of a regular expression, methods for creating RegexMatchers that allow the pattern
80
  * to be applied to input text, and a few convenience methods for simple common
81
  * uses of regular expressions.
82
  *
83
  * Class RegexPattern is not intended to be subclassed.
84
  *
85
  * @stable ICU 2.4
86
  */
87
class U_I18N_API RegexPattern U_FINAL : public UObject {
88
public:
89
90
    /**
91
     * default constructor.  Create a RegexPattern object that refers to no actual
92
     *   pattern.  Not normally needed; RegexPattern objects are usually
93
     *   created using the factory method `compile()`.
94
     *
95
     * @stable ICU 2.4
96
     */
97
    RegexPattern();
98
99
    /**
100
     * Copy Constructor.  Create a new RegexPattern object that is equivalent
101
     *                    to the source object.
102
     * @param source the pattern object to be copied.
103
     * @stable ICU 2.4
104
     */
105
    RegexPattern(const RegexPattern &source);
106
107
    /**
108
     * Destructor.  Note that a RegexPattern object must persist so long as any
109
     *  RegexMatcher objects that were created from the RegexPattern are active.
110
     * @stable ICU 2.4
111
     */
112
    virtual ~RegexPattern();
113
114
    /**
115
     * Comparison operator.  Two RegexPattern objects are considered equal if they
116
     * were constructed from identical source patterns using the same #URegexpFlag
117
     * settings.
118
     * @param that a RegexPattern object to compare with "this".
119
     * @return TRUE if the objects are equivalent.
120
     * @stable ICU 2.4
121
     */
122
    UBool           operator==(const RegexPattern& that) const;
123
124
    /**
125
     * Comparison operator.  Two RegexPattern objects are considered equal if they
126
     * were constructed from identical source patterns using the same #URegexpFlag
127
     * settings.
128
     * @param that a RegexPattern object to compare with "this".
129
     * @return TRUE if the objects are different.
130
     * @stable ICU 2.4
131
     */
132
0
    inline UBool    operator!=(const RegexPattern& that) const {return ! operator ==(that);}
133
134
    /**
135
     * Assignment operator.  After assignment, this RegexPattern will behave identically
136
     *     to the source object.
137
     * @stable ICU 2.4
138
     */
139
    RegexPattern  &operator =(const RegexPattern &source);
140
141
    /**
142
     * Create an exact copy of this RegexPattern object.  Since RegexPattern is not
143
     * intended to be subclassed, <code>clone()</code> and the copy construction are
144
     * equivalent operations.
145
     * @return the copy of this RegexPattern
146
     * @stable ICU 2.4
147
     */
148
    virtual RegexPattern  *clone() const;
149
150
151
   /**
152
    * Compiles the regular expression in string form into a RegexPattern
153
    * object.  These compile methods, rather than the constructors, are the usual
154
    * way that RegexPattern objects are created.
155
    *
156
    * Note that RegexPattern objects must not be deleted while RegexMatcher
157
    * objects created from the pattern are active.  RegexMatchers keep a pointer
158
    * back to their pattern, so premature deletion of the pattern is a
159
    * catastrophic error.
160
    *
161
    * All #URegexpFlag pattern match mode flags are set to their default values.
162
    *
163
    * Note that it is often more convenient to construct a RegexMatcher directly
164
    *    from a pattern string rather than separately compiling the pattern and
165
    *    then creating a RegexMatcher object from the pattern.
166
    *
167
    * @param regex The regular expression to be compiled.
168
    * @param pe    Receives the position (line and column nubers) of any error
169
    *              within the regular expression.)
170
    * @param status A reference to a UErrorCode to receive any errors.
171
    * @return      A regexPattern object for the compiled pattern.
172
    *
173
    * @stable ICU 2.4
174
    */
175
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
176
        UParseError          &pe,
177
        UErrorCode           &status);
178
179
   /**
180
    * Compiles the regular expression in string form into a RegexPattern
181
    * object.  These compile methods, rather than the constructors, are the usual
182
    * way that RegexPattern objects are created.
183
    *
184
    * Note that RegexPattern objects must not be deleted while RegexMatcher
185
    * objects created from the pattern are active.  RegexMatchers keep a pointer
186
    * back to their pattern, so premature deletion of the pattern is a
187
    * catastrophic error.
188
    *
189
    * All #URegexpFlag pattern match mode flags are set to their default values.
190
    *
191
    * Note that it is often more convenient to construct a RegexMatcher directly
192
    *    from a pattern string rather than separately compiling the pattern and
193
    *    then creating a RegexMatcher object from the pattern.
194
    *
195
    * @param regex The regular expression to be compiled. Note, the text referred
196
    *              to by this UText must not be deleted during the lifetime of the
197
    *              RegexPattern object or any RegexMatcher object created from it.
198
    * @param pe    Receives the position (line and column nubers) of any error
199
    *              within the regular expression.)
200
    * @param status A reference to a UErrorCode to receive any errors.
201
    * @return      A regexPattern object for the compiled pattern.
202
    *
203
    * @stable ICU 4.6
204
    */
205
    static RegexPattern * U_EXPORT2 compile( UText *regex,
206
        UParseError          &pe,
207
        UErrorCode           &status);
208
209
   /**
210
    * Compiles the regular expression in string form into a RegexPattern
211
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
212
    * rather than the constructors, are the usual way that RegexPattern objects
213
    * are created.
214
    *
215
    * Note that RegexPattern objects must not be deleted while RegexMatcher
216
    * objects created from the pattern are active.  RegexMatchers keep a pointer
217
    * back to their pattern, so premature deletion of the pattern is a
218
    * catastrophic error.
219
    *
220
    * Note that it is often more convenient to construct a RegexMatcher directly
221
    *    from a pattern string instead of than separately compiling the pattern and
222
    *    then creating a RegexMatcher object from the pattern.
223
    *
224
    * @param regex The regular expression to be compiled.
225
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
226
    * @param pe    Receives the position (line and column numbers) of any error
227
    *              within the regular expression.)
228
    * @param status   A reference to a UErrorCode to receive any errors.
229
    * @return      A regexPattern object for the compiled pattern.
230
    *
231
    * @stable ICU 2.4
232
    */
233
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
234
        uint32_t             flags,
235
        UParseError          &pe,
236
        UErrorCode           &status);
237
238
   /**
239
    * Compiles the regular expression in string form into a RegexPattern
240
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
241
    * rather than the constructors, are the usual way that RegexPattern objects
242
    * are created.
243
    *
244
    * Note that RegexPattern objects must not be deleted while RegexMatcher
245
    * objects created from the pattern are active.  RegexMatchers keep a pointer
246
    * back to their pattern, so premature deletion of the pattern is a
247
    * catastrophic error.
248
    *
249
    * Note that it is often more convenient to construct a RegexMatcher directly
250
    *    from a pattern string instead of than separately compiling the pattern and
251
    *    then creating a RegexMatcher object from the pattern.
252
    *
253
    * @param regex The regular expression to be compiled. Note, the text referred
254
    *              to by this UText must not be deleted during the lifetime of the
255
    *              RegexPattern object or any RegexMatcher object created from it.
256
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
257
    * @param pe    Receives the position (line and column numbers) of any error
258
    *              within the regular expression.)
259
    * @param status   A reference to a UErrorCode to receive any errors.
260
    * @return      A regexPattern object for the compiled pattern.
261
    *
262
    * @stable ICU 4.6
263
    */
264
    static RegexPattern * U_EXPORT2 compile( UText *regex,
265
        uint32_t             flags,
266
        UParseError          &pe,
267
        UErrorCode           &status);
268
269
   /**
270
    * Compiles the regular expression in string form into a RegexPattern
271
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
272
    * rather than the constructors, are the usual way that RegexPattern objects
273
    * are created.
274
    *
275
    * Note that RegexPattern objects must not be deleted while RegexMatcher
276
    * objects created from the pattern are active.  RegexMatchers keep a pointer
277
    * back to their pattern, so premature deletion of the pattern is a
278
    * catastrophic error.
279
    *
280
    * Note that it is often more convenient to construct a RegexMatcher directly
281
    *    from a pattern string instead of than separately compiling the pattern and
282
    *    then creating a RegexMatcher object from the pattern.
283
    *
284
    * @param regex The regular expression to be compiled.
285
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
286
    * @param status   A reference to a UErrorCode to receive any errors.
287
    * @return      A regexPattern object for the compiled pattern.
288
    *
289
    * @stable ICU 2.6
290
    */
291
    static RegexPattern * U_EXPORT2 compile( const UnicodeString &regex,
292
        uint32_t             flags,
293
        UErrorCode           &status);
294
295
   /**
296
    * Compiles the regular expression in string form into a RegexPattern
297
    * object using the specified #URegexpFlag match mode flags.  These compile methods,
298
    * rather than the constructors, are the usual way that RegexPattern objects
299
    * are created.
300
    *
301
    * Note that RegexPattern objects must not be deleted while RegexMatcher
302
    * objects created from the pattern are active.  RegexMatchers keep a pointer
303
    * back to their pattern, so premature deletion of the pattern is a
304
    * catastrophic error.
305
    *
306
    * Note that it is often more convenient to construct a RegexMatcher directly
307
    *    from a pattern string instead of than separately compiling the pattern and
308
    *    then creating a RegexMatcher object from the pattern.
309
    *
310
    * @param regex The regular expression to be compiled. Note, the text referred
311
    *              to by this UText must not be deleted during the lifetime of the
312
    *              RegexPattern object or any RegexMatcher object created from it.
313
    * @param flags The #URegexpFlag match mode flags to be used, e.g. #UREGEX_CASE_INSENSITIVE.
314
    * @param status   A reference to a UErrorCode to receive any errors.
315
    * @return      A regexPattern object for the compiled pattern.
316
    *
317
    * @stable ICU 4.6
318
    */
319
    static RegexPattern * U_EXPORT2 compile( UText *regex,
320
        uint32_t             flags,
321
        UErrorCode           &status);
322
323
   /**
324
    * Get the #URegexpFlag match mode flags that were used when compiling this pattern.
325
    * @return  the #URegexpFlag match mode flags
326
    * @stable ICU 2.4
327
    */
328
    virtual uint32_t flags() const;
329
330
   /**
331
    * Creates a RegexMatcher that will match the given input against this pattern.  The
332
    * RegexMatcher can then be used to perform match, find or replace operations
333
    * on the input.  Note that a RegexPattern object must not be deleted while
334
    * RegexMatchers created from it still exist and might possibly be used again.
335
    *
336
    * The matcher will retain a reference to the supplied input string, and all regexp
337
    * pattern matching operations happen directly on this original string.  It is
338
    * critical that the string not be altered or deleted before use by the regular
339
    * expression operations is complete.
340
    *
341
    * @param input    The input string to which the regular expression will be applied.
342
    * @param status   A reference to a UErrorCode to receive any errors.
343
    * @return         A RegexMatcher object for this pattern and input.
344
    *
345
    * @stable ICU 2.4
346
    */
347
    virtual RegexMatcher *matcher(const UnicodeString &input,
348
        UErrorCode          &status) const;
349
        
350
private:
351
    /**
352
     * Cause a compilation error if an application accidentally attempts to
353
     *   create a matcher with a (char16_t *) string as input rather than
354
     *   a UnicodeString.  Avoids a dangling reference to a temporary string.
355
     *
356
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
357
     * using one of the aliasing constructors, such as
358
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
359
     * or in a UText, using
360
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
361
     *
362
     */
363
    RegexMatcher *matcher(const char16_t *input,
364
        UErrorCode          &status) const;
365
public:
366
367
368
   /**
369
    * Creates a RegexMatcher that will match against this pattern.  The
370
    * RegexMatcher can be used to perform match, find or replace operations.
371
    * Note that a RegexPattern object must not be deleted while
372
    * RegexMatchers created from it still exist and might possibly be used again.
373
    *
374
    * @param status   A reference to a UErrorCode to receive any errors.
375
    * @return      A RegexMatcher object for this pattern and input.
376
    *
377
    * @stable ICU 2.6
378
    */
379
    virtual RegexMatcher *matcher(UErrorCode  &status) const;
380
381
382
   /**
383
    * Test whether a string matches a regular expression.  This convenience function
384
    * both compiles the regular expression and applies it in a single operation.
385
    * Note that if the same pattern needs to be applied repeatedly, this method will be
386
    * less efficient than creating and reusing a RegexMatcher object.
387
    *
388
    * @param regex The regular expression
389
    * @param input The string data to be matched
390
    * @param pe Receives the position of any syntax errors within the regular expression
391
    * @param status A reference to a UErrorCode to receive any errors.
392
    * @return True if the regular expression exactly matches the full input string.
393
    *
394
    * @stable ICU 2.4
395
    */
396
    static UBool U_EXPORT2 matches(const UnicodeString   &regex,
397
        const UnicodeString   &input,
398
              UParseError     &pe,
399
              UErrorCode      &status);
400
401
   /**
402
    * Test whether a string matches a regular expression.  This convenience function
403
    * both compiles the regular expression and applies it in a single operation.
404
    * Note that if the same pattern needs to be applied repeatedly, this method will be
405
    * less efficient than creating and reusing a RegexMatcher object.
406
    *
407
    * @param regex The regular expression
408
    * @param input The string data to be matched
409
    * @param pe Receives the position of any syntax errors within the regular expression
410
    * @param status A reference to a UErrorCode to receive any errors.
411
    * @return True if the regular expression exactly matches the full input string.
412
    *
413
    * @stable ICU 4.6
414
    */
415
    static UBool U_EXPORT2 matches(UText *regex,
416
        UText           *input,
417
        UParseError     &pe,
418
        UErrorCode      &status);
419
420
   /**
421
    * Returns the regular expression from which this pattern was compiled. This method will work
422
    * even if the pattern was compiled from a UText.
423
    *
424
    * Note: If the pattern was originally compiled from a UText, and that UText was modified,
425
    * the returned string may no longer reflect the RegexPattern object.
426
    * @stable ICU 2.4
427
    */
428
    virtual UnicodeString pattern() const;
429
    
430
    
431
   /**
432
    * Returns the regular expression from which this pattern was compiled. This method will work
433
    * even if the pattern was compiled from a UnicodeString.
434
    *
435
    * Note: This is the original input, not a clone. If the pattern was originally compiled from a
436
    * UText, and that UText was modified, the returned UText may no longer reflect the RegexPattern
437
    * object.
438
    *
439
    * @stable ICU 4.6
440
    */
441
    virtual UText *patternText(UErrorCode      &status) const;
442
443
444
    /**
445
     * Get the group number corresponding to a named capture group.
446
     * The returned number can be used with any function that access
447
     * capture groups by number.
448
     *
449
     * The function returns an error status if the specified name does not
450
     * appear in the pattern.
451
     *
452
     * @param  groupName   The capture group name.
453
     * @param  status      A UErrorCode to receive any errors.
454
     *
455
     * @stable ICU 55
456
     */
457
    virtual int32_t groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const;
458
459
460
    /**
461
     * Get the group number corresponding to a named capture group.
462
     * The returned number can be used with any function that access
463
     * capture groups by number.
464
     *
465
     * The function returns an error status if the specified name does not
466
     * appear in the pattern.
467
     *
468
     * @param  groupName   The capture group name,
469
     *                     platform invariant characters only.
470
     * @param  nameLength  The length of the name, or -1 if the name is
471
     *                     nul-terminated.
472
     * @param  status      A UErrorCode to receive any errors.
473
     *
474
     * @stable ICU 55
475
     */
476
    virtual int32_t groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const;
477
478
479
    /**
480
     * Split a string into fields.  Somewhat like split() from Perl or Java.
481
     * Pattern matches identify delimiters that separate the input
482
     * into fields.  The input data between the delimiters becomes the
483
     * fields themselves.
484
     *
485
     * If the delimiter pattern includes capture groups, the captured text will
486
     * also appear in the destination array of output strings, interspersed
487
     * with the fields.  This is similar to Perl, but differs from Java, 
488
     * which ignores the presence of capture groups in the pattern.
489
     * 
490
     * Trailing empty fields will always be returned, assuming sufficient
491
     * destination capacity.  This differs from the default behavior for Java
492
     * and Perl where trailing empty fields are not returned.
493
     *
494
     * The number of strings produced by the split operation is returned.
495
     * This count includes the strings from capture groups in the delimiter pattern.
496
     * This behavior differs from Java, which ignores capture groups.
497
     *
498
     * For the best performance on split() operations,
499
     * <code>RegexMatcher::split</code> is preferable to this function
500
     *
501
     * @param input   The string to be split into fields.  The field delimiters
502
     *                match the pattern (in the "this" object)
503
     * @param dest    An array of UnicodeStrings to receive the results of the split.
504
     *                This is an array of actual UnicodeString objects, not an
505
     *                array of pointers to strings.  Local (stack based) arrays can
506
     *                work well here.
507
     * @param destCapacity  The number of elements in the destination array.
508
     *                If the number of fields found is less than destCapacity, the
509
     *                extra strings in the destination array are not altered.
510
     *                If the number of destination strings is less than the number
511
     *                of fields, the trailing part of the input string, including any
512
     *                field delimiters, is placed in the last destination string.
513
     * @param status  A reference to a UErrorCode to receive any errors.
514
     * @return        The number of fields into which the input string was split.
515
     * @stable ICU 2.4
516
     */
517
    virtual int32_t  split(const UnicodeString &input,
518
        UnicodeString    dest[],
519
        int32_t          destCapacity,
520
        UErrorCode       &status) const;
521
522
523
    /**
524
     * Split a string into fields.  Somewhat like %split() from Perl or Java.
525
     * Pattern matches identify delimiters that separate the input
526
     * into fields.  The input data between the delimiters becomes the
527
     * fields themselves.
528
     *
529
     * If the delimiter pattern includes capture groups, the captured text will
530
     * also appear in the destination array of output strings, interspersed
531
     * with the fields.  This is similar to Perl, but differs from Java, 
532
     * which ignores the presence of capture groups in the pattern.
533
     * 
534
     * Trailing empty fields will always be returned, assuming sufficient
535
     * destination capacity.  This differs from the default behavior for Java
536
     * and Perl where trailing empty fields are not returned.
537
     *
538
     * The number of strings produced by the split operation is returned.
539
     * This count includes the strings from capture groups in the delimiter pattern.
540
     * This behavior differs from Java, which ignores capture groups.
541
     *
542
     *  For the best performance on split() operations,
543
     *  `RegexMatcher::split()` is preferable to this function
544
     *
545
     * @param input   The string to be split into fields.  The field delimiters
546
     *                match the pattern (in the "this" object)
547
     * @param dest    An array of mutable UText structs to receive the results of the split.
548
     *                If a field is NULL, a new UText is allocated to contain the results for
549
     *                that field. This new UText is not guaranteed to be mutable.
550
     * @param destCapacity  The number of elements in the destination array.
551
     *                If the number of fields found is less than destCapacity, the
552
     *                extra strings in the destination array are not altered.
553
     *                If the number of destination strings is less than the number
554
     *                of fields, the trailing part of the input string, including any
555
     *                field delimiters, is placed in the last destination string.
556
     * @param status  A reference to a UErrorCode to receive any errors.
557
     * @return        The number of destination strings used.  
558
     *
559
     * @stable ICU 4.6
560
     */
561
    virtual int32_t  split(UText *input,
562
        UText            *dest[],
563
        int32_t          destCapacity,
564
        UErrorCode       &status) const;
565
566
567
    /**
568
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
569
     *
570
     * @stable ICU 2.4
571
     */
572
    virtual UClassID getDynamicClassID() const;
573
574
    /**
575
     * ICU "poor man's RTTI", returns a UClassID for this class.
576
     *
577
     * @stable ICU 2.4
578
     */
579
    static UClassID U_EXPORT2 getStaticClassID();
580
581
private:
582
    //
583
    //  Implementation Data
584
    //
585
    UText          *fPattern;      // The original pattern string.
586
    UnicodeString  *fPatternString; // The original pattern UncodeString if relevant
587
    uint32_t        fFlags;        // The flags used when compiling the pattern.
588
                                   //
589
    UVector64       *fCompiledPat; // The compiled pattern p-code.
590
    UnicodeString   fLiteralText;  // Any literal string data from the pattern,
591
                                   //   after un-escaping, for use during the match.
592
593
    UVector         *fSets;        // Any UnicodeSets referenced from the pattern.
594
    Regex8BitSet    *fSets8;       //      (and fast sets for latin-1 range.)
595
596
597
    UErrorCode      fDeferredStatus; // status if some prior error has left this
598
                                   //  RegexPattern in an unusable state.
599
600
    int32_t         fMinMatchLen;  // Minimum Match Length.  All matches will have length
601
                                   //   >= this value.  For some patterns, this calculated
602
                                   //   value may be less than the true shortest
603
                                   //   possible match.
604
    
605
    int32_t         fFrameSize;    // Size of a state stack frame in the
606
                                   //   execution engine.
607
608
    int32_t         fDataSize;     // The size of the data needed by the pattern that
609
                                   //   does not go on the state stack, but has just
610
                                   //   a single copy per matcher.
611
612
    UVector32       *fGroupMap;    // Map from capture group number to position of
613
                                   //   the group's variables in the matcher stack frame.
614
615
    UnicodeSet     **fStaticSets;  // Ptr to static (shared) sets for predefined
616
                                   //   regex character classes, e.g. Word.
617
618
    Regex8BitSet   *fStaticSets8;  // Ptr to the static (shared) latin-1 only
619
                                   //  sets for predefined regex classes.
620
621
    int32_t         fStartType;    // Info on how a match must start.
622
    int32_t         fInitialStringIdx;     //
623
    int32_t         fInitialStringLen;
624
    UnicodeSet     *fInitialChars;
625
    UChar32         fInitialChar;
626
    Regex8BitSet   *fInitialChars8;
627
    UBool           fNeedsAltInput;
628
629
    UHashtable     *fNamedCaptureMap;  // Map from capture group names to numbers.
630
631
    friend class RegexCompile;
632
    friend class RegexMatcher;
633
    friend class RegexCImpl;
634
635
    //
636
    //  Implementation Methods
637
    //
638
    void        init();                 // Common initialization, for use by constructors.
639
    bool        initNamedCaptureMap();  // Lazy init for fNamedCaptureMap.
640
    void        zap();                  // Common cleanup
641
642
    void        dumpOp(int32_t index) const;
643
644
  public:
645
#ifndef U_HIDE_INTERNAL_API
646
    /**
647
      * Dump a compiled pattern. Internal debug function.
648
      * @internal
649
      */
650
    void        dumpPattern() const;
651
#endif  /* U_HIDE_INTERNAL_API */
652
};
653
654
655
656
/**
657
 *  class RegexMatcher bundles together a regular expression pattern and
658
 *  input text to which the expression can be applied.  It includes methods
659
 *  for testing for matches, and for find and replace operations.
660
 *
661
 * <p>Class RegexMatcher is not intended to be subclassed.</p>
662
 *
663
 * @stable ICU 2.4
664
 */
665
class U_I18N_API RegexMatcher U_FINAL : public UObject {
666
public:
667
668
    /**
669
      * Construct a RegexMatcher for a regular expression.
670
      * This is a convenience method that avoids the need to explicitly create
671
      * a RegexPattern object.  Note that if several RegexMatchers need to be
672
      * created for the same expression, it will be more efficient to
673
      * separately create and cache a RegexPattern object, and use
674
      * its matcher() method to create the RegexMatcher objects.
675
      *
676
      *  @param regexp The Regular Expression to be compiled.
677
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
678
      *  @param status Any errors are reported by setting this UErrorCode variable.
679
      *  @stable ICU 2.6
680
      */
681
    RegexMatcher(const UnicodeString &regexp, uint32_t flags, UErrorCode &status);
682
683
    /**
684
      * Construct a RegexMatcher for a regular expression.
685
      * This is a convenience method that avoids the need to explicitly create
686
      * a RegexPattern object.  Note that if several RegexMatchers need to be
687
      * created for the same expression, it will be more efficient to
688
      * separately create and cache a RegexPattern object, and use
689
      * its matcher() method to create the RegexMatcher objects.
690
      *
691
      *  @param regexp The regular expression to be compiled.
692
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
693
      *  @param status Any errors are reported by setting this UErrorCode variable.
694
      *
695
      *  @stable ICU 4.6
696
      */
697
    RegexMatcher(UText *regexp, uint32_t flags, UErrorCode &status);
698
699
    /**
700
      * Construct a RegexMatcher for a regular expression.
701
      * This is a convenience method that avoids the need to explicitly create
702
      * a RegexPattern object.  Note that if several RegexMatchers need to be
703
      * created for the same expression, it will be more efficient to
704
      * separately create and cache a RegexPattern object, and use
705
      * its matcher() method to create the RegexMatcher objects.
706
      *
707
      * The matcher will retain a reference to the supplied input string, and all regexp
708
      * pattern matching operations happen directly on the original string.  It is
709
      * critical that the string not be altered or deleted before use by the regular
710
      * expression operations is complete.
711
      *
712
      *  @param regexp The Regular Expression to be compiled.
713
      *  @param input  The string to match.  The matcher retains a reference to the
714
      *                caller's string; mo copy is made.
715
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
716
      *  @param status Any errors are reported by setting this UErrorCode variable.
717
      *  @stable ICU 2.6
718
      */
719
    RegexMatcher(const UnicodeString &regexp, const UnicodeString &input,
720
        uint32_t flags, UErrorCode &status);
721
722
    /**
723
      * Construct a RegexMatcher for a regular expression.
724
      * This is a convenience method that avoids the need to explicitly create
725
      * a RegexPattern object.  Note that if several RegexMatchers need to be
726
      * created for the same expression, it will be more efficient to
727
      * separately create and cache a RegexPattern object, and use
728
      * its matcher() method to create the RegexMatcher objects.
729
      *
730
      * The matcher will make a shallow clone of the supplied input text, and all regexp
731
      * pattern matching operations happen on this clone.  While read-only operations on
732
      * the supplied text are permitted, it is critical that the underlying string not be
733
      * altered or deleted before use by the regular expression operations is complete.
734
      *
735
      *  @param regexp The Regular Expression to be compiled.
736
      *  @param input  The string to match.  The matcher retains a shallow clone of the text.
737
      *  @param flags  #URegexpFlag options, such as #UREGEX_CASE_INSENSITIVE.
738
      *  @param status Any errors are reported by setting this UErrorCode variable.
739
      *
740
      *  @stable ICU 4.6
741
      */
742
    RegexMatcher(UText *regexp, UText *input,
743
        uint32_t flags, UErrorCode &status);
744
745
private:
746
    /**
747
     * Cause a compilation error if an application accidentally attempts to
748
     *   create a matcher with a (char16_t *) string as input rather than
749
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
750
     *
751
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
752
     * using one of the aliasing constructors, such as
753
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
754
     * or in a UText, using
755
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
756
     */
757
    RegexMatcher(const UnicodeString &regexp, const char16_t *input,
758
        uint32_t flags, UErrorCode &status);
759
public:
760
761
762
   /**
763
    *   Destructor.
764
    *
765
    *  @stable ICU 2.4
766
    */
767
    virtual ~RegexMatcher();
768
769
770
   /**
771
    *   Attempts to match the entire input region against the pattern.
772
    *    @param   status     A reference to a UErrorCode to receive any errors.
773
    *    @return TRUE if there is a match
774
    *    @stable ICU 2.4
775
    */
776
    virtual UBool matches(UErrorCode &status);
777
778
779
   /**
780
    *   Resets the matcher, then attempts to match the input beginning 
781
    *   at the specified startIndex, and extending to the end of the input.
782
    *   The input region is reset to include the entire input string.
783
    *   A successful match must extend to the end of the input.
784
    *    @param   startIndex The input string (native) index at which to begin matching.
785
    *    @param   status     A reference to a UErrorCode to receive any errors.
786
    *    @return TRUE if there is a match
787
    *    @stable ICU 2.8
788
    */
789
    virtual UBool matches(int64_t startIndex, UErrorCode &status);
790
791
792
   /**
793
    *   Attempts to match the input string, starting from the beginning of the region,
794
    *   against the pattern.  Like the matches() method, this function 
795
    *   always starts at the beginning of the input region;
796
    *   unlike that function, it does not require that the entire region be matched.
797
    *
798
    *   If the match succeeds then more information can be obtained via the start(),
799
    *   end(), and group() functions.
800
    *
801
    *    @param   status     A reference to a UErrorCode to receive any errors.
802
    *    @return  TRUE if there is a match at the start of the input string.
803
    *    @stable ICU 2.4
804
    */
805
    virtual UBool lookingAt(UErrorCode &status);
806
807
808
  /**
809
    *   Attempts to match the input string, starting from the specified index, against the pattern.
810
    *   The match may be of any length, and is not required to extend to the end
811
    *   of the input string.  Contrast with match().
812
    *
813
    *   If the match succeeds then more information can be obtained via the start(),
814
    *   end(), and group() functions.
815
    *
816
    *    @param   startIndex The input string (native) index at which to begin matching.
817
    *    @param   status     A reference to a UErrorCode to receive any errors.
818
    *    @return  TRUE if there is a match.
819
    *    @stable ICU 2.8
820
    */
821
    virtual UBool lookingAt(int64_t startIndex, UErrorCode &status);
822
823
824
   /**
825
    *  Find the next pattern match in the input string.
826
    *  The find begins searching the input at the location following the end of
827
    *  the previous match, or at the start of the string if there is no previous match.
828
    *  If a match is found, `start()`, `end()` and `group()`
829
    *  will provide more information regarding the match.
830
    *  Note that if the input string is changed by the application,
831
    *     use find(startPos, status) instead of find(), because the saved starting
832
    *     position may not be valid with the altered input string.
833
    *  @return  TRUE if a match is found.
834
    *  @stable ICU 2.4
835
    */
836
    virtual UBool find();
837
838
839
   /**
840
    *  Find the next pattern match in the input string.
841
    *  The find begins searching the input at the location following the end of
842
    *  the previous match, or at the start of the string if there is no previous match.
843
    *  If a match is found, `start()`, `end()` and `group()`
844
    *  will provide more information regarding the match.
845
    *
846
    *  Note that if the input string is changed by the application,
847
    *  use find(startPos, status) instead of find(), because the saved starting
848
    *  position may not be valid with the altered input string.
849
    *  @param   status  A reference to a UErrorCode to receive any errors.
850
    *  @return  TRUE if a match is found.
851
    * @stable ICU 55
852
    */
853
    virtual UBool find(UErrorCode &status);
854
855
   /**
856
    *   Resets this RegexMatcher and then attempts to find the next substring of the
857
    *   input string that matches the pattern, starting at the specified index.
858
    *
859
    *   @param   start     The (native) index in the input string to begin the search.
860
    *   @param   status    A reference to a UErrorCode to receive any errors.
861
    *   @return  TRUE if a match is found.
862
    *   @stable ICU 2.4
863
    */
864
    virtual UBool find(int64_t start, UErrorCode &status);
865
866
867
   /**
868
    *   Returns a string containing the text matched by the previous match.
869
    *   If the pattern can match an empty string, an empty string may be returned.
870
    *   @param   status      A reference to a UErrorCode to receive any errors.
871
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
872
    *                        has been attempted or the last match failed.
873
    *   @return  a string containing the matched input text.
874
    *   @stable ICU 2.4
875
    */
876
    virtual UnicodeString group(UErrorCode &status) const;
877
878
879
   /**
880
    *    Returns a string containing the text captured by the given group
881
    *    during the previous match operation.  Group(0) is the entire match.
882
    *
883
    *    A zero length string is returned both for capture groups that did not
884
    *    participate in the match and for actual zero length matches.
885
    *    To distinguish between these two cases use the function start(),
886
    *    which returns -1 for non-participating groups.
887
    *
888
    *    @param groupNum the capture group number
889
    *    @param   status     A reference to a UErrorCode to receive any errors.
890
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
891
    *                        has been attempted or the last match failed and
892
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
893
    *    @return the captured text
894
    *    @stable ICU 2.4
895
    */
896
    virtual UnicodeString group(int32_t groupNum, UErrorCode &status) const;
897
898
   /**
899
    *   Returns the number of capturing groups in this matcher's pattern.
900
    *   @return the number of capture groups
901
    *   @stable ICU 2.4
902
    */
903
    virtual int32_t groupCount() const;
904
905
906
   /**
907
    *   Returns a shallow clone of the entire live input string with the UText current native index
908
    *   set to the beginning of the requested group.
909
    *
910
    *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText
911
    *   @param   group_len   A reference to receive the length of the desired capture group
912
    *   @param   status      A reference to a UErrorCode to receive any errors.
913
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
914
    *                        has been attempted or the last match failed and
915
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
916
    *   @return dest if non-NULL, a shallow copy of the input text otherwise
917
    *
918
    *   @stable ICU 4.6
919
    */
920
    virtual UText *group(UText *dest, int64_t &group_len, UErrorCode &status) const; 
921
922
   /**
923
    *   Returns a shallow clone of the entire live input string with the UText current native index
924
    *   set to the beginning of the requested group.
925
    *
926
    *   A group length of zero is returned both for capture groups that did not
927
    *   participate in the match and for actual zero length matches.
928
    *   To distinguish between these two cases use the function start(),
929
    *   which returns -1 for non-participating groups.
930
    *
931
    *   @param   groupNum   The capture group number.
932
    *   @param   dest        The UText into which the input should be cloned, or NULL to create a new UText.
933
    *   @param   group_len   A reference to receive the length of the desired capture group
934
    *   @param   status      A reference to a UErrorCode to receive any errors.
935
    *                        Possible errors are  U_REGEX_INVALID_STATE if no match
936
    *                        has been attempted or the last match failed and
937
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
938
    *   @return dest if non-NULL, a shallow copy of the input text otherwise
939
    *
940
    *   @stable ICU 4.6
941
    */
942
    virtual UText *group(int32_t groupNum, UText *dest, int64_t &group_len, UErrorCode &status) const;
943
944
   /**
945
    *   Returns the index in the input string of the start of the text matched
946
    *   during the previous match operation.
947
    *    @param   status      a reference to a UErrorCode to receive any errors.
948
    *    @return              The (native) position in the input string of the start of the last match.
949
    *    @stable ICU 2.4
950
    */
951
    virtual int32_t start(UErrorCode &status) const;
952
953
   /**
954
    *   Returns the index in the input string of the start of the text matched
955
    *   during the previous match operation.
956
    *    @param   status      a reference to a UErrorCode to receive any errors.
957
    *    @return              The (native) position in the input string of the start of the last match.
958
    *   @stable ICU 4.6
959
    */
960
    virtual int64_t start64(UErrorCode &status) const;
961
962
963
   /**
964
    *   Returns the index in the input string of the start of the text matched by the
965
    *    specified capture group during the previous match operation.  Return -1 if
966
    *    the capture group exists in the pattern, but was not part of the last match.
967
    *
968
    *    @param  group       the capture group number
969
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
970
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
971
    *                        attempted or the last match failed, and
972
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
973
    *    @return the (native) start position of substring matched by the specified group.
974
    *    @stable ICU 2.4
975
    */
976
    virtual int32_t start(int32_t group, UErrorCode &status) const;
977
978
   /**
979
    *   Returns the index in the input string of the start of the text matched by the
980
    *    specified capture group during the previous match operation.  Return -1 if
981
    *    the capture group exists in the pattern, but was not part of the last match.
982
    *
983
    *    @param  group       the capture group number.
984
    *    @param  status      A reference to a UErrorCode to receive any errors.  Possible
985
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
986
    *                        attempted or the last match failed, and
987
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number.
988
    *    @return the (native) start position of substring matched by the specified group.
989
    *    @stable ICU 4.6
990
    */
991
    virtual int64_t start64(int32_t group, UErrorCode &status) const;
992
993
   /**
994
    *    Returns the index in the input string of the first character following the
995
    *    text matched during the previous match operation.
996
    *
997
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
998
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
999
    *                        attempted or the last match failed.
1000
    *    @return the index of the last character matched, plus one.
1001
    *                        The index value returned is a native index, corresponding to
1002
    *                        code units for the underlying encoding type, for example,
1003
    *                        a byte index for UTF-8.
1004
    *   @stable ICU 2.4
1005
    */
1006
    virtual int32_t end(UErrorCode &status) const;
1007
1008
   /**
1009
    *    Returns the index in the input string of the first character following the
1010
    *    text matched during the previous match operation.
1011
    *
1012
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1013
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1014
    *                        attempted or the last match failed.
1015
    *    @return the index of the last character matched, plus one.
1016
    *                        The index value returned is a native index, corresponding to
1017
    *                        code units for the underlying encoding type, for example,
1018
    *                        a byte index for UTF-8.
1019
    *   @stable ICU 4.6
1020
    */
1021
    virtual int64_t end64(UErrorCode &status) const;
1022
1023
1024
   /**
1025
    *    Returns the index in the input string of the character following the
1026
    *    text matched by the specified capture group during the previous match operation.
1027
    *
1028
    *    @param group  the capture group number
1029
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1030
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1031
    *                        attempted or the last match failed and
1032
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1033
    *    @return  the index of the first character following the text
1034
    *              captured by the specified group during the previous match operation.
1035
    *              Return -1 if the capture group exists in the pattern but was not part of the match.
1036
    *              The index value returned is a native index, corresponding to
1037
    *              code units for the underlying encoding type, for example,
1038
    *              a byte index for UTF8.
1039
    *    @stable ICU 2.4
1040
    */
1041
    virtual int32_t end(int32_t group, UErrorCode &status) const;
1042
1043
   /**
1044
    *    Returns the index in the input string of the character following the
1045
    *    text matched by the specified capture group during the previous match operation.
1046
    *
1047
    *    @param group  the capture group number
1048
    *    @param   status      A reference to a UErrorCode to receive any errors.  Possible
1049
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1050
    *                        attempted or the last match failed and
1051
    *                        U_INDEX_OUTOFBOUNDS_ERROR for a bad capture group number
1052
    *    @return  the index of the first character following the text
1053
    *              captured by the specified group during the previous match operation.
1054
    *              Return -1 if the capture group exists in the pattern but was not part of the match.
1055
    *              The index value returned is a native index, corresponding to
1056
    *              code units for the underlying encoding type, for example,
1057
    *              a byte index for UTF8.
1058
    *   @stable ICU 4.6
1059
    */
1060
    virtual int64_t end64(int32_t group, UErrorCode &status) const;
1061
1062
   /**
1063
    *   Resets this matcher.  The effect is to remove any memory of previous matches,
1064
    *       and to cause subsequent find() operations to begin at the beginning of
1065
    *       the input string.
1066
    *
1067
    *   @return this RegexMatcher.
1068
    *   @stable ICU 2.4
1069
    */
1070
    virtual RegexMatcher &reset();
1071
1072
1073
   /**
1074
    *   Resets this matcher, and set the current input position.
1075
    *   The effect is to remove any memory of previous matches,
1076
    *       and to cause subsequent find() operations to begin at
1077
    *       the specified (native) position in the input string.
1078
    *
1079
    *   The matcher's region is reset to its default, which is the entire
1080
    *   input string.
1081
    *
1082
    *   An alternative to this function is to set a match region
1083
    *   beginning at the desired index.
1084
    *
1085
    *   @return this RegexMatcher.
1086
    *   @stable ICU 2.8
1087
    */
1088
    virtual RegexMatcher &reset(int64_t index, UErrorCode &status);
1089
1090
1091
   /**
1092
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1093
    *     to be reused, which is more efficient than creating a new RegexMatcher for
1094
    *     each input string to be processed.
1095
    *   @param input The new string on which subsequent pattern matches will operate.
1096
    *                The matcher retains a reference to the callers string, and operates
1097
    *                directly on that.  Ownership of the string remains with the caller.
1098
    *                Because no copy of the string is made, it is essential that the
1099
    *                caller not delete the string until after regexp operations on it
1100
    *                are done.
1101
    *                Note that while a reset on the matcher with an input string that is then
1102
    *                modified across/during matcher operations may be supported currently for UnicodeString,
1103
    *                this was not originally intended behavior, and support for this is not guaranteed
1104
    *                in upcoming versions of ICU.
1105
    *   @return this RegexMatcher.
1106
    *   @stable ICU 2.4
1107
    */
1108
    virtual RegexMatcher &reset(const UnicodeString &input);
1109
1110
1111
   /**
1112
    *   Resets this matcher with a new input string.  This allows instances of RegexMatcher
1113
    *     to be reused, which is more efficient than creating a new RegexMatcher for
1114
    *     each input string to be processed.
1115
    *   @param input The new string on which subsequent pattern matches will operate.
1116
    *                The matcher makes a shallow clone of the given text; ownership of the
1117
    *                original string remains with the caller. Because no deep copy of the
1118
    *                text is made, it is essential that the caller not modify the string
1119
    *                until after regexp operations on it are done.
1120
    *   @return this RegexMatcher.
1121
    *
1122
    *   @stable ICU 4.6
1123
    */
1124
    virtual RegexMatcher &reset(UText *input);
1125
1126
1127
  /**
1128
    *  Set the subject text string upon which the regular expression is looking for matches
1129
    *  without changing any other aspect of the matching state.
1130
    *  The new and previous text strings must have the same content.
1131
    *
1132
    *  This function is intended for use in environments where ICU is operating on 
1133
    *  strings that may move around in memory.  It provides a mechanism for notifying
1134
    *  ICU that the string has been relocated, and providing a new UText to access the
1135
    *  string in its new position.
1136
    *
1137
    *  Note that the regular expression implementation never copies the underlying text
1138
    *  of a string being matched, but always operates directly on the original text 
1139
    *  provided by the user. Refreshing simply drops the references to the old text 
1140
    *  and replaces them with references to the new.
1141
    *
1142
    *  Caution:  this function is normally used only by very specialized,
1143
    *  system-level code.  One example use case is with garbage collection that moves
1144
    *  the text in memory.
1145
    *
1146
    * @param input      The new (moved) text string.
1147
    * @param status     Receives errors detected by this function.
1148
    *
1149
    * @stable ICU 4.8 
1150
    */
1151
    virtual RegexMatcher &refreshInputText(UText *input, UErrorCode &status);
1152
1153
private:
1154
    /**
1155
     * Cause a compilation error if an application accidentally attempts to
1156
     *   reset a matcher with a (char16_t *) string as input rather than
1157
     *   a UnicodeString.    Avoids a dangling reference to a temporary string.
1158
     *
1159
     * To efficiently work with char16_t *strings, wrap the data in a UnicodeString
1160
     * using one of the aliasing constructors, such as
1161
     * `UnicodeString(UBool isTerminated, const char16_t *text, int32_t textLength);`
1162
     * or in a UText, using
1163
     * `utext_openUChars(UText *ut, const char16_t *text, int64_t textLength, UErrorCode *status);`
1164
     *
1165
     */
1166
    RegexMatcher &reset(const char16_t *input);
1167
public:
1168
1169
   /**
1170
    *   Returns the input string being matched.  Ownership of the string belongs to
1171
    *   the matcher; it should not be altered or deleted. This method will work even if the input
1172
    *   was originally supplied as a UText.
1173
    *   @return the input string
1174
    *   @stable ICU 2.4
1175
    */
1176
    virtual const UnicodeString &input() const;
1177
    
1178
   /**
1179
    *   Returns the input string being matched.  This is the live input text; it should not be
1180
    *   altered or deleted. This method will work even if the input was originally supplied as
1181
    *   a UnicodeString.
1182
    *   @return the input text
1183
    *
1184
    *   @stable ICU 4.6
1185
    */
1186
    virtual UText *inputText() const;
1187
    
1188
   /**
1189
    *   Returns the input string being matched, either by copying it into the provided
1190
    *   UText parameter or by returning a shallow clone of the live input. Note that copying
1191
    *   the entire input may cause significant performance and memory issues.
1192
    *   @param dest The UText into which the input should be copied, or NULL to create a new UText
1193
    *   @param status error code
1194
    *   @return dest if non-NULL, a shallow copy of the input text otherwise
1195
    *
1196
    *   @stable ICU 4.6
1197
    */
1198
    virtual UText *getInput(UText *dest, UErrorCode &status) const;
1199
    
1200
1201
   /** Sets the limits of this matcher's region.
1202
     * The region is the part of the input string that will be searched to find a match.
1203
     * Invoking this method resets the matcher, and then sets the region to start
1204
     * at the index specified by the start parameter and end at the index specified
1205
     * by the end parameter.
1206
     *
1207
     * Depending on the transparency and anchoring being used (see useTransparentBounds
1208
     * and useAnchoringBounds), certain constructs such as anchors may behave differently
1209
     * at or around the boundaries of the region
1210
     *
1211
     * The function will fail if start is greater than limit, or if either index
1212
     *  is less than zero or greater than the length of the string being matched.
1213
     *
1214
     * @param start  The (native) index to begin searches at.
1215
     * @param limit  The index to end searches at (exclusive).
1216
     * @param status A reference to a UErrorCode to receive any errors.
1217
     * @stable ICU 4.0
1218
     */
1219
     virtual RegexMatcher &region(int64_t start, int64_t limit, UErrorCode &status);
1220
1221
   /** 
1222
     * Identical to region(start, limit, status) but also allows a start position without
1223
     *  resetting the region state.
1224
     * @param regionStart The region start
1225
     * @param regionLimit the limit of the region
1226
     * @param startIndex  The (native) index within the region bounds at which to begin searches.
1227
     * @param status A reference to a UErrorCode to receive any errors.
1228
     *                If startIndex is not within the specified region bounds, 
1229
     *                U_INDEX_OUTOFBOUNDS_ERROR is returned.
1230
     * @stable ICU 4.6
1231
     */
1232
     virtual RegexMatcher &region(int64_t regionStart, int64_t regionLimit, int64_t startIndex, UErrorCode &status);
1233
1234
   /**
1235
     * Reports the start index of this matcher's region. The searches this matcher
1236
     * conducts are limited to finding matches within regionStart (inclusive) and
1237
     * regionEnd (exclusive).
1238
     *
1239
     * @return The starting (native) index of this matcher's region.
1240
     * @stable ICU 4.0
1241
     */
1242
     virtual int32_t regionStart() const;
1243
1244
   /**
1245
     * Reports the start index of this matcher's region. The searches this matcher
1246
     * conducts are limited to finding matches within regionStart (inclusive) and
1247
     * regionEnd (exclusive).
1248
     *
1249
     * @return The starting (native) index of this matcher's region.
1250
     * @stable ICU 4.6
1251
     */
1252
     virtual int64_t regionStart64() const;
1253
1254
1255
    /**
1256
      * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1257
      * this matcher conducts are limited to finding matches within regionStart
1258
      * (inclusive) and regionEnd (exclusive).
1259
      *
1260
      * @return The ending point (native) of this matcher's region.
1261
      * @stable ICU 4.0
1262
      */
1263
      virtual int32_t regionEnd() const;
1264
1265
   /**
1266
     * Reports the end (limit) index (exclusive) of this matcher's region. The searches
1267
     * this matcher conducts are limited to finding matches within regionStart
1268
     * (inclusive) and regionEnd (exclusive).
1269
     *
1270
     * @return The ending point (native) of this matcher's region.
1271
     * @stable ICU 4.6
1272
     */
1273
      virtual int64_t regionEnd64() const;
1274
1275
    /**
1276
      * Queries the transparency of region bounds for this matcher.
1277
      * See useTransparentBounds for a description of transparent and opaque bounds.
1278
      * By default, a matcher uses opaque region boundaries.
1279
      *
1280
      * @return TRUE if this matcher is using opaque bounds, false if it is not.
1281
      * @stable ICU 4.0
1282
      */
1283
      virtual UBool hasTransparentBounds() const;
1284
1285
    /**
1286
      * Sets the transparency of region bounds for this matcher.
1287
      * Invoking this function with an argument of true will set this matcher to use transparent bounds.
1288
      * If the boolean argument is false, then opaque bounds will be used.
1289
      *
1290
      * Using transparent bounds, the boundaries of this matcher's region are transparent
1291
      * to lookahead, lookbehind, and boundary matching constructs. Those constructs can
1292
      * see text beyond the boundaries of the region while checking for a match.
1293
      *
1294
      * With opaque bounds, no text outside of the matcher's region is visible to lookahead,
1295
      * lookbehind, and boundary matching constructs.
1296
      *
1297
      * By default, a matcher uses opaque bounds.
1298
      *
1299
      * @param   b TRUE for transparent bounds; FALSE for opaque bounds
1300
      * @return  This Matcher;
1301
      * @stable ICU 4.0
1302
      **/
1303
      virtual RegexMatcher &useTransparentBounds(UBool b);
1304
1305
     
1306
    /**
1307
      * Return true if this matcher is using anchoring bounds.
1308
      * By default, matchers use anchoring region bounds.
1309
      *
1310
      * @return TRUE if this matcher is using anchoring bounds.
1311
      * @stable ICU 4.0
1312
      */    
1313
      virtual UBool hasAnchoringBounds() const;
1314
1315
1316
    /**
1317
      * Set whether this matcher is using Anchoring Bounds for its region.
1318
      * With anchoring bounds, pattern anchors such as ^ and $ will match at the start
1319
      * and end of the region.  Without Anchoring Bounds, anchors will only match at
1320
      * the positions they would in the complete text.
1321
      *
1322
      * Anchoring Bounds are the default for regions.
1323
      *
1324
      * @param b TRUE if to enable anchoring bounds; FALSE to disable them.
1325
      * @return  This Matcher
1326
      * @stable ICU 4.0
1327
      */
1328
      virtual RegexMatcher &useAnchoringBounds(UBool b);
1329
1330
1331
    /**
1332
      * Return TRUE if the most recent matching operation attempted to access
1333
      *  additional input beyond the available input text.
1334
      *  In this case, additional input text could change the results of the match.
1335
      *
1336
      *  hitEnd() is defined for both successful and unsuccessful matches.
1337
      *  In either case hitEnd() will return TRUE if if the end of the text was
1338
      *  reached at any point during the matching process.
1339
      *
1340
      *  @return  TRUE if the most recent match hit the end of input
1341
      *  @stable ICU 4.0
1342
      */
1343
      virtual UBool hitEnd() const;
1344
1345
    /**
1346
      * Return TRUE the most recent match succeeded and additional input could cause
1347
      * it to fail. If this method returns false and a match was found, then more input
1348
      * might change the match but the match won't be lost. If a match was not found,
1349
      * then requireEnd has no meaning.
1350
      *
1351
      * @return TRUE if more input could cause the most recent match to no longer match.
1352
      * @stable ICU 4.0
1353
      */
1354
      virtual UBool requireEnd() const;
1355
1356
1357
   /**
1358
    *    Returns the pattern that is interpreted by this matcher.
1359
    *    @return  the RegexPattern for this RegexMatcher
1360
    *    @stable ICU 2.4
1361
    */
1362
    virtual const RegexPattern &pattern() const;
1363
1364
1365
   /**
1366
    *    Replaces every substring of the input that matches the pattern
1367
    *    with the given replacement string.  This is a convenience function that
1368
    *    provides a complete find-and-replace-all operation.
1369
    *
1370
    *    This method first resets this matcher. It then scans the input string
1371
    *    looking for matches of the pattern. Input that is not part of any
1372
    *    match is left unchanged; each match is replaced in the result by the
1373
    *    replacement string. The replacement string may contain references to
1374
    *    capture groups.
1375
    *
1376
    *    @param   replacement a string containing the replacement text.
1377
    *    @param   status      a reference to a UErrorCode to receive any errors.
1378
    *    @return              a string containing the results of the find and replace.
1379
    *    @stable ICU 2.4
1380
    */
1381
    virtual UnicodeString replaceAll(const UnicodeString &replacement, UErrorCode &status);
1382
1383
1384
   /**
1385
    *    Replaces every substring of the input that matches the pattern
1386
    *    with the given replacement string.  This is a convenience function that
1387
    *    provides a complete find-and-replace-all operation.
1388
    *
1389
    *    This method first resets this matcher. It then scans the input string
1390
    *    looking for matches of the pattern. Input that is not part of any
1391
    *    match is left unchanged; each match is replaced in the result by the
1392
    *    replacement string. The replacement string may contain references to
1393
    *    capture groups.
1394
    *
1395
    *    @param   replacement a string containing the replacement text.
1396
    *    @param   dest        a mutable UText in which the results are placed.
1397
    *                          If NULL, a new UText will be created (which may not be mutable).
1398
    *    @param   status      a reference to a UErrorCode to receive any errors.
1399
    *    @return              a string containing the results of the find and replace.
1400
    *                          If a pre-allocated UText was provided, it will always be used and returned.
1401
    *
1402
    *    @stable ICU 4.6
1403
    */
1404
    virtual UText *replaceAll(UText *replacement, UText *dest, UErrorCode &status);
1405
    
1406
1407
   /**
1408
    * Replaces the first substring of the input that matches
1409
    * the pattern with the replacement string.   This is a convenience
1410
    * function that provides a complete find-and-replace operation.
1411
    *
1412
    * This function first resets this RegexMatcher. It then scans the input string
1413
    * looking for a match of the pattern. Input that is not part
1414
    * of the match is appended directly to the result string; the match is replaced
1415
    * in the result by the replacement string. The replacement string may contain
1416
    * references to captured groups.
1417
    *
1418
    * The state of the matcher (the position at which a subsequent find()
1419
    *    would begin) after completing a replaceFirst() is not specified.  The
1420
    *    RegexMatcher should be reset before doing additional find() operations.
1421
    *
1422
    *    @param   replacement a string containing the replacement text.
1423
    *    @param   status      a reference to a UErrorCode to receive any errors.
1424
    *    @return              a string containing the results of the find and replace.
1425
    *    @stable ICU 2.4
1426
    */
1427
    virtual UnicodeString replaceFirst(const UnicodeString &replacement, UErrorCode &status);
1428
    
1429
1430
   /**
1431
    * Replaces the first substring of the input that matches
1432
    * the pattern with the replacement string.   This is a convenience
1433
    * function that provides a complete find-and-replace operation.
1434
    *
1435
    * This function first resets this RegexMatcher. It then scans the input string
1436
    * looking for a match of the pattern. Input that is not part
1437
    * of the match is appended directly to the result string; the match is replaced
1438
    * in the result by the replacement string. The replacement string may contain
1439
    * references to captured groups.
1440
    *
1441
    * The state of the matcher (the position at which a subsequent find()
1442
    *    would begin) after completing a replaceFirst() is not specified.  The
1443
    *    RegexMatcher should be reset before doing additional find() operations.
1444
    *
1445
    *    @param   replacement a string containing the replacement text.
1446
    *    @param   dest        a mutable UText in which the results are placed.
1447
    *                          If NULL, a new UText will be created (which may not be mutable).
1448
    *    @param   status      a reference to a UErrorCode to receive any errors.
1449
    *    @return              a string containing the results of the find and replace.
1450
    *                          If a pre-allocated UText was provided, it will always be used and returned.
1451
    *
1452
    *    @stable ICU 4.6
1453
    */
1454
    virtual UText *replaceFirst(UText *replacement, UText *dest, UErrorCode &status);
1455
    
1456
    
1457
   /**
1458
    *   Implements a replace operation intended to be used as part of an
1459
    *   incremental find-and-replace.
1460
    *
1461
    *   The input string, starting from the end of the previous replacement and ending at
1462
    *   the start of the current match, is appended to the destination string.  Then the
1463
    *   replacement string is appended to the output string,
1464
    *   including handling any substitutions of captured text.
1465
    *
1466
    *   For simple, prepackaged, non-incremental find-and-replace
1467
    *   operations, see replaceFirst() or replaceAll().
1468
    *
1469
    *   @param   dest        A UnicodeString to which the results of the find-and-replace are appended.
1470
    *   @param   replacement A UnicodeString that provides the text to be substituted for
1471
    *                        the input text that matched the regexp pattern.  The replacement
1472
    *                        text may contain references to captured text from the
1473
    *                        input.
1474
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1475
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1476
    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1477
    *                        if the replacement text specifies a capture group that
1478
    *                        does not exist in the pattern.
1479
    *
1480
    *   @return  this  RegexMatcher
1481
    *   @stable ICU 2.4
1482
    *
1483
    */
1484
    virtual RegexMatcher &appendReplacement(UnicodeString &dest,
1485
        const UnicodeString &replacement, UErrorCode &status);
1486
    
1487
    
1488
   /**
1489
    *   Implements a replace operation intended to be used as part of an
1490
    *   incremental find-and-replace.
1491
    *
1492
    *   The input string, starting from the end of the previous replacement and ending at
1493
    *   the start of the current match, is appended to the destination string.  Then the
1494
    *   replacement string is appended to the output string,
1495
    *   including handling any substitutions of captured text.
1496
    *
1497
    *   For simple, prepackaged, non-incremental find-and-replace
1498
    *   operations, see replaceFirst() or replaceAll().
1499
    *
1500
    *   @param   dest        A mutable UText to which the results of the find-and-replace are appended.
1501
    *                         Must not be NULL.
1502
    *   @param   replacement A UText that provides the text to be substituted for
1503
    *                        the input text that matched the regexp pattern.  The replacement
1504
    *                        text may contain references to captured text from the input.
1505
    *   @param   status      A reference to a UErrorCode to receive any errors.  Possible
1506
    *                        errors are  U_REGEX_INVALID_STATE if no match has been
1507
    *                        attempted or the last match failed, and U_INDEX_OUTOFBOUNDS_ERROR
1508
    *                        if the replacement text specifies a capture group that
1509
    *                        does not exist in the pattern.
1510
    *
1511
    *   @return  this  RegexMatcher
1512
    *
1513
    *   @stable ICU 4.6
1514
    */
1515
    virtual RegexMatcher &appendReplacement(UText *dest,
1516
        UText *replacement, UErrorCode &status);
1517
1518
1519
   /**
1520
    * As the final step in a find-and-replace operation, append the remainder
1521
    * of the input string, starting at the position following the last appendReplacement(),
1522
    * to the destination string. `appendTail()` is intended to be invoked after one
1523
    * or more invocations of the `RegexMatcher::appendReplacement()`.
1524
    *
1525
    *  @param dest A UnicodeString to which the results of the find-and-replace are appended.
1526
    *  @return  the destination string.
1527
    *  @stable ICU 2.4
1528
    */
1529
    virtual UnicodeString &appendTail(UnicodeString &dest);
1530
1531
1532
   /**
1533
    * As the final step in a find-and-replace operation, append the remainder
1534
    * of the input string, starting at the position following the last appendReplacement(),
1535
    * to the destination string. `appendTail()` is intended to be invoked after one
1536
    * or more invocations of the `RegexMatcher::appendReplacement()`.
1537
    *
1538
    *  @param dest A mutable UText to which the results of the find-and-replace are appended.
1539
    *               Must not be NULL.
1540
    *  @param status error cod
1541
    *  @return  the destination string.
1542
    *
1543
    *  @stable ICU 4.6
1544
    */
1545
    virtual UText *appendTail(UText *dest, UErrorCode &status);
1546
1547
1548
    /**
1549
     * Split a string into fields.  Somewhat like %split() from Perl.
1550
     * The pattern matches identify delimiters that separate the input
1551
     *  into fields.  The input data between the matches becomes the
1552
     *  fields themselves.
1553
     *
1554
     * @param input   The string to be split into fields.  The field delimiters
1555
     *                match the pattern (in the "this" object).  This matcher
1556
     *                will be reset to this input string.
1557
     * @param dest    An array of UnicodeStrings to receive the results of the split.
1558
     *                This is an array of actual UnicodeString objects, not an
1559
     *                array of pointers to strings.  Local (stack based) arrays can
1560
     *                work well here.
1561
     * @param destCapacity  The number of elements in the destination array.
1562
     *                If the number of fields found is less than destCapacity, the
1563
     *                extra strings in the destination array are not altered.
1564
     *                If the number of destination strings is less than the number
1565
     *                of fields, the trailing part of the input string, including any
1566
     *                field delimiters, is placed in the last destination string.
1567
     * @param status  A reference to a UErrorCode to receive any errors.
1568
     * @return        The number of fields into which the input string was split.
1569
     * @stable ICU 2.6
1570
     */
1571
    virtual int32_t  split(const UnicodeString &input,
1572
        UnicodeString    dest[],
1573
        int32_t          destCapacity,
1574
        UErrorCode       &status);
1575
1576
1577
    /**
1578
     * Split a string into fields.  Somewhat like %split() from Perl.
1579
     * The pattern matches identify delimiters that separate the input
1580
     *  into fields.  The input data between the matches becomes the
1581
     *  fields themselves.
1582
     *
1583
     * @param input   The string to be split into fields.  The field delimiters
1584
     *                match the pattern (in the "this" object).  This matcher
1585
     *                will be reset to this input string.
1586
     * @param dest    An array of mutable UText structs to receive the results of the split.
1587
     *                If a field is NULL, a new UText is allocated to contain the results for
1588
     *                that field. This new UText is not guaranteed to be mutable.
1589
     * @param destCapacity  The number of elements in the destination array.
1590
     *                If the number of fields found is less than destCapacity, the
1591
     *                extra strings in the destination array are not altered.
1592
     *                If the number of destination strings is less than the number
1593
     *                of fields, the trailing part of the input string, including any
1594
     *                field delimiters, is placed in the last destination string.
1595
     * @param status  A reference to a UErrorCode to receive any errors.
1596
     * @return        The number of fields into which the input string was split.
1597
     *
1598
     * @stable ICU 4.6
1599
     */
1600
    virtual int32_t  split(UText *input,
1601
        UText           *dest[],
1602
        int32_t          destCapacity,
1603
        UErrorCode       &status);
1604
    
1605
  /**
1606
    *   Set a processing time limit for match operations with this Matcher.
1607
    *  
1608
    *   Some patterns, when matching certain strings, can run in exponential time.
1609
    *   For practical purposes, the match operation may appear to be in an
1610
    *   infinite loop.
1611
    *   When a limit is set a match operation will fail with an error if the
1612
    *   limit is exceeded.
1613
    *
1614
    *   The units of the limit are steps of the match engine.
1615
    *   Correspondence with actual processor time will depend on the speed
1616
    *   of the processor and the details of the specific pattern, but will
1617
    *   typically be on the order of milliseconds.
1618
    *
1619
    *   By default, the matching time is not limited.
1620
    *
1621
    *
1622
    *   @param   limit       The limit value, or 0 for no limit.
1623
    *   @param   status      A reference to a UErrorCode to receive any errors.
1624
    *   @stable ICU 4.0
1625
    */
1626
    virtual void setTimeLimit(int32_t limit, UErrorCode &status);
1627
1628
  /**
1629
    * Get the time limit, if any, for match operations made with this Matcher.
1630
    *
1631
    *   @return the maximum allowed time for a match, in units of processing steps.
1632
    *   @stable ICU 4.0
1633
    */
1634
    virtual int32_t getTimeLimit() const;
1635
1636
  /**
1637
    *  Set the amount of heap storage available for use by the match backtracking stack.
1638
    *  The matcher is also reset, discarding any results from previous matches.
1639
    *
1640
    *  ICU uses a backtracking regular expression engine, with the backtrack stack
1641
    *  maintained on the heap.  This function sets the limit to the amount of memory
1642
    *  that can be used for this purpose.  A backtracking stack overflow will
1643
    *  result in an error from the match operation that caused it.
1644
    *
1645
    *  A limit is desirable because a malicious or poorly designed pattern can use
1646
    *  excessive memory, potentially crashing the process.  A limit is enabled
1647
    *  by default.
1648
    *
1649
    *  @param limit  The maximum size, in bytes, of the matching backtrack stack.
1650
    *                A value of zero means no limit.
1651
    *                The limit must be greater or equal to zero.
1652
    *
1653
    *  @param status   A reference to a UErrorCode to receive any errors.
1654
    *
1655
    *  @stable ICU 4.0
1656
    */
1657
    virtual void setStackLimit(int32_t  limit, UErrorCode &status);
1658
    
1659
  /**
1660
    *  Get the size of the heap storage available for use by the back tracking stack.
1661
    *
1662
    *  @return  the maximum backtracking stack size, in bytes, or zero if the
1663
    *           stack size is unlimited.
1664
    *  @stable ICU 4.0
1665
    */
1666
    virtual int32_t  getStackLimit() const;
1667
1668
1669
  /**
1670
    * Set a callback function for use with this Matcher.
1671
    * During matching operations the function will be called periodically,
1672
    * giving the application the opportunity to terminate a long-running
1673
    * match.
1674
    *
1675
    *    @param   callback    A pointer to the user-supplied callback function.
1676
    *    @param   context     User context pointer.  The value supplied at the
1677
    *                         time the callback function is set will be saved
1678
    *                         and passed to the callback each time that it is called.
1679
    *    @param   status      A reference to a UErrorCode to receive any errors.
1680
    *  @stable ICU 4.0
1681
    */
1682
    virtual void setMatchCallback(URegexMatchCallback     *callback,
1683
                                  const void              *context,
1684
                                  UErrorCode              &status);
1685
1686
1687
  /**
1688
    *  Get the callback function for this URegularExpression.
1689
    *
1690
    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
1691
    *                         callback function.
1692
    *    @param   context     Out parameter, receives the user context pointer that
1693
    *                         was set when uregex_setMatchCallback() was called.
1694
    *    @param   status      A reference to a UErrorCode to receive any errors.
1695
    *    @stable ICU 4.0
1696
    */
1697
    virtual void getMatchCallback(URegexMatchCallback     *&callback,
1698
                                  const void              *&context,
1699
                                  UErrorCode              &status);
1700
1701
1702
  /**
1703
    * Set a progress callback function for use with find operations on this Matcher.
1704
    * During find operations, the callback will be invoked after each return from a
1705
    * match attempt, giving the application the opportunity to terminate a long-running
1706
    * find operation.
1707
    *
1708
    *    @param   callback    A pointer to the user-supplied callback function.
1709
    *    @param   context     User context pointer.  The value supplied at the
1710
    *                         time the callback function is set will be saved
1711
    *                         and passed to the callback each time that it is called.
1712
    *    @param   status      A reference to a UErrorCode to receive any errors.
1713
    *    @stable ICU 4.6
1714
    */
1715
    virtual void setFindProgressCallback(URegexFindProgressCallback      *callback,
1716
                                              const void                              *context,
1717
                                              UErrorCode                              &status);
1718
1719
1720
  /**
1721
    *  Get the find progress callback function for this URegularExpression.
1722
    *
1723
    *    @param   callback    Out parameter, receives a pointer to the user-supplied 
1724
    *                         callback function.
1725
    *    @param   context     Out parameter, receives the user context pointer that
1726
    *                         was set when uregex_setFindProgressCallback() was called.
1727
    *    @param   status      A reference to a UErrorCode to receive any errors.
1728
    *    @stable ICU 4.6
1729
    */
1730
    virtual void getFindProgressCallback(URegexFindProgressCallback      *&callback,
1731
                                              const void                      *&context,
1732
                                              UErrorCode                      &status);
1733
1734
#ifndef U_HIDE_INTERNAL_API
1735
   /**
1736
     *   setTrace   Debug function, enable/disable tracing of the matching engine.
1737
     *              For internal ICU development use only.  DO NO USE!!!!
1738
     *   @internal
1739
     */
1740
    void setTrace(UBool state);
1741
#endif  /* U_HIDE_INTERNAL_API */
1742
1743
    /**
1744
    * ICU "poor man's RTTI", returns a UClassID for this class.
1745
    *
1746
    * @stable ICU 2.2
1747
    */
1748
    static UClassID U_EXPORT2 getStaticClassID();
1749
1750
    /**
1751
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
1752
     *
1753
     * @stable ICU 2.2
1754
     */
1755
    virtual UClassID getDynamicClassID() const;
1756
1757
private:
1758
    // Constructors and other object boilerplate are private.
1759
    // Instances of RegexMatcher can not be assigned, copied, cloned, etc.
1760
    RegexMatcher();                  // default constructor not implemented
1761
    RegexMatcher(const RegexPattern *pat);
1762
    RegexMatcher(const RegexMatcher &other);
1763
    RegexMatcher &operator =(const RegexMatcher &rhs);
1764
    void init(UErrorCode &status);                      // Common initialization
1765
    void init2(UText *t, UErrorCode &e);  // Common initialization, part 2.
1766
1767
    friend class RegexPattern;
1768
    friend class RegexCImpl;
1769
public:
1770
#ifndef U_HIDE_INTERNAL_API
1771
    /** @internal  */
1772
    void resetPreserveRegion();  // Reset matcher state, but preserve any region.
1773
#endif  /* U_HIDE_INTERNAL_API */
1774
private:
1775
1776
    //
1777
    //  MatchAt   This is the internal interface to the match engine itself.
1778
    //            Match status comes back in matcher member variables.
1779
    //
1780
    void                 MatchAt(int64_t startIdx, UBool toEnd, UErrorCode &status);
1781
    inline void          backTrack(int64_t &inputIdx, int32_t &patIdx);
1782
    UBool                isWordBoundary(int64_t pos);         // perform Perl-like  \b test
1783
    UBool                isUWordBoundary(int64_t pos);        // perform RBBI based \b test
1784
    REStackFrame        *resetStack();
1785
    inline REStackFrame *StateSave(REStackFrame *fp, int64_t savePatIdx, UErrorCode &status);
1786
    void                 IncrementTime(UErrorCode &status);
1787
1788
    // Call user find callback function, if set. Return TRUE if operation should be interrupted.
1789
    inline UBool         findProgressInterrupt(int64_t matchIndex, UErrorCode &status);
1790
    
1791
    int64_t              appendGroup(int32_t groupNum, UText *dest, UErrorCode &status) const;
1792
    
1793
    UBool                findUsingChunk(UErrorCode &status);
1794
    void                 MatchChunkAt(int32_t startIdx, UBool toEnd, UErrorCode &status);
1795
    UBool                isChunkWordBoundary(int32_t pos);
1796
1797
    const RegexPattern  *fPattern;
1798
    RegexPattern        *fPatternOwned;    // Non-NULL if this matcher owns the pattern, and
1799
                                           //   should delete it when through.
1800
1801
    const UnicodeString *fInput;           // The string being matched. Only used for input()
1802
    UText               *fInputText;       // The text being matched. Is never NULL.
1803
    UText               *fAltInputText;    // A shallow copy of the text being matched.
1804
                                           //   Only created if the pattern contains backreferences.
1805
    int64_t              fInputLength;     // Full length of the input text.
1806
    int32_t              fFrameSize;       // The size of a frame in the backtrack stack.
1807
    
1808
    int64_t              fRegionStart;     // Start of the input region, default = 0.
1809
    int64_t              fRegionLimit;     // End of input region, default to input.length.
1810
    
1811
    int64_t              fAnchorStart;     // Region bounds for anchoring operations (^ or $).
1812
    int64_t              fAnchorLimit;     //   See useAnchoringBounds
1813
    
1814
    int64_t              fLookStart;       // Region bounds for look-ahead/behind and
1815
    int64_t              fLookLimit;       //   and other boundary tests.  See
1816
                                           //   useTransparentBounds
1817
1818
    int64_t              fActiveStart;     // Currently active bounds for matching.
1819
    int64_t              fActiveLimit;     //   Usually is the same as region, but
1820
                                           //   is changed to fLookStart/Limit when
1821
                                           //   entering look around regions.
1822
1823
    UBool                fTransparentBounds;  // True if using transparent bounds.
1824
    UBool                fAnchoringBounds; // True if using anchoring bounds.
1825
1826
    UBool                fMatch;           // True if the last attempted match was successful.
1827
    int64_t              fMatchStart;      // Position of the start of the most recent match
1828
    int64_t              fMatchEnd;        // First position after the end of the most recent match
1829
                                           //   Zero if no previous match, even when a region
1830
                                           //   is active.
1831
    int64_t              fLastMatchEnd;    // First position after the end of the previous match,
1832
                                           //   or -1 if there was no previous match.
1833
    int64_t              fAppendPosition;  // First position after the end of the previous
1834
                                           //   appendReplacement().  As described by the
1835
                                           //   JavaDoc for Java Matcher, where it is called 
1836
                                           //   "append position"
1837
    UBool                fHitEnd;          // True if the last match touched the end of input.
1838
    UBool                fRequireEnd;      // True if the last match required end-of-input
1839
                                           //    (matched $ or Z)
1840
1841
    UVector64           *fStack;
1842
    REStackFrame        *fFrame;           // After finding a match, the last active stack frame,
1843
                                           //   which will contain the capture group results.
1844
                                           //   NOT valid while match engine is running.
1845
1846
    int64_t             *fData;            // Data area for use by the compiled pattern.
1847
    int64_t             fSmallData[8];     //   Use this for data if it's enough.
1848
1849
    int32_t             fTimeLimit;        // Max time (in arbitrary steps) to let the
1850
                                           //   match engine run.  Zero for unlimited.
1851
    
1852
    int32_t             fTime;             // Match time, accumulates while matching.
1853
    int32_t             fTickCounter;      // Low bits counter for time.  Counts down StateSaves.
1854
                                           //   Kept separately from fTime to keep as much
1855
                                           //   code as possible out of the inline
1856
                                           //   StateSave function.
1857
1858
    int32_t             fStackLimit;       // Maximum memory size to use for the backtrack
1859
                                           //   stack, in bytes.  Zero for unlimited.
1860
1861
    URegexMatchCallback *fCallbackFn;       // Pointer to match progress callback funct.
1862
                                           //   NULL if there is no callback.
1863
    const void         *fCallbackContext;  // User Context ptr for callback function.
1864
1865
    URegexFindProgressCallback  *fFindProgressCallbackFn;  // Pointer to match progress callback funct.
1866
                                                           //   NULL if there is no callback.
1867
    const void         *fFindProgressCallbackContext;      // User Context ptr for callback function.
1868
1869
1870
    UBool               fInputUniStrMaybeMutable;  // Set when fInputText wraps a UnicodeString that may be mutable - compatibility.
1871
1872
    UBool               fTraceDebug;       // Set true for debug tracing of match engine.
1873
1874
    UErrorCode          fDeferredStatus;   // Save error state that cannot be immediately
1875
                                           //   reported, or that permanently disables this matcher.
1876
1877
    RuleBasedBreakIterator  *fWordBreakItr;
1878
};
1879
1880
U_NAMESPACE_END
1881
#endif  // UCONFIG_NO_REGULAR_EXPRESSIONS
1882
1883
#endif /* U_SHOW_CPLUSPLUS_API */
1884
1885
#endif