/src/icu/icu4c/source/i18n/repattrn.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | // © 2016 and later: Unicode, Inc. and others. |
2 | | // License & terms of use: http://www.unicode.org/copyright.html |
3 | | // |
4 | | // file: repattrn.cpp |
5 | | // |
6 | | /* |
7 | | *************************************************************************** |
8 | | * Copyright (C) 2002-2016 International Business Machines Corporation |
9 | | * and others. All rights reserved. |
10 | | *************************************************************************** |
11 | | */ |
12 | | |
13 | | #include "unicode/utypes.h" |
14 | | |
15 | | #if !UCONFIG_NO_REGULAR_EXPRESSIONS |
16 | | |
17 | | #include "unicode/regex.h" |
18 | | #include "unicode/uclean.h" |
19 | | #include "cmemory.h" |
20 | | #include "cstr.h" |
21 | | #include "uassert.h" |
22 | | #include "uhash.h" |
23 | | #include "uvector.h" |
24 | | #include "uvectr32.h" |
25 | | #include "uvectr64.h" |
26 | | #include "regexcmp.h" |
27 | | #include "regeximp.h" |
28 | | #include "regexst.h" |
29 | | |
30 | | U_NAMESPACE_BEGIN |
31 | | |
32 | | //-------------------------------------------------------------------------- |
33 | | // |
34 | | // RegexPattern Default Constructor |
35 | | // |
36 | | //-------------------------------------------------------------------------- |
37 | 25.1k | RegexPattern::RegexPattern() { |
38 | | // Init all of this instances data. |
39 | 25.1k | init(); |
40 | 25.1k | } |
41 | | |
42 | | |
43 | | //-------------------------------------------------------------------------- |
44 | | // |
45 | | // Copy Constructor Note: This is a rather inefficient implementation, |
46 | | // but it probably doesn't matter. |
47 | | // |
48 | | //-------------------------------------------------------------------------- |
49 | 0 | RegexPattern::RegexPattern(const RegexPattern &other) : UObject(other) { |
50 | 0 | init(); |
51 | 0 | *this = other; |
52 | 0 | } |
53 | | |
54 | | |
55 | | |
56 | | //-------------------------------------------------------------------------- |
57 | | // |
58 | | // Assignment Operator |
59 | | // |
60 | | //-------------------------------------------------------------------------- |
61 | 0 | RegexPattern &RegexPattern::operator = (const RegexPattern &other) { |
62 | 0 | if (this == &other) { |
63 | | // Source and destination are the same. Don't do anything. |
64 | 0 | return *this; |
65 | 0 | } |
66 | | |
67 | | // Clean out any previous contents of object being assigned to. |
68 | 0 | zap(); |
69 | | |
70 | | // Give target object a default initialization |
71 | 0 | init(); |
72 | | |
73 | | // Copy simple fields |
74 | 0 | fDeferredStatus = other.fDeferredStatus; |
75 | |
|
76 | 0 | if (U_FAILURE(fDeferredStatus)) { |
77 | 0 | return *this; |
78 | 0 | } |
79 | | |
80 | 0 | if (other.fPatternString == nullptr) { |
81 | 0 | fPatternString = nullptr; |
82 | 0 | fPattern = utext_clone(fPattern, other.fPattern, false, true, &fDeferredStatus); |
83 | 0 | } else { |
84 | 0 | fPatternString = new UnicodeString(*(other.fPatternString)); |
85 | 0 | if (fPatternString == nullptr) { |
86 | 0 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
87 | 0 | } else { |
88 | 0 | fPattern = utext_openConstUnicodeString(nullptr, fPatternString, &fDeferredStatus); |
89 | 0 | } |
90 | 0 | } |
91 | 0 | if (U_FAILURE(fDeferredStatus)) { |
92 | 0 | return *this; |
93 | 0 | } |
94 | | |
95 | 0 | fFlags = other.fFlags; |
96 | 0 | fLiteralText = other.fLiteralText; |
97 | 0 | fMinMatchLen = other.fMinMatchLen; |
98 | 0 | fFrameSize = other.fFrameSize; |
99 | 0 | fDataSize = other.fDataSize; |
100 | |
|
101 | 0 | fStartType = other.fStartType; |
102 | 0 | fInitialStringIdx = other.fInitialStringIdx; |
103 | 0 | fInitialStringLen = other.fInitialStringLen; |
104 | 0 | *fInitialChars = *other.fInitialChars; |
105 | 0 | fInitialChar = other.fInitialChar; |
106 | 0 | *fInitialChars8 = *other.fInitialChars8; |
107 | 0 | fNeedsAltInput = other.fNeedsAltInput; |
108 | | |
109 | | // Copy the pattern. It's just values, nothing deep to copy. |
110 | 0 | fCompiledPat->assign(*other.fCompiledPat, fDeferredStatus); |
111 | 0 | fGroupMap->assign(*other.fGroupMap, fDeferredStatus); |
112 | | |
113 | | // Copy the Unicode Sets. |
114 | | // Could be made more efficient if the sets were reference counted and shared, |
115 | | // but I doubt that pattern copying will be particularly common. |
116 | | // Note: init() already added an empty element zero to fSets |
117 | 0 | int32_t i; |
118 | 0 | int32_t numSets = other.fSets->size(); |
119 | 0 | fSets8 = new Regex8BitSet[numSets]; |
120 | 0 | if (fSets8 == nullptr) { |
121 | 0 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
122 | 0 | return *this; |
123 | 0 | } |
124 | 0 | for (i=1; i<numSets; i++) { |
125 | 0 | if (U_FAILURE(fDeferredStatus)) { |
126 | 0 | return *this; |
127 | 0 | } |
128 | 0 | UnicodeSet* sourceSet = static_cast<UnicodeSet*>(other.fSets->elementAt(i)); |
129 | 0 | UnicodeSet *newSet = new UnicodeSet(*sourceSet); |
130 | 0 | if (newSet == nullptr) { |
131 | 0 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
132 | 0 | break; |
133 | 0 | } |
134 | 0 | fSets->addElement(newSet, fDeferredStatus); |
135 | 0 | fSets8[i] = other.fSets8[i]; |
136 | 0 | } |
137 | | |
138 | | // Copy the named capture group hash map. |
139 | 0 | if (other.fNamedCaptureMap != nullptr && initNamedCaptureMap()) { |
140 | 0 | int32_t hashPos = UHASH_FIRST; |
141 | 0 | while (const UHashElement *hashEl = uhash_nextElement(other.fNamedCaptureMap, &hashPos)) { |
142 | 0 | if (U_FAILURE(fDeferredStatus)) { |
143 | 0 | break; |
144 | 0 | } |
145 | 0 | const UnicodeString* name = static_cast<const UnicodeString*>(hashEl->key.pointer); |
146 | 0 | UnicodeString *key = new UnicodeString(*name); |
147 | 0 | int32_t val = hashEl->value.integer; |
148 | 0 | if (key == nullptr) { |
149 | 0 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
150 | 0 | } else { |
151 | 0 | uhash_puti(fNamedCaptureMap, key, val, &fDeferredStatus); |
152 | 0 | } |
153 | 0 | } |
154 | 0 | } |
155 | 0 | return *this; |
156 | 0 | } |
157 | | |
158 | | |
159 | | //-------------------------------------------------------------------------- |
160 | | // |
161 | | // init Shared initialization for use by constructors. |
162 | | // Bring an uninitialized RegexPattern up to a default state. |
163 | | // |
164 | | //-------------------------------------------------------------------------- |
165 | 25.1k | void RegexPattern::init() { |
166 | 25.1k | fFlags = 0; |
167 | 25.1k | fCompiledPat = nullptr; |
168 | 25.1k | fLiteralText.remove(); |
169 | 25.1k | fSets = nullptr; |
170 | 25.1k | fSets8 = nullptr; |
171 | 25.1k | fDeferredStatus = U_ZERO_ERROR; |
172 | 25.1k | fMinMatchLen = 0; |
173 | 25.1k | fFrameSize = 0; |
174 | 25.1k | fDataSize = 0; |
175 | 25.1k | fGroupMap = nullptr; |
176 | 25.1k | fStartType = START_NO_INFO; |
177 | 25.1k | fInitialStringIdx = 0; |
178 | 25.1k | fInitialStringLen = 0; |
179 | 25.1k | fInitialChars = nullptr; |
180 | 25.1k | fInitialChar = 0; |
181 | 25.1k | fInitialChars8 = nullptr; |
182 | 25.1k | fNeedsAltInput = false; |
183 | 25.1k | fNamedCaptureMap = nullptr; |
184 | | |
185 | 25.1k | fPattern = nullptr; // will be set later |
186 | 25.1k | fPatternString = nullptr; // may be set later |
187 | 25.1k | fCompiledPat = new UVector64(fDeferredStatus); |
188 | 25.1k | fGroupMap = new UVector32(fDeferredStatus); |
189 | 25.1k | fSets = new UVector(fDeferredStatus); |
190 | 25.1k | fInitialChars = new UnicodeSet; |
191 | 25.1k | fInitialChars8 = new Regex8BitSet; |
192 | 25.1k | if (U_FAILURE(fDeferredStatus)) { |
193 | 0 | return; |
194 | 0 | } |
195 | 25.1k | if (fCompiledPat == nullptr || fGroupMap == nullptr || fSets == nullptr || |
196 | 25.1k | fInitialChars == nullptr || fInitialChars8 == nullptr) { |
197 | 0 | fDeferredStatus = U_MEMORY_ALLOCATION_ERROR; |
198 | 0 | return; |
199 | 0 | } |
200 | | |
201 | | // Slot zero of the vector of sets is reserved. Fill it here. |
202 | 25.1k | fSets->addElement(static_cast<int32_t>(0), fDeferredStatus); |
203 | 25.1k | } |
204 | | |
205 | | |
206 | 803 | bool RegexPattern::initNamedCaptureMap() { |
207 | 803 | if (fNamedCaptureMap) { |
208 | 471 | return true; |
209 | 471 | } |
210 | 332 | fNamedCaptureMap = uhash_openSize(uhash_hashUnicodeString, // Key hash function |
211 | 332 | uhash_compareUnicodeString, // Key comparator function |
212 | 332 | uhash_compareLong, // Value comparator function |
213 | 332 | 7, // Initial table capacity |
214 | 332 | &fDeferredStatus); |
215 | 332 | if (U_FAILURE(fDeferredStatus)) { |
216 | 0 | return false; |
217 | 0 | } |
218 | | |
219 | | // fNamedCaptureMap owns its key strings, type (UnicodeString *) |
220 | 332 | uhash_setKeyDeleter(fNamedCaptureMap, uprv_deleteUObject); |
221 | 332 | return true; |
222 | 332 | } |
223 | | |
224 | | //-------------------------------------------------------------------------- |
225 | | // |
226 | | // zap Delete everything owned by this RegexPattern. |
227 | | // |
228 | | //-------------------------------------------------------------------------- |
229 | 25.1k | void RegexPattern::zap() { |
230 | 25.1k | delete fCompiledPat; |
231 | 25.1k | fCompiledPat = nullptr; |
232 | 25.1k | int i; |
233 | 404k | for (i=1; i<fSets->size(); i++) { |
234 | 379k | UnicodeSet *s; |
235 | 379k | s = static_cast<UnicodeSet*>(fSets->elementAt(i)); |
236 | 379k | delete s; |
237 | 379k | } |
238 | 25.1k | delete fSets; |
239 | 25.1k | fSets = nullptr; |
240 | 25.1k | delete[] fSets8; |
241 | 25.1k | fSets8 = nullptr; |
242 | 25.1k | delete fGroupMap; |
243 | 25.1k | fGroupMap = nullptr; |
244 | 25.1k | delete fInitialChars; |
245 | 25.1k | fInitialChars = nullptr; |
246 | 25.1k | delete fInitialChars8; |
247 | 25.1k | fInitialChars8 = nullptr; |
248 | 25.1k | if (fPattern != nullptr) { |
249 | 25.1k | utext_close(fPattern); |
250 | 25.1k | fPattern = nullptr; |
251 | 25.1k | } |
252 | 25.1k | if (fPatternString != nullptr) { |
253 | 12.8k | delete fPatternString; |
254 | 12.8k | fPatternString = nullptr; |
255 | 12.8k | } |
256 | 25.1k | if (fNamedCaptureMap != nullptr) { |
257 | 332 | uhash_close(fNamedCaptureMap); |
258 | 332 | fNamedCaptureMap = nullptr; |
259 | 332 | } |
260 | 25.1k | } |
261 | | |
262 | | |
263 | | //-------------------------------------------------------------------------- |
264 | | // |
265 | | // Destructor |
266 | | // |
267 | | //-------------------------------------------------------------------------- |
268 | 25.1k | RegexPattern::~RegexPattern() { |
269 | 25.1k | zap(); |
270 | 25.1k | } |
271 | | |
272 | | |
273 | | //-------------------------------------------------------------------------- |
274 | | // |
275 | | // Clone |
276 | | // |
277 | | //-------------------------------------------------------------------------- |
278 | 0 | RegexPattern *RegexPattern::clone() const { |
279 | 0 | RegexPattern *copy = new RegexPattern(*this); |
280 | 0 | return copy; |
281 | 0 | } |
282 | | |
283 | | |
284 | | //-------------------------------------------------------------------------- |
285 | | // |
286 | | // operator == (comparison) Consider to patterns to be == if the |
287 | | // pattern strings and the flags are the same. |
288 | | // Note that pattern strings with the same |
289 | | // characters can still be considered different. |
290 | | // |
291 | | //-------------------------------------------------------------------------- |
292 | 0 | bool RegexPattern::operator ==(const RegexPattern &other) const { |
293 | 0 | if (this->fFlags == other.fFlags && this->fDeferredStatus == other.fDeferredStatus) { |
294 | 0 | if (this->fPatternString != nullptr && other.fPatternString != nullptr) { |
295 | 0 | return *(this->fPatternString) == *(other.fPatternString); |
296 | 0 | } else if (this->fPattern == nullptr) { |
297 | 0 | if (other.fPattern == nullptr) { |
298 | 0 | return true; |
299 | 0 | } |
300 | 0 | } else if (other.fPattern != nullptr) { |
301 | 0 | UTEXT_SETNATIVEINDEX(this->fPattern, 0); |
302 | 0 | UTEXT_SETNATIVEINDEX(other.fPattern, 0); |
303 | 0 | return utext_equals(this->fPattern, other.fPattern); |
304 | 0 | } |
305 | 0 | } |
306 | 0 | return false; |
307 | 0 | } |
308 | | |
309 | | //--------------------------------------------------------------------- |
310 | | // |
311 | | // compile |
312 | | // |
313 | | //--------------------------------------------------------------------- |
314 | | RegexPattern * U_EXPORT2 |
315 | | RegexPattern::compile(const UnicodeString ®ex, |
316 | | uint32_t flags, |
317 | | UParseError &pe, |
318 | | UErrorCode &status) |
319 | 12.8k | { |
320 | 12.8k | if (U_FAILURE(status)) { |
321 | 0 | return nullptr; |
322 | 0 | } |
323 | | |
324 | 12.8k | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
325 | 12.8k | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
326 | 12.8k | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
327 | | |
328 | 12.8k | if ((flags & ~allFlags) != 0) { |
329 | 0 | status = U_REGEX_INVALID_FLAG; |
330 | 0 | return nullptr; |
331 | 0 | } |
332 | | |
333 | 12.8k | if ((flags & UREGEX_CANON_EQ) != 0) { |
334 | 0 | status = U_REGEX_UNIMPLEMENTED; |
335 | 0 | return nullptr; |
336 | 0 | } |
337 | | |
338 | 12.8k | RegexPattern *This = new RegexPattern; |
339 | 12.8k | if (This == nullptr) { |
340 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
341 | 0 | return nullptr; |
342 | 0 | } |
343 | 12.8k | if (U_FAILURE(This->fDeferredStatus)) { |
344 | 0 | status = This->fDeferredStatus; |
345 | 0 | delete This; |
346 | 0 | return nullptr; |
347 | 0 | } |
348 | 12.8k | This->fFlags = flags; |
349 | | |
350 | 12.8k | RegexCompile compiler(This, status); |
351 | 12.8k | compiler.compile(regex, pe, status); |
352 | | |
353 | 12.8k | if (U_FAILURE(status)) { |
354 | 1.72k | delete This; |
355 | 1.72k | This = nullptr; |
356 | 1.72k | } |
357 | | |
358 | 12.8k | return This; |
359 | 12.8k | } |
360 | | |
361 | | |
362 | | // |
363 | | // compile, UText mode |
364 | | // |
365 | | RegexPattern * U_EXPORT2 |
366 | | RegexPattern::compile(UText *regex, |
367 | | uint32_t flags, |
368 | | UParseError &pe, |
369 | | UErrorCode &status) |
370 | 12.3k | { |
371 | 12.3k | if (U_FAILURE(status)) { |
372 | 0 | return nullptr; |
373 | 0 | } |
374 | | |
375 | 12.3k | const uint32_t allFlags = UREGEX_CANON_EQ | UREGEX_CASE_INSENSITIVE | UREGEX_COMMENTS | |
376 | 12.3k | UREGEX_DOTALL | UREGEX_MULTILINE | UREGEX_UWORD | |
377 | 12.3k | UREGEX_ERROR_ON_UNKNOWN_ESCAPES | UREGEX_UNIX_LINES | UREGEX_LITERAL; |
378 | | |
379 | 12.3k | if ((flags & ~allFlags) != 0) { |
380 | 0 | status = U_REGEX_INVALID_FLAG; |
381 | 0 | return nullptr; |
382 | 0 | } |
383 | | |
384 | 12.3k | if ((flags & UREGEX_CANON_EQ) != 0) { |
385 | 0 | status = U_REGEX_UNIMPLEMENTED; |
386 | 0 | return nullptr; |
387 | 0 | } |
388 | | |
389 | 12.3k | RegexPattern *This = new RegexPattern; |
390 | 12.3k | if (This == nullptr) { |
391 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
392 | 0 | return nullptr; |
393 | 0 | } |
394 | 12.3k | if (U_FAILURE(This->fDeferredStatus)) { |
395 | 0 | status = This->fDeferredStatus; |
396 | 0 | delete This; |
397 | 0 | return nullptr; |
398 | 0 | } |
399 | 12.3k | This->fFlags = flags; |
400 | | |
401 | 12.3k | RegexCompile compiler(This, status); |
402 | 12.3k | compiler.compile(regex, pe, status); |
403 | | |
404 | 12.3k | if (U_FAILURE(status)) { |
405 | 7.42k | delete This; |
406 | 7.42k | This = nullptr; |
407 | 7.42k | } |
408 | | |
409 | 12.3k | return This; |
410 | 12.3k | } |
411 | | |
412 | | // |
413 | | // compile with default flags. |
414 | | // |
415 | | RegexPattern * U_EXPORT2 |
416 | | RegexPattern::compile(const UnicodeString ®ex, |
417 | | UParseError &pe, |
418 | | UErrorCode &err) |
419 | 0 | { |
420 | 0 | return compile(regex, 0, pe, err); |
421 | 0 | } |
422 | | |
423 | | |
424 | | // |
425 | | // compile with default flags, UText mode |
426 | | // |
427 | | RegexPattern * U_EXPORT2 |
428 | | RegexPattern::compile(UText *regex, |
429 | | UParseError &pe, |
430 | | UErrorCode &err) |
431 | 0 | { |
432 | 0 | return compile(regex, 0, pe, err); |
433 | 0 | } |
434 | | |
435 | | |
436 | | // |
437 | | // compile with no UParseErr parameter. |
438 | | // |
439 | | RegexPattern * U_EXPORT2 |
440 | | RegexPattern::compile(const UnicodeString ®ex, |
441 | | uint32_t flags, |
442 | | UErrorCode &err) |
443 | 12.8k | { |
444 | 12.8k | UParseError pe; |
445 | 12.8k | return compile(regex, flags, pe, err); |
446 | 12.8k | } |
447 | | |
448 | | |
449 | | // |
450 | | // compile with no UParseErr parameter, UText mode |
451 | | // |
452 | | RegexPattern * U_EXPORT2 |
453 | | RegexPattern::compile(UText *regex, |
454 | | uint32_t flags, |
455 | | UErrorCode &err) |
456 | 0 | { |
457 | 0 | UParseError pe; |
458 | 0 | return compile(regex, flags, pe, err); |
459 | 0 | } |
460 | | |
461 | | |
462 | | //--------------------------------------------------------------------- |
463 | | // |
464 | | // flags |
465 | | // |
466 | | //--------------------------------------------------------------------- |
467 | 0 | uint32_t RegexPattern::flags() const { |
468 | 0 | return fFlags; |
469 | 0 | } |
470 | | |
471 | | |
472 | | //--------------------------------------------------------------------- |
473 | | // |
474 | | // matcher(UnicodeString, err) |
475 | | // |
476 | | //--------------------------------------------------------------------- |
477 | | RegexMatcher *RegexPattern::matcher(const UnicodeString &input, |
478 | 11.1k | UErrorCode &status) const { |
479 | 11.1k | RegexMatcher *retMatcher = matcher(status); |
480 | 11.1k | if (retMatcher != nullptr) { |
481 | 11.1k | retMatcher->fDeferredStatus = status; |
482 | 11.1k | retMatcher->reset(input); |
483 | 11.1k | } |
484 | 11.1k | return retMatcher; |
485 | 11.1k | } |
486 | | |
487 | | |
488 | | //--------------------------------------------------------------------- |
489 | | // |
490 | | // matcher(status) |
491 | | // |
492 | | //--------------------------------------------------------------------- |
493 | 16.0k | RegexMatcher *RegexPattern::matcher(UErrorCode &status) const { |
494 | 16.0k | RegexMatcher *retMatcher = nullptr; |
495 | | |
496 | 16.0k | if (U_FAILURE(status)) { |
497 | 0 | return nullptr; |
498 | 0 | } |
499 | 16.0k | if (U_FAILURE(fDeferredStatus)) { |
500 | 0 | status = fDeferredStatus; |
501 | 0 | return nullptr; |
502 | 0 | } |
503 | | |
504 | 16.0k | retMatcher = new RegexMatcher(this); |
505 | 16.0k | if (retMatcher == nullptr) { |
506 | 0 | status = U_MEMORY_ALLOCATION_ERROR; |
507 | 0 | return nullptr; |
508 | 0 | } |
509 | 16.0k | return retMatcher; |
510 | 16.0k | } |
511 | | |
512 | | |
513 | | |
514 | | //--------------------------------------------------------------------- |
515 | | // |
516 | | // matches Convenience function to test for a match, starting |
517 | | // with a pattern string and a data string. |
518 | | // |
519 | | //--------------------------------------------------------------------- |
520 | | UBool U_EXPORT2 RegexPattern::matches(const UnicodeString ®ex, |
521 | | const UnicodeString &input, |
522 | | UParseError &pe, |
523 | 0 | UErrorCode &status) { |
524 | |
|
525 | 0 | if (U_FAILURE(status)) {return false;} |
526 | | |
527 | 0 | UBool retVal; |
528 | 0 | RegexPattern *pat = nullptr; |
529 | 0 | RegexMatcher *matcher = nullptr; |
530 | |
|
531 | 0 | pat = RegexPattern::compile(regex, 0, pe, status); |
532 | 0 | matcher = pat->matcher(input, status); |
533 | 0 | retVal = matcher->matches(status); |
534 | |
|
535 | 0 | delete matcher; |
536 | 0 | delete pat; |
537 | 0 | return retVal; |
538 | 0 | } |
539 | | |
540 | | |
541 | | // |
542 | | // matches, UText mode |
543 | | // |
544 | | UBool U_EXPORT2 RegexPattern::matches(UText *regex, |
545 | | UText *input, |
546 | | UParseError &pe, |
547 | 0 | UErrorCode &status) { |
548 | |
|
549 | 0 | if (U_FAILURE(status)) {return false;} |
550 | | |
551 | 0 | UBool retVal = false; |
552 | 0 | RegexPattern *pat = nullptr; |
553 | 0 | RegexMatcher *matcher = nullptr; |
554 | |
|
555 | 0 | pat = RegexPattern::compile(regex, 0, pe, status); |
556 | 0 | matcher = pat->matcher(status); |
557 | 0 | if (U_SUCCESS(status)) { |
558 | 0 | matcher->reset(input); |
559 | 0 | retVal = matcher->matches(status); |
560 | 0 | } |
561 | |
|
562 | 0 | delete matcher; |
563 | 0 | delete pat; |
564 | 0 | return retVal; |
565 | 0 | } |
566 | | |
567 | | |
568 | | |
569 | | |
570 | | |
571 | | //--------------------------------------------------------------------- |
572 | | // |
573 | | // pattern |
574 | | // |
575 | | //--------------------------------------------------------------------- |
576 | 0 | UnicodeString RegexPattern::pattern() const { |
577 | 0 | if (fPatternString != nullptr) { |
578 | 0 | return *fPatternString; |
579 | 0 | } else if (fPattern == nullptr) { |
580 | 0 | return {}; |
581 | 0 | } else { |
582 | 0 | UErrorCode status = U_ZERO_ERROR; |
583 | 0 | int64_t nativeLen = utext_nativeLength(fPattern); |
584 | 0 | int32_t len16 = utext_extract(fPattern, 0, nativeLen, nullptr, 0, &status); // buffer overflow error |
585 | 0 | UnicodeString result; |
586 | |
|
587 | 0 | status = U_ZERO_ERROR; |
588 | 0 | char16_t *resultChars = result.getBuffer(len16); |
589 | 0 | utext_extract(fPattern, 0, nativeLen, resultChars, len16, &status); // unterminated warning |
590 | 0 | result.releaseBuffer(len16); |
591 | |
|
592 | 0 | return result; |
593 | 0 | } |
594 | 0 | } |
595 | | |
596 | | |
597 | | |
598 | | |
599 | | //--------------------------------------------------------------------- |
600 | | // |
601 | | // patternText |
602 | | // |
603 | | //--------------------------------------------------------------------- |
604 | 0 | UText *RegexPattern::patternText(UErrorCode &status) const { |
605 | 0 | if (U_FAILURE(status)) {return nullptr;} |
606 | 0 | status = U_ZERO_ERROR; |
607 | |
|
608 | 0 | if (fPattern != nullptr) { |
609 | 0 | return fPattern; |
610 | 0 | } else { |
611 | 0 | RegexStaticSets::initGlobals(&status); |
612 | 0 | return RegexStaticSets::gStaticSets->fEmptyText; |
613 | 0 | } |
614 | 0 | } |
615 | | |
616 | | |
617 | | //-------------------------------------------------------------------------------- |
618 | | // |
619 | | // groupNumberFromName() |
620 | | // |
621 | | //-------------------------------------------------------------------------------- |
622 | 0 | int32_t RegexPattern::groupNumberFromName(const UnicodeString &groupName, UErrorCode &status) const { |
623 | 0 | if (U_FAILURE(status)) { |
624 | 0 | return 0; |
625 | 0 | } |
626 | | |
627 | | // No need to explicitly check for syntactically valid names. |
628 | | // Invalid ones will never be in the map, and the lookup will fail. |
629 | | |
630 | 0 | int32_t number = fNamedCaptureMap ? uhash_geti(fNamedCaptureMap, &groupName) : 0; |
631 | 0 | if (number == 0) { |
632 | 0 | status = U_REGEX_INVALID_CAPTURE_GROUP_NAME; |
633 | 0 | } |
634 | 0 | return number; |
635 | 0 | } |
636 | | |
637 | 0 | int32_t RegexPattern::groupNumberFromName(const char *groupName, int32_t nameLength, UErrorCode &status) const { |
638 | 0 | if (U_FAILURE(status)) { |
639 | 0 | return 0; |
640 | 0 | } |
641 | 0 | UnicodeString name(groupName, nameLength, US_INV); |
642 | 0 | return groupNumberFromName(name, status); |
643 | 0 | } |
644 | | |
645 | | |
646 | | //--------------------------------------------------------------------- |
647 | | // |
648 | | // split |
649 | | // |
650 | | //--------------------------------------------------------------------- |
651 | | int32_t RegexPattern::split(const UnicodeString &input, |
652 | | UnicodeString dest[], |
653 | | int32_t destCapacity, |
654 | | UErrorCode &status) const |
655 | 0 | { |
656 | 0 | if (U_FAILURE(status)) { |
657 | 0 | return 0; |
658 | 0 | } |
659 | | |
660 | 0 | RegexMatcher m(this); |
661 | 0 | int32_t r = 0; |
662 | | // Check m's status to make sure all is ok. |
663 | 0 | if (U_SUCCESS(m.fDeferredStatus)) { |
664 | 0 | r = m.split(input, dest, destCapacity, status); |
665 | 0 | } |
666 | 0 | return r; |
667 | 0 | } |
668 | | |
669 | | // |
670 | | // split, UText mode |
671 | | // |
672 | | int32_t RegexPattern::split(UText *input, |
673 | | UText *dest[], |
674 | | int32_t destCapacity, |
675 | | UErrorCode &status) const |
676 | 0 | { |
677 | 0 | if (U_FAILURE(status)) { |
678 | 0 | return 0; |
679 | 0 | } |
680 | | |
681 | 0 | RegexMatcher m(this); |
682 | 0 | int32_t r = 0; |
683 | | // Check m's status to make sure all is ok. |
684 | 0 | if (U_SUCCESS(m.fDeferredStatus)) { |
685 | 0 | r = m.split(input, dest, destCapacity, status); |
686 | 0 | } |
687 | 0 | return r; |
688 | 0 | } |
689 | | |
690 | | |
691 | | //--------------------------------------------------------------------- |
692 | | // |
693 | | // dump Output the compiled form of the pattern. |
694 | | // Debugging function only. |
695 | | // |
696 | | //--------------------------------------------------------------------- |
697 | 0 | void RegexPattern::dumpOp(int32_t index) const { |
698 | 0 | (void)index; // Suppress warnings in non-debug build. |
699 | | #if defined(REGEX_DEBUG) |
700 | | static const char * const opNames[] = {URX_OPCODE_NAMES}; |
701 | | int32_t op = fCompiledPat->elementAti(index); |
702 | | int32_t val = URX_VAL(op); |
703 | | int32_t type = URX_TYPE(op); |
704 | | int32_t pinnedType = type; |
705 | | if ((uint32_t)pinnedType >= UPRV_LENGTHOF(opNames)) { |
706 | | pinnedType = 0; |
707 | | } |
708 | | |
709 | | printf("%4d %08x %-15s ", index, op, opNames[pinnedType]); |
710 | | switch (type) { |
711 | | case URX_NOP: |
712 | | case URX_DOTANY: |
713 | | case URX_DOTANY_ALL: |
714 | | case URX_FAIL: |
715 | | case URX_CARET: |
716 | | case URX_DOLLAR: |
717 | | case URX_BACKSLASH_G: |
718 | | case URX_BACKSLASH_X: |
719 | | case URX_END: |
720 | | case URX_DOLLAR_M: |
721 | | case URX_CARET_M: |
722 | | // Types with no operand field of interest. |
723 | | break; |
724 | | |
725 | | case URX_RESERVED_OP: |
726 | | case URX_START_CAPTURE: |
727 | | case URX_END_CAPTURE: |
728 | | case URX_STATE_SAVE: |
729 | | case URX_JMP: |
730 | | case URX_JMP_SAV: |
731 | | case URX_JMP_SAV_X: |
732 | | case URX_BACKSLASH_B: |
733 | | case URX_BACKSLASH_BU: |
734 | | case URX_BACKSLASH_D: |
735 | | case URX_BACKSLASH_Z: |
736 | | case URX_STRING_LEN: |
737 | | case URX_CTR_INIT: |
738 | | case URX_CTR_INIT_NG: |
739 | | case URX_CTR_LOOP: |
740 | | case URX_CTR_LOOP_NG: |
741 | | case URX_RELOC_OPRND: |
742 | | case URX_STO_SP: |
743 | | case URX_LD_SP: |
744 | | case URX_BACKREF: |
745 | | case URX_STO_INP_LOC: |
746 | | case URX_JMPX: |
747 | | case URX_LA_START: |
748 | | case URX_LA_END: |
749 | | case URX_BACKREF_I: |
750 | | case URX_LB_START: |
751 | | case URX_LB_CONT: |
752 | | case URX_LB_END: |
753 | | case URX_LBN_CONT: |
754 | | case URX_LBN_END: |
755 | | case URX_LOOP_C: |
756 | | case URX_LOOP_DOT_I: |
757 | | case URX_BACKSLASH_H: |
758 | | case URX_BACKSLASH_R: |
759 | | case URX_BACKSLASH_V: |
760 | | // types with an integer operand field. |
761 | | printf("%d", val); |
762 | | break; |
763 | | |
764 | | case URX_ONECHAR: |
765 | | case URX_ONECHAR_I: |
766 | | if (val < 0x20) { |
767 | | printf("%#x", val); |
768 | | } else { |
769 | | printf("'%s'", CStr(UnicodeString(val))()); |
770 | | } |
771 | | break; |
772 | | |
773 | | case URX_STRING: |
774 | | case URX_STRING_I: |
775 | | { |
776 | | int32_t lengthOp = fCompiledPat->elementAti(index+1); |
777 | | U_ASSERT(URX_TYPE(lengthOp) == URX_STRING_LEN); |
778 | | int32_t length = URX_VAL(lengthOp); |
779 | | UnicodeString str(fLiteralText, val, length); |
780 | | printf("%s", CStr(str)()); |
781 | | } |
782 | | break; |
783 | | |
784 | | case URX_SETREF: |
785 | | case URX_LOOP_SR_I: |
786 | | { |
787 | | UnicodeString s; |
788 | | UnicodeSet *set = (UnicodeSet *)fSets->elementAt(val); |
789 | | set->toPattern(s, true); |
790 | | printf("%s", CStr(s)()); |
791 | | } |
792 | | break; |
793 | | |
794 | | case URX_STATIC_SETREF: |
795 | | case URX_STAT_SETREF_N: |
796 | | { |
797 | | UnicodeString s; |
798 | | if (val & URX_NEG_SET) { |
799 | | printf("NOT "); |
800 | | val &= ~URX_NEG_SET; |
801 | | } |
802 | | UnicodeSet &set = RegexStaticSets::gStaticSets->fPropSets[val]; |
803 | | set.toPattern(s, true); |
804 | | printf("%s", CStr(s)()); |
805 | | } |
806 | | break; |
807 | | |
808 | | |
809 | | default: |
810 | | printf("??????"); |
811 | | break; |
812 | | } |
813 | | printf("\n"); |
814 | | #endif |
815 | 0 | } |
816 | | |
817 | | |
818 | 0 | void RegexPattern::dumpPattern() const { |
819 | | #if defined(REGEX_DEBUG) |
820 | | int index; |
821 | | |
822 | | UnicodeString patStr; |
823 | | for (UChar32 c = utext_next32From(fPattern, 0); c != U_SENTINEL; c = utext_next32(fPattern)) { |
824 | | patStr.append(c); |
825 | | } |
826 | | printf("Original Pattern: \"%s\"\n", CStr(patStr)()); |
827 | | printf(" Min Match Length: %d\n", fMinMatchLen); |
828 | | printf(" Match Start Type: %s\n", START_OF_MATCH_STR(fStartType)); |
829 | | if (fStartType == START_STRING) { |
830 | | UnicodeString initialString(fLiteralText,fInitialStringIdx, fInitialStringLen); |
831 | | printf(" Initial match string: \"%s\"\n", CStr(initialString)()); |
832 | | } else if (fStartType == START_SET) { |
833 | | UnicodeString s; |
834 | | fInitialChars->toPattern(s, true); |
835 | | printf(" Match First Chars: %s\n", CStr(s)()); |
836 | | |
837 | | } else if (fStartType == START_CHAR) { |
838 | | printf(" First char of Match: "); |
839 | | if (fInitialChar > 0x20) { |
840 | | printf("'%s'\n", CStr(UnicodeString(fInitialChar))()); |
841 | | } else { |
842 | | printf("%#x\n", fInitialChar); |
843 | | } |
844 | | } |
845 | | |
846 | | printf("Named Capture Groups:\n"); |
847 | | if (!fNamedCaptureMap || uhash_count(fNamedCaptureMap) == 0) { |
848 | | printf(" None\n"); |
849 | | } else { |
850 | | int32_t pos = UHASH_FIRST; |
851 | | const UHashElement *el = nullptr; |
852 | | while ((el = uhash_nextElement(fNamedCaptureMap, &pos))) { |
853 | | const UnicodeString *name = (const UnicodeString *)el->key.pointer; |
854 | | int32_t number = el->value.integer; |
855 | | printf(" %d\t%s\n", number, CStr(*name)()); |
856 | | } |
857 | | } |
858 | | |
859 | | printf("\nIndex Binary Type Operand\n" \ |
860 | | "-------------------------------------------\n"); |
861 | | for (index = 0; index<fCompiledPat->size(); index++) { |
862 | | dumpOp(index); |
863 | | } |
864 | | printf("\n\n"); |
865 | | #endif |
866 | 0 | } |
867 | | |
868 | | |
869 | | |
870 | | UOBJECT_DEFINE_RTTI_IMPLEMENTATION(RegexPattern) |
871 | | |
872 | | U_NAMESPACE_END |
873 | | #endif // !UCONFIG_NO_REGULAR_EXPRESSIONS |