/proc/self/cwd/external/re2~/re2/regexp.cc
Line | Count | Source |
1 | | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Regular expression representation. |
6 | | // Tested by parse_test.cc |
7 | | |
8 | | #include "re2/regexp.h" |
9 | | |
10 | | #include <stddef.h> |
11 | | #include <stdint.h> |
12 | | #include <string.h> |
13 | | |
14 | | #include <algorithm> |
15 | | #include <map> |
16 | | #include <string> |
17 | | #include <vector> |
18 | | |
19 | | #include "absl/base/call_once.h" |
20 | | #include "absl/base/macros.h" |
21 | | #include "absl/container/flat_hash_map.h" |
22 | | #include "absl/log/absl_check.h" |
23 | | #include "absl/log/absl_log.h" |
24 | | #include "absl/synchronization/mutex.h" |
25 | | #include "re2/pod_array.h" |
26 | | #include "re2/walker-inl.h" |
27 | | #include "util/utf.h" |
28 | | |
29 | | namespace re2 { |
30 | | |
31 | | // Constructor. Allocates vectors as appropriate for operator. |
32 | | Regexp::Regexp(RegexpOp op, ParseFlags parse_flags) |
33 | 0 | : op_(static_cast<uint8_t>(op)), |
34 | 0 | simple_(false), |
35 | 0 | parse_flags_(static_cast<uint16_t>(parse_flags)), |
36 | 0 | ref_(1), |
37 | 0 | nsub_(0), |
38 | 0 | down_(NULL) { |
39 | 0 | subone_ = NULL; |
40 | 0 | memset(the_union_, 0, sizeof the_union_); |
41 | 0 | } |
42 | | |
43 | | // Destructor. Assumes already cleaned up children. |
44 | | // Private: use Decref() instead of delete to destroy Regexps. |
45 | | // Can't call Decref on the sub-Regexps here because |
46 | | // that could cause arbitrarily deep recursion, so |
47 | | // required Decref() to have handled them for us. |
48 | 0 | Regexp::~Regexp() { |
49 | 0 | if (nsub_ > 0) |
50 | 0 | ABSL_LOG(DFATAL) << "Regexp not destroyed."; |
51 | |
|
52 | 0 | switch (op_) { |
53 | 0 | default: |
54 | 0 | break; |
55 | 0 | case kRegexpCapture: |
56 | 0 | delete name_; |
57 | 0 | break; |
58 | 0 | case kRegexpLiteralString: |
59 | 0 | delete[] runes_; |
60 | 0 | break; |
61 | 0 | case kRegexpCharClass: |
62 | 0 | if (cc_) |
63 | 0 | cc_->Delete(); |
64 | 0 | delete ccb_; |
65 | 0 | break; |
66 | 0 | } |
67 | 0 | } |
68 | | |
69 | | // If it's possible to destroy this regexp without recurring, |
70 | | // do so and return true. Else return false. |
71 | 0 | bool Regexp::QuickDestroy() { |
72 | 0 | if (nsub_ == 0) { |
73 | 0 | delete this; |
74 | 0 | return true; |
75 | 0 | } |
76 | 0 | return false; |
77 | 0 | } |
78 | | |
79 | | // Similar to EmptyStorage in re2.cc. |
80 | | struct RefStorage { |
81 | | absl::Mutex ref_mutex; |
82 | | absl::flat_hash_map<Regexp*, int> ref_map; |
83 | | }; |
84 | | alignas(RefStorage) static char ref_storage[sizeof(RefStorage)]; |
85 | | |
86 | 0 | static inline absl::Mutex* ref_mutex() { |
87 | 0 | return &reinterpret_cast<RefStorage*>(ref_storage)->ref_mutex; |
88 | 0 | } |
89 | | |
90 | 0 | static inline absl::flat_hash_map<Regexp*, int>* ref_map() { |
91 | 0 | return &reinterpret_cast<RefStorage*>(ref_storage)->ref_map; |
92 | 0 | } |
93 | | |
94 | 0 | int Regexp::Ref() { |
95 | 0 | if (ref_ < kMaxRef) |
96 | 0 | return ref_; |
97 | | |
98 | 0 | absl::MutexLock l(ref_mutex()); |
99 | 0 | return (*ref_map())[this]; |
100 | 0 | } |
101 | | |
102 | | // Increments reference count, returns object as convenience. |
103 | 0 | Regexp* Regexp::Incref() { |
104 | 0 | if (ref_ >= kMaxRef-1) { |
105 | 0 | static absl::once_flag ref_once; |
106 | 0 | absl::call_once(ref_once, []() { |
107 | 0 | (void) new (ref_storage) RefStorage; |
108 | 0 | }); |
109 | | |
110 | | // Store ref count in overflow map. |
111 | 0 | absl::MutexLock l(ref_mutex()); |
112 | 0 | if (ref_ == kMaxRef) { |
113 | | // already overflowed |
114 | 0 | (*ref_map())[this]++; |
115 | 0 | } else { |
116 | | // overflowing now |
117 | 0 | (*ref_map())[this] = kMaxRef; |
118 | 0 | ref_ = kMaxRef; |
119 | 0 | } |
120 | 0 | return this; |
121 | 0 | } |
122 | | |
123 | 0 | ref_++; |
124 | 0 | return this; |
125 | 0 | } |
126 | | |
127 | | // Decrements reference count and deletes this object if count reaches 0. |
128 | 0 | void Regexp::Decref() { |
129 | 0 | if (ref_ == kMaxRef) { |
130 | | // Ref count is stored in overflow map. |
131 | 0 | absl::MutexLock l(ref_mutex()); |
132 | 0 | int r = (*ref_map())[this] - 1; |
133 | 0 | if (r < kMaxRef) { |
134 | 0 | ref_ = static_cast<uint16_t>(r); |
135 | 0 | ref_map()->erase(this); |
136 | 0 | } else { |
137 | 0 | (*ref_map())[this] = r; |
138 | 0 | } |
139 | 0 | return; |
140 | 0 | } |
141 | 0 | ref_--; |
142 | 0 | if (ref_ == 0) |
143 | 0 | Destroy(); |
144 | 0 | } |
145 | | |
146 | | // Deletes this object; ref count has count reached 0. |
147 | 0 | void Regexp::Destroy() { |
148 | 0 | if (QuickDestroy()) |
149 | 0 | return; |
150 | | |
151 | | // Handle recursive Destroy with explicit stack |
152 | | // to avoid arbitrarily deep recursion on process stack [sigh]. |
153 | 0 | down_ = NULL; |
154 | 0 | Regexp* stack = this; |
155 | 0 | while (stack != NULL) { |
156 | 0 | Regexp* re = stack; |
157 | 0 | stack = re->down_; |
158 | 0 | if (re->ref_ != 0) |
159 | 0 | ABSL_LOG(DFATAL) << "Bad reference count " << re->ref_; |
160 | 0 | if (re->nsub_ > 0) { |
161 | 0 | Regexp** subs = re->sub(); |
162 | 0 | for (int i = 0; i < re->nsub_; i++) { |
163 | 0 | Regexp* sub = subs[i]; |
164 | 0 | if (sub == NULL) |
165 | 0 | continue; |
166 | 0 | if (sub->ref_ == kMaxRef) |
167 | 0 | sub->Decref(); |
168 | 0 | else |
169 | 0 | --sub->ref_; |
170 | 0 | if (sub->ref_ == 0 && !sub->QuickDestroy()) { |
171 | 0 | sub->down_ = stack; |
172 | 0 | stack = sub; |
173 | 0 | } |
174 | 0 | } |
175 | 0 | if (re->nsub_ > 1) |
176 | 0 | delete[] subs; |
177 | 0 | re->nsub_ = 0; |
178 | 0 | } |
179 | 0 | delete re; |
180 | 0 | } |
181 | 0 | } |
182 | | |
183 | 0 | void Regexp::AddRuneToString(Rune r) { |
184 | 0 | ABSL_DCHECK(op_ == kRegexpLiteralString); |
185 | 0 | if (nrunes_ == 0) { |
186 | | // start with 8 |
187 | 0 | runes_ = new Rune[8]; |
188 | 0 | } else if (nrunes_ >= 8 && (nrunes_ & (nrunes_ - 1)) == 0) { |
189 | | // double on powers of two |
190 | 0 | Rune *old = runes_; |
191 | 0 | runes_ = new Rune[nrunes_ * 2]; |
192 | 0 | for (int i = 0; i < nrunes_; i++) |
193 | 0 | runes_[i] = old[i]; |
194 | 0 | delete[] old; |
195 | 0 | } |
196 | |
|
197 | 0 | runes_[nrunes_++] = r; |
198 | 0 | } |
199 | | |
200 | 0 | Regexp* Regexp::HaveMatch(int match_id, ParseFlags flags) { |
201 | 0 | Regexp* re = new Regexp(kRegexpHaveMatch, flags); |
202 | 0 | re->match_id_ = match_id; |
203 | 0 | return re; |
204 | 0 | } |
205 | | |
206 | 0 | Regexp* Regexp::StarPlusOrQuest(RegexpOp op, Regexp* sub, ParseFlags flags) { |
207 | | // Squash **, ++ and ??. |
208 | 0 | if (op == sub->op() && flags == sub->parse_flags()) |
209 | 0 | return sub; |
210 | | |
211 | | // Squash *+, *?, +*, +?, ?* and ?+. They all squash to *, so because |
212 | | // op is Star/Plus/Quest, we just have to check that sub->op() is too. |
213 | 0 | if ((sub->op() == kRegexpStar || |
214 | 0 | sub->op() == kRegexpPlus || |
215 | 0 | sub->op() == kRegexpQuest) && |
216 | 0 | flags == sub->parse_flags()) { |
217 | | // If sub is Star, no need to rewrite it. |
218 | 0 | if (sub->op() == kRegexpStar) |
219 | 0 | return sub; |
220 | | |
221 | | // Rewrite sub to Star. |
222 | 0 | Regexp* re = new Regexp(kRegexpStar, flags); |
223 | 0 | re->AllocSub(1); |
224 | 0 | re->sub()[0] = sub->sub()[0]->Incref(); |
225 | 0 | sub->Decref(); // We didn't consume the reference after all. |
226 | 0 | return re; |
227 | 0 | } |
228 | | |
229 | 0 | Regexp* re = new Regexp(op, flags); |
230 | 0 | re->AllocSub(1); |
231 | 0 | re->sub()[0] = sub; |
232 | 0 | return re; |
233 | 0 | } |
234 | | |
235 | 0 | Regexp* Regexp::Plus(Regexp* sub, ParseFlags flags) { |
236 | 0 | return StarPlusOrQuest(kRegexpPlus, sub, flags); |
237 | 0 | } |
238 | | |
239 | 0 | Regexp* Regexp::Star(Regexp* sub, ParseFlags flags) { |
240 | 0 | return StarPlusOrQuest(kRegexpStar, sub, flags); |
241 | 0 | } |
242 | | |
243 | 0 | Regexp* Regexp::Quest(Regexp* sub, ParseFlags flags) { |
244 | 0 | return StarPlusOrQuest(kRegexpQuest, sub, flags); |
245 | 0 | } |
246 | | |
247 | | Regexp* Regexp::ConcatOrAlternate(RegexpOp op, Regexp** sub, int nsub, |
248 | 0 | ParseFlags flags, bool can_factor) { |
249 | 0 | if (nsub == 1) |
250 | 0 | return sub[0]; |
251 | | |
252 | 0 | if (nsub == 0) { |
253 | 0 | if (op == kRegexpAlternate) |
254 | 0 | return new Regexp(kRegexpNoMatch, flags); |
255 | 0 | else |
256 | 0 | return new Regexp(kRegexpEmptyMatch, flags); |
257 | 0 | } |
258 | | |
259 | 0 | PODArray<Regexp*> subcopy; |
260 | 0 | if (op == kRegexpAlternate && can_factor) { |
261 | | // Going to edit sub; make a copy so we don't step on caller. |
262 | 0 | subcopy = PODArray<Regexp*>(nsub); |
263 | 0 | memmove(subcopy.data(), sub, nsub * sizeof sub[0]); |
264 | 0 | sub = subcopy.data(); |
265 | 0 | nsub = FactorAlternation(sub, nsub, flags); |
266 | 0 | if (nsub == 1) { |
267 | 0 | Regexp* re = sub[0]; |
268 | 0 | return re; |
269 | 0 | } |
270 | 0 | } |
271 | | |
272 | 0 | if (nsub > kMaxNsub) { |
273 | | // Too many subexpressions to fit in a single Regexp. |
274 | | // Make a two-level tree. Two levels gets us to 65535^2. |
275 | 0 | int nbigsub = (nsub+kMaxNsub-1)/kMaxNsub; |
276 | 0 | Regexp* re = new Regexp(op, flags); |
277 | 0 | re->AllocSub(nbigsub); |
278 | 0 | Regexp** subs = re->sub(); |
279 | 0 | for (int i = 0; i < nbigsub - 1; i++) |
280 | 0 | subs[i] = ConcatOrAlternate(op, sub+i*kMaxNsub, kMaxNsub, flags, false); |
281 | 0 | subs[nbigsub - 1] = ConcatOrAlternate(op, sub+(nbigsub-1)*kMaxNsub, |
282 | 0 | nsub - (nbigsub-1)*kMaxNsub, flags, |
283 | 0 | false); |
284 | 0 | return re; |
285 | 0 | } |
286 | | |
287 | 0 | Regexp* re = new Regexp(op, flags); |
288 | 0 | re->AllocSub(nsub); |
289 | 0 | Regexp** subs = re->sub(); |
290 | 0 | for (int i = 0; i < nsub; i++) |
291 | 0 | subs[i] = sub[i]; |
292 | 0 | return re; |
293 | 0 | } |
294 | | |
295 | 0 | Regexp* Regexp::Concat(Regexp** sub, int nsub, ParseFlags flags) { |
296 | 0 | return ConcatOrAlternate(kRegexpConcat, sub, nsub, flags, false); |
297 | 0 | } |
298 | | |
299 | 0 | Regexp* Regexp::Alternate(Regexp** sub, int nsub, ParseFlags flags) { |
300 | 0 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, true); |
301 | 0 | } |
302 | | |
303 | 0 | Regexp* Regexp::AlternateNoFactor(Regexp** sub, int nsub, ParseFlags flags) { |
304 | 0 | return ConcatOrAlternate(kRegexpAlternate, sub, nsub, flags, false); |
305 | 0 | } |
306 | | |
307 | 0 | Regexp* Regexp::Capture(Regexp* sub, ParseFlags flags, int cap) { |
308 | 0 | Regexp* re = new Regexp(kRegexpCapture, flags); |
309 | 0 | re->AllocSub(1); |
310 | 0 | re->sub()[0] = sub; |
311 | 0 | re->cap_ = cap; |
312 | 0 | return re; |
313 | 0 | } |
314 | | |
315 | 0 | Regexp* Regexp::Repeat(Regexp* sub, ParseFlags flags, int min, int max) { |
316 | 0 | Regexp* re = new Regexp(kRegexpRepeat, flags); |
317 | 0 | re->AllocSub(1); |
318 | 0 | re->sub()[0] = sub; |
319 | 0 | re->min_ = min; |
320 | 0 | re->max_ = max; |
321 | 0 | return re; |
322 | 0 | } |
323 | | |
324 | 0 | Regexp* Regexp::NewLiteral(Rune rune, ParseFlags flags) { |
325 | 0 | Regexp* re = new Regexp(kRegexpLiteral, flags); |
326 | 0 | re->rune_ = rune; |
327 | 0 | return re; |
328 | 0 | } |
329 | | |
330 | 0 | Regexp* Regexp::LiteralString(Rune* runes, int nrunes, ParseFlags flags) { |
331 | 0 | if (nrunes <= 0) |
332 | 0 | return new Regexp(kRegexpEmptyMatch, flags); |
333 | 0 | if (nrunes == 1) |
334 | 0 | return NewLiteral(runes[0], flags); |
335 | 0 | Regexp* re = new Regexp(kRegexpLiteralString, flags); |
336 | 0 | for (int i = 0; i < nrunes; i++) |
337 | 0 | re->AddRuneToString(runes[i]); |
338 | 0 | return re; |
339 | 0 | } |
340 | | |
341 | 0 | Regexp* Regexp::NewCharClass(CharClass* cc, ParseFlags flags) { |
342 | 0 | Regexp* re = new Regexp(kRegexpCharClass, flags); |
343 | 0 | re->cc_ = cc; |
344 | 0 | return re; |
345 | 0 | } |
346 | | |
347 | 0 | void Regexp::Swap(Regexp* that) { |
348 | | // Regexp is not trivially copyable, so we cannot freely copy it with |
349 | | // memmove(3), but swapping objects like so is safe for our purposes. |
350 | 0 | char tmp[sizeof *this]; |
351 | 0 | void* vthis = reinterpret_cast<void*>(this); |
352 | 0 | void* vthat = reinterpret_cast<void*>(that); |
353 | 0 | memmove(tmp, vthis, sizeof *this); |
354 | 0 | memmove(vthis, vthat, sizeof *this); |
355 | 0 | memmove(vthat, tmp, sizeof *this); |
356 | 0 | } |
357 | | |
358 | | // Tests equality of all top-level structure but not subregexps. |
359 | 0 | static bool TopEqual(Regexp* a, Regexp* b) { |
360 | 0 | if (a->op() != b->op()) |
361 | 0 | return false; |
362 | | |
363 | 0 | switch (a->op()) { |
364 | 0 | case kRegexpNoMatch: |
365 | 0 | case kRegexpEmptyMatch: |
366 | 0 | case kRegexpAnyChar: |
367 | 0 | case kRegexpAnyByte: |
368 | 0 | case kRegexpBeginLine: |
369 | 0 | case kRegexpEndLine: |
370 | 0 | case kRegexpWordBoundary: |
371 | 0 | case kRegexpNoWordBoundary: |
372 | 0 | case kRegexpBeginText: |
373 | 0 | return true; |
374 | | |
375 | 0 | case kRegexpEndText: |
376 | | // The parse flags remember whether it's \z or (?-m:$), |
377 | | // which matters when testing against PCRE. |
378 | 0 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::WasDollar) == 0; |
379 | | |
380 | 0 | case kRegexpLiteral: |
381 | 0 | return a->rune() == b->rune() && |
382 | 0 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0; |
383 | | |
384 | 0 | case kRegexpLiteralString: |
385 | 0 | return a->nrunes() == b->nrunes() && |
386 | 0 | ((a->parse_flags() ^ b->parse_flags()) & Regexp::FoldCase) == 0 && |
387 | 0 | memcmp(a->runes(), b->runes(), |
388 | 0 | a->nrunes() * sizeof a->runes()[0]) == 0; |
389 | | |
390 | 0 | case kRegexpAlternate: |
391 | 0 | case kRegexpConcat: |
392 | 0 | return a->nsub() == b->nsub(); |
393 | | |
394 | 0 | case kRegexpStar: |
395 | 0 | case kRegexpPlus: |
396 | 0 | case kRegexpQuest: |
397 | 0 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0; |
398 | | |
399 | 0 | case kRegexpRepeat: |
400 | 0 | return ((a->parse_flags() ^ b->parse_flags()) & Regexp::NonGreedy) == 0 && |
401 | 0 | a->min() == b->min() && |
402 | 0 | a->max() == b->max(); |
403 | | |
404 | 0 | case kRegexpCapture: |
405 | 0 | if (a->name() == NULL || b->name() == NULL) { |
406 | | // One pointer is null, so the other pointer should also be null. |
407 | 0 | return a->cap() == b->cap() && a->name() == b->name(); |
408 | 0 | } else { |
409 | | // Neither pointer is null, so compare the pointees for equality. |
410 | 0 | return a->cap() == b->cap() && *a->name() == *b->name(); |
411 | 0 | } |
412 | | |
413 | 0 | case kRegexpHaveMatch: |
414 | 0 | return a->match_id() == b->match_id(); |
415 | | |
416 | 0 | case kRegexpCharClass: { |
417 | 0 | CharClass* acc = a->cc(); |
418 | 0 | CharClass* bcc = b->cc(); |
419 | 0 | return acc->size() == bcc->size() && |
420 | 0 | acc->end() - acc->begin() == bcc->end() - bcc->begin() && |
421 | 0 | memcmp(acc->begin(), bcc->begin(), |
422 | 0 | (acc->end() - acc->begin()) * sizeof acc->begin()[0]) == 0; |
423 | 0 | } |
424 | 0 | } |
425 | | |
426 | 0 | ABSL_LOG(DFATAL) << "Unexpected op in Regexp::Equal: " << a->op(); |
427 | 0 | return 0; |
428 | 0 | } |
429 | | |
430 | | bool Regexp::Equal(Regexp* a, Regexp* b) { |
431 | | if (a == NULL || b == NULL) |
432 | | return a == b; |
433 | | |
434 | | if (!TopEqual(a, b)) |
435 | | return false; |
436 | | |
437 | | // Fast path: |
438 | | // return without allocating vector if there are no subregexps. |
439 | | switch (a->op()) { |
440 | | case kRegexpAlternate: |
441 | | case kRegexpConcat: |
442 | | case kRegexpStar: |
443 | | case kRegexpPlus: |
444 | | case kRegexpQuest: |
445 | | case kRegexpRepeat: |
446 | | case kRegexpCapture: |
447 | | break; |
448 | | |
449 | | default: |
450 | | return true; |
451 | | } |
452 | | |
453 | | // Committed to doing real work. |
454 | | // The stack (vector) has pairs of regexps waiting to |
455 | | // be compared. The regexps are only equal if |
456 | | // all the pairs end up being equal. |
457 | | std::vector<Regexp*> stk; |
458 | | |
459 | | for (;;) { |
460 | | // Invariant: TopEqual(a, b) == true. |
461 | | Regexp* a2; |
462 | | Regexp* b2; |
463 | | switch (a->op()) { |
464 | | default: |
465 | | break; |
466 | | case kRegexpAlternate: |
467 | | case kRegexpConcat: |
468 | | for (int i = 0; i < a->nsub(); i++) { |
469 | | a2 = a->sub()[i]; |
470 | | b2 = b->sub()[i]; |
471 | | if (!TopEqual(a2, b2)) |
472 | | return false; |
473 | | stk.push_back(a2); |
474 | | stk.push_back(b2); |
475 | | } |
476 | | break; |
477 | | |
478 | | case kRegexpStar: |
479 | | case kRegexpPlus: |
480 | | case kRegexpQuest: |
481 | | case kRegexpRepeat: |
482 | | case kRegexpCapture: |
483 | | a2 = a->sub()[0]; |
484 | | b2 = b->sub()[0]; |
485 | | if (!TopEqual(a2, b2)) |
486 | | return false; |
487 | | // Really: |
488 | | // stk.push_back(a2); |
489 | | // stk.push_back(b2); |
490 | | // break; |
491 | | // but faster to assign directly and loop. |
492 | | a = a2; |
493 | | b = b2; |
494 | | continue; |
495 | | } |
496 | | |
497 | | size_t n = stk.size(); |
498 | | if (n == 0) |
499 | | break; |
500 | | |
501 | | ABSL_DCHECK_GE(n, size_t{2}); |
502 | | a = stk[n-2]; |
503 | | b = stk[n-1]; |
504 | | stk.resize(n-2); |
505 | | } |
506 | | |
507 | | return true; |
508 | | } |
509 | | |
510 | | // Keep in sync with enum RegexpStatusCode in regexp.h |
511 | | static const char *kErrorStrings[] = { |
512 | | "no error", |
513 | | "unexpected error", |
514 | | "invalid escape sequence", |
515 | | "invalid character class", |
516 | | "invalid character class range", |
517 | | "missing ]", |
518 | | "missing )", |
519 | | "unexpected )", |
520 | | "trailing \\", |
521 | | "no argument for repetition operator", |
522 | | "invalid repetition size", |
523 | | "bad repetition operator", |
524 | | "invalid perl operator", |
525 | | "invalid UTF-8", |
526 | | "invalid named capture group", |
527 | | }; |
528 | | |
529 | 0 | std::string RegexpStatus::CodeText(enum RegexpStatusCode code) { |
530 | 0 | if (code < 0 || code >= ABSL_ARRAYSIZE(kErrorStrings)) |
531 | 0 | code = kRegexpInternalError; |
532 | 0 | return kErrorStrings[code]; |
533 | 0 | } |
534 | | |
535 | 0 | std::string RegexpStatus::Text() const { |
536 | 0 | if (error_arg_.empty()) |
537 | 0 | return CodeText(code_); |
538 | 0 | std::string s; |
539 | 0 | s.append(CodeText(code_)); |
540 | 0 | s.append(": "); |
541 | 0 | s.append(error_arg_.data(), error_arg_.size()); |
542 | 0 | return s; |
543 | 0 | } |
544 | | |
545 | 0 | void RegexpStatus::Copy(const RegexpStatus& status) { |
546 | 0 | code_ = status.code_; |
547 | 0 | error_arg_ = status.error_arg_; |
548 | 0 | } |
549 | | |
550 | | typedef int Ignored; // Walker<void> doesn't exist |
551 | | |
552 | | // Walker subclass to count capturing parens in regexp. |
553 | | class NumCapturesWalker : public Regexp::Walker<Ignored> { |
554 | | public: |
555 | 0 | NumCapturesWalker() : ncapture_(0) {} |
556 | 0 | int ncapture() { return ncapture_; } |
557 | | |
558 | 0 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
559 | 0 | if (re->op() == kRegexpCapture) |
560 | 0 | ncapture_++; |
561 | 0 | return ignored; |
562 | 0 | } |
563 | | |
564 | 0 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
565 | | // Should never be called: we use Walk(), not WalkExponential(). |
566 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
567 | | ABSL_LOG(DFATAL) << "NumCapturesWalker::ShortVisit called"; |
568 | | #endif |
569 | 0 | return ignored; |
570 | 0 | } |
571 | | |
572 | | private: |
573 | | int ncapture_; |
574 | | |
575 | | NumCapturesWalker(const NumCapturesWalker&) = delete; |
576 | | NumCapturesWalker& operator=(const NumCapturesWalker&) = delete; |
577 | | }; |
578 | | |
579 | 0 | int Regexp::NumCaptures() { |
580 | 0 | NumCapturesWalker w; |
581 | 0 | w.Walk(this, 0); |
582 | 0 | return w.ncapture(); |
583 | 0 | } |
584 | | |
585 | | // Walker class to build map of named capture groups and their indices. |
586 | | class NamedCapturesWalker : public Regexp::Walker<Ignored> { |
587 | | public: |
588 | 0 | NamedCapturesWalker() : map_(NULL) {} |
589 | 0 | ~NamedCapturesWalker() { delete map_; } |
590 | | |
591 | 0 | std::map<std::string, int>* TakeMap() { |
592 | 0 | std::map<std::string, int>* m = map_; |
593 | 0 | map_ = NULL; |
594 | 0 | return m; |
595 | 0 | } |
596 | | |
597 | 0 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
598 | 0 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
599 | | // Allocate map once we find a name. |
600 | 0 | if (map_ == NULL) |
601 | 0 | map_ = new std::map<std::string, int>; |
602 | | |
603 | | // Record first occurrence of each name. |
604 | | // (The rule is that if you have the same name |
605 | | // multiple times, only the leftmost one counts.) |
606 | 0 | map_->insert({*re->name(), re->cap()}); |
607 | 0 | } |
608 | 0 | return ignored; |
609 | 0 | } |
610 | | |
611 | 0 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
612 | | // Should never be called: we use Walk(), not WalkExponential(). |
613 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
614 | | ABSL_LOG(DFATAL) << "NamedCapturesWalker::ShortVisit called"; |
615 | | #endif |
616 | 0 | return ignored; |
617 | 0 | } |
618 | | |
619 | | private: |
620 | | std::map<std::string, int>* map_; |
621 | | |
622 | | NamedCapturesWalker(const NamedCapturesWalker&) = delete; |
623 | | NamedCapturesWalker& operator=(const NamedCapturesWalker&) = delete; |
624 | | }; |
625 | | |
626 | 0 | std::map<std::string, int>* Regexp::NamedCaptures() { |
627 | 0 | NamedCapturesWalker w; |
628 | 0 | w.Walk(this, 0); |
629 | 0 | return w.TakeMap(); |
630 | 0 | } |
631 | | |
632 | | // Walker class to build map from capture group indices to their names. |
633 | | class CaptureNamesWalker : public Regexp::Walker<Ignored> { |
634 | | public: |
635 | 0 | CaptureNamesWalker() : map_(NULL) {} |
636 | 0 | ~CaptureNamesWalker() { delete map_; } |
637 | | |
638 | 0 | std::map<int, std::string>* TakeMap() { |
639 | 0 | std::map<int, std::string>* m = map_; |
640 | 0 | map_ = NULL; |
641 | 0 | return m; |
642 | 0 | } |
643 | | |
644 | 0 | virtual Ignored PreVisit(Regexp* re, Ignored ignored, bool* stop) { |
645 | 0 | if (re->op() == kRegexpCapture && re->name() != NULL) { |
646 | | // Allocate map once we find a name. |
647 | 0 | if (map_ == NULL) |
648 | 0 | map_ = new std::map<int, std::string>; |
649 | |
|
650 | 0 | (*map_)[re->cap()] = *re->name(); |
651 | 0 | } |
652 | 0 | return ignored; |
653 | 0 | } |
654 | | |
655 | 0 | virtual Ignored ShortVisit(Regexp* re, Ignored ignored) { |
656 | | // Should never be called: we use Walk(), not WalkExponential(). |
657 | | #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
658 | | ABSL_LOG(DFATAL) << "CaptureNamesWalker::ShortVisit called"; |
659 | | #endif |
660 | 0 | return ignored; |
661 | 0 | } |
662 | | |
663 | | private: |
664 | | std::map<int, std::string>* map_; |
665 | | |
666 | | CaptureNamesWalker(const CaptureNamesWalker&) = delete; |
667 | | CaptureNamesWalker& operator=(const CaptureNamesWalker&) = delete; |
668 | | }; |
669 | | |
670 | 0 | std::map<int, std::string>* Regexp::CaptureNames() { |
671 | 0 | CaptureNamesWalker w; |
672 | 0 | w.Walk(this, 0); |
673 | 0 | return w.TakeMap(); |
674 | 0 | } |
675 | | |
676 | | void ConvertRunesToBytes(bool latin1, Rune* runes, int nrunes, |
677 | 0 | std::string* bytes) { |
678 | 0 | if (latin1) { |
679 | 0 | bytes->resize(nrunes); |
680 | 0 | for (int i = 0; i < nrunes; i++) |
681 | 0 | (*bytes)[i] = static_cast<char>(runes[i]); |
682 | 0 | } else { |
683 | 0 | bytes->resize(nrunes * UTFmax); // worst case |
684 | 0 | char* p = &(*bytes)[0]; |
685 | 0 | for (int i = 0; i < nrunes; i++) |
686 | 0 | p += runetochar(p, &runes[i]); |
687 | 0 | bytes->resize(p - &(*bytes)[0]); |
688 | 0 | bytes->shrink_to_fit(); |
689 | 0 | } |
690 | 0 | } |
691 | | |
692 | | // Determines whether regexp matches must be anchored |
693 | | // with a fixed string prefix. If so, returns the prefix and |
694 | | // the regexp that remains after the prefix. The prefix might |
695 | | // be ASCII case-insensitive. |
696 | | bool Regexp::RequiredPrefix(std::string* prefix, bool* foldcase, |
697 | 0 | Regexp** suffix) { |
698 | 0 | prefix->clear(); |
699 | 0 | *foldcase = false; |
700 | 0 | *suffix = NULL; |
701 | | |
702 | | // No need for a walker: the regexp must be of the form |
703 | | // 1. some number of ^ anchors |
704 | | // 2. a literal char or string |
705 | | // 3. the rest |
706 | 0 | if (op_ != kRegexpConcat) |
707 | 0 | return false; |
708 | 0 | int i = 0; |
709 | 0 | while (i < nsub_ && sub()[i]->op_ == kRegexpBeginText) |
710 | 0 | i++; |
711 | 0 | if (i == 0 || i >= nsub_) |
712 | 0 | return false; |
713 | 0 | Regexp* re = sub()[i]; |
714 | 0 | if (re->op_ != kRegexpLiteral && |
715 | 0 | re->op_ != kRegexpLiteralString) |
716 | 0 | return false; |
717 | 0 | i++; |
718 | 0 | if (i < nsub_) { |
719 | 0 | for (int j = i; j < nsub_; j++) |
720 | 0 | sub()[j]->Incref(); |
721 | 0 | *suffix = Concat(sub() + i, nsub_ - i, parse_flags()); |
722 | 0 | } else { |
723 | 0 | *suffix = new Regexp(kRegexpEmptyMatch, parse_flags()); |
724 | 0 | } |
725 | |
|
726 | 0 | bool latin1 = (re->parse_flags() & Latin1) != 0; |
727 | 0 | Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; |
728 | 0 | int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; |
729 | 0 | ConvertRunesToBytes(latin1, runes, nrunes, prefix); |
730 | 0 | *foldcase = (re->parse_flags() & FoldCase) != 0; |
731 | 0 | return true; |
732 | 0 | } |
733 | | |
734 | | // Determines whether regexp matches must be unanchored |
735 | | // with a fixed string prefix. If so, returns the prefix. |
736 | | // The prefix might be ASCII case-insensitive. |
737 | 0 | bool Regexp::RequiredPrefixForAccel(std::string* prefix, bool* foldcase) { |
738 | 0 | prefix->clear(); |
739 | 0 | *foldcase = false; |
740 | | |
741 | | // No need for a walker: the regexp must either begin with or be |
742 | | // a literal char or string. We "see through" capturing groups, |
743 | | // but make no effort to glue multiple prefix fragments together. |
744 | 0 | Regexp* re = op_ == kRegexpConcat && nsub_ > 0 ? sub()[0] : this; |
745 | 0 | while (re->op_ == kRegexpCapture) { |
746 | 0 | re = re->sub()[0]; |
747 | 0 | if (re->op_ == kRegexpConcat && re->nsub_ > 0) |
748 | 0 | re = re->sub()[0]; |
749 | 0 | } |
750 | 0 | if (re->op_ != kRegexpLiteral && |
751 | 0 | re->op_ != kRegexpLiteralString) |
752 | 0 | return false; |
753 | | |
754 | 0 | bool latin1 = (re->parse_flags() & Latin1) != 0; |
755 | 0 | Rune* runes = re->op_ == kRegexpLiteral ? &re->rune_ : re->runes_; |
756 | 0 | int nrunes = re->op_ == kRegexpLiteral ? 1 : re->nrunes_; |
757 | 0 | ConvertRunesToBytes(latin1, runes, nrunes, prefix); |
758 | 0 | *foldcase = (re->parse_flags() & FoldCase) != 0; |
759 | 0 | return true; |
760 | 0 | } |
761 | | |
762 | | // Character class builder is a balanced binary tree (STL set) |
763 | | // containing non-overlapping, non-abutting RuneRanges. |
764 | | // The less-than operator used in the tree treats two |
765 | | // ranges as equal if they overlap at all, so that |
766 | | // lookups for a particular Rune are possible. |
767 | | |
768 | 0 | CharClassBuilder::CharClassBuilder() { |
769 | 0 | nrunes_ = 0; |
770 | 0 | upper_ = 0; |
771 | 0 | lower_ = 0; |
772 | 0 | } |
773 | | |
774 | | // Add lo-hi to the class; return whether class got bigger. |
775 | 0 | bool CharClassBuilder::AddRange(Rune lo, Rune hi) { |
776 | 0 | if (hi < lo) |
777 | 0 | return false; |
778 | | |
779 | 0 | if (lo <= 'z' && hi >= 'A') { |
780 | | // Overlaps some alpha, maybe not all. |
781 | | // Update bitmaps telling which ASCII letters are in the set. |
782 | 0 | Rune lo1 = std::max<Rune>(lo, 'A'); |
783 | 0 | Rune hi1 = std::min<Rune>(hi, 'Z'); |
784 | 0 | if (lo1 <= hi1) |
785 | 0 | upper_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'A'); |
786 | |
|
787 | 0 | lo1 = std::max<Rune>(lo, 'a'); |
788 | 0 | hi1 = std::min<Rune>(hi, 'z'); |
789 | 0 | if (lo1 <= hi1) |
790 | 0 | lower_ |= ((1 << (hi1 - lo1 + 1)) - 1) << (lo1 - 'a'); |
791 | 0 | } |
792 | |
|
793 | 0 | { // Check whether lo, hi is already in the class. |
794 | 0 | iterator it = ranges_.find(RuneRange(lo, lo)); |
795 | 0 | if (it != end() && it->lo <= lo && hi <= it->hi) |
796 | 0 | return false; |
797 | 0 | } |
798 | | |
799 | | // Look for a range abutting lo on the left. |
800 | | // If it exists, take it out and increase our range. |
801 | 0 | if (lo > 0) { |
802 | 0 | iterator it = ranges_.find(RuneRange(lo-1, lo-1)); |
803 | 0 | if (it != end()) { |
804 | 0 | lo = it->lo; |
805 | 0 | if (it->hi > hi) |
806 | 0 | hi = it->hi; |
807 | 0 | nrunes_ -= it->hi - it->lo + 1; |
808 | 0 | ranges_.erase(it); |
809 | 0 | } |
810 | 0 | } |
811 | | |
812 | | // Look for a range abutting hi on the right. |
813 | | // If it exists, take it out and increase our range. |
814 | 0 | if (hi < Runemax) { |
815 | 0 | iterator it = ranges_.find(RuneRange(hi+1, hi+1)); |
816 | 0 | if (it != end()) { |
817 | 0 | hi = it->hi; |
818 | 0 | nrunes_ -= it->hi - it->lo + 1; |
819 | 0 | ranges_.erase(it); |
820 | 0 | } |
821 | 0 | } |
822 | | |
823 | | // Look for ranges between lo and hi. Take them out. |
824 | | // This is only safe because the set has no overlapping ranges. |
825 | | // We've already removed any ranges abutting lo and hi, so |
826 | | // any that overlap [lo, hi] must be contained within it. |
827 | 0 | for (;;) { |
828 | 0 | iterator it = ranges_.find(RuneRange(lo, hi)); |
829 | 0 | if (it == end()) |
830 | 0 | break; |
831 | 0 | nrunes_ -= it->hi - it->lo + 1; |
832 | 0 | ranges_.erase(it); |
833 | 0 | } |
834 | | |
835 | | // Finally, add [lo, hi]. |
836 | 0 | nrunes_ += hi - lo + 1; |
837 | 0 | ranges_.insert(RuneRange(lo, hi)); |
838 | 0 | return true; |
839 | 0 | } |
840 | | |
841 | 0 | void CharClassBuilder::AddCharClass(CharClassBuilder *cc) { |
842 | 0 | for (iterator it = cc->begin(); it != cc->end(); ++it) |
843 | 0 | AddRange(it->lo, it->hi); |
844 | 0 | } |
845 | | |
846 | 0 | bool CharClassBuilder::Contains(Rune r) { |
847 | 0 | return ranges_.find(RuneRange(r, r)) != end(); |
848 | 0 | } |
849 | | |
850 | | // Does the character class behave the same on A-Z as on a-z? |
851 | 0 | bool CharClassBuilder::FoldsASCII() { |
852 | 0 | return ((upper_ ^ lower_) & AlphaMask) == 0; |
853 | 0 | } |
854 | | |
855 | 0 | CharClassBuilder* CharClassBuilder::Copy() { |
856 | 0 | CharClassBuilder* cc = new CharClassBuilder; |
857 | 0 | for (iterator it = begin(); it != end(); ++it) |
858 | 0 | cc->ranges_.insert(RuneRange(it->lo, it->hi)); |
859 | 0 | cc->upper_ = upper_; |
860 | 0 | cc->lower_ = lower_; |
861 | 0 | cc->nrunes_ = nrunes_; |
862 | 0 | return cc; |
863 | 0 | } |
864 | | |
865 | | |
866 | | |
867 | 0 | void CharClassBuilder::RemoveAbove(Rune r) { |
868 | 0 | if (r >= Runemax) |
869 | 0 | return; |
870 | | |
871 | 0 | if (r < 'z') { |
872 | 0 | if (r < 'a') |
873 | 0 | lower_ = 0; |
874 | 0 | else |
875 | 0 | lower_ &= AlphaMask >> ('z' - r); |
876 | 0 | } |
877 | |
|
878 | 0 | if (r < 'Z') { |
879 | 0 | if (r < 'A') |
880 | 0 | upper_ = 0; |
881 | 0 | else |
882 | 0 | upper_ &= AlphaMask >> ('Z' - r); |
883 | 0 | } |
884 | |
|
885 | 0 | for (;;) { |
886 | |
|
887 | 0 | iterator it = ranges_.find(RuneRange(r + 1, Runemax)); |
888 | 0 | if (it == end()) |
889 | 0 | break; |
890 | 0 | RuneRange rr = *it; |
891 | 0 | ranges_.erase(it); |
892 | 0 | nrunes_ -= rr.hi - rr.lo + 1; |
893 | 0 | if (rr.lo <= r) { |
894 | 0 | rr.hi = r; |
895 | 0 | ranges_.insert(rr); |
896 | 0 | nrunes_ += rr.hi - rr.lo + 1; |
897 | 0 | } |
898 | 0 | } |
899 | 0 | } |
900 | | |
901 | 0 | void CharClassBuilder::Negate() { |
902 | | // Build up negation and then copy in. |
903 | | // Could edit ranges in place, but C++ won't let me. |
904 | 0 | std::vector<RuneRange> v; |
905 | 0 | v.reserve(ranges_.size() + 1); |
906 | | |
907 | | // In negation, first range begins at 0, unless |
908 | | // the current class begins at 0. |
909 | 0 | iterator it = begin(); |
910 | 0 | if (it == end()) { |
911 | 0 | v.push_back(RuneRange(0, Runemax)); |
912 | 0 | } else { |
913 | 0 | int nextlo = 0; |
914 | 0 | if (it->lo == 0) { |
915 | 0 | nextlo = it->hi + 1; |
916 | 0 | ++it; |
917 | 0 | } |
918 | 0 | for (; it != end(); ++it) { |
919 | 0 | v.push_back(RuneRange(nextlo, it->lo - 1)); |
920 | 0 | nextlo = it->hi + 1; |
921 | 0 | } |
922 | 0 | if (nextlo <= Runemax) |
923 | 0 | v.push_back(RuneRange(nextlo, Runemax)); |
924 | 0 | } |
925 | |
|
926 | 0 | ranges_.clear(); |
927 | 0 | for (size_t i = 0; i < v.size(); i++) |
928 | 0 | ranges_.insert(v[i]); |
929 | |
|
930 | 0 | upper_ = AlphaMask & ~upper_; |
931 | 0 | lower_ = AlphaMask & ~lower_; |
932 | 0 | nrunes_ = Runemax+1 - nrunes_; |
933 | 0 | } |
934 | | |
935 | | // Character class is a sorted list of ranges. |
936 | | // The ranges are allocated in the same block as the header, |
937 | | // necessitating a special allocator and Delete method. |
938 | | |
939 | 0 | CharClass* CharClass::New(size_t maxranges) { |
940 | 0 | CharClass* cc; |
941 | 0 | uint8_t* data = new uint8_t[sizeof *cc + maxranges*sizeof cc->ranges_[0]]; |
942 | 0 | cc = reinterpret_cast<CharClass*>(data); |
943 | 0 | cc->ranges_ = reinterpret_cast<RuneRange*>(data + sizeof *cc); |
944 | 0 | cc->nranges_ = 0; |
945 | 0 | cc->folds_ascii_ = false; |
946 | 0 | cc->nrunes_ = 0; |
947 | 0 | return cc; |
948 | 0 | } |
949 | | |
950 | 0 | void CharClass::Delete() { |
951 | 0 | uint8_t* data = reinterpret_cast<uint8_t*>(this); |
952 | 0 | delete[] data; |
953 | 0 | } |
954 | | |
955 | 0 | CharClass* CharClass::Negate() { |
956 | 0 | CharClass* cc = CharClass::New(static_cast<size_t>(nranges_+1)); |
957 | 0 | cc->folds_ascii_ = folds_ascii_; |
958 | 0 | cc->nrunes_ = Runemax + 1 - nrunes_; |
959 | 0 | int n = 0; |
960 | 0 | int nextlo = 0; |
961 | 0 | for (CharClass::iterator it = begin(); it != end(); ++it) { |
962 | 0 | if (it->lo == nextlo) { |
963 | 0 | nextlo = it->hi + 1; |
964 | 0 | } else { |
965 | 0 | cc->ranges_[n++] = RuneRange(nextlo, it->lo - 1); |
966 | 0 | nextlo = it->hi + 1; |
967 | 0 | } |
968 | 0 | } |
969 | 0 | if (nextlo <= Runemax) |
970 | 0 | cc->ranges_[n++] = RuneRange(nextlo, Runemax); |
971 | 0 | cc->nranges_ = n; |
972 | 0 | return cc; |
973 | 0 | } |
974 | | |
975 | 0 | bool CharClass::Contains(Rune r) const { |
976 | 0 | RuneRange* rr = ranges_; |
977 | 0 | int n = nranges_; |
978 | 0 | while (n > 0) { |
979 | 0 | int m = n/2; |
980 | 0 | if (rr[m].hi < r) { |
981 | 0 | rr += m+1; |
982 | 0 | n -= m+1; |
983 | 0 | } else if (r < rr[m].lo) { |
984 | 0 | n = m; |
985 | 0 | } else { // rr[m].lo <= r && r <= rr[m].hi |
986 | 0 | return true; |
987 | 0 | } |
988 | 0 | } |
989 | 0 | return false; |
990 | 0 | } |
991 | | |
992 | 0 | CharClass* CharClassBuilder::GetCharClass() { |
993 | 0 | CharClass* cc = CharClass::New(ranges_.size()); |
994 | 0 | int n = 0; |
995 | 0 | for (iterator it = begin(); it != end(); ++it) |
996 | 0 | cc->ranges_[n++] = *it; |
997 | 0 | cc->nranges_ = n; |
998 | 0 | ABSL_DCHECK_LE(n, static_cast<int>(ranges_.size())); |
999 | 0 | cc->nrunes_ = nrunes_; |
1000 | 0 | cc->folds_ascii_ = FoldsASCII(); |
1001 | 0 | return cc; |
1002 | 0 | } |
1003 | | |
1004 | | } // namespace re2 |