Line | Count | Source |
1 | | // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Regular expression interface RE2. |
6 | | // |
7 | | // Originally the PCRE C++ wrapper, but adapted to use |
8 | | // the new automata-based regular expression engines. |
9 | | |
10 | | #include "re2/re2.h" |
11 | | |
12 | | #include <errno.h> |
13 | | #include <stddef.h> |
14 | | #include <stdint.h> |
15 | | #include <stdlib.h> |
16 | | #include <string.h> |
17 | | |
18 | | #include <algorithm> |
19 | | #include <atomic> |
20 | | #include <map> |
21 | | #include <string> |
22 | | #include <utility> |
23 | | #include <vector> |
24 | | |
25 | | #include "absl/base/call_once.h" |
26 | | #include "absl/base/macros.h" |
27 | | #include "absl/container/fixed_array.h" |
28 | | #include "absl/log/absl_check.h" |
29 | | #include "absl/log/absl_log.h" |
30 | | #include "absl/strings/ascii.h" |
31 | | #include "absl/strings/str_format.h" |
32 | | #include "absl/strings/string_view.h" |
33 | | #include "re2/prog.h" |
34 | | #include "re2/regexp.h" |
35 | | #include "re2/sparse_array.h" |
36 | | #include "util/strutil.h" |
37 | | #include "util/utf.h" |
38 | | |
39 | | #ifdef _MSC_VER |
40 | | #include <intrin.h> |
41 | | #endif |
42 | | |
43 | | namespace re2 { |
44 | | |
45 | | // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. |
46 | | static int maximum_global_replace_count = -1; |
47 | | |
48 | 12.2k | void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) { |
49 | 12.2k | maximum_global_replace_count = i; |
50 | 12.2k | } |
51 | | |
52 | | // Maximum number of args we can set |
53 | | static const int kMaxArgs = 16; |
54 | | static const int kVecSize = 1+kMaxArgs; |
55 | | |
56 | | const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
57 | | |
58 | | RE2::Options::Options(RE2::CannedOptions opt) |
59 | 0 | : max_mem_(kDefaultMaxMem), |
60 | 0 | encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
61 | 0 | posix_syntax_(opt == RE2::POSIX), |
62 | 0 | longest_match_(opt == RE2::POSIX), |
63 | 0 | log_errors_(opt != RE2::Quiet), |
64 | 0 | literal_(false), |
65 | 0 | never_nl_(false), |
66 | 0 | dot_nl_(false), |
67 | 0 | never_capture_(false), |
68 | 0 | case_sensitive_(true), |
69 | 0 | perl_classes_(false), |
70 | 0 | word_boundary_(false), |
71 | 0 | one_line_(false) { |
72 | 0 | } |
73 | | |
74 | | // Empty objects for use as const references. |
75 | | // Statically allocating the storage and then |
76 | | // lazily constructing the objects (in a once |
77 | | // in RE2::Init()) avoids global constructors |
78 | | // and the false positives (thanks, Valgrind) |
79 | | // about memory leaks at program termination. |
80 | | struct EmptyStorage { |
81 | | std::string empty_string; |
82 | | std::map<std::string, int> empty_named_groups; |
83 | | std::map<int, std::string> empty_group_names; |
84 | | }; |
85 | | alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)]; |
86 | | |
87 | 87.8k | static inline std::string* empty_string() { |
88 | 87.8k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string; |
89 | 87.8k | } |
90 | | |
91 | 31.3k | static inline std::map<std::string, int>* empty_named_groups() { |
92 | 31.3k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups; |
93 | 31.3k | } |
94 | | |
95 | 31.3k | static inline std::map<int, std::string>* empty_group_names() { |
96 | 31.3k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names; |
97 | 31.3k | } |
98 | | |
99 | | // Converts from Regexp error code to RE2 error code. |
100 | | // Maybe some day they will diverge. In any event, this |
101 | | // hides the existence of Regexp from RE2 users. |
102 | 1.57k | static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { |
103 | 1.57k | switch (code) { |
104 | 0 | case re2::kRegexpSuccess: |
105 | 0 | return RE2::NoError; |
106 | 0 | case re2::kRegexpInternalError: |
107 | 0 | return RE2::ErrorInternal; |
108 | 229 | case re2::kRegexpBadEscape: |
109 | 229 | return RE2::ErrorBadEscape; |
110 | 0 | case re2::kRegexpBadCharClass: |
111 | 0 | return RE2::ErrorBadCharClass; |
112 | 182 | case re2::kRegexpBadCharRange: |
113 | 182 | return RE2::ErrorBadCharRange; |
114 | 314 | case re2::kRegexpMissingBracket: |
115 | 314 | return RE2::ErrorMissingBracket; |
116 | 275 | case re2::kRegexpMissingParen: |
117 | 275 | return RE2::ErrorMissingParen; |
118 | 53 | case re2::kRegexpUnexpectedParen: |
119 | 53 | return RE2::ErrorUnexpectedParen; |
120 | 46 | case re2::kRegexpTrailingBackslash: |
121 | 46 | return RE2::ErrorTrailingBackslash; |
122 | 26 | case re2::kRegexpRepeatArgument: |
123 | 26 | return RE2::ErrorRepeatArgument; |
124 | 98 | case re2::kRegexpRepeatSize: |
125 | 98 | return RE2::ErrorRepeatSize; |
126 | 8 | case re2::kRegexpRepeatOp: |
127 | 8 | return RE2::ErrorRepeatOp; |
128 | 51 | case re2::kRegexpBadPerlOp: |
129 | 51 | return RE2::ErrorBadPerlOp; |
130 | 211 | case re2::kRegexpBadUTF8: |
131 | 211 | return RE2::ErrorBadUTF8; |
132 | 84 | case re2::kRegexpBadNamedCapture: |
133 | 84 | return RE2::ErrorBadNamedCapture; |
134 | 1.57k | } |
135 | 0 | return RE2::ErrorInternal; |
136 | 1.57k | } |
137 | | |
138 | 0 | static std::string trunc(absl::string_view pattern) { |
139 | 0 | if (pattern.size() < 100) |
140 | 0 | return std::string(pattern); |
141 | 0 | return std::string(pattern.substr(0, 100)) + "..."; |
142 | 0 | } |
143 | | |
144 | | |
145 | 0 | RE2::RE2(const char* pattern) { |
146 | | // If absl::string_view becomes an alias for std::string_view, |
147 | | // it will stop allowing NULL to be converted. |
148 | | // Handle NULL explicitly to keep callers working no matter what. |
149 | 0 | if (pattern == NULL) |
150 | 0 | pattern = ""; |
151 | 0 | Init(pattern, DefaultOptions); |
152 | 0 | } |
153 | | |
154 | 0 | RE2::RE2(const std::string& pattern) { |
155 | 0 | Init(pattern, DefaultOptions); |
156 | 0 | } |
157 | | |
158 | 0 | RE2::RE2(absl::string_view pattern) { |
159 | 0 | Init(pattern, DefaultOptions); |
160 | 0 | } |
161 | | |
162 | 21.9k | RE2::RE2(absl::string_view pattern, const Options& options) { |
163 | 21.9k | Init(pattern, options); |
164 | 21.9k | } |
165 | | |
166 | 41.2k | int RE2::Options::ParseFlags() const { |
167 | 41.2k | int flags = Regexp::ClassNL; |
168 | 41.2k | switch (encoding()) { |
169 | 0 | default: |
170 | 0 | if (log_errors()) |
171 | 0 | ABSL_LOG(ERROR) << "Unknown encoding " << encoding(); |
172 | 0 | break; |
173 | 20.5k | case RE2::Options::EncodingUTF8: |
174 | 20.5k | break; |
175 | 20.7k | case RE2::Options::EncodingLatin1: |
176 | 20.7k | flags |= Regexp::Latin1; |
177 | 20.7k | break; |
178 | 41.2k | } |
179 | | |
180 | 41.2k | if (!posix_syntax()) |
181 | 26.9k | flags |= Regexp::LikePerl; |
182 | | |
183 | 41.2k | if (literal()) |
184 | 1.81k | flags |= Regexp::Literal; |
185 | | |
186 | 41.2k | if (never_nl()) |
187 | 15.7k | flags |= Regexp::NeverNL; |
188 | | |
189 | 41.2k | if (dot_nl()) |
190 | 16.2k | flags |= Regexp::DotNL; |
191 | | |
192 | 41.2k | if (never_capture()) |
193 | 27.2k | flags |= Regexp::NeverCapture; |
194 | | |
195 | 41.2k | if (!case_sensitive()) |
196 | 19.0k | flags |= Regexp::FoldCase; |
197 | | |
198 | 41.2k | if (perl_classes()) |
199 | 17.2k | flags |= Regexp::PerlClasses; |
200 | | |
201 | 41.2k | if (word_boundary()) |
202 | 17.0k | flags |= Regexp::PerlB; |
203 | | |
204 | 41.2k | if (one_line()) |
205 | 17.5k | flags |= Regexp::OneLine; |
206 | | |
207 | 41.2k | return flags; |
208 | 41.2k | } |
209 | | |
210 | 21.9k | void RE2::Init(absl::string_view pattern, const Options& options) { |
211 | 21.9k | static absl::once_flag empty_once; |
212 | 21.9k | absl::call_once(empty_once, []() { |
213 | 1 | (void) new (empty_storage) EmptyStorage; |
214 | 1 | }); |
215 | | |
216 | 21.9k | pattern_ = new std::string(pattern); |
217 | 21.9k | options_.Copy(options); |
218 | 21.9k | entire_regexp_ = NULL; |
219 | 21.9k | suffix_regexp_ = NULL; |
220 | 21.9k | error_ = empty_string(); |
221 | 21.9k | error_arg_ = empty_string(); |
222 | | |
223 | 21.9k | num_captures_ = -1; |
224 | 21.9k | error_code_ = NoError; |
225 | 21.9k | longest_match_ = options_.longest_match(); |
226 | 21.9k | is_one_pass_ = false; |
227 | 21.9k | prefix_foldcase_ = false; |
228 | 21.9k | prefix_.clear(); |
229 | 21.9k | prog_ = NULL; |
230 | | |
231 | 21.9k | rprog_ = NULL; |
232 | 21.9k | named_groups_ = NULL; |
233 | 21.9k | group_names_ = NULL; |
234 | | |
235 | 21.9k | RegexpStatus status; |
236 | 21.9k | entire_regexp_ = Regexp::Parse( |
237 | 21.9k | *pattern_, |
238 | 21.9k | static_cast<Regexp::ParseFlags>(options_.ParseFlags()), |
239 | 21.9k | &status); |
240 | 21.9k | if (entire_regexp_ == NULL) { |
241 | 1.57k | if (options_.log_errors()) { |
242 | 0 | ABSL_LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': " |
243 | 0 | << status.Text(); |
244 | 0 | } |
245 | 1.57k | error_ = new std::string(status.Text()); |
246 | 1.57k | error_code_ = RegexpErrorToRE2(status.code()); |
247 | 1.57k | error_arg_ = new std::string(status.error_arg()); |
248 | 1.57k | return; |
249 | 1.57k | } |
250 | | |
251 | 20.3k | bool foldcase; |
252 | 20.3k | re2::Regexp* suffix; |
253 | 20.3k | if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) { |
254 | 450 | prefix_foldcase_ = foldcase; |
255 | 450 | suffix_regexp_ = suffix; |
256 | 450 | } |
257 | 19.9k | else { |
258 | 19.9k | suffix_regexp_ = entire_regexp_->Incref(); |
259 | 19.9k | } |
260 | | |
261 | | // Two thirds of the memory goes to the forward Prog, |
262 | | // one third to the reverse prog, because the forward |
263 | | // Prog has two DFAs but the reverse prog has one. |
264 | 20.3k | prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); |
265 | 20.3k | if (prog_ == NULL) { |
266 | 0 | if (options_.log_errors()) |
267 | 0 | ABSL_LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'"; |
268 | 0 | error_ = new std::string("pattern too large - compile failed"); |
269 | 0 | error_code_ = RE2::ErrorPatternTooLarge; |
270 | 0 | return; |
271 | 0 | } |
272 | | |
273 | | // We used to compute this lazily, but it's used during the |
274 | | // typical control flow for a match call, so we now compute |
275 | | // it eagerly, which avoids the overhead of absl::once_flag. |
276 | 20.3k | num_captures_ = suffix_regexp_->NumCaptures(); |
277 | | |
278 | | // Could delay this until the first match call that |
279 | | // cares about submatch information, but the one-pass |
280 | | // machine's memory gets cut from the DFA memory budget, |
281 | | // and that is harder to do if the DFA has already |
282 | | // been built. |
283 | 20.3k | is_one_pass_ = prog_->IsOnePass(); |
284 | 20.3k | } |
285 | | |
286 | | // Returns rprog_, computing it if needed. |
287 | 32.8k | re2::Prog* RE2::ReverseProg() const { |
288 | 32.8k | absl::call_once(rprog_once_, [](const RE2* re) { |
289 | 10.3k | re->rprog_ = |
290 | 10.3k | re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); |
291 | 10.3k | if (re->rprog_ == NULL) { |
292 | 0 | if (re->options_.log_errors()) |
293 | 0 | ABSL_LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_) |
294 | 0 | << "'"; |
295 | | // We no longer touch error_ and error_code_ because failing to compile |
296 | | // the reverse Prog is not a showstopper: falling back to NFA execution |
297 | | // is fine. More importantly, an RE2 object is supposed to be logically |
298 | | // immutable: whatever ok() would have returned after Init() completed, |
299 | | // it should continue to return that no matter what ReverseProg() does. |
300 | 0 | } |
301 | 10.3k | }, this); |
302 | 32.8k | return rprog_; |
303 | 32.8k | } |
304 | | |
305 | 21.9k | RE2::~RE2() { |
306 | 21.9k | if (group_names_ != empty_group_names()) |
307 | 12.5k | delete group_names_; |
308 | 21.9k | if (named_groups_ != empty_named_groups()) |
309 | 12.5k | delete named_groups_; |
310 | 21.9k | delete rprog_; |
311 | 21.9k | delete prog_; |
312 | 21.9k | if (error_arg_ != empty_string()) |
313 | 1.57k | delete error_arg_; |
314 | 21.9k | if (error_ != empty_string()) |
315 | 1.57k | delete error_; |
316 | 21.9k | if (suffix_regexp_) |
317 | 20.3k | suffix_regexp_->Decref(); |
318 | 21.9k | if (entire_regexp_) |
319 | 20.3k | entire_regexp_->Decref(); |
320 | 21.9k | delete pattern_; |
321 | 21.9k | } |
322 | | |
323 | 9.94k | int RE2::ProgramSize() const { |
324 | 9.94k | if (prog_ == NULL) |
325 | 0 | return -1; |
326 | 9.94k | return prog_->size(); |
327 | 9.94k | } |
328 | | |
329 | 9.77k | int RE2::ReverseProgramSize() const { |
330 | 9.77k | if (prog_ == NULL) |
331 | 0 | return -1; |
332 | 9.77k | Prog* prog = ReverseProg(); |
333 | 9.77k | if (prog == NULL) |
334 | 0 | return -1; |
335 | 9.77k | return prog->size(); |
336 | 9.77k | } |
337 | | |
338 | | // Finds the most significant non-zero bit in n. |
339 | 3.70M | static int FindMSBSet(uint32_t n) { |
340 | 3.70M | ABSL_DCHECK_NE(n, uint32_t{0}); |
341 | 3.70M | #if defined(__GNUC__) |
342 | 3.70M | return 31 ^ __builtin_clz(n); |
343 | | #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) |
344 | | unsigned long c; |
345 | | _BitScanReverse(&c, n); |
346 | | return static_cast<int>(c); |
347 | | #else |
348 | | int c = 0; |
349 | | for (int shift = 1 << 4; shift != 0; shift >>= 1) { |
350 | | uint32_t word = n >> shift; |
351 | | if (word != 0) { |
352 | | n = word; |
353 | | c += shift; |
354 | | } |
355 | | } |
356 | | return c; |
357 | | #endif |
358 | 3.70M | } |
359 | | |
360 | 19.4k | static int Fanout(Prog* prog, std::vector<int>* histogram) { |
361 | 19.4k | SparseArray<int> fanout(prog->size()); |
362 | 19.4k | prog->Fanout(&fanout); |
363 | 19.4k | int data[32] = {}; |
364 | 19.4k | int size = 0; |
365 | 3.73M | for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
366 | 3.72M | if (i->value() == 0) |
367 | 17.5k | continue; |
368 | 3.70M | uint32_t value = i->value(); |
369 | 3.70M | int bucket = FindMSBSet(value); |
370 | 3.70M | bucket += value & (value-1) ? 1 : 0; |
371 | 3.70M | ++data[bucket]; |
372 | 3.70M | size = std::max(size, bucket+1); |
373 | 3.70M | } |
374 | 19.4k | if (histogram != NULL) |
375 | 19.4k | histogram->assign(data, data+size); |
376 | 19.4k | return size-1; |
377 | 19.4k | } |
378 | | |
379 | 9.73k | int RE2::ProgramFanout(std::vector<int>* histogram) const { |
380 | 9.73k | if (prog_ == NULL) |
381 | 0 | return -1; |
382 | 9.73k | return Fanout(prog_, histogram); |
383 | 9.73k | } |
384 | | |
385 | 9.72k | int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { |
386 | 9.72k | if (prog_ == NULL) |
387 | 0 | return -1; |
388 | 9.72k | Prog* prog = ReverseProg(); |
389 | 9.72k | if (prog == NULL) |
390 | 0 | return -1; |
391 | 9.72k | return Fanout(prog, histogram); |
392 | 9.72k | } |
393 | | |
394 | | // Returns named_groups_, computing it if needed. |
395 | 9.66k | const std::map<std::string, int>& RE2::NamedCapturingGroups() const { |
396 | 9.66k | absl::call_once(named_groups_once_, [](const RE2* re) { |
397 | 9.66k | if (re->suffix_regexp_ != NULL) |
398 | 9.66k | re->named_groups_ = re->suffix_regexp_->NamedCaptures(); |
399 | 9.66k | if (re->named_groups_ == NULL) |
400 | 9.39k | re->named_groups_ = empty_named_groups(); |
401 | 9.66k | }, this); |
402 | 9.66k | return *named_groups_; |
403 | 9.66k | } |
404 | | |
405 | | // Returns group_names_, computing it if needed. |
406 | 9.66k | const std::map<int, std::string>& RE2::CapturingGroupNames() const { |
407 | 9.66k | absl::call_once(group_names_once_, [](const RE2* re) { |
408 | 9.66k | if (re->suffix_regexp_ != NULL) |
409 | 9.66k | re->group_names_ = re->suffix_regexp_->CaptureNames(); |
410 | 9.66k | if (re->group_names_ == NULL) |
411 | 9.39k | re->group_names_ = empty_group_names(); |
412 | 9.66k | }, this); |
413 | 9.66k | return *group_names_; |
414 | 9.66k | } |
415 | | |
416 | | /***** Convenience interfaces *****/ |
417 | | |
418 | | bool RE2::FullMatchN(absl::string_view text, const RE2& re, |
419 | 9.66k | const Arg* const args[], int n) { |
420 | 9.66k | return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); |
421 | 9.66k | } |
422 | | |
423 | | bool RE2::PartialMatchN(absl::string_view text, const RE2& re, |
424 | 19.3k | const Arg* const args[], int n) { |
425 | 19.3k | return re.DoMatch(text, UNANCHORED, NULL, args, n); |
426 | 19.3k | } |
427 | | |
428 | | bool RE2::ConsumeN(absl::string_view* input, const RE2& re, |
429 | 9.66k | const Arg* const args[], int n) { |
430 | 9.66k | size_t consumed; |
431 | 9.66k | if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { |
432 | 1.95k | input->remove_prefix(consumed); |
433 | 1.95k | return true; |
434 | 7.71k | } else { |
435 | 7.71k | return false; |
436 | 7.71k | } |
437 | 9.66k | } |
438 | | |
439 | | bool RE2::FindAndConsumeN(absl::string_view* input, const RE2& re, |
440 | 9.66k | const Arg* const args[], int n) { |
441 | 9.66k | size_t consumed; |
442 | 9.66k | if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { |
443 | 2.06k | input->remove_prefix(consumed); |
444 | 2.06k | return true; |
445 | 7.59k | } else { |
446 | 7.59k | return false; |
447 | 7.59k | } |
448 | 9.66k | } |
449 | | |
450 | | bool RE2::Replace(std::string* str, |
451 | | const RE2& re, |
452 | 9.66k | absl::string_view rewrite) { |
453 | 9.66k | absl::string_view vec[kVecSize]; |
454 | 9.66k | int nvec = 1 + MaxSubmatch(rewrite); |
455 | 9.66k | if (nvec > 1 + re.NumberOfCapturingGroups()) |
456 | 0 | return false; |
457 | 9.66k | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
458 | 0 | return false; |
459 | 9.66k | if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) |
460 | 6.08k | return false; |
461 | | |
462 | 3.58k | std::string s; |
463 | 3.58k | if (!re.Rewrite(&s, rewrite, vec, nvec)) |
464 | 0 | return false; |
465 | | |
466 | 3.58k | ABSL_DCHECK_GE(vec[0].data(), str->data()); |
467 | 3.58k | ABSL_DCHECK_LE(vec[0].data() + vec[0].size(), str->data() + str->size()); |
468 | 3.58k | str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
469 | 3.58k | return true; |
470 | 3.58k | } |
471 | | |
472 | | int RE2::GlobalReplace(std::string* str, |
473 | | const RE2& re, |
474 | 9.66k | absl::string_view rewrite) { |
475 | 9.66k | absl::string_view vec[kVecSize]; |
476 | 9.66k | int nvec = 1 + MaxSubmatch(rewrite); |
477 | 9.66k | if (nvec > 1 + re.NumberOfCapturingGroups()) |
478 | 0 | return false; |
479 | 9.66k | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
480 | 0 | return false; |
481 | | |
482 | 9.66k | const char* p = str->data(); |
483 | 9.66k | const char* ep = p + str->size(); |
484 | 9.66k | const char* lastend = NULL; |
485 | 9.66k | std::string out; |
486 | 9.66k | int count = 0; |
487 | 13.2k | while (p <= ep) { |
488 | 13.2k | if (maximum_global_replace_count != -1 && |
489 | 13.2k | count >= maximum_global_replace_count) |
490 | 3.58k | break; |
491 | 9.66k | if (!re.Match(*str, static_cast<size_t>(p - str->data()), |
492 | 9.66k | str->size(), UNANCHORED, vec, nvec)) |
493 | 6.08k | break; |
494 | 3.58k | if (p < vec[0].data()) |
495 | 179 | out.append(p, vec[0].data() - p); |
496 | 3.58k | if (vec[0].data() == lastend && vec[0].empty()) { |
497 | | // Disallow empty match at end of last match: skip ahead. |
498 | | // |
499 | | // fullrune() takes int, not ptrdiff_t. However, it just looks |
500 | | // at the leading byte and treats any length >= 4 the same. |
501 | 0 | if (re.options().encoding() == RE2::Options::EncodingUTF8 && |
502 | 0 | fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { |
503 | | // re is in UTF-8 mode and there is enough left of str |
504 | | // to allow us to advance by up to UTFmax bytes. |
505 | 0 | Rune r; |
506 | 0 | int n = chartorune(&r, p); |
507 | | // Some copies of chartorune have a bug that accepts |
508 | | // encodings of values in (10FFFF, 1FFFFF] as valid. |
509 | 0 | if (r > Runemax) { |
510 | 0 | n = 1; |
511 | 0 | r = Runeerror; |
512 | 0 | } |
513 | 0 | if (!(n == 1 && r == Runeerror)) { // no decoding error |
514 | 0 | out.append(p, n); |
515 | 0 | p += n; |
516 | 0 | continue; |
517 | 0 | } |
518 | 0 | } |
519 | | // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, |
520 | | // we fell through from above and the GIGO principle applies. |
521 | 0 | if (p < ep) |
522 | 0 | out.append(p, 1); |
523 | 0 | p++; |
524 | 0 | continue; |
525 | 0 | } |
526 | 3.58k | re.Rewrite(&out, rewrite, vec, nvec); |
527 | 3.58k | p = vec[0].data() + vec[0].size(); |
528 | 3.58k | lastend = p; |
529 | 3.58k | count++; |
530 | 3.58k | } |
531 | | |
532 | 9.66k | if (count == 0) |
533 | 6.08k | return 0; |
534 | | |
535 | 3.58k | if (p < ep) |
536 | 547 | out.append(p, ep - p); |
537 | 3.58k | using std::swap; |
538 | 3.58k | swap(out, *str); |
539 | 3.58k | return count; |
540 | 9.66k | } |
541 | | |
542 | | bool RE2::Extract(absl::string_view text, |
543 | | const RE2& re, |
544 | | absl::string_view rewrite, |
545 | 0 | std::string* out) { |
546 | 0 | absl::string_view vec[kVecSize]; |
547 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
548 | 0 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
549 | 0 | return false; |
550 | 0 | if (nvec > static_cast<int>(ABSL_ARRAYSIZE(vec))) |
551 | 0 | return false; |
552 | 0 | if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) |
553 | 0 | return false; |
554 | | |
555 | 0 | out->clear(); |
556 | 0 | return re.Rewrite(out, rewrite, vec, nvec); |
557 | 0 | } |
558 | | |
559 | 9.66k | std::string RE2::QuoteMeta(absl::string_view unquoted) { |
560 | 9.66k | std::string result; |
561 | 9.66k | result.reserve(unquoted.size() << 1); |
562 | | |
563 | | // Escape any ascii character not in [A-Za-z_0-9]. |
564 | | // |
565 | | // Note that it's legal to escape a character even if it has no |
566 | | // special meaning in a regular expression -- so this function does |
567 | | // that. (This also makes it identical to the perl function of the |
568 | | // same name except for the null-character special case; |
569 | | // see `perldoc -f quotemeta`.) |
570 | 444k | for (size_t ii = 0; ii < unquoted.size(); ++ii) { |
571 | | // Note that using 'isalnum' here raises the benchmark time from |
572 | | // 32ns to 58ns: |
573 | 434k | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
574 | 420k | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
575 | 407k | (unquoted[ii] < '0' || unquoted[ii] > '9') && |
576 | 377k | unquoted[ii] != '_' && |
577 | | // If this is the part of a UTF8 or Latin1 character, we need |
578 | | // to copy this byte without escaping. Experimentally this is |
579 | | // what works correctly with the regexp library. |
580 | 376k | !(unquoted[ii] & 128)) { |
581 | 298k | if (unquoted[ii] == '\0') { // Special handling for null chars. |
582 | | // Note that this special handling is not strictly required for RE2, |
583 | | // but this quoting is required for other regexp libraries such as |
584 | | // PCRE. |
585 | | // Can't use "\\0" since the next character might be a digit. |
586 | 18.7k | result += "\\x00"; |
587 | 18.7k | continue; |
588 | 18.7k | } |
589 | 279k | result += '\\'; |
590 | 279k | } |
591 | 415k | result += unquoted[ii]; |
592 | 415k | } |
593 | | |
594 | 9.66k | return result; |
595 | 9.66k | } |
596 | | |
597 | | bool RE2::PossibleMatchRange(std::string* min, std::string* max, |
598 | 9.66k | int maxlen) const { |
599 | 9.66k | if (prog_ == NULL) |
600 | 0 | return false; |
601 | | |
602 | 9.66k | int n = static_cast<int>(prefix_.size()); |
603 | 9.66k | if (n > maxlen) |
604 | 13 | n = maxlen; |
605 | | |
606 | | // Determine initial min max from prefix_ literal. |
607 | 9.66k | *min = prefix_.substr(0, n); |
608 | 9.66k | *max = prefix_.substr(0, n); |
609 | 9.66k | if (prefix_foldcase_) { |
610 | | // prefix is ASCII lowercase; change *min to uppercase. |
611 | 394 | for (int i = 0; i < n; i++) { |
612 | 276 | char& c = (*min)[i]; |
613 | 276 | if ('a' <= c && c <= 'z') |
614 | 60 | c += 'A' - 'a'; |
615 | 276 | } |
616 | 118 | } |
617 | | |
618 | | // Add to prefix min max using PossibleMatchRange on regexp. |
619 | 9.66k | std::string dmin, dmax; |
620 | 9.66k | maxlen -= n; |
621 | 9.66k | if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { |
622 | 8.89k | min->append(dmin); |
623 | 8.89k | max->append(dmax); |
624 | 8.89k | } else if (!max->empty()) { |
625 | | // prog_->PossibleMatchRange has failed us, |
626 | | // but we still have useful information from prefix_. |
627 | | // Round up *max to allow any possible suffix. |
628 | 21 | PrefixSuccessor(max); |
629 | 749 | } else { |
630 | | // Nothing useful. |
631 | 749 | *min = ""; |
632 | 749 | *max = ""; |
633 | 749 | return false; |
634 | 749 | } |
635 | | |
636 | 8.91k | return true; |
637 | 9.66k | } |
638 | | |
639 | | // Avoid possible locale nonsense in standard strcasecmp. |
640 | | // The string a is known to be all lowercase. |
641 | 343 | static int ascii_strcasecmp(const char* a, const char* b, size_t len) { |
642 | 343 | const char* ae = a + len; |
643 | | |
644 | 896 | for (; a < ae; a++, b++) { |
645 | 707 | uint8_t x = *a; |
646 | 707 | uint8_t y = *b; |
647 | 707 | if ('A' <= y && y <= 'Z') |
648 | 112 | y += 'a' - 'A'; |
649 | 707 | if (x != y) |
650 | 154 | return x - y; |
651 | 707 | } |
652 | 189 | return 0; |
653 | 343 | } |
654 | | |
655 | | |
656 | | /***** Actual matching and rewriting code *****/ |
657 | | |
658 | | bool RE2::Match(absl::string_view text, |
659 | | size_t startpos, |
660 | | size_t endpos, |
661 | | Anchor re_anchor, |
662 | | absl::string_view* submatch, |
663 | 67.6k | int nsubmatch) const { |
664 | 67.6k | if (!ok()) { |
665 | 0 | if (options_.log_errors()) |
666 | 0 | ABSL_LOG(ERROR) << "Invalid RE2: " << *error_; |
667 | 0 | return false; |
668 | 0 | } |
669 | | |
670 | 67.6k | if (startpos > endpos || endpos > text.size()) { |
671 | 0 | if (options_.log_errors()) |
672 | 0 | ABSL_LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
673 | 0 | << "startpos: " << startpos << ", " |
674 | 0 | << "endpos: " << endpos << ", " |
675 | 0 | << "text size: " << text.size() << "]"; |
676 | 0 | return false; |
677 | 0 | } |
678 | | |
679 | 67.6k | absl::string_view subtext = text; |
680 | 67.6k | subtext.remove_prefix(startpos); |
681 | 67.6k | subtext.remove_suffix(text.size() - endpos); |
682 | | |
683 | | // Use DFAs to find exact location of match, filter out non-matches. |
684 | | |
685 | | // Don't ask for the location if we won't use it. |
686 | | // SearchDFA can do extra optimizations in that case. |
687 | 67.6k | absl::string_view match; |
688 | 67.6k | absl::string_view* matchp = &match; |
689 | 67.6k | if (nsubmatch == 0) |
690 | 22.5k | matchp = NULL; |
691 | | |
692 | 67.6k | int ncap = 1 + NumberOfCapturingGroups(); |
693 | 67.6k | if (ncap > nsubmatch) |
694 | 32.6k | ncap = nsubmatch; |
695 | | |
696 | | // If the regexp is anchored explicitly, must not be in middle of text. |
697 | 67.6k | if (prog_->anchor_start() && startpos != 0) |
698 | 0 | return false; |
699 | 67.6k | if (prog_->anchor_end() && endpos != text.size()) |
700 | 0 | return false; |
701 | | |
702 | | // If the regexp is anchored explicitly, update re_anchor |
703 | | // so that we can potentially fall into a faster case below. |
704 | 67.6k | if (prog_->anchor_start() && prog_->anchor_end()) |
705 | 298 | re_anchor = ANCHOR_BOTH; |
706 | 67.3k | else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
707 | 2.29k | re_anchor = ANCHOR_START; |
708 | | |
709 | | // Check for the required prefix, if any. |
710 | 67.6k | size_t prefixlen = 0; |
711 | 67.6k | if (!prefix_.empty()) { |
712 | 1.44k | if (startpos != 0) |
713 | 0 | return false; |
714 | 1.44k | prefixlen = prefix_.size(); |
715 | 1.44k | if (prefixlen > subtext.size()) |
716 | 826 | return false; |
717 | 620 | if (prefix_foldcase_) { |
718 | 343 | if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
719 | 154 | return false; |
720 | 343 | } else { |
721 | 277 | if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
722 | 147 | return false; |
723 | 277 | } |
724 | 319 | subtext.remove_prefix(prefixlen); |
725 | | // If there is a required prefix, the anchor must be at least ANCHOR_START. |
726 | 319 | if (re_anchor != ANCHOR_BOTH) |
727 | 262 | re_anchor = ANCHOR_START; |
728 | 319 | } |
729 | | |
730 | 66.4k | Prog::Anchor anchor = Prog::kUnanchored; |
731 | 66.4k | Prog::MatchKind kind = |
732 | 66.4k | longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch; |
733 | | |
734 | 66.4k | bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; |
735 | 66.4k | bool can_bit_state = prog_->CanBitState(); |
736 | 66.4k | size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); |
737 | | |
738 | 66.4k | #ifdef RE2_HAVE_THREAD_LOCAL |
739 | 66.4k | hooks::context = this; |
740 | 66.4k | #endif |
741 | 66.4k | bool dfa_failed = false; |
742 | 66.4k | bool skipped_test = false; |
743 | 66.4k | switch (re_anchor) { |
744 | 0 | default: |
745 | 0 | ABSL_LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; |
746 | 0 | return false; |
747 | | |
748 | 45.2k | case UNANCHORED: { |
749 | 45.2k | if (prog_->anchor_end()) { |
750 | | // This is a very special case: we don't need the forward DFA because |
751 | | // we already know where the match must end! Instead, the reverse DFA |
752 | | // can say whether there is a match and (optionally) where it starts. |
753 | 2.73k | Prog* prog = ReverseProg(); |
754 | 2.73k | if (prog == NULL) { |
755 | | // Fall back to NFA below. |
756 | 0 | skipped_test = true; |
757 | 0 | break; |
758 | 0 | } |
759 | 2.73k | if (!prog->SearchDFA(subtext, text, Prog::kAnchored, |
760 | 2.73k | Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { |
761 | 1.75k | if (dfa_failed) { |
762 | 0 | if (options_.log_errors()) |
763 | 0 | ABSL_LOG(ERROR) << "DFA out of memory: " |
764 | 0 | << "pattern length " << pattern_->size() << ", " |
765 | 0 | << "program size " << prog->size() << ", " |
766 | 0 | << "list count " << prog->list_count() << ", " |
767 | 0 | << "bytemap range " << prog->bytemap_range(); |
768 | | // Fall back to NFA below. |
769 | 0 | skipped_test = true; |
770 | 0 | break; |
771 | 0 | } |
772 | 1.75k | return false; |
773 | 1.75k | } |
774 | 985 | if (matchp == NULL) // Matched. Don't care where. |
775 | 286 | return true; |
776 | 699 | break; |
777 | 985 | } |
778 | | |
779 | 42.4k | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
780 | 42.4k | matchp, &dfa_failed, NULL)) { |
781 | 27.1k | if (dfa_failed) { |
782 | 0 | if (options_.log_errors()) |
783 | 0 | ABSL_LOG(ERROR) << "DFA out of memory: " |
784 | 0 | << "pattern length " << pattern_->size() << ", " |
785 | 0 | << "program size " << prog_->size() << ", " |
786 | 0 | << "list count " << prog_->list_count() << ", " |
787 | 0 | << "bytemap range " << prog_->bytemap_range(); |
788 | | // Fall back to NFA below. |
789 | 0 | skipped_test = true; |
790 | 0 | break; |
791 | 0 | } |
792 | 27.1k | return false; |
793 | 27.1k | } |
794 | 15.3k | if (matchp == NULL) // Matched. Don't care where. |
795 | 4.74k | return true; |
796 | | // SearchDFA set match.end() but didn't know where the |
797 | | // match started. Run the regexp backward from match.end() |
798 | | // to find the longest possible match -- that's where it started. |
799 | 10.6k | Prog* prog = ReverseProg(); |
800 | 10.6k | if (prog == NULL) { |
801 | | // Fall back to NFA below. |
802 | 0 | skipped_test = true; |
803 | 0 | break; |
804 | 0 | } |
805 | 10.6k | if (!prog->SearchDFA(match, text, Prog::kAnchored, |
806 | 10.6k | Prog::kLongestMatch, &match, &dfa_failed, NULL)) { |
807 | 0 | if (dfa_failed) { |
808 | 0 | if (options_.log_errors()) |
809 | 0 | ABSL_LOG(ERROR) << "DFA out of memory: " |
810 | 0 | << "pattern length " << pattern_->size() << ", " |
811 | 0 | << "program size " << prog->size() << ", " |
812 | 0 | << "list count " << prog->list_count() << ", " |
813 | 0 | << "bytemap range " << prog->bytemap_range(); |
814 | | // Fall back to NFA below. |
815 | 0 | skipped_test = true; |
816 | 0 | break; |
817 | 0 | } |
818 | 0 | if (options_.log_errors()) |
819 | 0 | ABSL_LOG(ERROR) << "SearchDFA inconsistency"; |
820 | 0 | return false; |
821 | 0 | } |
822 | 10.6k | break; |
823 | 10.6k | } |
824 | | |
825 | 10.6k | case ANCHOR_BOTH: |
826 | 21.2k | case ANCHOR_START: |
827 | 21.2k | if (re_anchor == ANCHOR_BOTH) |
828 | 9.74k | kind = Prog::kFullMatch; |
829 | 21.2k | anchor = Prog::kAnchored; |
830 | | |
831 | | // If only a small amount of text and need submatch |
832 | | // information anyway and we're going to use OnePass or BitState |
833 | | // to get it, we might as well not even bother with the DFA: |
834 | | // OnePass or BitState will be fast enough. |
835 | | // On tiny texts, OnePass outruns even the DFA, and |
836 | | // it doesn't have the shared state and occasional mutex that |
837 | | // the DFA does. |
838 | 21.2k | if (can_one_pass && text.size() <= 4096 && |
839 | 14.2k | (ncap > 1 || text.size() <= 16)) { |
840 | 13.8k | skipped_test = true; |
841 | 13.8k | break; |
842 | 13.8k | } |
843 | 7.39k | if (can_bit_state && text.size() <= bit_state_text_max_size && |
844 | 5.45k | ncap > 1) { |
845 | 2.33k | skipped_test = true; |
846 | 2.33k | break; |
847 | 2.33k | } |
848 | 5.05k | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
849 | 5.05k | &match, &dfa_failed, NULL)) { |
850 | 2.63k | if (dfa_failed) { |
851 | 0 | if (options_.log_errors()) |
852 | 0 | ABSL_LOG(ERROR) << "DFA out of memory: " |
853 | 0 | << "pattern length " << pattern_->size() << ", " |
854 | 0 | << "program size " << prog_->size() << ", " |
855 | 0 | << "list count " << prog_->list_count() << ", " |
856 | 0 | << "bytemap range " << prog_->bytemap_range(); |
857 | | // Fall back to NFA below. |
858 | 0 | skipped_test = true; |
859 | 0 | break; |
860 | 0 | } |
861 | 2.63k | return false; |
862 | 2.63k | } |
863 | 2.41k | break; |
864 | 66.4k | } |
865 | | |
866 | 29.9k | if (!skipped_test && ncap <= 1) { |
867 | | // We know exactly where it matches. That's enough. |
868 | 9.96k | if (ncap == 1) |
869 | 9.22k | submatch[0] = match; |
870 | 19.9k | } else { |
871 | 19.9k | absl::string_view subtext1; |
872 | 19.9k | if (skipped_test) { |
873 | | // DFA ran out of memory or was skipped: |
874 | | // need to search in entire original text. |
875 | 16.2k | subtext1 = subtext; |
876 | 16.2k | } else { |
877 | | // DFA found the exact match location: |
878 | | // let NFA run an anchored, full match search |
879 | | // to find submatch locations. |
880 | 3.77k | subtext1 = match; |
881 | 3.77k | anchor = Prog::kAnchored; |
882 | 3.77k | kind = Prog::kFullMatch; |
883 | 3.77k | } |
884 | | |
885 | 19.9k | if (can_one_pass && anchor != Prog::kUnanchored) { |
886 | 15.1k | if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { |
887 | 9.55k | if (!skipped_test && options_.log_errors()) |
888 | 9.55k | ABSL_LOG(ERROR) << "SearchOnePass inconsistency"; |
889 | 9.55k | return false; |
890 | 9.55k | } |
891 | 15.1k | } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { |
892 | 3.41k | if (!prog_->SearchBitState(subtext1, text, anchor, |
893 | 3.41k | kind, submatch, ncap)) { |
894 | 1.12k | if (!skipped_test && options_.log_errors()) |
895 | 1.12k | ABSL_LOG(ERROR) << "SearchBitState inconsistency"; |
896 | 1.12k | return false; |
897 | 1.12k | } |
898 | 3.41k | } else { |
899 | 1.47k | if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { |
900 | 0 | if (!skipped_test && options_.log_errors()) |
901 | 0 | ABSL_LOG(ERROR) << "SearchNFA inconsistency"; |
902 | 0 | return false; |
903 | 0 | } |
904 | 1.47k | } |
905 | 19.9k | } |
906 | | |
907 | | // Adjust overall match for required prefix that we stripped off. |
908 | 19.2k | if (prefixlen > 0 && nsubmatch > 0) |
909 | 126 | submatch[0] = absl::string_view(submatch[0].data() - prefixlen, |
910 | 126 | submatch[0].size() + prefixlen); |
911 | | |
912 | | // Zero submatches that don't exist in the regexp. |
913 | 19.2k | for (int i = ncap; i < nsubmatch; i++) |
914 | 0 | submatch[i] = absl::string_view(); |
915 | 19.2k | return true; |
916 | 29.9k | } |
917 | | |
918 | | // Internal matcher - like Match() but takes Args not string_views. |
919 | | bool RE2::DoMatch(absl::string_view text, |
920 | | Anchor re_anchor, |
921 | | size_t* consumed, |
922 | | const Arg* const* args, |
923 | 48.2k | int n) const { |
924 | 48.2k | if (!ok()) { |
925 | 0 | if (options_.log_errors()) |
926 | 0 | ABSL_LOG(ERROR) << "Invalid RE2: " << *error_; |
927 | 0 | return false; |
928 | 0 | } |
929 | | |
930 | 48.2k | if (NumberOfCapturingGroups() < n) { |
931 | | // RE has fewer capturing groups than number of Arg pointers passed in. |
932 | 0 | return false; |
933 | 0 | } |
934 | | |
935 | | // Count number of capture groups needed. |
936 | 48.2k | int nvec; |
937 | 48.2k | if (n == 0 && consumed == NULL) |
938 | 22.5k | nvec = 0; |
939 | 25.7k | else |
940 | 25.7k | nvec = n+1; |
941 | | |
942 | 48.2k | absl::FixedArray<absl::string_view, kVecSize> vec_storage(nvec); |
943 | 48.2k | absl::string_view* vec = vec_storage.data(); |
944 | | |
945 | 48.2k | if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { |
946 | 31.1k | return false; |
947 | 31.1k | } |
948 | | |
949 | 17.1k | if (consumed != NULL) |
950 | 6.98k | *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text)); |
951 | | |
952 | 17.1k | if (n == 0 || args == NULL) { |
953 | | // We are not interested in results |
954 | 10.7k | return true; |
955 | 10.7k | } |
956 | | |
957 | | // If we got here, we must have matched the whole pattern. |
958 | 6.88k | for (int i = 0; i < n; i++) { |
959 | 6.37k | absl::string_view s = vec[i+1]; |
960 | 6.37k | if (!args[i]->Parse(s.data(), s.size())) { |
961 | | // TODO: Should we indicate what the error was? |
962 | 5.87k | return false; |
963 | 5.87k | } |
964 | 6.37k | } |
965 | | |
966 | 502 | return true; |
967 | 6.37k | } |
968 | | |
969 | | // Checks that the rewrite string is well-formed with respect to this |
970 | | // regular expression. |
971 | | bool RE2::CheckRewriteString(absl::string_view rewrite, |
972 | 0 | std::string* error) const { |
973 | 0 | int max_token = -1; |
974 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
975 | 0 | s < end; s++) { |
976 | 0 | int c = *s; |
977 | 0 | if (c != '\\') { |
978 | 0 | continue; |
979 | 0 | } |
980 | 0 | if (++s == end) { |
981 | 0 | *error = "Rewrite schema error: '\\' not allowed at end."; |
982 | 0 | return false; |
983 | 0 | } |
984 | 0 | c = *s; |
985 | 0 | if (c == '\\') { |
986 | 0 | continue; |
987 | 0 | } |
988 | 0 | if (!absl::ascii_isdigit(c)) { |
989 | 0 | *error = "Rewrite schema error: " |
990 | 0 | "'\\' must be followed by a digit or '\\'."; |
991 | 0 | return false; |
992 | 0 | } |
993 | 0 | int n = (c - '0'); |
994 | 0 | if (max_token < n) { |
995 | 0 | max_token = n; |
996 | 0 | } |
997 | 0 | } |
998 | | |
999 | 0 | if (max_token > NumberOfCapturingGroups()) { |
1000 | 0 | *error = absl::StrFormat( |
1001 | 0 | "Rewrite schema requests %d matches, but the regexp only has %d " |
1002 | 0 | "parenthesized subexpressions.", |
1003 | 0 | max_token, NumberOfCapturingGroups()); |
1004 | 0 | return false; |
1005 | 0 | } |
1006 | 0 | return true; |
1007 | 0 | } |
1008 | | |
1009 | | // Returns the maximum submatch needed for the rewrite to be done by Replace(). |
1010 | | // E.g. if rewrite == "foo \\2,\\1", returns 2. |
1011 | 19.3k | int RE2::MaxSubmatch(absl::string_view rewrite) { |
1012 | 19.3k | int max = 0; |
1013 | 19.3k | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
1014 | 19.3k | s < end; s++) { |
1015 | 0 | if (*s == '\\') { |
1016 | 0 | s++; |
1017 | 0 | int c = (s < end) ? *s : -1; |
1018 | 0 | if (absl::ascii_isdigit(c)) { |
1019 | 0 | int n = (c - '0'); |
1020 | 0 | if (n > max) |
1021 | 0 | max = n; |
1022 | 0 | } |
1023 | 0 | } |
1024 | 0 | } |
1025 | 19.3k | return max; |
1026 | 19.3k | } |
1027 | | |
1028 | | // Append the "rewrite" string, with backslash substitutions from "vec", |
1029 | | // to string "out". |
1030 | | bool RE2::Rewrite(std::string* out, |
1031 | | absl::string_view rewrite, |
1032 | | const absl::string_view* vec, |
1033 | 7.16k | int veclen) const { |
1034 | 7.16k | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
1035 | 7.16k | s < end; s++) { |
1036 | 0 | if (*s != '\\') { |
1037 | 0 | out->push_back(*s); |
1038 | 0 | continue; |
1039 | 0 | } |
1040 | 0 | s++; |
1041 | 0 | int c = (s < end) ? *s : -1; |
1042 | 0 | if (absl::ascii_isdigit(c)) { |
1043 | 0 | int n = (c - '0'); |
1044 | 0 | if (n >= veclen) { |
1045 | 0 | if (options_.log_errors()) { |
1046 | 0 | ABSL_LOG(ERROR) << "invalid substitution \\" << n |
1047 | 0 | << " from " << veclen << " groups"; |
1048 | 0 | } |
1049 | 0 | return false; |
1050 | 0 | } |
1051 | 0 | absl::string_view snip = vec[n]; |
1052 | 0 | if (!snip.empty()) |
1053 | 0 | out->append(snip.data(), snip.size()); |
1054 | 0 | } else if (c == '\\') { |
1055 | 0 | out->push_back('\\'); |
1056 | 0 | } else { |
1057 | 0 | if (options_.log_errors()) |
1058 | 0 | ABSL_LOG(ERROR) << "invalid rewrite pattern: " << rewrite; |
1059 | 0 | return false; |
1060 | 0 | } |
1061 | 0 | } |
1062 | 7.16k | return true; |
1063 | 7.16k | } |
1064 | | |
1065 | | /***** Parsers for various types *****/ |
1066 | | |
1067 | | namespace re2_internal { |
1068 | | |
1069 | | template <> |
1070 | 0 | bool Parse(const char* str, size_t n, void* dest) { |
1071 | | // We fail if somebody asked us to store into a non-NULL void* pointer |
1072 | 0 | return (dest == NULL); |
1073 | 0 | } |
1074 | | |
1075 | | template <> |
1076 | 0 | bool Parse(const char* str, size_t n, std::string* dest) { |
1077 | 0 | if (dest == NULL) return true; |
1078 | 0 | dest->assign(str, n); |
1079 | 0 | return true; |
1080 | 0 | } |
1081 | | |
1082 | | template <> |
1083 | 0 | bool Parse(const char* str, size_t n, absl::string_view* dest) { |
1084 | 0 | if (dest == NULL) return true; |
1085 | 0 | *dest = absl::string_view(str, n); |
1086 | 0 | return true; |
1087 | 0 | } |
1088 | | |
1089 | | template <> |
1090 | 0 | bool Parse(const char* str, size_t n, char* dest) { |
1091 | 0 | if (n != 1) return false; |
1092 | 0 | if (dest == NULL) return true; |
1093 | 0 | *dest = str[0]; |
1094 | 0 | return true; |
1095 | 0 | } |
1096 | | |
1097 | | template <> |
1098 | 0 | bool Parse(const char* str, size_t n, signed char* dest) { |
1099 | 0 | if (n != 1) return false; |
1100 | 0 | if (dest == NULL) return true; |
1101 | 0 | *dest = str[0]; |
1102 | 0 | return true; |
1103 | 0 | } |
1104 | | |
1105 | | template <> |
1106 | 0 | bool Parse(const char* str, size_t n, unsigned char* dest) { |
1107 | 0 | if (n != 1) return false; |
1108 | 0 | if (dest == NULL) return true; |
1109 | 0 | *dest = str[0]; |
1110 | 0 | return true; |
1111 | 0 | } |
1112 | | |
1113 | | // Largest number spec that we are willing to parse |
1114 | | static const int kMaxNumberLength = 32; |
1115 | | |
1116 | | // REQUIRES "buf" must have length at least nbuf. |
1117 | | // Copies "str" into "buf" and null-terminates. |
1118 | | // Overwrites *np with the new length. |
1119 | | static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, |
1120 | 1.61k | size_t* np, bool accept_spaces) { |
1121 | 1.61k | size_t n = *np; |
1122 | 1.61k | if (n == 0) return ""; |
1123 | 1.61k | if (n > 0 && absl::ascii_isspace(*str)) { |
1124 | | // We are less forgiving than the strtoxxx() routines and do not |
1125 | | // allow leading spaces. We do allow leading spaces for floats. |
1126 | 119 | if (!accept_spaces) { |
1127 | 58 | return ""; |
1128 | 58 | } |
1129 | 533 | while (n > 0 && absl::ascii_isspace(*str)) { |
1130 | 472 | n--; |
1131 | 472 | str++; |
1132 | 472 | } |
1133 | 61 | } |
1134 | | |
1135 | | // Although buf has a fixed maximum size, we can still handle |
1136 | | // arbitrarily large integers correctly by omitting leading zeros. |
1137 | | // (Numbers that are still too long will be out of range.) |
1138 | | // Before deciding whether str is too long, |
1139 | | // remove leading zeros with s/000+/00/. |
1140 | | // Leaving the leading two zeros in place means that |
1141 | | // we don't change 0000x123 (invalid) into 0x123 (valid). |
1142 | | // Skip over leading - before replacing. |
1143 | 1.55k | bool neg = false; |
1144 | 1.55k | if (n >= 1 && str[0] == '-') { |
1145 | 369 | neg = true; |
1146 | 369 | n--; |
1147 | 369 | str++; |
1148 | 369 | } |
1149 | | |
1150 | 1.55k | if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1151 | 1.25k | while (n >= 3 && str[2] == '0') { |
1152 | 1.18k | n--; |
1153 | 1.18k | str++; |
1154 | 1.18k | } |
1155 | 73 | } |
1156 | | |
1157 | 1.55k | if (neg) { // make room in buf for - |
1158 | 369 | n++; |
1159 | 369 | str--; |
1160 | 369 | } |
1161 | | |
1162 | 1.55k | if (n > nbuf-1) return ""; |
1163 | | |
1164 | 1.48k | memmove(buf, str, n); |
1165 | 1.48k | if (neg) { |
1166 | 348 | buf[0] = '-'; |
1167 | 348 | } |
1168 | 1.48k | buf[n] = '\0'; |
1169 | 1.48k | *np = n; |
1170 | 1.48k | return buf; |
1171 | 1.55k | } |
1172 | | |
1173 | | template <> |
1174 | 1.60k | bool Parse(const char* str, size_t n, float* dest) { |
1175 | 1.60k | if (n == 0) return false; |
1176 | 384 | static const int kMaxLength = 200; |
1177 | 384 | char buf[kMaxLength+1]; |
1178 | 384 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1179 | 384 | char* end; |
1180 | 384 | errno = 0; |
1181 | 384 | float r = strtof(str, &end); |
1182 | 384 | if (end != str + n) return false; // Leftover junk |
1183 | 157 | if (errno) return false; |
1184 | 155 | if (dest == NULL) return true; |
1185 | 155 | *dest = r; |
1186 | 155 | return true; |
1187 | 155 | } |
1188 | | |
1189 | | template <> |
1190 | 1.67k | bool Parse(const char* str, size_t n, double* dest) { |
1191 | 1.67k | if (n == 0) return false; |
1192 | 418 | static const int kMaxLength = 200; |
1193 | 418 | char buf[kMaxLength+1]; |
1194 | 418 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1195 | 418 | char* end; |
1196 | 418 | errno = 0; |
1197 | 418 | double r = strtod(str, &end); |
1198 | 418 | if (end != str + n) return false; // Leftover junk |
1199 | 159 | if (errno) return false; |
1200 | 158 | if (dest == NULL) return true; |
1201 | 158 | *dest = r; |
1202 | 158 | return true; |
1203 | 158 | } |
1204 | | |
1205 | | template <> |
1206 | 3.10k | bool Parse(const char* str, size_t n, long* dest, int radix) { |
1207 | 3.10k | if (n == 0) return false; |
1208 | 812 | char buf[kMaxNumberLength+1]; |
1209 | 812 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1210 | 812 | char* end; |
1211 | 812 | errno = 0; |
1212 | 812 | long r = strtol(str, &end, radix); |
1213 | 812 | if (end != str + n) return false; // Leftover junk |
1214 | 275 | if (errno) return false; |
1215 | 272 | if (dest == NULL) return true; |
1216 | 272 | *dest = r; |
1217 | 272 | return true; |
1218 | 272 | } |
1219 | | |
1220 | | template <> |
1221 | 0 | bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { |
1222 | 0 | if (n == 0) return false; |
1223 | 0 | char buf[kMaxNumberLength+1]; |
1224 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1225 | 0 | if (str[0] == '-') { |
1226 | | // strtoul() will silently accept negative numbers and parse |
1227 | | // them. This module is more strict and treats them as errors. |
1228 | 0 | return false; |
1229 | 0 | } |
1230 | | |
1231 | 0 | char* end; |
1232 | 0 | errno = 0; |
1233 | 0 | unsigned long r = strtoul(str, &end, radix); |
1234 | 0 | if (end != str + n) return false; // Leftover junk |
1235 | 0 | if (errno) return false; |
1236 | 0 | if (dest == NULL) return true; |
1237 | 0 | *dest = r; |
1238 | 0 | return true; |
1239 | 0 | } |
1240 | | |
1241 | | template <> |
1242 | 1.42k | bool Parse(const char* str, size_t n, short* dest, int radix) { |
1243 | 1.42k | long r; |
1244 | 1.42k | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1245 | 136 | if ((short)r != r) return false; // Out of range |
1246 | 53 | if (dest == NULL) return true; |
1247 | 53 | *dest = (short)r; |
1248 | 53 | return true; |
1249 | 53 | } |
1250 | | |
1251 | | template <> |
1252 | 0 | bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { |
1253 | 0 | unsigned long r; |
1254 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1255 | 0 | if ((unsigned short)r != r) return false; // Out of range |
1256 | 0 | if (dest == NULL) return true; |
1257 | 0 | *dest = (unsigned short)r; |
1258 | 0 | return true; |
1259 | 0 | } |
1260 | | |
1261 | | template <> |
1262 | 0 | bool Parse(const char* str, size_t n, int* dest, int radix) { |
1263 | 0 | long r; |
1264 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1265 | 0 | if ((int)r != r) return false; // Out of range |
1266 | 0 | if (dest == NULL) return true; |
1267 | 0 | *dest = (int)r; |
1268 | 0 | return true; |
1269 | 0 | } |
1270 | | |
1271 | | template <> |
1272 | 0 | bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { |
1273 | 0 | unsigned long r; |
1274 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1275 | 0 | if ((unsigned int)r != r) return false; // Out of range |
1276 | 0 | if (dest == NULL) return true; |
1277 | 0 | *dest = (unsigned int)r; |
1278 | 0 | return true; |
1279 | 0 | } |
1280 | | |
1281 | | template <> |
1282 | 0 | bool Parse(const char* str, size_t n, long long* dest, int radix) { |
1283 | 0 | if (n == 0) return false; |
1284 | 0 | char buf[kMaxNumberLength+1]; |
1285 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1286 | 0 | char* end; |
1287 | 0 | errno = 0; |
1288 | 0 | long long r = strtoll(str, &end, radix); |
1289 | 0 | if (end != str + n) return false; // Leftover junk |
1290 | 0 | if (errno) return false; |
1291 | 0 | if (dest == NULL) return true; |
1292 | 0 | *dest = r; |
1293 | 0 | return true; |
1294 | 0 | } |
1295 | | |
1296 | | template <> |
1297 | 0 | bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { |
1298 | 0 | if (n == 0) return false; |
1299 | 0 | char buf[kMaxNumberLength+1]; |
1300 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1301 | 0 | if (str[0] == '-') { |
1302 | | // strtoull() will silently accept negative numbers and parse |
1303 | | // them. This module is more strict and treats them as errors. |
1304 | 0 | return false; |
1305 | 0 | } |
1306 | 0 | char* end; |
1307 | 0 | errno = 0; |
1308 | 0 | unsigned long long r = strtoull(str, &end, radix); |
1309 | 0 | if (end != str + n) return false; // Leftover junk |
1310 | 0 | if (errno) return false; |
1311 | 0 | if (dest == NULL) return true; |
1312 | 0 | *dest = r; |
1313 | 0 | return true; |
1314 | 0 | } |
1315 | | |
1316 | | } // namespace re2_internal |
1317 | | |
1318 | | namespace hooks { |
1319 | | |
1320 | | #ifdef RE2_HAVE_THREAD_LOCAL |
1321 | | thread_local const RE2* context = NULL; |
1322 | | #endif |
1323 | | |
1324 | | template <typename T> |
1325 | | union Hook { |
1326 | 0 | void Store(T* cb) { cb_.store(cb, std::memory_order_release); }Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFAStateCacheReset const&)>::Store(void (*)(re2::hooks::DFAStateCacheReset const&)) Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFASearchFailure const&)>::Store(void (*)(re2::hooks::DFASearchFailure const&)) |
1327 | 6 | T* Load() const { return cb_.load(std::memory_order_acquire); }re2::hooks::Hook<void (re2::hooks::DFAStateCacheReset const&)>::Load() const Line | Count | Source | 1327 | 6 | T* Load() const { return cb_.load(std::memory_order_acquire); } |
Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFASearchFailure const&)>::Load() const |
1328 | | |
1329 | | #if !defined(__clang__) && defined(_MSC_VER) |
1330 | | // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, |
1331 | | // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. |
1332 | | static_assert(ATOMIC_POINTER_LOCK_FREE == 2, |
1333 | | "std::atomic<T*> must be always lock-free"); |
1334 | | T* cb_for_constinit_; |
1335 | | #endif |
1336 | | |
1337 | | std::atomic<T*> cb_; |
1338 | | }; |
1339 | | |
1340 | | template <typename T> |
1341 | 6 | static void DoNothing(const T&) {}re2.cc:void re2::hooks::DoNothing<re2::hooks::DFAStateCacheReset>(re2::hooks::DFAStateCacheReset const&) Line | Count | Source | 1341 | 6 | static void DoNothing(const T&) {} |
Unexecuted instantiation: re2.cc:void re2::hooks::DoNothing<re2::hooks::DFASearchFailure>(re2::hooks::DFASearchFailure const&) |
1342 | | |
1343 | | #define DEFINE_HOOK(type, name) \ |
1344 | | static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \ |
1345 | 0 | void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \Unexecuted instantiation: re2::hooks::SetDFAStateCacheResetHook(void (*)(re2::hooks::DFAStateCacheReset const&)) Unexecuted instantiation: re2::hooks::SetDFASearchFailureHook(void (*)(re2::hooks::DFASearchFailure const&)) |
1346 | 6 | type##Callback* Get##type##Hook() { return name##_hook.Load(); }re2::hooks::GetDFAStateCacheResetHook() Line | Count | Source | 1346 | 6 | type##Callback* Get##type##Hook() { return name##_hook.Load(); } |
Unexecuted instantiation: re2::hooks::GetDFASearchFailureHook() |
1347 | | |
1348 | | DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) |
1349 | | DEFINE_HOOK(DFASearchFailure, dfa_search_failure) |
1350 | | |
1351 | | #undef DEFINE_HOOK |
1352 | | |
1353 | | } // namespace hooks |
1354 | | |
1355 | | } // namespace re2 |