Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Regular expression interface RE2. |
6 | | // |
7 | | // Originally the PCRE C++ wrapper, but adapted to use |
8 | | // the new automata-based regular expression engines. |
9 | | |
10 | | #include "re2/re2.h" |
11 | | |
12 | | #include <assert.h> |
13 | | #include <ctype.h> |
14 | | #include <errno.h> |
15 | | #ifdef _MSC_VER |
16 | | #include <intrin.h> |
17 | | #endif |
18 | | #include <stdint.h> |
19 | | #include <stdlib.h> |
20 | | #include <string.h> |
21 | | #include <algorithm> |
22 | | #include <atomic> |
23 | | #include <iterator> |
24 | | #include <mutex> |
25 | | #include <string> |
26 | | #include <utility> |
27 | | #include <vector> |
28 | | |
29 | | #include "util/util.h" |
30 | | #include "util/logging.h" |
31 | | #include "util/strutil.h" |
32 | | #include "util/utf.h" |
33 | | #include "re2/prog.h" |
34 | | #include "re2/regexp.h" |
35 | | #include "re2/sparse_array.h" |
36 | | |
37 | | namespace re2 { |
38 | | |
39 | | // Controls the maximum count permitted by GlobalReplace(); -1 is unlimited. |
40 | | static int maximum_global_replace_count = -1; |
41 | | |
42 | 0 | void RE2::FUZZING_ONLY_set_maximum_global_replace_count(int i) { |
43 | 0 | maximum_global_replace_count = i; |
44 | 0 | } |
45 | | |
46 | | // Maximum number of args we can set |
47 | | static const int kMaxArgs = 16; |
48 | | static const int kVecSize = 1+kMaxArgs; |
49 | | |
50 | | const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
51 | | |
52 | | RE2::Options::Options(RE2::CannedOptions opt) |
53 | | : max_mem_(kDefaultMaxMem), |
54 | | encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
55 | | posix_syntax_(opt == RE2::POSIX), |
56 | | longest_match_(opt == RE2::POSIX), |
57 | | log_errors_(opt != RE2::Quiet), |
58 | | literal_(false), |
59 | | never_nl_(false), |
60 | | dot_nl_(false), |
61 | | never_capture_(false), |
62 | | case_sensitive_(true), |
63 | | perl_classes_(false), |
64 | | word_boundary_(false), |
65 | 0 | one_line_(false) { |
66 | 0 | } |
67 | | |
68 | | // Empty objects for use as const references. |
69 | | // Statically allocating the storage and then |
70 | | // lazily constructing the objects (in a once |
71 | | // in RE2::Init()) avoids global constructors |
72 | | // and the false positives (thanks, Valgrind) |
73 | | // about memory leaks at program termination. |
74 | | struct EmptyStorage { |
75 | | std::string empty_string; |
76 | | std::map<std::string, int> empty_named_groups; |
77 | | std::map<int, std::string> empty_group_names; |
78 | | }; |
79 | | alignas(EmptyStorage) static char empty_storage[sizeof(EmptyStorage)]; |
80 | | |
81 | 20.7k | static inline std::string* empty_string() { |
82 | 20.7k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_string; |
83 | 20.7k | } |
84 | | |
85 | 5.18k | static inline std::map<std::string, int>* empty_named_groups() { |
86 | 5.18k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_named_groups; |
87 | 5.18k | } |
88 | | |
89 | 5.18k | static inline std::map<int, std::string>* empty_group_names() { |
90 | 5.18k | return &reinterpret_cast<EmptyStorage*>(empty_storage)->empty_group_names; |
91 | 5.18k | } |
92 | | |
93 | | // Converts from Regexp error code to RE2 error code. |
94 | | // Maybe some day they will diverge. In any event, this |
95 | | // hides the existence of Regexp from RE2 users. |
96 | 1.11k | static RE2::ErrorCode RegexpErrorToRE2(re2::RegexpStatusCode code) { |
97 | 1.11k | switch (code) { |
98 | 0 | case re2::kRegexpSuccess: |
99 | 0 | return RE2::NoError; |
100 | 0 | case re2::kRegexpInternalError: |
101 | 0 | return RE2::ErrorInternal; |
102 | 113 | case re2::kRegexpBadEscape: |
103 | 113 | return RE2::ErrorBadEscape; |
104 | 0 | case re2::kRegexpBadCharClass: |
105 | 0 | return RE2::ErrorBadCharClass; |
106 | 208 | case re2::kRegexpBadCharRange: |
107 | 208 | return RE2::ErrorBadCharRange; |
108 | 243 | case re2::kRegexpMissingBracket: |
109 | 243 | return RE2::ErrorMissingBracket; |
110 | 134 | case re2::kRegexpMissingParen: |
111 | 134 | return RE2::ErrorMissingParen; |
112 | 60 | case re2::kRegexpUnexpectedParen: |
113 | 60 | return RE2::ErrorUnexpectedParen; |
114 | 11 | case re2::kRegexpTrailingBackslash: |
115 | 11 | return RE2::ErrorTrailingBackslash; |
116 | 14 | case re2::kRegexpRepeatArgument: |
117 | 14 | return RE2::ErrorRepeatArgument; |
118 | 12 | case re2::kRegexpRepeatSize: |
119 | 12 | return RE2::ErrorRepeatSize; |
120 | 14 | case re2::kRegexpRepeatOp: |
121 | 14 | return RE2::ErrorRepeatOp; |
122 | 47 | case re2::kRegexpBadPerlOp: |
123 | 47 | return RE2::ErrorBadPerlOp; |
124 | 259 | case re2::kRegexpBadUTF8: |
125 | 259 | return RE2::ErrorBadUTF8; |
126 | 0 | case re2::kRegexpBadNamedCapture: |
127 | 0 | return RE2::ErrorBadNamedCapture; |
128 | 1.11k | } |
129 | 0 | return RE2::ErrorInternal; |
130 | 1.11k | } |
131 | | |
132 | 0 | static std::string trunc(const StringPiece& pattern) { |
133 | 0 | if (pattern.size() < 100) |
134 | 0 | return std::string(pattern); |
135 | 0 | return std::string(pattern.substr(0, 100)) + "..."; |
136 | 0 | } |
137 | | |
138 | | |
139 | 0 | RE2::RE2(const char* pattern) { |
140 | 0 | Init(pattern, DefaultOptions); |
141 | 0 | } |
142 | | |
143 | 0 | RE2::RE2(const std::string& pattern) { |
144 | 0 | Init(pattern, DefaultOptions); |
145 | 0 | } |
146 | | |
147 | 0 | RE2::RE2(const StringPiece& pattern) { |
148 | 0 | Init(pattern, DefaultOptions); |
149 | 0 | } |
150 | | |
151 | 5.18k | RE2::RE2(const StringPiece& pattern, const Options& options) { |
152 | 5.18k | Init(pattern, options); |
153 | 5.18k | } |
154 | | |
155 | 5.18k | int RE2::Options::ParseFlags() const { |
156 | 5.18k | int flags = Regexp::ClassNL; |
157 | 5.18k | switch (encoding()) { |
158 | 0 | default: |
159 | 0 | if (log_errors()) |
160 | 0 | LOG(ERROR) << "Unknown encoding " << encoding(); |
161 | 0 | break; |
162 | 1.32k | case RE2::Options::EncodingUTF8: |
163 | 1.32k | break; |
164 | 3.85k | case RE2::Options::EncodingLatin1: |
165 | 3.85k | flags |= Regexp::Latin1; |
166 | 3.85k | break; |
167 | 5.18k | } |
168 | | |
169 | 5.18k | if (!posix_syntax()) |
170 | 4.15k | flags |= Regexp::LikePerl; |
171 | | |
172 | 5.18k | if (literal()) |
173 | 202 | flags |= Regexp::Literal; |
174 | | |
175 | 5.18k | if (never_nl()) |
176 | 1.39k | flags |= Regexp::NeverNL; |
177 | | |
178 | 5.18k | if (dot_nl()) |
179 | 1.73k | flags |= Regexp::DotNL; |
180 | | |
181 | 5.18k | if (never_capture()) |
182 | 1.51k | flags |= Regexp::NeverCapture; |
183 | | |
184 | 5.18k | if (!case_sensitive()) |
185 | 4.58k | flags |= Regexp::FoldCase; |
186 | | |
187 | 5.18k | if (perl_classes()) |
188 | 0 | flags |= Regexp::PerlClasses; |
189 | | |
190 | 5.18k | if (word_boundary()) |
191 | 0 | flags |= Regexp::PerlB; |
192 | | |
193 | 5.18k | if (one_line()) |
194 | 0 | flags |= Regexp::OneLine; |
195 | | |
196 | 5.18k | return flags; |
197 | 5.18k | } |
198 | | |
199 | 5.18k | void RE2::Init(const StringPiece& pattern, const Options& options) { |
200 | 5.18k | static std::once_flag empty_once; |
201 | 5.18k | std::call_once(empty_once, []() { |
202 | 4 | (void) new (empty_storage) EmptyStorage; |
203 | 4 | }); |
204 | | |
205 | 5.18k | pattern_ = new std::string(pattern); |
206 | 5.18k | options_.Copy(options); |
207 | 5.18k | entire_regexp_ = NULL; |
208 | 5.18k | suffix_regexp_ = NULL; |
209 | 5.18k | error_ = empty_string(); |
210 | 5.18k | error_arg_ = empty_string(); |
211 | | |
212 | 5.18k | num_captures_ = -1; |
213 | 5.18k | error_code_ = NoError; |
214 | 5.18k | longest_match_ = options_.longest_match(); |
215 | 5.18k | is_one_pass_ = false; |
216 | 5.18k | prefix_foldcase_ = false; |
217 | 5.18k | prefix_.clear(); |
218 | 5.18k | prog_ = NULL; |
219 | | |
220 | 5.18k | rprog_ = NULL; |
221 | 5.18k | named_groups_ = NULL; |
222 | 5.18k | group_names_ = NULL; |
223 | | |
224 | 5.18k | RegexpStatus status; |
225 | 5.18k | entire_regexp_ = Regexp::Parse( |
226 | 5.18k | *pattern_, |
227 | 5.18k | static_cast<Regexp::ParseFlags>(options_.ParseFlags()), |
228 | 5.18k | &status); |
229 | 5.18k | if (entire_regexp_ == NULL) { |
230 | 1.11k | if (options_.log_errors()) { |
231 | 0 | LOG(ERROR) << "Error parsing '" << trunc(*pattern_) << "': " |
232 | 0 | << status.Text(); |
233 | 0 | } |
234 | 1.11k | error_ = new std::string(status.Text()); |
235 | 1.11k | error_code_ = RegexpErrorToRE2(status.code()); |
236 | 1.11k | error_arg_ = new std::string(status.error_arg()); |
237 | 1.11k | return; |
238 | 1.11k | } |
239 | | |
240 | 4.07k | bool foldcase; |
241 | 4.07k | re2::Regexp* suffix; |
242 | 4.07k | if (entire_regexp_->RequiredPrefix(&prefix_, &foldcase, &suffix)) { |
243 | 85 | prefix_foldcase_ = foldcase; |
244 | 85 | suffix_regexp_ = suffix; |
245 | 85 | } |
246 | 3.98k | else { |
247 | 3.98k | suffix_regexp_ = entire_regexp_->Incref(); |
248 | 3.98k | } |
249 | | |
250 | | // Two thirds of the memory goes to the forward Prog, |
251 | | // one third to the reverse prog, because the forward |
252 | | // Prog has two DFAs but the reverse prog has one. |
253 | 4.07k | prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); |
254 | 4.07k | if (prog_ == NULL) { |
255 | 0 | if (options_.log_errors()) |
256 | 0 | LOG(ERROR) << "Error compiling '" << trunc(*pattern_) << "'"; |
257 | 0 | error_ = new std::string("pattern too large - compile failed"); |
258 | 0 | error_code_ = RE2::ErrorPatternTooLarge; |
259 | 0 | return; |
260 | 0 | } |
261 | | |
262 | | // We used to compute this lazily, but it's used during the |
263 | | // typical control flow for a match call, so we now compute |
264 | | // it eagerly, which avoids the overhead of std::once_flag. |
265 | 4.07k | num_captures_ = suffix_regexp_->NumCaptures(); |
266 | | |
267 | | // Could delay this until the first match call that |
268 | | // cares about submatch information, but the one-pass |
269 | | // machine's memory gets cut from the DFA memory budget, |
270 | | // and that is harder to do if the DFA has already |
271 | | // been built. |
272 | 4.07k | is_one_pass_ = prog_->IsOnePass(); |
273 | 4.07k | } |
274 | | |
275 | | // Returns rprog_, computing it if needed. |
276 | 0 | re2::Prog* RE2::ReverseProg() const { |
277 | 0 | std::call_once(rprog_once_, [](const RE2* re) { |
278 | 0 | re->rprog_ = |
279 | 0 | re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); |
280 | 0 | if (re->rprog_ == NULL) { |
281 | 0 | if (re->options_.log_errors()) |
282 | 0 | LOG(ERROR) << "Error reverse compiling '" << trunc(*re->pattern_) |
283 | 0 | << "'"; |
284 | | // We no longer touch error_ and error_code_ because failing to compile |
285 | | // the reverse Prog is not a showstopper: falling back to NFA execution |
286 | | // is fine. More importantly, an RE2 object is supposed to be logically |
287 | | // immutable: whatever ok() would have returned after Init() completed, |
288 | | // it should continue to return that no matter what ReverseProg() does. |
289 | 0 | } |
290 | 0 | }, this); |
291 | 0 | return rprog_; |
292 | 0 | } |
293 | | |
294 | 5.18k | RE2::~RE2() { |
295 | 5.18k | if (group_names_ != empty_group_names()) |
296 | 5.18k | delete group_names_; |
297 | 5.18k | if (named_groups_ != empty_named_groups()) |
298 | 5.18k | delete named_groups_; |
299 | 5.18k | delete rprog_; |
300 | 5.18k | delete prog_; |
301 | 5.18k | if (error_arg_ != empty_string()) |
302 | 1.11k | delete error_arg_; |
303 | 5.18k | if (error_ != empty_string()) |
304 | 1.11k | delete error_; |
305 | 5.18k | if (suffix_regexp_) |
306 | 4.07k | suffix_regexp_->Decref(); |
307 | 5.18k | if (entire_regexp_) |
308 | 4.07k | entire_regexp_->Decref(); |
309 | 5.18k | delete pattern_; |
310 | 5.18k | } |
311 | | |
312 | 0 | int RE2::ProgramSize() const { |
313 | 0 | if (prog_ == NULL) |
314 | 0 | return -1; |
315 | 0 | return prog_->size(); |
316 | 0 | } |
317 | | |
318 | 0 | int RE2::ReverseProgramSize() const { |
319 | 0 | if (prog_ == NULL) |
320 | 0 | return -1; |
321 | 0 | Prog* prog = ReverseProg(); |
322 | 0 | if (prog == NULL) |
323 | 0 | return -1; |
324 | 0 | return prog->size(); |
325 | 0 | } |
326 | | |
327 | | // Finds the most significant non-zero bit in n. |
328 | 0 | static int FindMSBSet(uint32_t n) { |
329 | 0 | DCHECK_NE(n, 0); |
330 | 0 | #if defined(__GNUC__) |
331 | 0 | return 31 ^ __builtin_clz(n); |
332 | | #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) |
333 | | unsigned long c; |
334 | | _BitScanReverse(&c, n); |
335 | | return static_cast<int>(c); |
336 | | #else |
337 | | int c = 0; |
338 | | for (int shift = 1 << 4; shift != 0; shift >>= 1) { |
339 | | uint32_t word = n >> shift; |
340 | | if (word != 0) { |
341 | | n = word; |
342 | | c += shift; |
343 | | } |
344 | | } |
345 | | return c; |
346 | | #endif |
347 | 0 | } |
348 | | |
349 | 0 | static int Fanout(Prog* prog, std::vector<int>* histogram) { |
350 | 0 | SparseArray<int> fanout(prog->size()); |
351 | 0 | prog->Fanout(&fanout); |
352 | 0 | int data[32] = {}; |
353 | 0 | int size = 0; |
354 | 0 | for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
355 | 0 | if (i->value() == 0) |
356 | 0 | continue; |
357 | 0 | uint32_t value = i->value(); |
358 | 0 | int bucket = FindMSBSet(value); |
359 | 0 | bucket += value & (value-1) ? 1 : 0; |
360 | 0 | ++data[bucket]; |
361 | 0 | size = std::max(size, bucket+1); |
362 | 0 | } |
363 | 0 | if (histogram != NULL) |
364 | 0 | histogram->assign(data, data+size); |
365 | 0 | return size-1; |
366 | 0 | } |
367 | | |
368 | 0 | int RE2::ProgramFanout(std::vector<int>* histogram) const { |
369 | 0 | if (prog_ == NULL) |
370 | 0 | return -1; |
371 | 0 | return Fanout(prog_, histogram); |
372 | 0 | } |
373 | | |
374 | 0 | int RE2::ReverseProgramFanout(std::vector<int>* histogram) const { |
375 | 0 | if (prog_ == NULL) |
376 | 0 | return -1; |
377 | 0 | Prog* prog = ReverseProg(); |
378 | 0 | if (prog == NULL) |
379 | 0 | return -1; |
380 | 0 | return Fanout(prog, histogram); |
381 | 0 | } |
382 | | |
383 | | // Returns named_groups_, computing it if needed. |
384 | 0 | const std::map<std::string, int>& RE2::NamedCapturingGroups() const { |
385 | 0 | std::call_once(named_groups_once_, [](const RE2* re) { |
386 | 0 | if (re->suffix_regexp_ != NULL) |
387 | 0 | re->named_groups_ = re->suffix_regexp_->NamedCaptures(); |
388 | 0 | if (re->named_groups_ == NULL) |
389 | 0 | re->named_groups_ = empty_named_groups(); |
390 | 0 | }, this); |
391 | 0 | return *named_groups_; |
392 | 0 | } |
393 | | |
394 | | // Returns group_names_, computing it if needed. |
395 | 0 | const std::map<int, std::string>& RE2::CapturingGroupNames() const { |
396 | 0 | std::call_once(group_names_once_, [](const RE2* re) { |
397 | 0 | if (re->suffix_regexp_ != NULL) |
398 | 0 | re->group_names_ = re->suffix_regexp_->CaptureNames(); |
399 | 0 | if (re->group_names_ == NULL) |
400 | 0 | re->group_names_ = empty_group_names(); |
401 | 0 | }, this); |
402 | 0 | return *group_names_; |
403 | 0 | } |
404 | | |
405 | | /***** Convenience interfaces *****/ |
406 | | |
407 | | bool RE2::FullMatchN(const StringPiece& text, const RE2& re, |
408 | 4.07k | const Arg* const args[], int n) { |
409 | 4.07k | return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); |
410 | 4.07k | } |
411 | | |
412 | | bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, |
413 | 0 | const Arg* const args[], int n) { |
414 | 0 | return re.DoMatch(text, UNANCHORED, NULL, args, n); |
415 | 0 | } |
416 | | |
417 | | bool RE2::ConsumeN(StringPiece* input, const RE2& re, |
418 | 0 | const Arg* const args[], int n) { |
419 | 0 | size_t consumed; |
420 | 0 | if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { |
421 | 0 | input->remove_prefix(consumed); |
422 | 0 | return true; |
423 | 0 | } else { |
424 | 0 | return false; |
425 | 0 | } |
426 | 0 | } |
427 | | |
428 | | bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, |
429 | 0 | const Arg* const args[], int n) { |
430 | 0 | size_t consumed; |
431 | 0 | if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { |
432 | 0 | input->remove_prefix(consumed); |
433 | 0 | return true; |
434 | 0 | } else { |
435 | 0 | return false; |
436 | 0 | } |
437 | 0 | } |
438 | | |
439 | | bool RE2::Replace(std::string* str, |
440 | | const RE2& re, |
441 | 0 | const StringPiece& rewrite) { |
442 | 0 | StringPiece vec[kVecSize]; |
443 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
444 | 0 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
445 | 0 | return false; |
446 | 0 | if (nvec > static_cast<int>(arraysize(vec))) |
447 | 0 | return false; |
448 | 0 | if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) |
449 | 0 | return false; |
450 | | |
451 | 0 | std::string s; |
452 | 0 | if (!re.Rewrite(&s, rewrite, vec, nvec)) |
453 | 0 | return false; |
454 | | |
455 | 0 | assert(vec[0].data() >= str->data()); |
456 | 0 | assert(vec[0].data() + vec[0].size() <= str->data() + str->size()); |
457 | 0 | str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
458 | 0 | return true; |
459 | 0 | } |
460 | | |
461 | | int RE2::GlobalReplace(std::string* str, |
462 | | const RE2& re, |
463 | 0 | const StringPiece& rewrite) { |
464 | 0 | StringPiece vec[kVecSize]; |
465 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
466 | 0 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
467 | 0 | return false; |
468 | 0 | if (nvec > static_cast<int>(arraysize(vec))) |
469 | 0 | return false; |
470 | | |
471 | 0 | const char* p = str->data(); |
472 | 0 | const char* ep = p + str->size(); |
473 | 0 | const char* lastend = NULL; |
474 | 0 | std::string out; |
475 | 0 | int count = 0; |
476 | 0 | while (p <= ep) { |
477 | 0 | if (maximum_global_replace_count != -1 && |
478 | 0 | count >= maximum_global_replace_count) |
479 | 0 | break; |
480 | 0 | if (!re.Match(*str, static_cast<size_t>(p - str->data()), |
481 | 0 | str->size(), UNANCHORED, vec, nvec)) |
482 | 0 | break; |
483 | 0 | if (p < vec[0].data()) |
484 | 0 | out.append(p, vec[0].data() - p); |
485 | 0 | if (vec[0].data() == lastend && vec[0].empty()) { |
486 | | // Disallow empty match at end of last match: skip ahead. |
487 | | // |
488 | | // fullrune() takes int, not ptrdiff_t. However, it just looks |
489 | | // at the leading byte and treats any length >= 4 the same. |
490 | 0 | if (re.options().encoding() == RE2::Options::EncodingUTF8 && |
491 | 0 | fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { |
492 | | // re is in UTF-8 mode and there is enough left of str |
493 | | // to allow us to advance by up to UTFmax bytes. |
494 | 0 | Rune r; |
495 | 0 | int n = chartorune(&r, p); |
496 | | // Some copies of chartorune have a bug that accepts |
497 | | // encodings of values in (10FFFF, 1FFFFF] as valid. |
498 | 0 | if (r > Runemax) { |
499 | 0 | n = 1; |
500 | 0 | r = Runeerror; |
501 | 0 | } |
502 | 0 | if (!(n == 1 && r == Runeerror)) { // no decoding error |
503 | 0 | out.append(p, n); |
504 | 0 | p += n; |
505 | 0 | continue; |
506 | 0 | } |
507 | 0 | } |
508 | | // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, |
509 | | // we fell through from above and the GIGO principle applies. |
510 | 0 | if (p < ep) |
511 | 0 | out.append(p, 1); |
512 | 0 | p++; |
513 | 0 | continue; |
514 | 0 | } |
515 | 0 | re.Rewrite(&out, rewrite, vec, nvec); |
516 | 0 | p = vec[0].data() + vec[0].size(); |
517 | 0 | lastend = p; |
518 | 0 | count++; |
519 | 0 | } |
520 | |
|
521 | 0 | if (count == 0) |
522 | 0 | return 0; |
523 | | |
524 | 0 | if (p < ep) |
525 | 0 | out.append(p, ep - p); |
526 | 0 | using std::swap; |
527 | 0 | swap(out, *str); |
528 | 0 | return count; |
529 | 0 | } |
530 | | |
531 | | bool RE2::Extract(const StringPiece& text, |
532 | | const RE2& re, |
533 | | const StringPiece& rewrite, |
534 | 0 | std::string* out) { |
535 | 0 | StringPiece vec[kVecSize]; |
536 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
537 | 0 | if (nvec > 1 + re.NumberOfCapturingGroups()) |
538 | 0 | return false; |
539 | 0 | if (nvec > static_cast<int>(arraysize(vec))) |
540 | 0 | return false; |
541 | 0 | if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) |
542 | 0 | return false; |
543 | | |
544 | 0 | out->clear(); |
545 | 0 | return re.Rewrite(out, rewrite, vec, nvec); |
546 | 0 | } |
547 | | |
548 | 0 | std::string RE2::QuoteMeta(const StringPiece& unquoted) { |
549 | 0 | std::string result; |
550 | 0 | result.reserve(unquoted.size() << 1); |
551 | | |
552 | | // Escape any ascii character not in [A-Za-z_0-9]. |
553 | | // |
554 | | // Note that it's legal to escape a character even if it has no |
555 | | // special meaning in a regular expression -- so this function does |
556 | | // that. (This also makes it identical to the perl function of the |
557 | | // same name except for the null-character special case; |
558 | | // see `perldoc -f quotemeta`.) |
559 | 0 | for (size_t ii = 0; ii < unquoted.size(); ++ii) { |
560 | | // Note that using 'isalnum' here raises the benchmark time from |
561 | | // 32ns to 58ns: |
562 | 0 | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
563 | 0 | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
564 | 0 | (unquoted[ii] < '0' || unquoted[ii] > '9') && |
565 | 0 | unquoted[ii] != '_' && |
566 | | // If this is the part of a UTF8 or Latin1 character, we need |
567 | | // to copy this byte without escaping. Experimentally this is |
568 | | // what works correctly with the regexp library. |
569 | 0 | !(unquoted[ii] & 128)) { |
570 | 0 | if (unquoted[ii] == '\0') { // Special handling for null chars. |
571 | | // Note that this special handling is not strictly required for RE2, |
572 | | // but this quoting is required for other regexp libraries such as |
573 | | // PCRE. |
574 | | // Can't use "\\0" since the next character might be a digit. |
575 | 0 | result += "\\x00"; |
576 | 0 | continue; |
577 | 0 | } |
578 | 0 | result += '\\'; |
579 | 0 | } |
580 | 0 | result += unquoted[ii]; |
581 | 0 | } |
582 | |
|
583 | 0 | return result; |
584 | 0 | } |
585 | | |
586 | | bool RE2::PossibleMatchRange(std::string* min, std::string* max, |
587 | 0 | int maxlen) const { |
588 | 0 | if (prog_ == NULL) |
589 | 0 | return false; |
590 | | |
591 | 0 | int n = static_cast<int>(prefix_.size()); |
592 | 0 | if (n > maxlen) |
593 | 0 | n = maxlen; |
594 | | |
595 | | // Determine initial min max from prefix_ literal. |
596 | 0 | *min = prefix_.substr(0, n); |
597 | 0 | *max = prefix_.substr(0, n); |
598 | 0 | if (prefix_foldcase_) { |
599 | | // prefix is ASCII lowercase; change *min to uppercase. |
600 | 0 | for (int i = 0; i < n; i++) { |
601 | 0 | char& c = (*min)[i]; |
602 | 0 | if ('a' <= c && c <= 'z') |
603 | 0 | c += 'A' - 'a'; |
604 | 0 | } |
605 | 0 | } |
606 | | |
607 | | // Add to prefix min max using PossibleMatchRange on regexp. |
608 | 0 | std::string dmin, dmax; |
609 | 0 | maxlen -= n; |
610 | 0 | if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { |
611 | 0 | min->append(dmin); |
612 | 0 | max->append(dmax); |
613 | 0 | } else if (!max->empty()) { |
614 | | // prog_->PossibleMatchRange has failed us, |
615 | | // but we still have useful information from prefix_. |
616 | | // Round up *max to allow any possible suffix. |
617 | 0 | PrefixSuccessor(max); |
618 | 0 | } else { |
619 | | // Nothing useful. |
620 | 0 | *min = ""; |
621 | 0 | *max = ""; |
622 | 0 | return false; |
623 | 0 | } |
624 | | |
625 | 0 | return true; |
626 | 0 | } |
627 | | |
628 | | // Avoid possible locale nonsense in standard strcasecmp. |
629 | | // The string a is known to be all lowercase. |
630 | 42 | static int ascii_strcasecmp(const char* a, const char* b, size_t len) { |
631 | 42 | const char* ae = a + len; |
632 | | |
633 | 45 | for (; a < ae; a++, b++) { |
634 | 43 | uint8_t x = *a; |
635 | 43 | uint8_t y = *b; |
636 | 43 | if ('A' <= y && y <= 'Z') |
637 | 0 | y += 'a' - 'A'; |
638 | 43 | if (x != y) |
639 | 40 | return x - y; |
640 | 43 | } |
641 | 2 | return 0; |
642 | 42 | } |
643 | | |
644 | | |
645 | | /***** Actual matching and rewriting code *****/ |
646 | | |
647 | | bool RE2::Match(const StringPiece& text, |
648 | | size_t startpos, |
649 | | size_t endpos, |
650 | | Anchor re_anchor, |
651 | | StringPiece* submatch, |
652 | 4.07k | int nsubmatch) const { |
653 | 4.07k | if (!ok()) { |
654 | 0 | if (options_.log_errors()) |
655 | 0 | LOG(ERROR) << "Invalid RE2: " << *error_; |
656 | 0 | return false; |
657 | 0 | } |
658 | | |
659 | 4.07k | if (startpos > endpos || endpos > text.size()) { |
660 | 0 | if (options_.log_errors()) |
661 | 0 | LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
662 | 0 | << "startpos: " << startpos << ", " |
663 | 0 | << "endpos: " << endpos << ", " |
664 | 0 | << "text size: " << text.size() << "]"; |
665 | 0 | return false; |
666 | 0 | } |
667 | | |
668 | 4.07k | StringPiece subtext = text; |
669 | 4.07k | subtext.remove_prefix(startpos); |
670 | 4.07k | subtext.remove_suffix(text.size() - endpos); |
671 | | |
672 | | // Use DFAs to find exact location of match, filter out non-matches. |
673 | | |
674 | | // Don't ask for the location if we won't use it. |
675 | | // SearchDFA can do extra optimizations in that case. |
676 | 4.07k | StringPiece match; |
677 | 4.07k | StringPiece* matchp = &match; |
678 | 4.07k | if (nsubmatch == 0) |
679 | 4.07k | matchp = NULL; |
680 | | |
681 | 4.07k | int ncap = 1 + NumberOfCapturingGroups(); |
682 | 4.07k | if (ncap > nsubmatch) |
683 | 4.07k | ncap = nsubmatch; |
684 | | |
685 | | // If the regexp is anchored explicitly, must not be in middle of text. |
686 | 4.07k | if (prog_->anchor_start() && startpos != 0) |
687 | 0 | return false; |
688 | 4.07k | if (prog_->anchor_end() && endpos != text.size()) |
689 | 0 | return false; |
690 | | |
691 | | // If the regexp is anchored explicitly, update re_anchor |
692 | | // so that we can potentially fall into a faster case below. |
693 | 4.07k | if (prog_->anchor_start() && prog_->anchor_end()) |
694 | 0 | re_anchor = ANCHOR_BOTH; |
695 | 4.07k | else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
696 | 0 | re_anchor = ANCHOR_START; |
697 | | |
698 | | // Check for the required prefix, if any. |
699 | 4.07k | size_t prefixlen = 0; |
700 | 4.07k | if (!prefix_.empty()) { |
701 | 85 | if (startpos != 0) |
702 | 0 | return false; |
703 | 85 | prefixlen = prefix_.size(); |
704 | 85 | if (prefixlen > subtext.size()) |
705 | 0 | return false; |
706 | 85 | if (prefix_foldcase_) { |
707 | 42 | if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
708 | 40 | return false; |
709 | 43 | } else { |
710 | 43 | if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
711 | 42 | return false; |
712 | 43 | } |
713 | 3 | subtext.remove_prefix(prefixlen); |
714 | | // If there is a required prefix, the anchor must be at least ANCHOR_START. |
715 | 3 | if (re_anchor != ANCHOR_BOTH) |
716 | 0 | re_anchor = ANCHOR_START; |
717 | 3 | } |
718 | | |
719 | 3.99k | Prog::Anchor anchor = Prog::kUnanchored; |
720 | 3.99k | Prog::MatchKind kind = |
721 | 3.99k | longest_match_ ? Prog::kLongestMatch : Prog::kFirstMatch; |
722 | | |
723 | 3.99k | bool can_one_pass = is_one_pass_ && ncap <= Prog::kMaxOnePassCapture; |
724 | 3.99k | bool can_bit_state = prog_->CanBitState(); |
725 | 3.99k | size_t bit_state_text_max_size = prog_->bit_state_text_max_size(); |
726 | | |
727 | 3.99k | #ifdef RE2_HAVE_THREAD_LOCAL |
728 | 3.99k | hooks::context = this; |
729 | 3.99k | #endif |
730 | 3.99k | bool dfa_failed = false; |
731 | 3.99k | bool skipped_test = false; |
732 | 3.99k | switch (re_anchor) { |
733 | 0 | default: |
734 | 0 | LOG(DFATAL) << "Unexpected re_anchor value: " << re_anchor; |
735 | 0 | return false; |
736 | | |
737 | 0 | case UNANCHORED: { |
738 | 0 | if (prog_->anchor_end()) { |
739 | | // This is a very special case: we don't need the forward DFA because |
740 | | // we already know where the match must end! Instead, the reverse DFA |
741 | | // can say whether there is a match and (optionally) where it starts. |
742 | 0 | Prog* prog = ReverseProg(); |
743 | 0 | if (prog == NULL) { |
744 | | // Fall back to NFA below. |
745 | 0 | skipped_test = true; |
746 | 0 | break; |
747 | 0 | } |
748 | 0 | if (!prog->SearchDFA(subtext, text, Prog::kAnchored, |
749 | 0 | Prog::kLongestMatch, matchp, &dfa_failed, NULL)) { |
750 | 0 | if (dfa_failed) { |
751 | 0 | if (options_.log_errors()) |
752 | 0 | LOG(ERROR) << "DFA out of memory: " |
753 | 0 | << "pattern length " << pattern_->size() << ", " |
754 | 0 | << "program size " << prog->size() << ", " |
755 | 0 | << "list count " << prog->list_count() << ", " |
756 | 0 | << "bytemap range " << prog->bytemap_range(); |
757 | | // Fall back to NFA below. |
758 | 0 | skipped_test = true; |
759 | 0 | break; |
760 | 0 | } |
761 | 0 | return false; |
762 | 0 | } |
763 | 0 | if (matchp == NULL) // Matched. Don't care where. |
764 | 0 | return true; |
765 | 0 | break; |
766 | 0 | } |
767 | | |
768 | 0 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
769 | 0 | matchp, &dfa_failed, NULL)) { |
770 | 0 | if (dfa_failed) { |
771 | 0 | if (options_.log_errors()) |
772 | 0 | LOG(ERROR) << "DFA out of memory: " |
773 | 0 | << "pattern length " << pattern_->size() << ", " |
774 | 0 | << "program size " << prog_->size() << ", " |
775 | 0 | << "list count " << prog_->list_count() << ", " |
776 | 0 | << "bytemap range " << prog_->bytemap_range(); |
777 | | // Fall back to NFA below. |
778 | 0 | skipped_test = true; |
779 | 0 | break; |
780 | 0 | } |
781 | 0 | return false; |
782 | 0 | } |
783 | 0 | if (matchp == NULL) // Matched. Don't care where. |
784 | 0 | return true; |
785 | | // SearchDFA set match.end() but didn't know where the |
786 | | // match started. Run the regexp backward from match.end() |
787 | | // to find the longest possible match -- that's where it started. |
788 | 0 | Prog* prog = ReverseProg(); |
789 | 0 | if (prog == NULL) { |
790 | | // Fall back to NFA below. |
791 | 0 | skipped_test = true; |
792 | 0 | break; |
793 | 0 | } |
794 | 0 | if (!prog->SearchDFA(match, text, Prog::kAnchored, |
795 | 0 | Prog::kLongestMatch, &match, &dfa_failed, NULL)) { |
796 | 0 | if (dfa_failed) { |
797 | 0 | if (options_.log_errors()) |
798 | 0 | LOG(ERROR) << "DFA out of memory: " |
799 | 0 | << "pattern length " << pattern_->size() << ", " |
800 | 0 | << "program size " << prog->size() << ", " |
801 | 0 | << "list count " << prog->list_count() << ", " |
802 | 0 | << "bytemap range " << prog->bytemap_range(); |
803 | | // Fall back to NFA below. |
804 | 0 | skipped_test = true; |
805 | 0 | break; |
806 | 0 | } |
807 | 0 | if (options_.log_errors()) |
808 | 0 | LOG(ERROR) << "SearchDFA inconsistency"; |
809 | 0 | return false; |
810 | 0 | } |
811 | 0 | break; |
812 | 0 | } |
813 | | |
814 | 3.99k | case ANCHOR_BOTH: |
815 | 3.99k | case ANCHOR_START: |
816 | 3.99k | if (re_anchor == ANCHOR_BOTH) |
817 | 3.99k | kind = Prog::kFullMatch; |
818 | 3.99k | anchor = Prog::kAnchored; |
819 | | |
820 | | // If only a small amount of text and need submatch |
821 | | // information anyway and we're going to use OnePass or BitState |
822 | | // to get it, we might as well not even bother with the DFA: |
823 | | // OnePass or BitState will be fast enough. |
824 | | // On tiny texts, OnePass outruns even the DFA, and |
825 | | // it doesn't have the shared state and occasional mutex that |
826 | | // the DFA does. |
827 | 3.99k | if (can_one_pass && text.size() <= 4096 && |
828 | 3.99k | (ncap > 1 || text.size() <= 16)) { |
829 | 126 | skipped_test = true; |
830 | 126 | break; |
831 | 126 | } |
832 | 3.86k | if (can_bit_state && text.size() <= bit_state_text_max_size && |
833 | 3.86k | ncap > 1) { |
834 | 0 | skipped_test = true; |
835 | 0 | break; |
836 | 0 | } |
837 | 3.86k | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
838 | 3.86k | &match, &dfa_failed, NULL)) { |
839 | 3.29k | if (dfa_failed) { |
840 | 181 | if (options_.log_errors()) |
841 | 0 | LOG(ERROR) << "DFA out of memory: " |
842 | 0 | << "pattern length " << pattern_->size() << ", " |
843 | 0 | << "program size " << prog_->size() << ", " |
844 | 0 | << "list count " << prog_->list_count() << ", " |
845 | 0 | << "bytemap range " << prog_->bytemap_range(); |
846 | | // Fall back to NFA below. |
847 | 181 | skipped_test = true; |
848 | 181 | break; |
849 | 181 | } |
850 | 3.11k | return false; |
851 | 3.29k | } |
852 | 573 | break; |
853 | 3.99k | } |
854 | | |
855 | 880 | if (!skipped_test && ncap <= 1) { |
856 | | // We know exactly where it matches. That's enough. |
857 | 573 | if (ncap == 1) |
858 | 0 | submatch[0] = match; |
859 | 573 | } else { |
860 | 307 | StringPiece subtext1; |
861 | 307 | if (skipped_test) { |
862 | | // DFA ran out of memory or was skipped: |
863 | | // need to search in entire original text. |
864 | 307 | subtext1 = subtext; |
865 | 307 | } else { |
866 | | // DFA found the exact match location: |
867 | | // let NFA run an anchored, full match search |
868 | | // to find submatch locations. |
869 | 0 | subtext1 = match; |
870 | 0 | anchor = Prog::kAnchored; |
871 | 0 | kind = Prog::kFullMatch; |
872 | 0 | } |
873 | | |
874 | 307 | if (can_one_pass && anchor != Prog::kUnanchored) { |
875 | 154 | if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { |
876 | 87 | if (!skipped_test && options_.log_errors()) |
877 | 0 | LOG(ERROR) << "SearchOnePass inconsistency"; |
878 | 87 | return false; |
879 | 87 | } |
880 | 154 | } else if (can_bit_state && subtext1.size() <= bit_state_text_max_size) { |
881 | 0 | if (!prog_->SearchBitState(subtext1, text, anchor, |
882 | 0 | kind, submatch, ncap)) { |
883 | 0 | if (!skipped_test && options_.log_errors()) |
884 | 0 | LOG(ERROR) << "SearchBitState inconsistency"; |
885 | 0 | return false; |
886 | 0 | } |
887 | 153 | } else { |
888 | 153 | if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { |
889 | 153 | if (!skipped_test && options_.log_errors()) |
890 | 0 | LOG(ERROR) << "SearchNFA inconsistency"; |
891 | 153 | return false; |
892 | 153 | } |
893 | 153 | } |
894 | 307 | } |
895 | | |
896 | | // Adjust overall match for required prefix that we stripped off. |
897 | 640 | if (prefixlen > 0 && nsubmatch > 0) |
898 | 0 | submatch[0] = StringPiece(submatch[0].data() - prefixlen, |
899 | 0 | submatch[0].size() + prefixlen); |
900 | | |
901 | | // Zero submatches that don't exist in the regexp. |
902 | 640 | for (int i = ncap; i < nsubmatch; i++) |
903 | 0 | submatch[i] = StringPiece(); |
904 | 640 | return true; |
905 | 880 | } |
906 | | |
907 | | // Internal matcher - like Match() but takes Args not StringPieces. |
908 | | bool RE2::DoMatch(const StringPiece& text, |
909 | | Anchor re_anchor, |
910 | | size_t* consumed, |
911 | | const Arg* const* args, |
912 | 4.07k | int n) const { |
913 | 4.07k | if (!ok()) { |
914 | 0 | if (options_.log_errors()) |
915 | 0 | LOG(ERROR) << "Invalid RE2: " << *error_; |
916 | 0 | return false; |
917 | 0 | } |
918 | | |
919 | 4.07k | if (NumberOfCapturingGroups() < n) { |
920 | | // RE has fewer capturing groups than number of Arg pointers passed in. |
921 | 0 | return false; |
922 | 0 | } |
923 | | |
924 | | // Count number of capture groups needed. |
925 | 4.07k | int nvec; |
926 | 4.07k | if (n == 0 && consumed == NULL) |
927 | 4.07k | nvec = 0; |
928 | 0 | else |
929 | 0 | nvec = n+1; |
930 | | |
931 | 4.07k | StringPiece* vec; |
932 | 4.07k | StringPiece stkvec[kVecSize]; |
933 | 4.07k | StringPiece* heapvec = NULL; |
934 | | |
935 | 4.07k | if (nvec <= static_cast<int>(arraysize(stkvec))) { |
936 | 4.07k | vec = stkvec; |
937 | 4.07k | } else { |
938 | 0 | vec = new StringPiece[nvec]; |
939 | 0 | heapvec = vec; |
940 | 0 | } |
941 | | |
942 | 4.07k | if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { |
943 | 3.43k | delete[] heapvec; |
944 | 3.43k | return false; |
945 | 3.43k | } |
946 | | |
947 | 640 | if (consumed != NULL) |
948 | 0 | *consumed = static_cast<size_t>(EndPtr(vec[0]) - BeginPtr(text)); |
949 | | |
950 | 640 | if (n == 0 || args == NULL) { |
951 | | // We are not interested in results |
952 | 640 | delete[] heapvec; |
953 | 640 | return true; |
954 | 640 | } |
955 | | |
956 | | // If we got here, we must have matched the whole pattern. |
957 | 0 | for (int i = 0; i < n; i++) { |
958 | 0 | const StringPiece& s = vec[i+1]; |
959 | 0 | if (!args[i]->Parse(s.data(), s.size())) { |
960 | | // TODO: Should we indicate what the error was? |
961 | 0 | delete[] heapvec; |
962 | 0 | return false; |
963 | 0 | } |
964 | 0 | } |
965 | | |
966 | 0 | delete[] heapvec; |
967 | 0 | return true; |
968 | 0 | } |
969 | | |
970 | | // Checks that the rewrite string is well-formed with respect to this |
971 | | // regular expression. |
972 | | bool RE2::CheckRewriteString(const StringPiece& rewrite, |
973 | 0 | std::string* error) const { |
974 | 0 | int max_token = -1; |
975 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
976 | 0 | s < end; s++) { |
977 | 0 | int c = *s; |
978 | 0 | if (c != '\\') { |
979 | 0 | continue; |
980 | 0 | } |
981 | 0 | if (++s == end) { |
982 | 0 | *error = "Rewrite schema error: '\\' not allowed at end."; |
983 | 0 | return false; |
984 | 0 | } |
985 | 0 | c = *s; |
986 | 0 | if (c == '\\') { |
987 | 0 | continue; |
988 | 0 | } |
989 | 0 | if (!isdigit(c)) { |
990 | 0 | *error = "Rewrite schema error: " |
991 | 0 | "'\\' must be followed by a digit or '\\'."; |
992 | 0 | return false; |
993 | 0 | } |
994 | 0 | int n = (c - '0'); |
995 | 0 | if (max_token < n) { |
996 | 0 | max_token = n; |
997 | 0 | } |
998 | 0 | } |
999 | | |
1000 | 0 | if (max_token > NumberOfCapturingGroups()) { |
1001 | 0 | *error = StringPrintf( |
1002 | 0 | "Rewrite schema requests %d matches, but the regexp only has %d " |
1003 | 0 | "parenthesized subexpressions.", |
1004 | 0 | max_token, NumberOfCapturingGroups()); |
1005 | 0 | return false; |
1006 | 0 | } |
1007 | 0 | return true; |
1008 | 0 | } |
1009 | | |
1010 | | // Returns the maximum submatch needed for the rewrite to be done by Replace(). |
1011 | | // E.g. if rewrite == "foo \\2,\\1", returns 2. |
1012 | 0 | int RE2::MaxSubmatch(const StringPiece& rewrite) { |
1013 | 0 | int max = 0; |
1014 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
1015 | 0 | s < end; s++) { |
1016 | 0 | if (*s == '\\') { |
1017 | 0 | s++; |
1018 | 0 | int c = (s < end) ? *s : -1; |
1019 | 0 | if (isdigit(c)) { |
1020 | 0 | int n = (c - '0'); |
1021 | 0 | if (n > max) |
1022 | 0 | max = n; |
1023 | 0 | } |
1024 | 0 | } |
1025 | 0 | } |
1026 | 0 | return max; |
1027 | 0 | } |
1028 | | |
1029 | | // Append the "rewrite" string, with backslash subsitutions from "vec", |
1030 | | // to string "out". |
1031 | | bool RE2::Rewrite(std::string* out, |
1032 | | const StringPiece& rewrite, |
1033 | | const StringPiece* vec, |
1034 | 0 | int veclen) const { |
1035 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
1036 | 0 | s < end; s++) { |
1037 | 0 | if (*s != '\\') { |
1038 | 0 | out->push_back(*s); |
1039 | 0 | continue; |
1040 | 0 | } |
1041 | 0 | s++; |
1042 | 0 | int c = (s < end) ? *s : -1; |
1043 | 0 | if (isdigit(c)) { |
1044 | 0 | int n = (c - '0'); |
1045 | 0 | if (n >= veclen) { |
1046 | 0 | if (options_.log_errors()) { |
1047 | 0 | LOG(ERROR) << "invalid substitution \\" << n |
1048 | 0 | << " from " << veclen << " groups"; |
1049 | 0 | } |
1050 | 0 | return false; |
1051 | 0 | } |
1052 | 0 | StringPiece snip = vec[n]; |
1053 | 0 | if (!snip.empty()) |
1054 | 0 | out->append(snip.data(), snip.size()); |
1055 | 0 | } else if (c == '\\') { |
1056 | 0 | out->push_back('\\'); |
1057 | 0 | } else { |
1058 | 0 | if (options_.log_errors()) |
1059 | 0 | LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
1060 | 0 | return false; |
1061 | 0 | } |
1062 | 0 | } |
1063 | 0 | return true; |
1064 | 0 | } |
1065 | | |
1066 | | /***** Parsers for various types *****/ |
1067 | | |
1068 | | namespace re2_internal { |
1069 | | |
1070 | | template <> |
1071 | 0 | bool Parse(const char* str, size_t n, void* dest) { |
1072 | | // We fail if somebody asked us to store into a non-NULL void* pointer |
1073 | 0 | return (dest == NULL); |
1074 | 0 | } |
1075 | | |
1076 | | template <> |
1077 | 0 | bool Parse(const char* str, size_t n, std::string* dest) { |
1078 | 0 | if (dest == NULL) return true; |
1079 | 0 | dest->assign(str, n); |
1080 | 0 | return true; |
1081 | 0 | } |
1082 | | |
1083 | | template <> |
1084 | 0 | bool Parse(const char* str, size_t n, StringPiece* dest) { |
1085 | 0 | if (dest == NULL) return true; |
1086 | 0 | *dest = StringPiece(str, n); |
1087 | 0 | return true; |
1088 | 0 | } |
1089 | | |
1090 | | template <> |
1091 | 0 | bool Parse(const char* str, size_t n, char* dest) { |
1092 | 0 | if (n != 1) return false; |
1093 | 0 | if (dest == NULL) return true; |
1094 | 0 | *dest = str[0]; |
1095 | 0 | return true; |
1096 | 0 | } |
1097 | | |
1098 | | template <> |
1099 | 0 | bool Parse(const char* str, size_t n, signed char* dest) { |
1100 | 0 | if (n != 1) return false; |
1101 | 0 | if (dest == NULL) return true; |
1102 | 0 | *dest = str[0]; |
1103 | 0 | return true; |
1104 | 0 | } |
1105 | | |
1106 | | template <> |
1107 | 0 | bool Parse(const char* str, size_t n, unsigned char* dest) { |
1108 | 0 | if (n != 1) return false; |
1109 | 0 | if (dest == NULL) return true; |
1110 | 0 | *dest = str[0]; |
1111 | 0 | return true; |
1112 | 0 | } |
1113 | | |
1114 | | // Largest number spec that we are willing to parse |
1115 | | static const int kMaxNumberLength = 32; |
1116 | | |
1117 | | // REQUIRES "buf" must have length at least nbuf. |
1118 | | // Copies "str" into "buf" and null-terminates. |
1119 | | // Overwrites *np with the new length. |
1120 | | static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, |
1121 | 0 | size_t* np, bool accept_spaces) { |
1122 | 0 | size_t n = *np; |
1123 | 0 | if (n == 0) return ""; |
1124 | 0 | if (n > 0 && isspace(*str)) { |
1125 | | // We are less forgiving than the strtoxxx() routines and do not |
1126 | | // allow leading spaces. We do allow leading spaces for floats. |
1127 | 0 | if (!accept_spaces) { |
1128 | 0 | return ""; |
1129 | 0 | } |
1130 | 0 | while (n > 0 && isspace(*str)) { |
1131 | 0 | n--; |
1132 | 0 | str++; |
1133 | 0 | } |
1134 | 0 | } |
1135 | | |
1136 | | // Although buf has a fixed maximum size, we can still handle |
1137 | | // arbitrarily large integers correctly by omitting leading zeros. |
1138 | | // (Numbers that are still too long will be out of range.) |
1139 | | // Before deciding whether str is too long, |
1140 | | // remove leading zeros with s/000+/00/. |
1141 | | // Leaving the leading two zeros in place means that |
1142 | | // we don't change 0000x123 (invalid) into 0x123 (valid). |
1143 | | // Skip over leading - before replacing. |
1144 | 0 | bool neg = false; |
1145 | 0 | if (n >= 1 && str[0] == '-') { |
1146 | 0 | neg = true; |
1147 | 0 | n--; |
1148 | 0 | str++; |
1149 | 0 | } |
1150 | |
|
1151 | 0 | if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1152 | 0 | while (n >= 3 && str[2] == '0') { |
1153 | 0 | n--; |
1154 | 0 | str++; |
1155 | 0 | } |
1156 | 0 | } |
1157 | |
|
1158 | 0 | if (neg) { // make room in buf for - |
1159 | 0 | n++; |
1160 | 0 | str--; |
1161 | 0 | } |
1162 | |
|
1163 | 0 | if (n > nbuf-1) return ""; |
1164 | | |
1165 | 0 | memmove(buf, str, n); |
1166 | 0 | if (neg) { |
1167 | 0 | buf[0] = '-'; |
1168 | 0 | } |
1169 | 0 | buf[n] = '\0'; |
1170 | 0 | *np = n; |
1171 | 0 | return buf; |
1172 | 0 | } |
1173 | | |
1174 | | template <> |
1175 | 0 | bool Parse(const char* str, size_t n, float* dest) { |
1176 | 0 | if (n == 0) return false; |
1177 | 0 | static const int kMaxLength = 200; |
1178 | 0 | char buf[kMaxLength+1]; |
1179 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1180 | 0 | char* end; |
1181 | 0 | errno = 0; |
1182 | 0 | float r = strtof(str, &end); |
1183 | 0 | if (end != str + n) return false; // Leftover junk |
1184 | 0 | if (errno) return false; |
1185 | 0 | if (dest == NULL) return true; |
1186 | 0 | *dest = r; |
1187 | 0 | return true; |
1188 | 0 | } |
1189 | | |
1190 | | template <> |
1191 | 0 | bool Parse(const char* str, size_t n, double* dest) { |
1192 | 0 | if (n == 0) return false; |
1193 | 0 | static const int kMaxLength = 200; |
1194 | 0 | char buf[kMaxLength+1]; |
1195 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1196 | 0 | char* end; |
1197 | 0 | errno = 0; |
1198 | 0 | double r = strtod(str, &end); |
1199 | 0 | if (end != str + n) return false; // Leftover junk |
1200 | 0 | if (errno) return false; |
1201 | 0 | if (dest == NULL) return true; |
1202 | 0 | *dest = r; |
1203 | 0 | return true; |
1204 | 0 | } |
1205 | | |
1206 | | template <> |
1207 | 0 | bool Parse(const char* str, size_t n, long* dest, int radix) { |
1208 | 0 | if (n == 0) return false; |
1209 | 0 | char buf[kMaxNumberLength+1]; |
1210 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1211 | 0 | char* end; |
1212 | 0 | errno = 0; |
1213 | 0 | long r = strtol(str, &end, radix); |
1214 | 0 | if (end != str + n) return false; // Leftover junk |
1215 | 0 | if (errno) return false; |
1216 | 0 | if (dest == NULL) return true; |
1217 | 0 | *dest = r; |
1218 | 0 | return true; |
1219 | 0 | } |
1220 | | |
1221 | | template <> |
1222 | 0 | bool Parse(const char* str, size_t n, unsigned long* dest, int radix) { |
1223 | 0 | if (n == 0) return false; |
1224 | 0 | char buf[kMaxNumberLength+1]; |
1225 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1226 | 0 | if (str[0] == '-') { |
1227 | | // strtoul() will silently accept negative numbers and parse |
1228 | | // them. This module is more strict and treats them as errors. |
1229 | 0 | return false; |
1230 | 0 | } |
1231 | | |
1232 | 0 | char* end; |
1233 | 0 | errno = 0; |
1234 | 0 | unsigned long r = strtoul(str, &end, radix); |
1235 | 0 | if (end != str + n) return false; // Leftover junk |
1236 | 0 | if (errno) return false; |
1237 | 0 | if (dest == NULL) return true; |
1238 | 0 | *dest = r; |
1239 | 0 | return true; |
1240 | 0 | } |
1241 | | |
1242 | | template <> |
1243 | 0 | bool Parse(const char* str, size_t n, short* dest, int radix) { |
1244 | 0 | long r; |
1245 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1246 | 0 | if ((short)r != r) return false; // Out of range |
1247 | 0 | if (dest == NULL) return true; |
1248 | 0 | *dest = (short)r; |
1249 | 0 | return true; |
1250 | 0 | } |
1251 | | |
1252 | | template <> |
1253 | 0 | bool Parse(const char* str, size_t n, unsigned short* dest, int radix) { |
1254 | 0 | unsigned long r; |
1255 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1256 | 0 | if ((unsigned short)r != r) return false; // Out of range |
1257 | 0 | if (dest == NULL) return true; |
1258 | 0 | *dest = (unsigned short)r; |
1259 | 0 | return true; |
1260 | 0 | } |
1261 | | |
1262 | | template <> |
1263 | 0 | bool Parse(const char* str, size_t n, int* dest, int radix) { |
1264 | 0 | long r; |
1265 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1266 | 0 | if ((int)r != r) return false; // Out of range |
1267 | 0 | if (dest == NULL) return true; |
1268 | 0 | *dest = (int)r; |
1269 | 0 | return true; |
1270 | 0 | } |
1271 | | |
1272 | | template <> |
1273 | 0 | bool Parse(const char* str, size_t n, unsigned int* dest, int radix) { |
1274 | 0 | unsigned long r; |
1275 | 0 | if (!Parse(str, n, &r, radix)) return false; // Could not parse |
1276 | 0 | if ((unsigned int)r != r) return false; // Out of range |
1277 | 0 | if (dest == NULL) return true; |
1278 | 0 | *dest = (unsigned int)r; |
1279 | 0 | return true; |
1280 | 0 | } |
1281 | | |
1282 | | template <> |
1283 | 0 | bool Parse(const char* str, size_t n, long long* dest, int radix) { |
1284 | 0 | if (n == 0) return false; |
1285 | 0 | char buf[kMaxNumberLength+1]; |
1286 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1287 | 0 | char* end; |
1288 | 0 | errno = 0; |
1289 | 0 | long long r = strtoll(str, &end, radix); |
1290 | 0 | if (end != str + n) return false; // Leftover junk |
1291 | 0 | if (errno) return false; |
1292 | 0 | if (dest == NULL) return true; |
1293 | 0 | *dest = r; |
1294 | 0 | return true; |
1295 | 0 | } |
1296 | | |
1297 | | template <> |
1298 | 0 | bool Parse(const char* str, size_t n, unsigned long long* dest, int radix) { |
1299 | 0 | if (n == 0) return false; |
1300 | 0 | char buf[kMaxNumberLength+1]; |
1301 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1302 | 0 | if (str[0] == '-') { |
1303 | | // strtoull() will silently accept negative numbers and parse |
1304 | | // them. This module is more strict and treats them as errors. |
1305 | 0 | return false; |
1306 | 0 | } |
1307 | 0 | char* end; |
1308 | 0 | errno = 0; |
1309 | 0 | unsigned long long r = strtoull(str, &end, radix); |
1310 | 0 | if (end != str + n) return false; // Leftover junk |
1311 | 0 | if (errno) return false; |
1312 | 0 | if (dest == NULL) return true; |
1313 | 0 | *dest = r; |
1314 | 0 | return true; |
1315 | 0 | } |
1316 | | |
1317 | | } // namespace re2_internal |
1318 | | |
1319 | | namespace hooks { |
1320 | | |
1321 | | #ifdef RE2_HAVE_THREAD_LOCAL |
1322 | | thread_local const RE2* context = NULL; |
1323 | | #endif |
1324 | | |
1325 | | template <typename T> |
1326 | | union Hook { |
1327 | 0 | void Store(T* cb) { cb_.store(cb, std::memory_order_release); } Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFAStateCacheReset const&)>::Store(void (*)(re2::hooks::DFAStateCacheReset const&)) Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFASearchFailure const&)>::Store(void (*)(re2::hooks::DFASearchFailure const&)) |
1328 | 181 | T* Load() const { return cb_.load(std::memory_order_acquire); } Unexecuted instantiation: re2::hooks::Hook<void (re2::hooks::DFAStateCacheReset const&)>::Load() const re2::hooks::Hook<void (re2::hooks::DFASearchFailure const&)>::Load() const Line | Count | Source | 1328 | 181 | T* Load() const { return cb_.load(std::memory_order_acquire); } |
|
1329 | | |
1330 | | #if !defined(__clang__) && defined(_MSC_VER) |
1331 | | // Citing https://github.com/protocolbuffers/protobuf/pull/4777 as precedent, |
1332 | | // this is a gross hack to make std::atomic<T*> constant-initialized on MSVC. |
1333 | | static_assert(ATOMIC_POINTER_LOCK_FREE == 2, |
1334 | | "std::atomic<T*> must be always lock-free"); |
1335 | | T* cb_for_constinit_; |
1336 | | #endif |
1337 | | |
1338 | | std::atomic<T*> cb_; |
1339 | | }; |
1340 | | |
1341 | | template <typename T> |
1342 | 181 | static void DoNothing(const T&) {} Unexecuted instantiation: re2.cc:void re2::hooks::DoNothing<re2::hooks::DFAStateCacheReset>(re2::hooks::DFAStateCacheReset const&) re2.cc:void re2::hooks::DoNothing<re2::hooks::DFASearchFailure>(re2::hooks::DFASearchFailure const&) Line | Count | Source | 1342 | 181 | static void DoNothing(const T&) {} |
|
1343 | | |
1344 | | #define DEFINE_HOOK(type, name) \ |
1345 | | static Hook<type##Callback> name##_hook = {{&DoNothing<type>}}; \ |
1346 | 0 | void Set##type##Hook(type##Callback* cb) { name##_hook.Store(cb); } \ Unexecuted instantiation: re2::hooks::SetDFAStateCacheResetHook(void (*)(re2::hooks::DFAStateCacheReset const&)) Unexecuted instantiation: re2::hooks::SetDFASearchFailureHook(void (*)(re2::hooks::DFASearchFailure const&)) |
1347 | 181 | type##Callback* Get##type##Hook() { return name##_hook.Load(); } Unexecuted instantiation: re2::hooks::GetDFAStateCacheResetHook() re2::hooks::GetDFASearchFailureHook() Line | Count | Source | 1347 | 181 | type##Callback* Get##type##Hook() { return name##_hook.Load(); } |
|
1348 | | |
1349 | | DEFINE_HOOK(DFAStateCacheReset, dfa_state_cache_reset) |
1350 | | DEFINE_HOOK(DFASearchFailure, dfa_search_failure) |
1351 | | |
1352 | | #undef DEFINE_HOOK |
1353 | | |
1354 | | } // namespace hooks |
1355 | | |
1356 | | } // namespace re2 |