/src/duckdb/third_party/re2/re2/re2.cc
Line | Count | Source (jump to first uncovered line) |
1 | | // Copyright 2003-2009 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Regular expression interface RE2. |
6 | | // |
7 | | // Originally the PCRE C++ wrapper, but adapted to use |
8 | | // the new automata-based regular expression engines. |
9 | | |
10 | | #include "re2/re2.h" |
11 | | |
12 | | #include <assert.h> |
13 | | #include <ctype.h> |
14 | | #include <errno.h> |
15 | | #include <stdint.h> |
16 | | #include <stdlib.h> |
17 | | #include <string.h> |
18 | | #include <algorithm> |
19 | | #include <iterator> |
20 | | #include <mutex> |
21 | | #include <string> |
22 | | #include <utility> |
23 | | #include <vector> |
24 | | |
25 | | #include "util/util.h" |
26 | | #include "util/logging.h" |
27 | | #include "util/sparse_array.h" |
28 | | #include "util/strutil.h" |
29 | | #include "util/utf.h" |
30 | | #include "re2/prog.h" |
31 | | #include "re2/regexp.h" |
32 | | |
33 | | namespace duckdb_re2 { |
34 | | |
35 | | // Maximum number of args we can set |
36 | | static const int kMaxArgs = 16; |
37 | | static const int kVecSize = 1+kMaxArgs; |
38 | | |
39 | | const int RE2::Options::kDefaultMaxMem; // initialized in re2.h |
40 | | |
41 | | RE2::Options::Options(RE2::CannedOptions opt) |
42 | | : encoding_(opt == RE2::Latin1 ? EncodingLatin1 : EncodingUTF8), |
43 | | posix_syntax_(opt == RE2::POSIX), |
44 | | longest_match_(opt == RE2::POSIX), |
45 | | log_errors_(opt != RE2::Quiet), |
46 | | max_mem_(kDefaultMaxMem), |
47 | | literal_(false), |
48 | | never_nl_(false), |
49 | | dot_nl_(false), |
50 | | never_capture_(false), |
51 | | case_sensitive_(true), |
52 | | perl_classes_(false), |
53 | | word_boundary_(false), |
54 | 0 | one_line_(false) { |
55 | 0 | } |
56 | | |
57 | | // static empty objects for use as const references. |
58 | | // To avoid global constructors, allocated in RE2::Init(). |
59 | | static const std::string* empty_string; |
60 | | static const std::map<std::string, int>* empty_named_groups; |
61 | | static const std::map<int, std::string>* empty_group_names; |
62 | | |
63 | | // Converts from Regexp error code to RE2 error code. |
64 | | // Maybe some day they will diverge. In any event, this |
65 | | // hides the existence of Regexp from RE2 users. |
66 | 0 | static RE2::ErrorCode RegexpErrorToRE2(duckdb_re2::RegexpStatusCode code) { |
67 | 0 | switch (code) { |
68 | 0 | case duckdb_re2::kRegexpSuccess: |
69 | 0 | return RE2::NoError; |
70 | 0 | case duckdb_re2::kRegexpInternalError: |
71 | 0 | return RE2::ErrorInternal; |
72 | 0 | case duckdb_re2::kRegexpBadEscape: |
73 | 0 | return RE2::ErrorBadEscape; |
74 | 0 | case duckdb_re2::kRegexpBadCharClass: |
75 | 0 | return RE2::ErrorBadCharClass; |
76 | 0 | case duckdb_re2::kRegexpBadCharRange: |
77 | 0 | return RE2::ErrorBadCharRange; |
78 | 0 | case duckdb_re2::kRegexpMissingBracket: |
79 | 0 | return RE2::ErrorMissingBracket; |
80 | 0 | case duckdb_re2::kRegexpMissingParen: |
81 | 0 | return RE2::ErrorMissingParen; |
82 | 0 | case duckdb_re2::kRegexpTrailingBackslash: |
83 | 0 | return RE2::ErrorTrailingBackslash; |
84 | 0 | case duckdb_re2::kRegexpRepeatArgument: |
85 | 0 | return RE2::ErrorRepeatArgument; |
86 | 0 | case duckdb_re2::kRegexpRepeatSize: |
87 | 0 | return RE2::ErrorRepeatSize; |
88 | 0 | case duckdb_re2::kRegexpRepeatOp: |
89 | 0 | return RE2::ErrorRepeatOp; |
90 | 0 | case duckdb_re2::kRegexpBadPerlOp: |
91 | 0 | return RE2::ErrorBadPerlOp; |
92 | 0 | case duckdb_re2::kRegexpBadUTF8: |
93 | 0 | return RE2::ErrorBadUTF8; |
94 | 0 | case duckdb_re2::kRegexpBadNamedCapture: |
95 | 0 | return RE2::ErrorBadNamedCapture; |
96 | 0 | } |
97 | 0 | return RE2::ErrorInternal; |
98 | 0 | } |
99 | | |
100 | 0 | static std::string trunc(const StringPiece& pattern) { |
101 | 0 | if (pattern.size() < 100) |
102 | 0 | return std::string(pattern); |
103 | 0 | return std::string(pattern.substr(0, 100)) + "..."; |
104 | 0 | } |
105 | | |
106 | | |
107 | 0 | RE2::RE2(const char* pattern) { |
108 | 0 | Init(pattern, DefaultOptions); |
109 | 0 | } |
110 | | |
111 | 0 | RE2::RE2(const std::string& pattern) { |
112 | 0 | Init(pattern, DefaultOptions); |
113 | 0 | } |
114 | | |
115 | 0 | RE2::RE2(const StringPiece& pattern) { |
116 | 0 | Init(pattern, DefaultOptions); |
117 | 0 | } |
118 | | |
119 | 0 | RE2::RE2(const StringPiece& pattern, const Options& options) { |
120 | 0 | Init(pattern, options); |
121 | 0 | } |
122 | | |
123 | 0 | int RE2::Options::ParseFlags() const { |
124 | 0 | int flags = Regexp::ClassNL; |
125 | 0 | switch (encoding()) { |
126 | 0 | default: |
127 | 0 | if (log_errors()) |
128 | 0 | LOG(ERROR) << "Unknown encoding " << encoding(); |
129 | 0 | break; |
130 | 0 | case RE2::Options::EncodingUTF8: |
131 | 0 | break; |
132 | 0 | case RE2::Options::EncodingLatin1: |
133 | 0 | flags |= Regexp::Latin1; |
134 | 0 | break; |
135 | 0 | } |
136 | | |
137 | 0 | if (!posix_syntax()) |
138 | 0 | flags |= Regexp::LikePerl; |
139 | |
|
140 | 0 | if (literal()) |
141 | 0 | flags |= Regexp::Literal; |
142 | |
|
143 | 0 | if (never_nl()) |
144 | 0 | flags |= Regexp::NeverNL; |
145 | |
|
146 | 0 | if (dot_nl()) |
147 | 0 | flags |= Regexp::DotNL; |
148 | |
|
149 | 0 | if (never_capture()) |
150 | 0 | flags |= Regexp::NeverCapture; |
151 | |
|
152 | 0 | if (!case_sensitive()) |
153 | 0 | flags |= Regexp::FoldCase; |
154 | |
|
155 | 0 | if (perl_classes()) |
156 | 0 | flags |= Regexp::PerlClasses; |
157 | |
|
158 | 0 | if (word_boundary()) |
159 | 0 | flags |= Regexp::PerlB; |
160 | |
|
161 | 0 | if (one_line()) |
162 | 0 | flags |= Regexp::OneLine; |
163 | |
|
164 | 0 | return flags; |
165 | 0 | } |
166 | | |
167 | 0 | void RE2::Init(const StringPiece& pattern, const Options& options) { |
168 | 0 | static std::once_flag empty_once; |
169 | 0 | std::call_once(empty_once, []() { |
170 | 0 | empty_string = new std::string; |
171 | 0 | empty_named_groups = new std::map<std::string, int>; |
172 | 0 | empty_group_names = new std::map<int, std::string>; |
173 | 0 | }); |
174 | |
|
175 | 0 | pattern_ = std::string(pattern); |
176 | 0 | options_.Copy(options); |
177 | 0 | entire_regexp_ = NULL; |
178 | 0 | suffix_regexp_ = NULL; |
179 | 0 | prog_ = NULL; |
180 | 0 | num_captures_ = -1; |
181 | 0 | rprog_ = NULL; |
182 | 0 | error_ = empty_string; |
183 | 0 | error_code_ = NoError; |
184 | 0 | named_groups_ = NULL; |
185 | 0 | group_names_ = NULL; |
186 | |
|
187 | 0 | RegexpStatus status; |
188 | 0 | entire_regexp_ = Regexp::Parse( |
189 | 0 | pattern_, |
190 | 0 | static_cast<Regexp::ParseFlags>(options_.ParseFlags()), |
191 | 0 | &status); |
192 | 0 | if (entire_regexp_ == NULL) { |
193 | 0 | if (options_.log_errors()) { |
194 | 0 | LOG(ERROR) << "Error parsing '" << trunc(pattern_) << "': " |
195 | 0 | << status.Text(); |
196 | 0 | } |
197 | 0 | error_ = new std::string(status.Text()); |
198 | 0 | error_code_ = RegexpErrorToRE2(status.code()); |
199 | 0 | error_arg_ = std::string(status.error_arg()); |
200 | 0 | return; |
201 | 0 | } |
202 | | |
203 | 0 | duckdb_re2::Regexp* suffix; |
204 | 0 | if (entire_regexp_->RequiredPrefix(&prefix_, &prefix_foldcase_, &suffix)) |
205 | 0 | suffix_regexp_ = suffix; |
206 | 0 | else |
207 | 0 | suffix_regexp_ = entire_regexp_->Incref(); |
208 | | |
209 | | // Two thirds of the memory goes to the forward Prog, |
210 | | // one third to the reverse prog, because the forward |
211 | | // Prog has two DFAs but the reverse prog has one. |
212 | 0 | prog_ = suffix_regexp_->CompileToProg(options_.max_mem()*2/3); |
213 | 0 | if (prog_ == NULL) { |
214 | 0 | if (options_.log_errors()) |
215 | 0 | LOG(ERROR) << "Error compiling '" << trunc(pattern_) << "'"; |
216 | 0 | error_ = new std::string("pattern too large - compile failed"); |
217 | 0 | error_code_ = RE2::ErrorPatternTooLarge; |
218 | 0 | return; |
219 | 0 | } |
220 | | |
221 | | // We used to compute this lazily, but it's used during the |
222 | | // typical control flow for a match call, so we now compute |
223 | | // it eagerly, which avoids the overhead of std::once_flag. |
224 | 0 | num_captures_ = suffix_regexp_->NumCaptures(); |
225 | | |
226 | | // Could delay this until the first match call that |
227 | | // cares about submatch information, but the one-pass |
228 | | // machine's memory gets cut from the DFA memory budget, |
229 | | // and that is harder to do if the DFA has already |
230 | | // been built. |
231 | 0 | is_one_pass_ = prog_->IsOnePass(); |
232 | 0 | } |
233 | | |
234 | | // Returns rprog_, computing it if needed. |
235 | 0 | duckdb_re2::Prog* RE2::ReverseProg() const { |
236 | 0 | std::call_once(rprog_once_, [](const RE2* re) { |
237 | 0 | re->rprog_ = |
238 | 0 | re->suffix_regexp_->CompileToReverseProg(re->options_.max_mem() / 3); |
239 | 0 | if (re->rprog_ == NULL) { |
240 | 0 | if (re->options_.log_errors()) |
241 | 0 | LOG(ERROR) << "Error reverse compiling '" << trunc(re->pattern_) << "'"; |
242 | 0 | re->error_ = |
243 | 0 | new std::string("pattern too large - reverse compile failed"); |
244 | 0 | re->error_code_ = RE2::ErrorPatternTooLarge; |
245 | 0 | } |
246 | 0 | }, this); |
247 | 0 | return rprog_; |
248 | 0 | } |
249 | | |
250 | 0 | RE2::~RE2() { |
251 | 0 | if (suffix_regexp_) |
252 | 0 | suffix_regexp_->Decref(); |
253 | 0 | if (entire_regexp_) |
254 | 0 | entire_regexp_->Decref(); |
255 | 0 | delete prog_; |
256 | 0 | delete rprog_; |
257 | 0 | if (error_ != empty_string) |
258 | 0 | delete error_; |
259 | 0 | if (named_groups_ != NULL && named_groups_ != empty_named_groups) |
260 | 0 | delete named_groups_; |
261 | 0 | if (group_names_ != NULL && group_names_ != empty_group_names) |
262 | 0 | delete group_names_; |
263 | 0 | } |
264 | | |
265 | 0 | int RE2::ProgramSize() const { |
266 | 0 | if (prog_ == NULL) |
267 | 0 | return -1; |
268 | 0 | return prog_->size(); |
269 | 0 | } |
270 | | |
271 | 0 | int RE2::ReverseProgramSize() const { |
272 | 0 | if (prog_ == NULL) |
273 | 0 | return -1; |
274 | 0 | Prog* prog = ReverseProg(); |
275 | 0 | if (prog == NULL) |
276 | 0 | return -1; |
277 | 0 | return prog->size(); |
278 | 0 | } |
279 | | |
280 | 0 | static int Fanout(Prog* prog, std::map<int, int>* histogram) { |
281 | 0 | SparseArray<int> fanout(prog->size()); |
282 | 0 | prog->Fanout(&fanout); |
283 | 0 | histogram->clear(); |
284 | 0 | for (SparseArray<int>::iterator i = fanout.begin(); i != fanout.end(); ++i) { |
285 | | // TODO(junyer): Optimise this? |
286 | 0 | int bucket = 0; |
287 | 0 | while (1 << bucket < i->value()) { |
288 | 0 | bucket++; |
289 | 0 | } |
290 | 0 | (*histogram)[bucket]++; |
291 | 0 | } |
292 | 0 | return histogram->rbegin()->first; |
293 | 0 | } |
294 | | |
295 | 0 | int RE2::ProgramFanout(std::map<int, int>* histogram) const { |
296 | 0 | if (prog_ == NULL) |
297 | 0 | return -1; |
298 | 0 | return Fanout(prog_, histogram); |
299 | 0 | } |
300 | | |
301 | 0 | int RE2::ReverseProgramFanout(std::map<int, int>* histogram) const { |
302 | 0 | if (prog_ == NULL) |
303 | 0 | return -1; |
304 | 0 | Prog* prog = ReverseProg(); |
305 | 0 | if (prog == NULL) |
306 | 0 | return -1; |
307 | 0 | return Fanout(prog, histogram); |
308 | 0 | } |
309 | | |
310 | | // Returns named_groups_, computing it if needed. |
311 | 0 | const std::map<std::string, int>& RE2::NamedCapturingGroups() const { |
312 | 0 | std::call_once(named_groups_once_, [](const RE2* re) { |
313 | 0 | if (re->suffix_regexp_ != NULL) |
314 | 0 | re->named_groups_ = re->suffix_regexp_->NamedCaptures(); |
315 | 0 | if (re->named_groups_ == NULL) |
316 | 0 | re->named_groups_ = empty_named_groups; |
317 | 0 | }, this); |
318 | 0 | return *named_groups_; |
319 | 0 | } |
320 | | |
321 | | // Returns group_names_, computing it if needed. |
322 | 0 | const std::map<int, std::string>& RE2::CapturingGroupNames() const { |
323 | 0 | std::call_once(group_names_once_, [](const RE2* re) { |
324 | 0 | if (re->suffix_regexp_ != NULL) |
325 | 0 | re->group_names_ = re->suffix_regexp_->CaptureNames(); |
326 | 0 | if (re->group_names_ == NULL) |
327 | 0 | re->group_names_ = empty_group_names; |
328 | 0 | }, this); |
329 | 0 | return *group_names_; |
330 | 0 | } |
331 | | |
332 | | /***** Convenience interfaces *****/ |
333 | | |
334 | | bool RE2::FullMatchN(const StringPiece& text, const RE2& re, |
335 | 0 | const Arg* const args[], int n) { |
336 | 0 | return re.DoMatch(text, ANCHOR_BOTH, NULL, args, n); |
337 | 0 | } |
338 | | |
339 | | bool RE2::PartialMatchN(const StringPiece& text, const RE2& re, |
340 | 0 | const Arg* const args[], int n) { |
341 | 0 | return re.DoMatch(text, UNANCHORED, NULL, args, n); |
342 | 0 | } |
343 | | |
344 | | bool RE2::ConsumeN(StringPiece* input, const RE2& re, |
345 | 0 | const Arg* const args[], int n) { |
346 | 0 | size_t consumed; |
347 | 0 | if (re.DoMatch(*input, ANCHOR_START, &consumed, args, n)) { |
348 | 0 | input->remove_prefix(consumed); |
349 | 0 | return true; |
350 | 0 | } else { |
351 | 0 | return false; |
352 | 0 | } |
353 | 0 | } |
354 | | |
355 | | bool RE2::FindAndConsumeN(StringPiece* input, const RE2& re, |
356 | 0 | const Arg* const args[], int n) { |
357 | 0 | size_t consumed; |
358 | 0 | if (re.DoMatch(*input, UNANCHORED, &consumed, args, n)) { |
359 | 0 | input->remove_prefix(consumed); |
360 | 0 | return true; |
361 | 0 | } else { |
362 | 0 | return false; |
363 | 0 | } |
364 | 0 | } |
365 | | |
366 | | bool RE2::Replace(std::string* str, |
367 | | const RE2& re, |
368 | 0 | const StringPiece& rewrite) { |
369 | 0 | StringPiece vec[kVecSize]; |
370 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
371 | 0 | if (nvec > arraysize(vec)) |
372 | 0 | return false; |
373 | 0 | if (!re.Match(*str, 0, str->size(), UNANCHORED, vec, nvec)) |
374 | 0 | return false; |
375 | | |
376 | 0 | std::string s; |
377 | 0 | if (!re.Rewrite(&s, rewrite, vec, nvec)) |
378 | 0 | return false; |
379 | | |
380 | 0 | assert(vec[0].begin() >= str->data()); |
381 | 0 | assert(vec[0].end() <= str->data()+str->size()); |
382 | 0 | str->replace(vec[0].data() - str->data(), vec[0].size(), s); |
383 | 0 | return true; |
384 | 0 | } |
385 | | |
386 | | int RE2::GlobalReplace(std::string* str, |
387 | | const RE2& re, |
388 | 0 | const StringPiece& rewrite) { |
389 | 0 | StringPiece vec[kVecSize]; |
390 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
391 | 0 | if (nvec > arraysize(vec)) |
392 | 0 | return false; |
393 | | |
394 | 0 | const char* p = str->data(); |
395 | 0 | const char* ep = p + str->size(); |
396 | 0 | const char* lastend = NULL; |
397 | 0 | std::string out; |
398 | 0 | int count = 0; |
399 | 0 | #ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION |
400 | | // Iterate just once when fuzzing. Otherwise, we easily get bogged down |
401 | | // and coverage is unlikely to improve despite significant expense. |
402 | 0 | while (p == str->data()) { |
403 | | #else |
404 | | while (p <= ep) { |
405 | | #endif |
406 | 0 | if (!re.Match(*str, static_cast<size_t>(p - str->data()), |
407 | 0 | str->size(), UNANCHORED, vec, nvec)) |
408 | 0 | break; |
409 | 0 | if (p < vec[0].begin()) |
410 | 0 | out.append(p, vec[0].begin() - p); |
411 | 0 | if (vec[0].begin() == lastend && vec[0].size() == 0) { |
412 | | // Disallow empty match at end of last match: skip ahead. |
413 | | // |
414 | | // fullrune() takes int, not ptrdiff_t. However, it just looks |
415 | | // at the leading byte and treats any length >= 4 the same. |
416 | 0 | if (re.options().encoding() == RE2::Options::EncodingUTF8 && |
417 | 0 | fullrune(p, static_cast<int>(std::min(ptrdiff_t{4}, ep - p)))) { |
418 | | // re is in UTF-8 mode and there is enough left of str |
419 | | // to allow us to advance by up to UTFmax bytes. |
420 | 0 | Rune r; |
421 | 0 | int n = chartorune(&r, p); |
422 | | // Some copies of chartorune have a bug that accepts |
423 | | // encodings of values in (10FFFF, 1FFFFF] as valid. |
424 | 0 | if (r > Runemax) { |
425 | 0 | n = 1; |
426 | 0 | r = Runeerror; |
427 | 0 | } |
428 | 0 | if (!(n == 1 && r == Runeerror)) { // no decoding error |
429 | 0 | out.append(p, n); |
430 | 0 | p += n; |
431 | 0 | continue; |
432 | 0 | } |
433 | 0 | } |
434 | | // Most likely, re is in Latin-1 mode. If it is in UTF-8 mode, |
435 | | // we fell through from above and the GIGO principle applies. |
436 | 0 | if (p < ep) |
437 | 0 | out.append(p, 1); |
438 | 0 | p++; |
439 | 0 | continue; |
440 | 0 | } |
441 | 0 | re.Rewrite(&out, rewrite, vec, nvec); |
442 | 0 | p = vec[0].end(); |
443 | 0 | lastend = p; |
444 | 0 | count++; |
445 | 0 | } |
446 | |
|
447 | 0 | if (count == 0) |
448 | 0 | return 0; |
449 | | |
450 | 0 | if (p < ep) |
451 | 0 | out.append(p, ep - p); |
452 | 0 | using std::swap; |
453 | 0 | swap(out, *str); |
454 | 0 | return count; |
455 | 0 | } |
456 | | |
457 | | bool RE2::Extract(const StringPiece& text, |
458 | | const RE2& re, |
459 | | const StringPiece& rewrite, |
460 | 0 | std::string* out) { |
461 | 0 | StringPiece vec[kVecSize]; |
462 | 0 | int nvec = 1 + MaxSubmatch(rewrite); |
463 | 0 | if (nvec > arraysize(vec)) |
464 | 0 | return false; |
465 | | |
466 | 0 | if (!re.Match(text, 0, text.size(), UNANCHORED, vec, nvec)) |
467 | 0 | return false; |
468 | | |
469 | 0 | out->clear(); |
470 | 0 | return re.Rewrite(out, rewrite, vec, nvec); |
471 | 0 | } |
472 | | |
473 | 0 | std::string RE2::QuoteMeta(const StringPiece& unquoted) { |
474 | 0 | std::string result; |
475 | 0 | result.reserve(unquoted.size() << 1); |
476 | | |
477 | | // Escape any ascii character not in [A-Za-z_0-9]. |
478 | | // |
479 | | // Note that it's legal to escape a character even if it has no |
480 | | // special meaning in a regular expression -- so this function does |
481 | | // that. (This also makes it identical to the perl function of the |
482 | | // same name except for the null-character special case; |
483 | | // see `perldoc -f quotemeta`.) |
484 | 0 | for (size_t ii = 0; ii < unquoted.size(); ++ii) { |
485 | | // Note that using 'isalnum' here raises the benchmark time from |
486 | | // 32ns to 58ns: |
487 | 0 | if ((unquoted[ii] < 'a' || unquoted[ii] > 'z') && |
488 | 0 | (unquoted[ii] < 'A' || unquoted[ii] > 'Z') && |
489 | 0 | (unquoted[ii] < '0' || unquoted[ii] > '9') && |
490 | 0 | unquoted[ii] != '_' && |
491 | | // If this is the part of a UTF8 or Latin1 character, we need |
492 | | // to copy this byte without escaping. Experimentally this is |
493 | | // what works correctly with the regexp library. |
494 | 0 | !(unquoted[ii] & 128)) { |
495 | 0 | if (unquoted[ii] == '\0') { // Special handling for null chars. |
496 | | // Note that this special handling is not strictly required for RE2, |
497 | | // but this quoting is required for other regexp libraries such as |
498 | | // PCRE. |
499 | | // Can't use "\\0" since the next character might be a digit. |
500 | 0 | result += "\\x00"; |
501 | 0 | continue; |
502 | 0 | } |
503 | 0 | result += '\\'; |
504 | 0 | } |
505 | 0 | result += unquoted[ii]; |
506 | 0 | } |
507 | |
|
508 | 0 | return result; |
509 | 0 | } |
510 | | |
511 | | bool RE2::PossibleMatchRange(std::string* min, std::string* max, |
512 | 0 | int maxlen) const { |
513 | 0 | if (prog_ == NULL) |
514 | 0 | return false; |
515 | | |
516 | 0 | int n = static_cast<int>(prefix_.size()); |
517 | 0 | if (n > maxlen) |
518 | 0 | n = maxlen; |
519 | | |
520 | | // Determine initial min max from prefix_ literal. |
521 | 0 | *min = prefix_.substr(0, n); |
522 | 0 | *max = prefix_.substr(0, n); |
523 | 0 | if (prefix_foldcase_) { |
524 | | // prefix is ASCII lowercase; change *min to uppercase. |
525 | 0 | for (int i = 0; i < n; i++) { |
526 | 0 | char& c = (*min)[i]; |
527 | 0 | if ('a' <= c && c <= 'z') |
528 | 0 | c += 'A' - 'a'; |
529 | 0 | } |
530 | 0 | } |
531 | | |
532 | | // Add to prefix min max using PossibleMatchRange on regexp. |
533 | 0 | std::string dmin, dmax; |
534 | 0 | maxlen -= n; |
535 | 0 | if (maxlen > 0 && prog_->PossibleMatchRange(&dmin, &dmax, maxlen)) { |
536 | 0 | min->append(dmin); |
537 | 0 | max->append(dmax); |
538 | 0 | } else if (!max->empty()) { |
539 | | // prog_->PossibleMatchRange has failed us, |
540 | | // but we still have useful information from prefix_. |
541 | | // Round up *max to allow any possible suffix. |
542 | 0 | PrefixSuccessor(max); |
543 | 0 | } else { |
544 | | // Nothing useful. |
545 | 0 | *min = ""; |
546 | 0 | *max = ""; |
547 | 0 | return false; |
548 | 0 | } |
549 | | |
550 | 0 | return true; |
551 | 0 | } |
552 | | |
553 | | // Avoid possible locale nonsense in standard strcasecmp. |
554 | | // The string a is known to be all lowercase. |
555 | 0 | static int ascii_strcasecmp(const char* a, const char* b, size_t len) { |
556 | 0 | const char* ae = a + len; |
557 | |
|
558 | 0 | for (; a < ae; a++, b++) { |
559 | 0 | uint8_t x = *a; |
560 | 0 | uint8_t y = *b; |
561 | 0 | if ('A' <= y && y <= 'Z') |
562 | 0 | y += 'a' - 'A'; |
563 | 0 | if (x != y) |
564 | 0 | return x - y; |
565 | 0 | } |
566 | 0 | return 0; |
567 | 0 | } |
568 | | |
569 | | |
570 | | /***** Actual matching and rewriting code *****/ |
571 | | |
572 | | bool RE2::Match(const StringPiece& text, |
573 | | size_t startpos, |
574 | | size_t endpos, |
575 | | Anchor re_anchor, |
576 | | StringPiece* submatch, |
577 | 0 | int nsubmatch) const { |
578 | 0 | if (!ok()) { |
579 | 0 | if (options_.log_errors()) |
580 | 0 | LOG(ERROR) << "Invalid RE2: " << *error_; |
581 | 0 | return false; |
582 | 0 | } |
583 | | |
584 | 0 | if (startpos > endpos || endpos > text.size()) { |
585 | 0 | if (options_.log_errors()) |
586 | 0 | LOG(ERROR) << "RE2: invalid startpos, endpos pair. [" |
587 | 0 | << "startpos: " << startpos << ", " |
588 | 0 | << "endpos: " << endpos << ", " |
589 | 0 | << "text size: " << text.size() << "]"; |
590 | 0 | return false; |
591 | 0 | } |
592 | | |
593 | 0 | StringPiece subtext = text; |
594 | 0 | subtext.remove_prefix(startpos); |
595 | 0 | subtext.remove_suffix(text.size() - endpos); |
596 | | |
597 | | // Use DFAs to find exact location of match, filter out non-matches. |
598 | | |
599 | | // Don't ask for the location if we won't use it. |
600 | | // SearchDFA can do extra optimizations in that case. |
601 | 0 | StringPiece match; |
602 | 0 | StringPiece* matchp = &match; |
603 | 0 | if (nsubmatch == 0) |
604 | 0 | matchp = NULL; |
605 | |
|
606 | 0 | int ncap = 1 + NumberOfCapturingGroups(); |
607 | 0 | if (ncap > nsubmatch) |
608 | 0 | ncap = nsubmatch; |
609 | | |
610 | | // If the regexp is anchored explicitly, must not be in middle of text. |
611 | 0 | if (prog_->anchor_start() && startpos != 0) |
612 | 0 | return false; |
613 | | |
614 | | // If the regexp is anchored explicitly, update re_anchor |
615 | | // so that we can potentially fall into a faster case below. |
616 | 0 | if (prog_->anchor_start() && prog_->anchor_end()) |
617 | 0 | re_anchor = ANCHOR_BOTH; |
618 | 0 | else if (prog_->anchor_start() && re_anchor != ANCHOR_BOTH) |
619 | 0 | re_anchor = ANCHOR_START; |
620 | | |
621 | | // Check for the required prefix, if any. |
622 | 0 | size_t prefixlen = 0; |
623 | 0 | if (!prefix_.empty()) { |
624 | 0 | if (startpos != 0) |
625 | 0 | return false; |
626 | 0 | prefixlen = prefix_.size(); |
627 | 0 | if (prefixlen > subtext.size()) |
628 | 0 | return false; |
629 | 0 | if (prefix_foldcase_) { |
630 | 0 | if (ascii_strcasecmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
631 | 0 | return false; |
632 | 0 | } else { |
633 | 0 | if (memcmp(&prefix_[0], subtext.data(), prefixlen) != 0) |
634 | 0 | return false; |
635 | 0 | } |
636 | 0 | subtext.remove_prefix(prefixlen); |
637 | | // If there is a required prefix, the anchor must be at least ANCHOR_START. |
638 | 0 | if (re_anchor != ANCHOR_BOTH) |
639 | 0 | re_anchor = ANCHOR_START; |
640 | 0 | } |
641 | | |
642 | 0 | Prog::Anchor anchor = Prog::kUnanchored; |
643 | 0 | Prog::MatchKind kind = Prog::kFirstMatch; |
644 | 0 | if (options_.longest_match()) |
645 | 0 | kind = Prog::kLongestMatch; |
646 | 0 | bool skipped_test = false; |
647 | |
|
648 | 0 | bool can_one_pass = (is_one_pass_ && ncap <= Prog::kMaxOnePassCapture); |
649 | | |
650 | | // BitState allocates a bitmap of size prog_->list_count() * text.size(). |
651 | | // It also allocates a stack of 3-word structures which could potentially |
652 | | // grow as large as prog_->list_count() * text.size(), but in practice is |
653 | | // much smaller. |
654 | 0 | const int kMaxBitStateBitmapSize = 256*1024; // bitmap size <= max (bits) |
655 | 0 | bool can_bit_state = prog_->CanBitState(); |
656 | 0 | size_t bit_state_text_max = kMaxBitStateBitmapSize / prog_->list_count(); |
657 | |
|
658 | 0 | bool dfa_failed = false; |
659 | 0 | switch (re_anchor) { |
660 | 0 | default: |
661 | 0 | case UNANCHORED: { |
662 | 0 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
663 | 0 | matchp, &dfa_failed, NULL)) { |
664 | 0 | if (dfa_failed) { |
665 | 0 | if (options_.log_errors()) |
666 | 0 | LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " |
667 | 0 | << "bytemap range " << prog_->bytemap_range() << ", " |
668 | 0 | << "list count " << prog_->list_count(); |
669 | | // Fall back to NFA below. |
670 | 0 | skipped_test = true; |
671 | 0 | break; |
672 | 0 | } |
673 | 0 | return false; |
674 | 0 | } |
675 | 0 | if (matchp == NULL) // Matched. Don't care where |
676 | 0 | return true; |
677 | | // SearchDFA set match[0].end() but didn't know where the |
678 | | // match started. Run the regexp backward from match[0].end() |
679 | | // to find the longest possible match -- that's where it started. |
680 | 0 | Prog* prog = ReverseProg(); |
681 | 0 | if (prog == NULL) |
682 | 0 | return false; |
683 | 0 | if (!prog->SearchDFA(match, text, Prog::kAnchored, |
684 | 0 | Prog::kLongestMatch, &match, &dfa_failed, NULL)) { |
685 | 0 | if (dfa_failed) { |
686 | 0 | if (options_.log_errors()) |
687 | 0 | LOG(ERROR) << "DFA out of memory: size " << prog->size() << ", " |
688 | 0 | << "bytemap range " << prog->bytemap_range() << ", " |
689 | 0 | << "list count " << prog->list_count(); |
690 | | // Fall back to NFA below. |
691 | 0 | skipped_test = true; |
692 | 0 | break; |
693 | 0 | } |
694 | 0 | if (options_.log_errors()) |
695 | 0 | LOG(ERROR) << "SearchDFA inconsistency"; |
696 | 0 | return false; |
697 | 0 | } |
698 | 0 | break; |
699 | 0 | } |
700 | | |
701 | 0 | case ANCHOR_BOTH: |
702 | 0 | case ANCHOR_START: |
703 | 0 | if (re_anchor == ANCHOR_BOTH) |
704 | 0 | kind = Prog::kFullMatch; |
705 | 0 | anchor = Prog::kAnchored; |
706 | | |
707 | | // If only a small amount of text and need submatch |
708 | | // information anyway and we're going to use OnePass or BitState |
709 | | // to get it, we might as well not even bother with the DFA: |
710 | | // OnePass or BitState will be fast enough. |
711 | | // On tiny texts, OnePass outruns even the DFA, and |
712 | | // it doesn't have the shared state and occasional mutex that |
713 | | // the DFA does. |
714 | 0 | if (can_one_pass && text.size() <= 4096 && |
715 | 0 | (ncap > 1 || text.size() <= 8)) { |
716 | 0 | skipped_test = true; |
717 | 0 | break; |
718 | 0 | } |
719 | 0 | if (can_bit_state && text.size() <= bit_state_text_max && ncap > 1) { |
720 | 0 | skipped_test = true; |
721 | 0 | break; |
722 | 0 | } |
723 | 0 | if (!prog_->SearchDFA(subtext, text, anchor, kind, |
724 | 0 | &match, &dfa_failed, NULL)) { |
725 | 0 | if (dfa_failed) { |
726 | 0 | if (options_.log_errors()) |
727 | 0 | LOG(ERROR) << "DFA out of memory: size " << prog_->size() << ", " |
728 | 0 | << "bytemap range " << prog_->bytemap_range() << ", " |
729 | 0 | << "list count " << prog_->list_count(); |
730 | | // Fall back to NFA below. |
731 | 0 | skipped_test = true; |
732 | 0 | break; |
733 | 0 | } |
734 | 0 | return false; |
735 | 0 | } |
736 | 0 | break; |
737 | 0 | } |
738 | | |
739 | 0 | if (!skipped_test && ncap <= 1) { |
740 | | // We know exactly where it matches. That's enough. |
741 | 0 | if (ncap == 1) |
742 | 0 | submatch[0] = match; |
743 | 0 | } else { |
744 | 0 | StringPiece subtext1; |
745 | 0 | if (skipped_test) { |
746 | | // DFA ran out of memory or was skipped: |
747 | | // need to search in entire original text. |
748 | 0 | subtext1 = subtext; |
749 | 0 | } else { |
750 | | // DFA found the exact match location: |
751 | | // let NFA run an anchored, full match search |
752 | | // to find submatch locations. |
753 | 0 | subtext1 = match; |
754 | 0 | anchor = Prog::kAnchored; |
755 | 0 | kind = Prog::kFullMatch; |
756 | 0 | } |
757 | |
|
758 | 0 | if (can_one_pass && anchor != Prog::kUnanchored) { |
759 | 0 | if (!prog_->SearchOnePass(subtext1, text, anchor, kind, submatch, ncap)) { |
760 | 0 | if (!skipped_test && options_.log_errors()) |
761 | 0 | LOG(ERROR) << "SearchOnePass inconsistency"; |
762 | 0 | return false; |
763 | 0 | } |
764 | 0 | } else if (can_bit_state && subtext1.size() <= bit_state_text_max) { |
765 | 0 | if (!prog_->SearchBitState(subtext1, text, anchor, |
766 | 0 | kind, submatch, ncap)) { |
767 | 0 | if (!skipped_test && options_.log_errors()) |
768 | 0 | LOG(ERROR) << "SearchBitState inconsistency"; |
769 | 0 | return false; |
770 | 0 | } |
771 | 0 | } else { |
772 | 0 | if (!prog_->SearchNFA(subtext1, text, anchor, kind, submatch, ncap)) { |
773 | 0 | if (!skipped_test && options_.log_errors()) |
774 | 0 | LOG(ERROR) << "SearchNFA inconsistency"; |
775 | 0 | return false; |
776 | 0 | } |
777 | 0 | } |
778 | 0 | } |
779 | | |
780 | | // Adjust overall match for required prefix that we stripped off. |
781 | 0 | if (prefixlen > 0 && nsubmatch > 0) |
782 | 0 | submatch[0] = StringPiece(submatch[0].data() - prefixlen, |
783 | 0 | submatch[0].size() + prefixlen); |
784 | | |
785 | | // Zero submatches that don't exist in the regexp. |
786 | 0 | for (int i = ncap; i < nsubmatch; i++) |
787 | 0 | submatch[i] = StringPiece(); |
788 | 0 | return true; |
789 | 0 | } |
790 | | |
791 | | // Internal matcher - like Match() but takes Args not StringPieces. |
792 | | bool RE2::DoMatch(const StringPiece& text, |
793 | | Anchor re_anchor, |
794 | | size_t* consumed, |
795 | | const Arg* const* args, |
796 | 0 | int n) const { |
797 | 0 | if (!ok()) { |
798 | 0 | if (options_.log_errors()) |
799 | 0 | LOG(ERROR) << "Invalid RE2: " << *error_; |
800 | 0 | return false; |
801 | 0 | } |
802 | | |
803 | 0 | if (NumberOfCapturingGroups() < n) { |
804 | | // RE has fewer capturing groups than number of Arg pointers passed in. |
805 | 0 | return false; |
806 | 0 | } |
807 | | |
808 | | // Count number of capture groups needed. |
809 | 0 | int nvec; |
810 | 0 | if (n == 0 && consumed == NULL) |
811 | 0 | nvec = 0; |
812 | 0 | else |
813 | 0 | nvec = n+1; |
814 | |
|
815 | 0 | StringPiece* vec; |
816 | 0 | StringPiece stkvec[kVecSize]; |
817 | 0 | StringPiece* heapvec = NULL; |
818 | |
|
819 | 0 | if (nvec <= arraysize(stkvec)) { |
820 | 0 | vec = stkvec; |
821 | 0 | } else { |
822 | 0 | vec = new StringPiece[nvec]; |
823 | 0 | heapvec = vec; |
824 | 0 | } |
825 | |
|
826 | 0 | if (!Match(text, 0, text.size(), re_anchor, vec, nvec)) { |
827 | 0 | delete[] heapvec; |
828 | 0 | return false; |
829 | 0 | } |
830 | | |
831 | 0 | if (consumed != NULL) |
832 | 0 | *consumed = static_cast<size_t>(vec[0].end() - text.begin()); |
833 | |
|
834 | 0 | if (n == 0 || args == NULL) { |
835 | | // We are not interested in results |
836 | 0 | delete[] heapvec; |
837 | 0 | return true; |
838 | 0 | } |
839 | | |
840 | | // If we got here, we must have matched the whole pattern. |
841 | 0 | for (int i = 0; i < n; i++) { |
842 | 0 | const StringPiece& s = vec[i+1]; |
843 | 0 | if (!args[i]->Parse(s.data(), s.size())) { |
844 | | // TODO: Should we indicate what the error was? |
845 | 0 | delete[] heapvec; |
846 | 0 | return false; |
847 | 0 | } |
848 | 0 | } |
849 | | |
850 | 0 | delete[] heapvec; |
851 | 0 | return true; |
852 | 0 | } |
853 | | |
854 | | // Checks that the rewrite string is well-formed with respect to this |
855 | | // regular expression. |
856 | | bool RE2::CheckRewriteString(const StringPiece& rewrite, |
857 | 0 | std::string* error) const { |
858 | 0 | int max_token = -1; |
859 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
860 | 0 | s < end; s++) { |
861 | 0 | int c = *s; |
862 | 0 | if (c != '\\') { |
863 | 0 | continue; |
864 | 0 | } |
865 | 0 | if (++s == end) { |
866 | 0 | *error = "Rewrite schema error: '\\' not allowed at end."; |
867 | 0 | return false; |
868 | 0 | } |
869 | 0 | c = *s; |
870 | 0 | if (c == '\\') { |
871 | 0 | continue; |
872 | 0 | } |
873 | 0 | if (!isdigit(c)) { |
874 | 0 | *error = "Rewrite schema error: " |
875 | 0 | "'\\' must be followed by a digit or '\\'."; |
876 | 0 | return false; |
877 | 0 | } |
878 | 0 | int n = (c - '0'); |
879 | 0 | if (max_token < n) { |
880 | 0 | max_token = n; |
881 | 0 | } |
882 | 0 | } |
883 | | |
884 | 0 | if (max_token > NumberOfCapturingGroups()) { |
885 | 0 | SStringPrintf(error, "Rewrite schema requests %d matches, " |
886 | 0 | "but the regexp only has %d parenthesized subexpressions.", |
887 | 0 | max_token, NumberOfCapturingGroups()); |
888 | 0 | return false; |
889 | 0 | } |
890 | 0 | return true; |
891 | 0 | } |
892 | | |
893 | | // Returns the maximum submatch needed for the rewrite to be done by Replace(). |
894 | | // E.g. if rewrite == "foo \\2,\\1", returns 2. |
895 | 0 | int RE2::MaxSubmatch(const StringPiece& rewrite) { |
896 | 0 | int max = 0; |
897 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
898 | 0 | s < end; s++) { |
899 | 0 | if (*s == '\\') { |
900 | 0 | s++; |
901 | 0 | int c = (s < end) ? *s : -1; |
902 | 0 | if (isdigit(c)) { |
903 | 0 | int n = (c - '0'); |
904 | 0 | if (n > max) |
905 | 0 | max = n; |
906 | 0 | } |
907 | 0 | } |
908 | 0 | } |
909 | 0 | return max; |
910 | 0 | } |
911 | | |
912 | | // Append the "rewrite" string, with backslash subsitutions from "vec", |
913 | | // to string "out". |
914 | | bool RE2::Rewrite(std::string* out, |
915 | | const StringPiece& rewrite, |
916 | | const StringPiece* vec, |
917 | 0 | int veclen) const { |
918 | 0 | for (const char *s = rewrite.data(), *end = s + rewrite.size(); |
919 | 0 | s < end; s++) { |
920 | 0 | if (*s != '\\') { |
921 | 0 | out->push_back(*s); |
922 | 0 | continue; |
923 | 0 | } |
924 | 0 | s++; |
925 | 0 | int c = (s < end) ? *s : -1; |
926 | 0 | if (isdigit(c)) { |
927 | 0 | int n = (c - '0'); |
928 | 0 | if (n >= veclen) { |
929 | 0 | if (options_.log_errors()) { |
930 | 0 | LOG(ERROR) << "requested group " << n |
931 | 0 | << " in regexp " << rewrite.data(); |
932 | 0 | } |
933 | 0 | return false; |
934 | 0 | } |
935 | 0 | StringPiece snip = vec[n]; |
936 | 0 | if (snip.size() > 0) |
937 | 0 | out->append(snip.data(), snip.size()); |
938 | 0 | } else if (c == '\\') { |
939 | 0 | out->push_back('\\'); |
940 | 0 | } else { |
941 | 0 | if (options_.log_errors()) |
942 | 0 | LOG(ERROR) << "invalid rewrite pattern: " << rewrite.data(); |
943 | 0 | return false; |
944 | 0 | } |
945 | 0 | } |
946 | 0 | return true; |
947 | 0 | } |
948 | | |
949 | | /***** Parsers for various types *****/ |
950 | | |
951 | 0 | bool RE2::Arg::parse_null(const char* str, size_t n, void* dest) { |
952 | | // We fail if somebody asked us to store into a non-NULL void* pointer |
953 | 0 | return (dest == NULL); |
954 | 0 | } |
955 | | |
956 | 0 | bool RE2::Arg::parse_string(const char* str, size_t n, void* dest) { |
957 | 0 | if (dest == NULL) return true; |
958 | 0 | reinterpret_cast<std::string*>(dest)->assign(str, n); |
959 | 0 | return true; |
960 | 0 | } |
961 | | |
962 | 0 | bool RE2::Arg::parse_stringpiece(const char* str, size_t n, void* dest) { |
963 | 0 | if (dest == NULL) return true; |
964 | 0 | *(reinterpret_cast<StringPiece*>(dest)) = StringPiece(str, n); |
965 | 0 | return true; |
966 | 0 | } |
967 | | |
968 | 0 | bool RE2::Arg::parse_char(const char* str, size_t n, void* dest) { |
969 | 0 | if (n != 1) return false; |
970 | 0 | if (dest == NULL) return true; |
971 | 0 | *(reinterpret_cast<char*>(dest)) = str[0]; |
972 | 0 | return true; |
973 | 0 | } |
974 | | |
975 | 0 | bool RE2::Arg::parse_schar(const char* str, size_t n, void* dest) { |
976 | 0 | if (n != 1) return false; |
977 | 0 | if (dest == NULL) return true; |
978 | 0 | *(reinterpret_cast<signed char*>(dest)) = str[0]; |
979 | 0 | return true; |
980 | 0 | } |
981 | | |
982 | 0 | bool RE2::Arg::parse_uchar(const char* str, size_t n, void* dest) { |
983 | 0 | if (n != 1) return false; |
984 | 0 | if (dest == NULL) return true; |
985 | 0 | *(reinterpret_cast<unsigned char*>(dest)) = str[0]; |
986 | 0 | return true; |
987 | 0 | } |
988 | | |
989 | | // Largest number spec that we are willing to parse |
990 | | static const int kMaxNumberLength = 32; |
991 | | |
992 | | // REQUIRES "buf" must have length at least nbuf. |
993 | | // Copies "str" into "buf" and null-terminates. |
994 | | // Overwrites *np with the new length. |
995 | | static const char* TerminateNumber(char* buf, size_t nbuf, const char* str, |
996 | 0 | size_t* np, bool accept_spaces) { |
997 | 0 | size_t n = *np; |
998 | 0 | if (n == 0) return ""; |
999 | 0 | if (n > 0 && isspace(*str)) { |
1000 | | // We are less forgiving than the strtoxxx() routines and do not |
1001 | | // allow leading spaces. We do allow leading spaces for floats. |
1002 | 0 | if (!accept_spaces) { |
1003 | 0 | return ""; |
1004 | 0 | } |
1005 | 0 | while (n > 0 && isspace(*str)) { |
1006 | 0 | n--; |
1007 | 0 | str++; |
1008 | 0 | } |
1009 | 0 | } |
1010 | | |
1011 | | // Although buf has a fixed maximum size, we can still handle |
1012 | | // arbitrarily large integers correctly by omitting leading zeros. |
1013 | | // (Numbers that are still too long will be out of range.) |
1014 | | // Before deciding whether str is too long, |
1015 | | // remove leading zeros with s/000+/00/. |
1016 | | // Leaving the leading two zeros in place means that |
1017 | | // we don't change 0000x123 (invalid) into 0x123 (valid). |
1018 | | // Skip over leading - before replacing. |
1019 | 0 | bool neg = false; |
1020 | 0 | if (n >= 1 && str[0] == '-') { |
1021 | 0 | neg = true; |
1022 | 0 | n--; |
1023 | 0 | str++; |
1024 | 0 | } |
1025 | |
|
1026 | 0 | if (n >= 3 && str[0] == '0' && str[1] == '0') { |
1027 | 0 | while (n >= 3 && str[2] == '0') { |
1028 | 0 | n--; |
1029 | 0 | str++; |
1030 | 0 | } |
1031 | 0 | } |
1032 | |
|
1033 | 0 | if (neg) { // make room in buf for - |
1034 | 0 | n++; |
1035 | 0 | str--; |
1036 | 0 | } |
1037 | |
|
1038 | 0 | if (n > nbuf-1) return ""; |
1039 | | |
1040 | 0 | memmove(buf, str, n); |
1041 | 0 | if (neg) { |
1042 | 0 | buf[0] = '-'; |
1043 | 0 | } |
1044 | 0 | buf[n] = '\0'; |
1045 | 0 | *np = n; |
1046 | 0 | return buf; |
1047 | 0 | } |
1048 | | |
1049 | | bool RE2::Arg::parse_long_radix(const char* str, |
1050 | | size_t n, |
1051 | | void* dest, |
1052 | 0 | int radix) { |
1053 | 0 | if (n == 0) return false; |
1054 | 0 | char buf[kMaxNumberLength+1]; |
1055 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1056 | 0 | char* end; |
1057 | 0 | errno = 0; |
1058 | 0 | long r = strtol(str, &end, radix); |
1059 | 0 | if (end != str + n) return false; // Leftover junk |
1060 | 0 | if (errno) return false; |
1061 | 0 | if (dest == NULL) return true; |
1062 | 0 | *(reinterpret_cast<long*>(dest)) = r; |
1063 | 0 | return true; |
1064 | 0 | } |
1065 | | |
1066 | | bool RE2::Arg::parse_ulong_radix(const char* str, |
1067 | | size_t n, |
1068 | | void* dest, |
1069 | 0 | int radix) { |
1070 | 0 | if (n == 0) return false; |
1071 | 0 | char buf[kMaxNumberLength+1]; |
1072 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1073 | 0 | if (str[0] == '-') { |
1074 | | // strtoul() will silently accept negative numbers and parse |
1075 | | // them. This module is more strict and treats them as errors. |
1076 | 0 | return false; |
1077 | 0 | } |
1078 | | |
1079 | 0 | char* end; |
1080 | 0 | errno = 0; |
1081 | 0 | unsigned long r = strtoul(str, &end, radix); |
1082 | 0 | if (end != str + n) return false; // Leftover junk |
1083 | 0 | if (errno) return false; |
1084 | 0 | if (dest == NULL) return true; |
1085 | 0 | *(reinterpret_cast<unsigned long*>(dest)) = r; |
1086 | 0 | return true; |
1087 | 0 | } |
1088 | | |
1089 | | bool RE2::Arg::parse_short_radix(const char* str, |
1090 | | size_t n, |
1091 | | void* dest, |
1092 | 0 | int radix) { |
1093 | 0 | long r; |
1094 | 0 | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1095 | 0 | if ((short)r != r) return false; // Out of range |
1096 | 0 | if (dest == NULL) return true; |
1097 | 0 | *(reinterpret_cast<short*>(dest)) = (short)r; |
1098 | 0 | return true; |
1099 | 0 | } |
1100 | | |
1101 | | bool RE2::Arg::parse_ushort_radix(const char* str, |
1102 | | size_t n, |
1103 | | void* dest, |
1104 | 0 | int radix) { |
1105 | 0 | unsigned long r; |
1106 | 0 | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1107 | 0 | if ((unsigned short)r != r) return false; // Out of range |
1108 | 0 | if (dest == NULL) return true; |
1109 | 0 | *(reinterpret_cast<unsigned short*>(dest)) = (unsigned short)r; |
1110 | 0 | return true; |
1111 | 0 | } |
1112 | | |
1113 | | bool RE2::Arg::parse_int_radix(const char* str, |
1114 | | size_t n, |
1115 | | void* dest, |
1116 | 0 | int radix) { |
1117 | 0 | long r; |
1118 | 0 | if (!parse_long_radix(str, n, &r, radix)) return false; // Could not parse |
1119 | 0 | if ((int)r != r) return false; // Out of range |
1120 | 0 | if (dest == NULL) return true; |
1121 | 0 | *(reinterpret_cast<int*>(dest)) = (int)r; |
1122 | 0 | return true; |
1123 | 0 | } |
1124 | | |
1125 | | bool RE2::Arg::parse_uint_radix(const char* str, |
1126 | | size_t n, |
1127 | | void* dest, |
1128 | 0 | int radix) { |
1129 | 0 | unsigned long r; |
1130 | 0 | if (!parse_ulong_radix(str, n, &r, radix)) return false; // Could not parse |
1131 | 0 | if ((unsigned int)r != r) return false; // Out of range |
1132 | 0 | if (dest == NULL) return true; |
1133 | 0 | *(reinterpret_cast<unsigned int*>(dest)) = (unsigned int)r; |
1134 | 0 | return true; |
1135 | 0 | } |
1136 | | |
1137 | | bool RE2::Arg::parse_longlong_radix(const char* str, |
1138 | | size_t n, |
1139 | | void* dest, |
1140 | 0 | int radix) { |
1141 | 0 | if (n == 0) return false; |
1142 | 0 | char buf[kMaxNumberLength+1]; |
1143 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1144 | 0 | char* end; |
1145 | 0 | errno = 0; |
1146 | 0 | long long r = strtoll(str, &end, radix); |
1147 | 0 | if (end != str + n) return false; // Leftover junk |
1148 | 0 | if (errno) return false; |
1149 | 0 | if (dest == NULL) return true; |
1150 | 0 | *(reinterpret_cast<long long*>(dest)) = r; |
1151 | 0 | return true; |
1152 | 0 | } |
1153 | | |
1154 | | bool RE2::Arg::parse_ulonglong_radix(const char* str, |
1155 | | size_t n, |
1156 | | void* dest, |
1157 | 0 | int radix) { |
1158 | 0 | if (n == 0) return false; |
1159 | 0 | char buf[kMaxNumberLength+1]; |
1160 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, false); |
1161 | 0 | if (str[0] == '-') { |
1162 | | // strtoull() will silently accept negative numbers and parse |
1163 | | // them. This module is more strict and treats them as errors. |
1164 | 0 | return false; |
1165 | 0 | } |
1166 | 0 | char* end; |
1167 | 0 | errno = 0; |
1168 | 0 | unsigned long long r = strtoull(str, &end, radix); |
1169 | 0 | if (end != str + n) return false; // Leftover junk |
1170 | 0 | if (errno) return false; |
1171 | 0 | if (dest == NULL) return true; |
1172 | 0 | *(reinterpret_cast<unsigned long long*>(dest)) = r; |
1173 | 0 | return true; |
1174 | 0 | } |
1175 | | |
1176 | | static bool parse_double_float(const char* str, size_t n, bool isfloat, |
1177 | 0 | void* dest) { |
1178 | 0 | if (n == 0) return false; |
1179 | 0 | static const int kMaxLength = 200; |
1180 | 0 | char buf[kMaxLength+1]; |
1181 | 0 | str = TerminateNumber(buf, sizeof buf, str, &n, true); |
1182 | 0 | char* end; |
1183 | 0 | errno = 0; |
1184 | 0 | double r; |
1185 | 0 | if (isfloat) { |
1186 | 0 | r = strtof(str, &end); |
1187 | 0 | } else { |
1188 | 0 | r = strtod(str, &end); |
1189 | 0 | } |
1190 | 0 | if (end != str + n) return false; // Leftover junk |
1191 | 0 | if (errno) return false; |
1192 | 0 | if (dest == NULL) return true; |
1193 | 0 | if (isfloat) { |
1194 | 0 | *(reinterpret_cast<float*>(dest)) = (float)r; |
1195 | 0 | } else { |
1196 | 0 | *(reinterpret_cast<double*>(dest)) = r; |
1197 | 0 | } |
1198 | 0 | return true; |
1199 | 0 | } |
1200 | | |
1201 | 0 | bool RE2::Arg::parse_double(const char* str, size_t n, void* dest) { |
1202 | 0 | return parse_double_float(str, n, false, dest); |
1203 | 0 | } |
1204 | | |
1205 | 0 | bool RE2::Arg::parse_float(const char* str, size_t n, void* dest) { |
1206 | 0 | return parse_double_float(str, n, true, dest); |
1207 | 0 | } |
1208 | | |
1209 | | #define DEFINE_INTEGER_PARSER(name) \ |
1210 | 0 | bool RE2::Arg::parse_##name(const char* str, size_t n, void* dest) { \ |
1211 | 0 | return parse_##name##_radix(str, n, dest, 10); \ |
1212 | 0 | } \ Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_short(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ushort(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_int(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_uint(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_long(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulong(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_longlong(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulonglong(char const*, unsigned long, void*) |
1213 | 0 | bool RE2::Arg::parse_##name##_hex(const char* str, size_t n, void* dest) { \ |
1214 | 0 | return parse_##name##_radix(str, n, dest, 16); \ |
1215 | 0 | } \ Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_short_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ushort_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_int_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_uint_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_long_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulong_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_longlong_hex(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulonglong_hex(char const*, unsigned long, void*) |
1216 | 0 | bool RE2::Arg::parse_##name##_octal(const char* str, size_t n, void* dest) { \ |
1217 | 0 | return parse_##name##_radix(str, n, dest, 8); \ |
1218 | 0 | } \ Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_short_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ushort_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_int_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_uint_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_long_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulong_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_longlong_octal(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulonglong_octal(char const*, unsigned long, void*) |
1219 | | bool RE2::Arg::parse_##name##_cradix(const char* str, size_t n, \ |
1220 | 0 | void* dest) { \ |
1221 | 0 | return parse_##name##_radix(str, n, dest, 0); \ |
1222 | 0 | } Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_short_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ushort_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_int_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_uint_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_long_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulong_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_longlong_cradix(char const*, unsigned long, void*) Unexecuted instantiation: duckdb_re2::RE2::Arg::parse_ulonglong_cradix(char const*, unsigned long, void*) |
1223 | | |
1224 | | DEFINE_INTEGER_PARSER(short) |
1225 | | DEFINE_INTEGER_PARSER(ushort) |
1226 | | DEFINE_INTEGER_PARSER(int) |
1227 | | DEFINE_INTEGER_PARSER(uint) |
1228 | | DEFINE_INTEGER_PARSER(long) |
1229 | | DEFINE_INTEGER_PARSER(ulong) |
1230 | | DEFINE_INTEGER_PARSER(longlong) |
1231 | | DEFINE_INTEGER_PARSER(ulonglong) |
1232 | | |
1233 | | #undef DEFINE_INTEGER_PARSER |
1234 | | |
1235 | | } // namespace duckdb_re2 |