/proc/self/cwd/external/re2~/re2/tostring.cc
Line | Count | Source |
1 | | // Copyright 2006 The RE2 Authors. All Rights Reserved. |
2 | | // Use of this source code is governed by a BSD-style |
3 | | // license that can be found in the LICENSE file. |
4 | | |
5 | | // Format a regular expression structure as a string. |
6 | | // Tested by parse_test.cc |
7 | | |
8 | | #include <string.h> |
9 | | |
10 | | #include <string> |
11 | | |
12 | | #include "absl/log/absl_log.h" |
13 | | #include "absl/strings/str_format.h" |
14 | | #include "re2/regexp.h" |
15 | | #include "re2/walker-inl.h" |
16 | | #include "util/utf.h" |
17 | | |
18 | | namespace re2 { |
19 | | |
20 | | enum { |
21 | | PrecAtom, |
22 | | PrecUnary, |
23 | | PrecConcat, |
24 | | PrecAlternate, |
25 | | PrecEmpty, |
26 | | PrecParen, |
27 | | PrecToplevel, |
28 | | }; |
29 | | |
30 | | // Helper function. See description below. |
31 | | static void AppendCCRange(std::string* t, Rune lo, Rune hi); |
32 | | |
33 | | // Walker to generate string in s_. |
34 | | // The arg pointers are actually integers giving the |
35 | | // context precedence. |
36 | | // The child_args are always NULL. |
37 | | class ToStringWalker : public Regexp::Walker<int> { |
38 | | public: |
39 | 0 | explicit ToStringWalker(std::string* t) : t_(t) {} |
40 | | |
41 | | virtual int PreVisit(Regexp* re, int parent_arg, bool* stop); |
42 | | virtual int PostVisit(Regexp* re, int parent_arg, int pre_arg, |
43 | | int* child_args, int nchild_args); |
44 | 0 | virtual int ShortVisit(Regexp* re, int parent_arg) { |
45 | 0 | return 0; |
46 | 0 | } |
47 | | |
48 | | private: |
49 | | std::string* t_; // The string the walker appends to. |
50 | | |
51 | | ToStringWalker(const ToStringWalker&) = delete; |
52 | | ToStringWalker& operator=(const ToStringWalker&) = delete; |
53 | | }; |
54 | | |
55 | 0 | std::string Regexp::ToString() { |
56 | 0 | std::string t; |
57 | 0 | ToStringWalker w(&t); |
58 | 0 | w.WalkExponential(this, PrecToplevel, 100000); |
59 | 0 | if (w.stopped_early()) |
60 | 0 | t += " [truncated]"; |
61 | 0 | return t; |
62 | 0 | } |
63 | | |
64 | | #define ToString DontCallToString // Avoid accidental recursion. |
65 | | |
66 | | // Visits re before children are processed. |
67 | | // Appends ( if needed and passes new precedence to children. |
68 | 0 | int ToStringWalker::PreVisit(Regexp* re, int parent_arg, bool* stop) { |
69 | 0 | int prec = parent_arg; |
70 | 0 | int nprec = PrecAtom; |
71 | |
|
72 | 0 | switch (re->op()) { |
73 | 0 | case kRegexpNoMatch: |
74 | 0 | case kRegexpEmptyMatch: |
75 | 0 | case kRegexpLiteral: |
76 | 0 | case kRegexpAnyChar: |
77 | 0 | case kRegexpAnyByte: |
78 | 0 | case kRegexpBeginLine: |
79 | 0 | case kRegexpEndLine: |
80 | 0 | case kRegexpBeginText: |
81 | 0 | case kRegexpEndText: |
82 | 0 | case kRegexpWordBoundary: |
83 | 0 | case kRegexpNoWordBoundary: |
84 | 0 | case kRegexpCharClass: |
85 | 0 | case kRegexpHaveMatch: |
86 | 0 | nprec = PrecAtom; |
87 | 0 | break; |
88 | | |
89 | 0 | case kRegexpConcat: |
90 | 0 | case kRegexpLiteralString: |
91 | 0 | if (prec < PrecConcat) |
92 | 0 | t_->append("(?:"); |
93 | 0 | nprec = PrecConcat; |
94 | 0 | break; |
95 | | |
96 | 0 | case kRegexpAlternate: |
97 | 0 | if (prec < PrecAlternate) |
98 | 0 | t_->append("(?:"); |
99 | 0 | nprec = PrecAlternate; |
100 | 0 | break; |
101 | | |
102 | 0 | case kRegexpCapture: |
103 | 0 | t_->append("("); |
104 | 0 | if (re->cap() == 0) |
105 | 0 | ABSL_LOG(DFATAL) << "kRegexpCapture cap() == 0"; |
106 | 0 | if (re->name()) { |
107 | 0 | t_->append("?P<"); |
108 | 0 | t_->append(*re->name()); |
109 | 0 | t_->append(">"); |
110 | 0 | } |
111 | 0 | nprec = PrecParen; |
112 | 0 | break; |
113 | | |
114 | 0 | case kRegexpStar: |
115 | 0 | case kRegexpPlus: |
116 | 0 | case kRegexpQuest: |
117 | 0 | case kRegexpRepeat: |
118 | 0 | if (prec < PrecUnary) |
119 | 0 | t_->append("(?:"); |
120 | | // The subprecedence here is PrecAtom instead of PrecUnary |
121 | | // because PCRE treats two unary ops in a row as a parse error. |
122 | 0 | nprec = PrecAtom; |
123 | 0 | break; |
124 | 0 | } |
125 | | |
126 | 0 | return nprec; |
127 | 0 | } |
128 | | |
129 | 0 | static void AppendLiteral(std::string *t, Rune r, bool foldcase) { |
130 | 0 | if (r != 0 && r < 0x80 && strchr("(){}[]*+?|.^$\\", r)) { |
131 | 0 | t->append(1, '\\'); |
132 | 0 | t->append(1, static_cast<char>(r)); |
133 | 0 | } else if (foldcase && 'a' <= r && r <= 'z') { |
134 | 0 | r -= 'a' - 'A'; |
135 | 0 | t->append(1, '['); |
136 | 0 | t->append(1, static_cast<char>(r)); |
137 | 0 | t->append(1, static_cast<char>(r) + 'a' - 'A'); |
138 | 0 | t->append(1, ']'); |
139 | 0 | } else { |
140 | 0 | AppendCCRange(t, r, r); |
141 | 0 | } |
142 | 0 | } |
143 | | |
144 | | // Visits re after children are processed. |
145 | | // For childless regexps, all the work is done here. |
146 | | // For regexps with children, append any unary suffixes or ). |
147 | | int ToStringWalker::PostVisit(Regexp* re, int parent_arg, int pre_arg, |
148 | 0 | int* child_args, int nchild_args) { |
149 | 0 | int prec = parent_arg; |
150 | 0 | switch (re->op()) { |
151 | 0 | case kRegexpNoMatch: |
152 | | // There's no simple symbol for "no match", but |
153 | | // [^0-Runemax] excludes everything. |
154 | 0 | t_->append("[^\\x00-\\x{10ffff}]"); |
155 | 0 | break; |
156 | | |
157 | 0 | case kRegexpEmptyMatch: |
158 | | // Append (?:) to make empty string visible, |
159 | | // unless this is already being parenthesized. |
160 | 0 | if (prec < PrecEmpty) |
161 | 0 | t_->append("(?:)"); |
162 | 0 | break; |
163 | | |
164 | 0 | case kRegexpLiteral: |
165 | 0 | AppendLiteral(t_, re->rune(), |
166 | 0 | (re->parse_flags() & Regexp::FoldCase) != 0); |
167 | 0 | break; |
168 | | |
169 | 0 | case kRegexpLiteralString: |
170 | 0 | for (int i = 0; i < re->nrunes(); i++) |
171 | 0 | AppendLiteral(t_, re->runes()[i], |
172 | 0 | (re->parse_flags() & Regexp::FoldCase) != 0); |
173 | 0 | if (prec < PrecConcat) |
174 | 0 | t_->append(")"); |
175 | 0 | break; |
176 | | |
177 | 0 | case kRegexpConcat: |
178 | 0 | if (prec < PrecConcat) |
179 | 0 | t_->append(")"); |
180 | 0 | break; |
181 | | |
182 | 0 | case kRegexpAlternate: |
183 | | // Clumsy but workable: the children all appended | |
184 | | // at the end of their strings, so just remove the last one. |
185 | 0 | if ((*t_)[t_->size()-1] == '|') |
186 | 0 | t_->erase(t_->size()-1); |
187 | 0 | else |
188 | 0 | ABSL_LOG(DFATAL) << "Bad final char: " << t_; |
189 | 0 | if (prec < PrecAlternate) |
190 | 0 | t_->append(")"); |
191 | 0 | break; |
192 | | |
193 | 0 | case kRegexpStar: |
194 | 0 | t_->append("*"); |
195 | 0 | if (re->parse_flags() & Regexp::NonGreedy) |
196 | 0 | t_->append("?"); |
197 | 0 | if (prec < PrecUnary) |
198 | 0 | t_->append(")"); |
199 | 0 | break; |
200 | | |
201 | 0 | case kRegexpPlus: |
202 | 0 | t_->append("+"); |
203 | 0 | if (re->parse_flags() & Regexp::NonGreedy) |
204 | 0 | t_->append("?"); |
205 | 0 | if (prec < PrecUnary) |
206 | 0 | t_->append(")"); |
207 | 0 | break; |
208 | | |
209 | 0 | case kRegexpQuest: |
210 | 0 | t_->append("?"); |
211 | 0 | if (re->parse_flags() & Regexp::NonGreedy) |
212 | 0 | t_->append("?"); |
213 | 0 | if (prec < PrecUnary) |
214 | 0 | t_->append(")"); |
215 | 0 | break; |
216 | | |
217 | 0 | case kRegexpRepeat: |
218 | 0 | if (re->max() == -1) |
219 | 0 | t_->append(absl::StrFormat("{%d,}", re->min())); |
220 | 0 | else if (re->min() == re->max()) |
221 | 0 | t_->append(absl::StrFormat("{%d}", re->min())); |
222 | 0 | else |
223 | 0 | t_->append(absl::StrFormat("{%d,%d}", re->min(), re->max())); |
224 | 0 | if (re->parse_flags() & Regexp::NonGreedy) |
225 | 0 | t_->append("?"); |
226 | 0 | if (prec < PrecUnary) |
227 | 0 | t_->append(")"); |
228 | 0 | break; |
229 | | |
230 | 0 | case kRegexpAnyChar: |
231 | 0 | t_->append("."); |
232 | 0 | break; |
233 | | |
234 | 0 | case kRegexpAnyByte: |
235 | 0 | t_->append("\\C"); |
236 | 0 | break; |
237 | | |
238 | 0 | case kRegexpBeginLine: |
239 | 0 | t_->append("^"); |
240 | 0 | break; |
241 | | |
242 | 0 | case kRegexpEndLine: |
243 | 0 | t_->append("$"); |
244 | 0 | break; |
245 | | |
246 | 0 | case kRegexpBeginText: |
247 | 0 | t_->append("(?-m:^)"); |
248 | 0 | break; |
249 | | |
250 | 0 | case kRegexpEndText: |
251 | 0 | if (re->parse_flags() & Regexp::WasDollar) |
252 | 0 | t_->append("(?-m:$)"); |
253 | 0 | else |
254 | 0 | t_->append("\\z"); |
255 | 0 | break; |
256 | | |
257 | 0 | case kRegexpWordBoundary: |
258 | 0 | t_->append("\\b"); |
259 | 0 | break; |
260 | | |
261 | 0 | case kRegexpNoWordBoundary: |
262 | 0 | t_->append("\\B"); |
263 | 0 | break; |
264 | | |
265 | 0 | case kRegexpCharClass: { |
266 | 0 | if (re->cc()->size() == 0) { |
267 | 0 | t_->append("[^\\x00-\\x{10ffff}]"); |
268 | 0 | break; |
269 | 0 | } |
270 | 0 | t_->append("["); |
271 | | // Heuristic: show class as negated if it contains the |
272 | | // non-character 0xFFFE and yet somehow isn't full. |
273 | 0 | CharClass* cc = re->cc(); |
274 | 0 | if (cc->Contains(0xFFFE) && !cc->full()) { |
275 | 0 | cc = cc->Negate(); |
276 | 0 | t_->append("^"); |
277 | 0 | } |
278 | 0 | for (CharClass::iterator i = cc->begin(); i != cc->end(); ++i) |
279 | 0 | AppendCCRange(t_, i->lo, i->hi); |
280 | 0 | if (cc != re->cc()) |
281 | 0 | cc->Delete(); |
282 | 0 | t_->append("]"); |
283 | 0 | break; |
284 | 0 | } |
285 | | |
286 | 0 | case kRegexpCapture: |
287 | 0 | t_->append(")"); |
288 | 0 | break; |
289 | | |
290 | 0 | case kRegexpHaveMatch: |
291 | | // There's no syntax accepted by the parser to generate |
292 | | // this node (it is generated by RE2::Set) so make something |
293 | | // up that is readable but won't compile. |
294 | 0 | t_->append(absl::StrFormat("(?HaveMatch:%d)", re->match_id())); |
295 | 0 | break; |
296 | 0 | } |
297 | | |
298 | | // If the parent is an alternation, append the | for it. |
299 | 0 | if (prec == PrecAlternate) |
300 | 0 | t_->append("|"); |
301 | |
|
302 | 0 | return 0; |
303 | 0 | } |
304 | | |
305 | | // Appends a rune for use in a character class to the string t. |
306 | 0 | static void AppendCCChar(std::string* t, Rune r) { |
307 | 0 | if (0x20 <= r && r <= 0x7E) { |
308 | 0 | if (strchr("[]^-\\", r)) |
309 | 0 | t->append("\\"); |
310 | 0 | t->append(1, static_cast<char>(r)); |
311 | 0 | return; |
312 | 0 | } |
313 | 0 | switch (r) { |
314 | 0 | default: |
315 | 0 | break; |
316 | | |
317 | 0 | case '\r': |
318 | 0 | t->append("\\r"); |
319 | 0 | return; |
320 | | |
321 | 0 | case '\t': |
322 | 0 | t->append("\\t"); |
323 | 0 | return; |
324 | | |
325 | 0 | case '\n': |
326 | 0 | t->append("\\n"); |
327 | 0 | return; |
328 | | |
329 | 0 | case '\f': |
330 | 0 | t->append("\\f"); |
331 | 0 | return; |
332 | 0 | } |
333 | | |
334 | 0 | if (r < 0x100) { |
335 | 0 | *t += absl::StrFormat("\\x%02x", static_cast<int>(r)); |
336 | 0 | return; |
337 | 0 | } |
338 | 0 | *t += absl::StrFormat("\\x{%x}", static_cast<int>(r)); |
339 | 0 | } |
340 | | |
341 | 0 | static void AppendCCRange(std::string* t, Rune lo, Rune hi) { |
342 | 0 | if (lo > hi) |
343 | 0 | return; |
344 | 0 | AppendCCChar(t, lo); |
345 | 0 | if (lo < hi) { |
346 | 0 | t->append("-"); |
347 | 0 | AppendCCChar(t, hi); |
348 | 0 | } |
349 | 0 | } |
350 | | |
351 | | } // namespace re2 |