/proc/self/cwd/internal/strings.cc
Line | Count | Source |
1 | | // Copyright 2021 Google LLC |
2 | | // |
3 | | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | | // you may not use this file except in compliance with the License. |
5 | | // You may obtain a copy of the License at |
6 | | // |
7 | | // https://www.apache.org/licenses/LICENSE-2.0 |
8 | | // |
9 | | // Unless required by applicable law or agreed to in writing, software |
10 | | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | | // See the License for the specific language governing permissions and |
13 | | // limitations under the License. |
14 | | |
15 | | #include "internal/strings.h" |
16 | | |
17 | | #include <string> |
18 | | |
19 | | #include "absl/base/attributes.h" |
20 | | #include "absl/status/status.h" |
21 | | #include "absl/strings/ascii.h" |
22 | | #include "absl/strings/cord.h" |
23 | | #include "absl/strings/escaping.h" |
24 | | #include "absl/strings/match.h" |
25 | | #include "absl/strings/str_cat.h" |
26 | | #include "absl/strings/string_view.h" |
27 | | #include "internal/lexis.h" |
28 | | #include "internal/unicode.h" |
29 | | #include "internal/utf8.h" |
30 | | |
31 | | namespace cel::internal { |
32 | | |
33 | | namespace { |
34 | | |
35 | | constexpr char kHexTable[] = "0123456789abcdef"; |
36 | | |
37 | 11.1k | constexpr int HexDigitToInt(char x) { |
38 | 11.1k | if (x > '9') { |
39 | 5.00k | x += 9; |
40 | 5.00k | } |
41 | 11.1k | return x & 0xf; |
42 | 11.1k | } |
43 | | |
44 | 4.59k | constexpr bool IsOctalDigit(char x) { return x >= '0' && x <= '7'; } |
45 | | |
46 | | // Returns true when following conditions are met: |
47 | | // - <closing_str> is a suffix of <source>. |
48 | | // - No other unescaped occurrence of <closing_str> inside <source> (apart from |
49 | | // being a suffix). |
50 | | // Returns false otherwise. If <error> is non-NULL, returns an error message in |
51 | | // <error>. If <error_offset> is non-NULL, returns the offset in <source> that |
52 | | // corresponds to the location of the error. |
53 | | bool CheckForClosingString(absl::string_view source, |
54 | 395k | absl::string_view closing_str, std::string* error) { |
55 | 395k | if (closing_str.empty()) return true; |
56 | | |
57 | 395k | const char* p = source.data(); |
58 | 395k | const char* end = p + source.size(); |
59 | | |
60 | 395k | bool is_closed = false; |
61 | 4.22M | while (p + closing_str.length() <= end) { |
62 | 3.83M | if (*p != '\\') { |
63 | 3.81M | size_t cur_pos = p - source.data(); |
64 | 3.81M | bool is_closing = |
65 | 3.81M | absl::StartsWith(absl::ClippedSubstr(source, cur_pos), closing_str); |
66 | 3.81M | if (is_closing && p + closing_str.length() < end) { |
67 | 0 | if (error) { |
68 | 0 | *error = |
69 | 0 | absl::StrCat("String cannot contain unescaped ", closing_str); |
70 | 0 | } |
71 | 0 | return false; |
72 | 0 | } |
73 | 3.81M | is_closed = is_closing && (p + closing_str.length() == end); |
74 | 3.81M | } else { |
75 | 15.1k | p++; // Read past the escaped character. |
76 | 15.1k | } |
77 | 3.83M | p++; |
78 | 3.83M | } |
79 | | |
80 | 395k | if (!is_closed) { |
81 | 780 | if (error) { |
82 | 780 | *error = absl::StrCat("String must end with ", closing_str); |
83 | 780 | } |
84 | 780 | return false; |
85 | 780 | } |
86 | | |
87 | 394k | return true; |
88 | 395k | } |
89 | | |
90 | | // ---------------------------------------------------------------------- |
91 | | // CUnescapeInternal() |
92 | | // Unescapes C escape sequences and is the reverse of CEscape(). |
93 | | // |
94 | | // If 'source' is valid, stores the unescaped string and its size in |
95 | | // 'dest' and 'dest_len' respectively, and returns true. Otherwise |
96 | | // returns false and optionally stores the error description in |
97 | | // 'error' and the error offset in 'error_offset'. If 'error' is |
98 | | // nonempty on return, 'error_offset' is in range [0, str.size()]. |
99 | | // Set 'error' and 'error_offset' to NULL to disable error reporting. |
100 | | // |
101 | | // 'dest' must point to a buffer that is at least as big as 'source'. The |
102 | | // unescaped string cannot grow bigger than the source string since no |
103 | | // unescaped sequence is longer than the corresponding escape sequence. |
104 | | // 'source' and 'dest' must not be the same. |
105 | | // |
106 | | // If <closing_str> is non-empty, for <source> to be valid: |
107 | | // - It must end with <closing_str>. |
108 | | // - Should not contain any other unescaped occurrence of <closing_str>. |
109 | | // ---------------------------------------------------------------------- |
110 | | bool UnescapeInternal(absl::string_view source, absl::string_view closing_str, |
111 | | bool is_raw_literal, bool is_bytes_literal, |
112 | 395k | std::string* dest, std::string* error) { |
113 | 395k | if (!CheckForClosingString(source, closing_str, error)) { |
114 | 780 | return false; |
115 | 780 | } |
116 | | |
117 | 394k | if (ABSL_PREDICT_FALSE(source.empty())) { |
118 | 0 | *dest = std::string(); |
119 | 0 | return true; |
120 | 0 | } |
121 | | |
122 | | // Strip off the closing_str from the end before unescaping. |
123 | 394k | source = source.substr(0, source.size() - closing_str.size()); |
124 | 394k | if (!is_bytes_literal) { |
125 | 392k | if (!Utf8IsValid(source)) { |
126 | 0 | if (error) { |
127 | 0 | *error = absl::StrCat("Structurally invalid UTF8 string: ", |
128 | 0 | EscapeBytes(source)); |
129 | 0 | } |
130 | 0 | return false; |
131 | 0 | } |
132 | 392k | } |
133 | | |
134 | 394k | dest->reserve(source.size()); |
135 | | |
136 | 394k | const char* p = source.data(); |
137 | 394k | const char* end = p + source.size(); |
138 | 394k | const char* last_byte = end - 1; |
139 | | |
140 | 3.61M | while (p < end) { |
141 | 3.22M | if (*p != '\\') { |
142 | 3.20M | if (*p != '\r') { |
143 | 3.20M | dest->push_back(*p++); |
144 | 3.20M | } else { |
145 | | // All types of newlines in different platforms i.e. '\r', '\n', '\r\n' |
146 | | // are replaced with '\n'. |
147 | 684 | dest->push_back('\n'); |
148 | 684 | p++; |
149 | 684 | if (p < end && *p == '\n') { |
150 | 221 | p++; |
151 | 221 | } |
152 | 684 | } |
153 | 3.20M | } else { |
154 | 14.2k | if ((p + 1) > last_byte) { |
155 | 0 | if (error) { |
156 | 0 | *error = is_raw_literal |
157 | 0 | ? "Raw literals cannot end with odd number of \\" |
158 | 0 | : is_bytes_literal ? "Bytes literal cannot end with \\" |
159 | 0 | : "String literal cannot end with \\"; |
160 | 0 | } |
161 | 0 | return false; |
162 | 0 | } |
163 | 14.2k | if (is_raw_literal) { |
164 | | // For raw literals, all escapes are valid and those characters ('\\' |
165 | | // and the escaped character) come through literally in the string. |
166 | 1.57k | dest->push_back(*p++); |
167 | 1.57k | dest->push_back(*p++); |
168 | 1.57k | continue; |
169 | 1.57k | } |
170 | | // Any error that occurs in the escape is accounted to the start of |
171 | | // the escape. |
172 | 12.6k | p++; // Read past the escape character. |
173 | | |
174 | 12.6k | switch (*p) { |
175 | 783 | case 'a': |
176 | 783 | dest->push_back('\a'); |
177 | 783 | break; |
178 | 388 | case 'b': |
179 | 388 | dest->push_back('\b'); |
180 | 388 | break; |
181 | 362 | case 'f': |
182 | 362 | dest->push_back('\f'); |
183 | 362 | break; |
184 | 197 | case 'n': |
185 | 197 | dest->push_back('\n'); |
186 | 197 | break; |
187 | 202 | case 'r': |
188 | 202 | dest->push_back('\r'); |
189 | 202 | break; |
190 | 542 | case 't': |
191 | 542 | dest->push_back('\t'); |
192 | 542 | break; |
193 | 253 | case 'v': |
194 | 253 | dest->push_back('\v'); |
195 | 253 | break; |
196 | 3.16k | case '\\': |
197 | 3.16k | dest->push_back('\\'); |
198 | 3.16k | break; |
199 | 288 | case '?': |
200 | 288 | dest->push_back('\?'); |
201 | 288 | break; // \? Who knew? |
202 | 354 | case '\'': |
203 | 354 | dest->push_back('\''); |
204 | 354 | break; |
205 | 691 | case '"': |
206 | 691 | dest->push_back('\"'); |
207 | 691 | break; |
208 | 606 | case '`': |
209 | 606 | dest->push_back('`'); |
210 | 606 | break; |
211 | 583 | case '0': |
212 | 583 | ABSL_FALLTHROUGH_INTENDED; |
213 | 783 | case '1': |
214 | 783 | ABSL_FALLTHROUGH_INTENDED; |
215 | 1.24k | case '2': |
216 | 1.24k | ABSL_FALLTHROUGH_INTENDED; |
217 | 1.53k | case '3': { |
218 | | // Octal escape '\ddd': requires exactly 3 octal digits. Note that |
219 | | // the highest valid escape sequence is '\377'. |
220 | | // For string literals, octal and hex escape sequences are interpreted |
221 | | // as unicode code points, and the related UTF8-encoded character is |
222 | | // added to the destination. For bytes literals, octal and hex |
223 | | // escape sequences are interpreted as a single byte value. |
224 | 1.53k | const char* octal_start = p; |
225 | 1.53k | if (p + 2 >= end) { |
226 | 0 | if (error) { |
227 | 0 | *error = |
228 | 0 | "Illegal escape sequence: Octal escape must be followed by 3 " |
229 | 0 | "octal digits but saw: \\" + |
230 | 0 | std::string(octal_start, end - p); |
231 | 0 | } |
232 | | // Error offset was set to the start of the escape above the switch. |
233 | 0 | return false; |
234 | 0 | } |
235 | 1.53k | const char* octal_end = p + 2; |
236 | 1.53k | char32_t ch = 0; |
237 | 6.13k | for (; p <= octal_end; ++p) { |
238 | 4.59k | if (IsOctalDigit(*p)) { |
239 | 4.59k | ch = ch * 8 + *p - '0'; |
240 | 4.59k | } else { |
241 | 0 | if (error) { |
242 | 0 | *error = |
243 | 0 | "Illegal escape sequence: Octal escape must be followed by " |
244 | 0 | "3 octal digits but saw: \\" + |
245 | 0 | std::string(octal_start, 3); |
246 | 0 | } |
247 | | // Error offset was set to the start of the escape above the |
248 | | // switch. |
249 | 0 | return false; |
250 | 0 | } |
251 | 4.59k | } |
252 | 1.53k | p = octal_end; // p points at last digit. |
253 | 1.53k | if (is_bytes_literal) { |
254 | 439 | dest->push_back(static_cast<char>(ch)); |
255 | 1.09k | } else { |
256 | 1.09k | Utf8Encode(*dest, ch); |
257 | 1.09k | } |
258 | 1.53k | break; |
259 | 1.53k | } |
260 | 218 | case 'x': |
261 | 218 | ABSL_FALLTHROUGH_INTENDED; |
262 | 914 | case 'X': { |
263 | | // Hex escape '\xhh': requires exactly 2 hex digits. |
264 | | // For string literals, octal and hex escape sequences are |
265 | | // interpreted as unicode code points, and the related UTF8-encoded |
266 | | // character is added to the destination. For bytes literals, octal |
267 | | // and hex escape sequences are interpreted as a single byte value. |
268 | 914 | const char* hex_start = p; |
269 | 914 | if (p + 2 >= end) { |
270 | 0 | if (error) { |
271 | 0 | *error = |
272 | 0 | "Illegal escape sequence: Hex escape must be followed by 2 " |
273 | 0 | "hex digits but saw: \\" + |
274 | 0 | std::string(hex_start, end - p); |
275 | 0 | } |
276 | | // Error offset was set to the start of the escape above the switch. |
277 | 0 | return false; |
278 | 0 | } |
279 | 914 | char32_t ch = 0; |
280 | 914 | const char* hex_end = p + 2; |
281 | 2.74k | for (++p; p <= hex_end; ++p) { |
282 | 1.82k | if (absl::ascii_isxdigit(*p)) { |
283 | 1.82k | ch = (ch << 4) + HexDigitToInt(*p); |
284 | 1.82k | } else { |
285 | 0 | if (error) { |
286 | 0 | *error = |
287 | 0 | "Illegal escape sequence: Hex escape must be followed by 2 " |
288 | 0 | "hex digits but saw: \\" + |
289 | 0 | std::string(hex_start, 3); |
290 | 0 | } |
291 | | // Error offset was set to the start of the escape above the |
292 | | // switch. |
293 | 0 | return false; |
294 | 0 | } |
295 | 1.82k | } |
296 | 914 | p = hex_end; // p points at last digit. |
297 | 914 | if (is_bytes_literal) { |
298 | 288 | dest->push_back(static_cast<char>(ch)); |
299 | 626 | } else { |
300 | 626 | Utf8Encode(*dest, ch); |
301 | 626 | } |
302 | 914 | break; |
303 | 914 | } |
304 | 1.46k | case 'u': { |
305 | 1.46k | if (is_bytes_literal) { |
306 | 495 | if (error) { |
307 | 495 | *error = |
308 | 495 | std::string( |
309 | 495 | "Illegal escape sequence: Unicode escape sequence \\") + |
310 | 495 | *p + " cannot be used in bytes literals"; |
311 | 495 | } |
312 | | // Error offset was set to the start of the escape above the switch. |
313 | 495 | return false; |
314 | 495 | } |
315 | | // \uhhhh => Read 4 hex digits as a code point, |
316 | | // then write it as UTF-8 bytes. |
317 | 967 | char32_t cp = 0; |
318 | 967 | const char* hex_start = p; |
319 | 967 | if (p + 4 >= end) { |
320 | 0 | if (error) { |
321 | 0 | *error = |
322 | 0 | "Illegal escape sequence: \\u must be followed by 4 hex " |
323 | 0 | "digits but saw: \\" + |
324 | 0 | std::string(hex_start, end - p); |
325 | 0 | } |
326 | | // Error offset was set to the start of the escape above the switch. |
327 | 0 | return false; |
328 | 0 | } |
329 | 4.83k | for (int i = 0; i < 4; ++i) { |
330 | | // Look one char ahead. |
331 | 3.86k | if (absl::ascii_isxdigit(p[1])) { |
332 | 3.86k | cp = (cp << 4) + HexDigitToInt(*++p); // Advance p. |
333 | 3.86k | } else { |
334 | 0 | if (error) { |
335 | 0 | *error = |
336 | 0 | "Illegal escape sequence: \\u must be followed by 4 " |
337 | 0 | "hex digits but saw: \\" + |
338 | 0 | std::string(hex_start, 5); |
339 | 0 | } |
340 | | // Error offset was set to the start of the escape above the |
341 | | // switch. |
342 | 0 | return false; |
343 | 0 | } |
344 | 3.86k | } |
345 | 967 | if (!UnicodeIsValid(cp)) { |
346 | 170 | if (error) { |
347 | 170 | *error = "Illegal escape sequence: Unicode value \\" + |
348 | 170 | std::string(hex_start, 5) + " is invalid"; |
349 | 170 | } |
350 | | // Error offset was set to the start of the escape above the switch. |
351 | 170 | return false; |
352 | 170 | } |
353 | 797 | Utf8Encode(*dest, cp); |
354 | 797 | break; |
355 | 967 | } |
356 | 939 | case 'U': { |
357 | 939 | if (is_bytes_literal) { |
358 | 229 | if (error) { |
359 | 229 | *error = |
360 | 229 | std::string( |
361 | 229 | "Illegal escape sequence: Unicode escape sequence \\") + |
362 | 229 | *p + " cannot be used in bytes literals"; |
363 | 229 | } |
364 | 229 | return false; |
365 | 229 | } |
366 | | // \Uhhhhhhhh => convert 8 hex digits to UTF-8. Note that the |
367 | | // first two digits must be 00: The valid range is |
368 | | // '\U00000000' to '\U0010FFFF' (excluding surrogates). |
369 | 710 | char32_t cp = 0; |
370 | 710 | const char* hex_start = p; |
371 | 710 | if (p + 8 >= end) { |
372 | 0 | if (error) { |
373 | 0 | *error = |
374 | 0 | "Illegal escape sequence: \\U must be followed by 8 hex " |
375 | 0 | "digits but saw: \\" + |
376 | 0 | std::string(hex_start, end - p); |
377 | 0 | } |
378 | | // Error offset was set to the start of the escape above the switch. |
379 | 0 | return false; |
380 | 0 | } |
381 | 5.97k | for (int i = 0; i < 8; ++i) { |
382 | | // Look one char ahead. |
383 | 5.48k | if (absl::ascii_isxdigit(p[1])) { |
384 | 5.48k | cp = (cp << 4) + HexDigitToInt(*++p); |
385 | 5.48k | if (cp > 0x10FFFF) { |
386 | 216 | if (error) { |
387 | 216 | *error = "Illegal escape sequence: Value of \\" + |
388 | 216 | std::string(hex_start, 9) + |
389 | 216 | " exceeds Unicode limit (0x0010FFFF)"; |
390 | 216 | } |
391 | | // Error offset was set to the start of the escape above the |
392 | | // switch. |
393 | 216 | return false; |
394 | 216 | } |
395 | 5.48k | } else { |
396 | 0 | if (error) { |
397 | 0 | *error = |
398 | 0 | "Illegal escape sequence: \\U must be followed by 8 " |
399 | 0 | "hex digits but saw: \\" + |
400 | 0 | std::string(hex_start, 9); |
401 | 0 | } |
402 | | // Error offset was set to the start of the escape above the |
403 | | // switch. |
404 | 0 | return false; |
405 | 0 | } |
406 | 5.48k | } |
407 | 494 | if (!UnicodeIsValid(cp)) { |
408 | 99 | if (error) { |
409 | 99 | *error = "Illegal escape sequence: Unicode value \\" + |
410 | 99 | std::string(hex_start, 9) + " is invalid"; |
411 | 99 | } |
412 | | // Error offset was set to the start of the escape above the switch. |
413 | 99 | return false; |
414 | 99 | } |
415 | 395 | Utf8Encode(*dest, cp); |
416 | 395 | break; |
417 | 494 | } |
418 | 0 | case '\r': |
419 | 0 | ABSL_FALLTHROUGH_INTENDED; |
420 | 0 | case '\n': { |
421 | 0 | if (error) { |
422 | 0 | *error = "Illegal escaped newline"; |
423 | 0 | } |
424 | | // Error offset was set to the start of the escape above the switch. |
425 | 0 | return false; |
426 | 0 | } |
427 | 0 | default: { |
428 | 0 | if (error) { |
429 | 0 | *error = std::string("Illegal escape sequence: \\") + *p; |
430 | 0 | } |
431 | | // Error offset was set to the start of the escape above the switch. |
432 | 0 | return false; |
433 | 0 | } |
434 | 12.6k | } |
435 | 11.4k | p++; // read past letter we escaped |
436 | 11.4k | } |
437 | 3.22M | } |
438 | | |
439 | 393k | dest->shrink_to_fit(); |
440 | | |
441 | 393k | return true; |
442 | 394k | } |
443 | | |
444 | | std::string EscapeInternal(absl::string_view src, bool escape_all_bytes, |
445 | 0 | char escape_quote_char) { |
446 | 0 | std::string dest; |
447 | | // Worst case size is every byte has to be hex escaped, so 4 char for every |
448 | | // byte. |
449 | 0 | dest.reserve(src.size() * 4); |
450 | 0 | bool last_hex_escape = false; // true if last output char was \xNN. |
451 | 0 | const char* p = src.data(); |
452 | 0 | const char* end = p + src.size(); |
453 | 0 | for (; p < end; ++p) { |
454 | 0 | unsigned char c = static_cast<unsigned char>(*p); |
455 | 0 | bool is_hex_escape = false; |
456 | 0 | switch (c) { |
457 | 0 | case '\n': |
458 | 0 | dest.append("\\n"); |
459 | 0 | break; |
460 | 0 | case '\r': |
461 | 0 | dest.append("\\r"); |
462 | 0 | break; |
463 | 0 | case '\t': |
464 | 0 | dest.append("\\t"); |
465 | 0 | break; |
466 | 0 | case '\\': |
467 | 0 | dest.append("\\\\"); |
468 | 0 | break; |
469 | 0 | case '\'': |
470 | 0 | ABSL_FALLTHROUGH_INTENDED; |
471 | 0 | case '\"': |
472 | 0 | ABSL_FALLTHROUGH_INTENDED; |
473 | 0 | case '`': |
474 | | // Escape only quote chars that match escape_quote_char. |
475 | 0 | if (escape_quote_char == 0 || c == escape_quote_char) { |
476 | 0 | dest.push_back('\\'); |
477 | 0 | } |
478 | 0 | dest.push_back(c); |
479 | 0 | break; |
480 | 0 | default: |
481 | | // Note that if we emit \xNN and the src character after that is a hex |
482 | | // digit then that digit must be escaped too to prevent it being |
483 | | // interpreted as part of the character code by C. |
484 | 0 | if ((!escape_all_bytes || c < 0x80) && |
485 | 0 | (!absl::ascii_isprint(c) || |
486 | 0 | (last_hex_escape && absl::ascii_isxdigit(c)))) { |
487 | 0 | dest.append("\\x"); |
488 | 0 | dest.push_back(kHexTable[c / 16]); |
489 | 0 | dest.push_back(kHexTable[c % 16]); |
490 | 0 | is_hex_escape = true; |
491 | 0 | } else { |
492 | 0 | dest.push_back(c); |
493 | 0 | break; |
494 | 0 | } |
495 | 0 | } |
496 | 0 | last_hex_escape = is_hex_escape; |
497 | 0 | } |
498 | 0 | dest.shrink_to_fit(); |
499 | 0 | return dest; |
500 | 0 | } |
501 | | |
502 | 395k | bool MayBeTripleQuotedString(absl::string_view str) { |
503 | 395k | return (str.size() >= 6 && |
504 | 3.91k | ((absl::StartsWith(str, "\"\"\"") && absl::EndsWith(str, "\"\"\"")) || |
505 | 3.75k | (absl::StartsWith(str, "'''") && absl::EndsWith(str, "'''")))); |
506 | 395k | } |
507 | | |
508 | 392k | bool MayBeStringLiteral(absl::string_view str) { |
509 | 392k | return (str.size() >= 2 && str[0] == str[str.size() - 1] && |
510 | 392k | (str[0] == '\'' || str[0] == '"')); |
511 | 392k | } |
512 | | |
513 | 2.35k | bool MayBeBytesLiteral(absl::string_view str) { |
514 | 2.35k | return (str.size() >= 3 && absl::StartsWithIgnoreCase(str, "b") && |
515 | 2.35k | str[1] == str[str.size() - 1] && (str[1] == '\'' || str[1] == '"')); |
516 | 2.35k | } |
517 | | |
518 | 392k | bool MayBeRawStringLiteral(absl::string_view str) { |
519 | 392k | return (str.size() >= 3 && absl::StartsWithIgnoreCase(str, "r") && |
520 | 628 | str[1] == str[str.size() - 1] && (str[1] == '\'' || str[1] == '"')); |
521 | 392k | } |
522 | | |
523 | 2.35k | bool MayBeRawBytesLiteral(absl::string_view str) { |
524 | 2.35k | return (str.size() >= 4 && |
525 | 1.98k | (absl::StartsWithIgnoreCase(str, "rb") || |
526 | 1.98k | absl::StartsWithIgnoreCase(str, "br")) && |
527 | 651 | (str[2] == str[str.size() - 1]) && (str[2] == '\'' || str[2] == '"')); |
528 | 2.35k | } |
529 | | |
530 | | } // namespace |
531 | | |
532 | 0 | absl::StatusOr<std::string> UnescapeString(absl::string_view str) { |
533 | 0 | std::string out; |
534 | 0 | std::string error; |
535 | 0 | if (!UnescapeInternal(str, "", false, false, &out, &error)) { |
536 | 0 | return absl::InvalidArgumentError( |
537 | 0 | absl::StrCat("Invalid escaped string: ", error)); |
538 | 0 | } |
539 | 0 | return out; |
540 | 0 | } |
541 | | |
542 | 0 | absl::StatusOr<std::string> UnescapeBytes(absl::string_view str) { |
543 | 0 | std::string out; |
544 | 0 | std::string error; |
545 | 0 | if (!UnescapeInternal(str, "", false, true, &out, &error)) { |
546 | 0 | return absl::InvalidArgumentError( |
547 | 0 | absl::StrCat("Invalid escaped bytes: ", error)); |
548 | 0 | } |
549 | 0 | return out; |
550 | 0 | } |
551 | | |
552 | 0 | std::string EscapeString(absl::string_view str) { |
553 | 0 | return EscapeInternal(str, true, '\0'); |
554 | 0 | } |
555 | | |
556 | | std::string EscapeBytes(absl::string_view str, bool escape_all_bytes, |
557 | 0 | char escape_quote_char) { |
558 | 0 | std::string escaped_bytes; |
559 | 0 | const char* p = str.data(); |
560 | 0 | const char* end = p + str.size(); |
561 | 0 | for (; p < end; ++p) { |
562 | 0 | unsigned char c = *p; |
563 | 0 | if (escape_all_bytes || !absl::ascii_isprint(c)) { |
564 | 0 | escaped_bytes += "\\x"; |
565 | 0 | escaped_bytes += absl::BytesToHexString(absl::string_view(p, 1)); |
566 | 0 | } else { |
567 | 0 | switch (c) { |
568 | | // Note that we only handle printable escape characters here. All |
569 | | // unprintable (\n, \r, \t, etc.) are hex escaped above. |
570 | 0 | case '\\': |
571 | 0 | escaped_bytes += "\\\\"; |
572 | 0 | break; |
573 | 0 | case '\'': |
574 | 0 | case '"': |
575 | 0 | case '`': |
576 | | // Escape only quote chars that match escape_quote_char. |
577 | 0 | if (escape_quote_char == 0 || c == escape_quote_char) { |
578 | 0 | escaped_bytes += '\\'; |
579 | 0 | } |
580 | 0 | escaped_bytes += c; |
581 | 0 | break; |
582 | 0 | default: |
583 | 0 | escaped_bytes += c; |
584 | 0 | break; |
585 | 0 | } |
586 | 0 | } |
587 | 0 | } |
588 | 0 | return escaped_bytes; |
589 | 0 | } |
590 | | |
591 | 392k | absl::StatusOr<std::string> ParseStringLiteral(absl::string_view str) { |
592 | 392k | std::string out; |
593 | 392k | bool is_string_literal = MayBeStringLiteral(str); |
594 | 392k | bool is_raw_string_literal = MayBeRawStringLiteral(str); |
595 | 392k | if (!is_string_literal && !is_raw_string_literal) { |
596 | 0 | return absl::InvalidArgumentError("Invalid string literal"); |
597 | 0 | } |
598 | | |
599 | 392k | absl::string_view copy_str = str; |
600 | 392k | if (is_raw_string_literal) { |
601 | | // Strip off the prefix 'r' from the raw string content before parsing. |
602 | 628 | copy_str = absl::ClippedSubstr(copy_str, 1); |
603 | 628 | } |
604 | | |
605 | 392k | bool is_triple_quoted = MayBeTripleQuotedString(copy_str); |
606 | | // Starts after the opening quotes {""", '''} or {", '}. |
607 | 392k | int quotes_length = is_triple_quoted ? 3 : 1; |
608 | 392k | absl::string_view quotes = copy_str.substr(0, quotes_length); |
609 | 392k | copy_str = absl::ClippedSubstr(copy_str, quotes_length); |
610 | 392k | std::string error; |
611 | 392k | if (!UnescapeInternal(copy_str, quotes, is_raw_string_literal, false, &out, |
612 | 392k | &error)) { |
613 | 987 | return absl::InvalidArgumentError( |
614 | 987 | absl::StrCat("Invalid string literal: ", error)); |
615 | 987 | } |
616 | 391k | return out; |
617 | 392k | } |
618 | | |
619 | 2.35k | absl::StatusOr<std::string> ParseBytesLiteral(absl::string_view str) { |
620 | 2.35k | std::string out; |
621 | 2.35k | bool is_bytes_literal = MayBeBytesLiteral(str); |
622 | 2.35k | bool is_raw_bytes_literal = MayBeRawBytesLiteral(str); |
623 | 2.35k | if (!is_bytes_literal && !is_raw_bytes_literal) { |
624 | 0 | return absl::InvalidArgumentError("Invalid bytes literal"); |
625 | 0 | } |
626 | | |
627 | 2.35k | absl::string_view copy_str = str; |
628 | 2.35k | if (is_raw_bytes_literal) { |
629 | | // Strip off the prefix {"rb", "br"} from the raw bytes content before |
630 | 651 | copy_str = absl::ClippedSubstr(copy_str, 2); |
631 | 1.70k | } else { |
632 | | // Strip off the prefix 'b' from the bytes content before parsing. |
633 | 1.70k | copy_str = absl::ClippedSubstr(copy_str, 1); |
634 | 1.70k | } |
635 | | |
636 | 2.35k | bool is_triple_quoted = MayBeTripleQuotedString(copy_str); |
637 | | // Starts after the opening quotes {""", '''} or {", '}. |
638 | 2.35k | int quotes_length = is_triple_quoted ? 3 : 1; |
639 | 2.35k | absl::string_view quotes = copy_str.substr(0, quotes_length); |
640 | | // Includes the closing quotes. |
641 | 2.35k | copy_str = absl::ClippedSubstr(copy_str, quotes_length); |
642 | 2.35k | std::string error; |
643 | 2.35k | if (!UnescapeInternal(copy_str, quotes, is_raw_bytes_literal, true, &out, |
644 | 2.35k | &error)) { |
645 | 1.00k | return absl::InvalidArgumentError( |
646 | 1.00k | absl::StrCat("Invalid bytes literal: ", error)); |
647 | 1.00k | } |
648 | 1.35k | return out; |
649 | 2.35k | } |
650 | | |
651 | 0 | std::string FormatStringLiteral(absl::string_view str) { |
652 | 0 | absl::string_view quote = |
653 | 0 | (str.find('"') != str.npos && str.find('\'') == str.npos) ? "'" : "\""; |
654 | 0 | return absl::StrCat(quote, EscapeInternal(str, true, quote[0]), quote); |
655 | 0 | } |
656 | | |
657 | 0 | std::string FormatStringLiteral(const absl::Cord& str) { |
658 | 0 | if (auto flat = str.TryFlat(); flat) { |
659 | 0 | return FormatStringLiteral(*flat); |
660 | 0 | } |
661 | 0 | return FormatStringLiteral(static_cast<std::string>(str)); |
662 | 0 | } |
663 | | |
664 | 0 | std::string FormatSingleQuotedStringLiteral(absl::string_view str) { |
665 | 0 | return absl::StrCat("'", EscapeInternal(str, true, '\''), "'"); |
666 | 0 | } |
667 | | |
668 | 0 | std::string FormatDoubleQuotedStringLiteral(absl::string_view str) { |
669 | 0 | return absl::StrCat("\"", EscapeInternal(str, true, '"'), "\""); |
670 | 0 | } |
671 | | |
672 | 0 | std::string FormatBytesLiteral(absl::string_view str) { |
673 | 0 | absl::string_view quote = |
674 | 0 | (str.find('"') != str.npos && str.find('\'') == str.npos) ? "'" : "\""; |
675 | 0 | return absl::StrCat("b", quote, EscapeBytes(str, false, quote[0]), quote); |
676 | 0 | } |
677 | | |
678 | 0 | std::string FormatSingleQuotedBytesLiteral(absl::string_view str) { |
679 | 0 | return absl::StrCat("b'", EscapeBytes(str, false, '\''), "'"); |
680 | 0 | } |
681 | | |
682 | 0 | std::string FormatDoubleQuotedBytesLiteral(absl::string_view str) { |
683 | 0 | return absl::StrCat("b\"", EscapeBytes(str, false, '"'), "\""); |
684 | 0 | } |
685 | | |
686 | 0 | absl::StatusOr<std::string> ParseIdentifier(absl::string_view str) { |
687 | 0 | if (!LexisIsIdentifier(str)) { |
688 | 0 | return absl::InvalidArgumentError("Invalid identifier"); |
689 | 0 | } |
690 | 0 | return std::string(str); |
691 | 0 | } |
692 | | |
693 | | } // namespace cel::internal |