/src/jsonnet/core/string_utils.cpp
Line | Count | Source (jump to first uncovered line) |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <iomanip> |
18 | | |
19 | | #include "static_error.h" |
20 | | #include "string_utils.h" |
21 | | |
22 | | namespace jsonnet::internal { |
23 | | |
24 | | UString jsonnet_string_unparse(const UString &str, bool single) |
25 | 975k | { |
26 | 975k | UStringStream ss; |
27 | 975k | ss << (single ? U'\'' : U'\"'); |
28 | 975k | ss << jsonnet_string_escape(str, single); |
29 | 975k | ss << (single ? U'\'' : U'\"'); |
30 | 975k | return ss.str(); |
31 | 975k | } |
32 | | |
33 | | UString jsonnet_string_escape(const UString &str, bool single) |
34 | 975k | { |
35 | 975k | UStringStream ss; |
36 | 1.60G | for (std::size_t i = 0; i < str.length(); ++i) { |
37 | 1.60G | char32_t c = str[i]; |
38 | 1.60G | switch (c) { |
39 | 1.54M | case U'\"': ss << (single ? U"\"" : U"\\\""); break; |
40 | 2.15M | case U'\'': ss << (single ? U"\\\'" : U"\'"); break; |
41 | 1.30G | case U'\\': ss << U"\\\\"; break; |
42 | 1.08M | case U'\b': ss << U"\\b"; break; |
43 | 519k | case U'\f': ss << U"\\f"; break; |
44 | 37.4M | case U'\n': ss << U"\\n"; break; |
45 | 3.48M | case U'\r': ss << U"\\r"; break; |
46 | 1.44M | case U'\t': ss << U"\\t"; break; |
47 | 7.51k | case U'\0': ss << U"\\u0000"; break; |
48 | 254M | default: { |
49 | 254M | if (c < 0x20 || (c >= 0x7f && c <= 0x9f)) { |
50 | | // Unprintable, use \u |
51 | 78.7M | std::stringstream ss8; |
52 | 78.7M | ss8 << "\\u" << std::hex << std::setfill('0') << std::setw(4) |
53 | 78.7M | << (unsigned long)(c); |
54 | 78.7M | ss << decode_utf8(ss8.str()); |
55 | 175M | } else { |
56 | | // Printable, write verbatim |
57 | 175M | ss << c; |
58 | 175M | } |
59 | 254M | } |
60 | 1.60G | } |
61 | 1.60G | } |
62 | 975k | return ss.str(); |
63 | 975k | } |
64 | | |
65 | | unsigned long jsonnet_string_parse_unicode(const LocationRange &loc, const char32_t *c) |
66 | 54.5k | { |
67 | 54.5k | unsigned long codepoint = 0; |
68 | | // Expect 4 hex digits. |
69 | 272k | for (unsigned i = 0; i < 4; ++i) { |
70 | 218k | auto x = (unsigned char)(c[i]); |
71 | 218k | unsigned digit; |
72 | 218k | if (x == '\0') { |
73 | 58 | auto msg = "Truncated unicode escape sequence in string literal."; |
74 | 58 | throw StaticError(loc, msg); |
75 | 217k | } else if (x >= '0' && x <= '9') { |
76 | 173k | digit = x - '0'; |
77 | 173k | } else if (x >= 'a' && x <= 'f') { |
78 | 16.3k | digit = x - 'a' + 10; |
79 | 28.5k | } else if (x >= 'A' && x <= 'F') { |
80 | 28.3k | digit = x - 'A' + 10; |
81 | 28.3k | } else { |
82 | 174 | std::stringstream ss; |
83 | 174 | ss << "Malformed unicode escape character, " |
84 | 174 | << "should be hex: '" << x << "'"; |
85 | 174 | throw StaticError(loc, ss.str()); |
86 | 174 | } |
87 | 217k | codepoint *= 16; |
88 | 217k | codepoint += digit; |
89 | 217k | } |
90 | 54.3k | return codepoint; |
91 | 54.5k | } |
92 | | |
93 | | bool is_bmp_codepoint(const unsigned long codepoint) |
94 | 52.1k | { |
95 | 52.1k | return codepoint < 0xd800 || (codepoint >= 0xe000 && codepoint < 0x10000); |
96 | 52.1k | } |
97 | | |
98 | | char32_t decode_utf16_surrogates(const LocationRange &loc, const unsigned long high, const unsigned long low) |
99 | 2.18k | { |
100 | 2.18k | if (high >= 0xd800 && high < 0xdc00 && low >= 0xdc00 && low < 0xe000) { |
101 | 2.14k | return 0x10000 + ((high & 0x03ff) << 10) + (low & 0x03ff); |
102 | 2.14k | } else { |
103 | 48 | std::stringstream ss; |
104 | 48 | ss << "Invalid UTF-16 bytes"; |
105 | 48 | throw StaticError(loc, ss.str()); |
106 | 48 | } |
107 | 2.18k | } |
108 | | |
109 | | UString jsonnet_string_unescape(const LocationRange &loc, const UString &s) |
110 | 22.4M | { |
111 | 22.4M | UString r; |
112 | 22.4M | const char32_t *s_ptr = s.c_str(); |
113 | 290M | for (const char32_t *c = s_ptr; *c != U'\0'; ++c) { |
114 | 267M | switch (*c) { |
115 | 1.09M | case '\\': |
116 | 1.09M | switch (*(++c)) { |
117 | 42.0k | case '"': |
118 | 44.7k | case '\'': r += *c; break; |
119 | | |
120 | 270k | case '\\': r += *c; break; |
121 | | |
122 | 5.38k | case '/': r += *c; break; |
123 | | |
124 | 26.6k | case 'b': r += '\b'; break; |
125 | | |
126 | 62.6k | case 'f': r += '\f'; break; |
127 | | |
128 | 516k | case 'n': r += '\n'; break; |
129 | | |
130 | 39.2k | case 'r': r += '\r'; break; |
131 | | |
132 | 78.0k | case 't': r += '\t'; break; |
133 | | |
134 | 52.3k | case 'u': { |
135 | 52.3k | ++c; // Consume the 'u'. |
136 | 52.3k | unsigned long codepoint = jsonnet_string_parse_unicode(loc, c); |
137 | | |
138 | | // Leave us on the last char, ready for the ++c at |
139 | | // the outer for loop. |
140 | 52.3k | c += 3; |
141 | 52.3k | if (!is_bmp_codepoint(codepoint)) { |
142 | 2.27k | if (*(++c) != '\\') { |
143 | 53 | std::stringstream ss; |
144 | 53 | ss << "Invalid non-BMP Unicode escape in string literal"; |
145 | 53 | throw StaticError(loc, ss.str()); |
146 | 53 | } |
147 | 2.22k | if (*(++c) != 'u') { |
148 | 12 | std::stringstream ss; |
149 | 12 | ss << "Invalid non-BMP Unicode escape in string literal"; |
150 | 12 | throw StaticError(loc, ss.str()); |
151 | 12 | } |
152 | 2.21k | ++c; |
153 | 2.21k | unsigned long codepoint2 = jsonnet_string_parse_unicode(loc, c); |
154 | 2.21k | c += 3; |
155 | 2.21k | codepoint = decode_utf16_surrogates(loc, codepoint, codepoint2); |
156 | 2.21k | } |
157 | 52.3k | r += codepoint; |
158 | 52.3k | } break; |
159 | | |
160 | 4 | case '\0': { |
161 | 4 | auto msg = "Truncated escape sequence in string literal."; |
162 | 4 | throw StaticError(loc, msg); |
163 | 52.3k | } |
164 | | |
165 | 308 | default: { |
166 | 308 | std::stringstream ss; |
167 | 308 | std::string utf8; |
168 | 308 | encode_utf8(*c, utf8); |
169 | 308 | ss << "Unknown escape sequence in string literal: '" << utf8 << "'"; |
170 | 308 | throw StaticError(loc, ss.str()); |
171 | 52.3k | } |
172 | 1.09M | } |
173 | 1.09M | break; |
174 | | |
175 | 266M | default: |
176 | | // Just a regular letter. |
177 | 266M | r += *c; |
178 | 267M | } |
179 | 267M | } |
180 | 22.4M | return r; |
181 | 22.4M | } |
182 | | |
183 | | } // namespace jsonnet::internal |