/src/jsonnet/core/string_utils.cpp
Line | Count | Source |
1 | | /* |
2 | | Copyright 2015 Google Inc. All rights reserved. |
3 | | |
4 | | Licensed under the Apache License, Version 2.0 (the "License"); |
5 | | you may not use this file except in compliance with the License. |
6 | | You may obtain a copy of the License at |
7 | | |
8 | | http://www.apache.org/licenses/LICENSE-2.0 |
9 | | |
10 | | Unless required by applicable law or agreed to in writing, software |
11 | | distributed under the License is distributed on an "AS IS" BASIS, |
12 | | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | | See the License for the specific language governing permissions and |
14 | | limitations under the License. |
15 | | */ |
16 | | |
17 | | #include <iomanip> |
18 | | |
19 | | #include "static_error.h" |
20 | | #include "string_utils.h" |
21 | | |
22 | | namespace jsonnet::internal { |
23 | | |
24 | | UString jsonnet_string_unparse(const UString &str, bool single) |
25 | 1.67M | { |
26 | 1.67M | UStringStream ss; |
27 | 1.67M | ss << (single ? U'\'' : U'\"'); |
28 | 1.67M | ss << jsonnet_string_escape(str, single); |
29 | 1.67M | ss << (single ? U'\'' : U'\"'); |
30 | 1.67M | return ss.str(); |
31 | 1.67M | } |
32 | | |
33 | | UString jsonnet_string_escape(const UString &str, bool single) |
34 | 1.67M | { |
35 | 1.67M | UStringStream ss; |
36 | 2.71G | for (std::size_t i = 0; i < str.length(); ++i) { |
37 | 2.71G | char32_t c = str[i]; |
38 | 2.71G | switch (c) { |
39 | 1.57M | case U'\"': ss << (single ? U"\"" : U"\\\""); break; |
40 | 1.50M | case U'\'': ss << (single ? U"\\\'" : U"\'"); break; |
41 | 2.44G | case U'\\': ss << U"\\\\"; break; |
42 | 1.62M | case U'\b': ss << U"\\b"; break; |
43 | 4.00M | case U'\f': ss << U"\\f"; break; |
44 | 23.5M | case U'\n': ss << U"\\n"; break; |
45 | 1.99M | case U'\r': ss << U"\\r"; break; |
46 | 4.32M | case U'\t': ss << U"\\t"; break; |
47 | 367k | case U'\0': ss << U"\\u0000"; break; |
48 | 232M | default: { |
49 | 232M | if (c < 0x20 || (c >= 0x7f && c <= 0x9f)) { |
50 | | // Unprintable, use \u |
51 | 46.0M | std::stringstream ss8; |
52 | 46.0M | ss8 << "\\u" << std::hex << std::setfill('0') << std::setw(4) |
53 | 46.0M | << (unsigned long)(c); |
54 | 46.0M | ss << decode_utf8(ss8.str()); |
55 | 186M | } else { |
56 | | // Printable, write verbatim |
57 | 186M | ss << c; |
58 | 186M | } |
59 | 232M | } |
60 | 2.71G | } |
61 | 2.71G | } |
62 | 1.67M | return ss.str(); |
63 | 1.67M | } |
64 | | |
65 | | unsigned long jsonnet_string_parse_unicode(const LocationRange &loc, const char32_t *c) |
66 | 76.8k | { |
67 | 76.8k | unsigned long codepoint = 0; |
68 | | // Expect 4 hex digits. |
69 | 383k | for (unsigned i = 0; i < 4; ++i) { |
70 | 306k | auto x = (unsigned char)(c[i]); |
71 | 306k | unsigned digit; |
72 | 306k | if (x == '\0') { |
73 | 72 | auto msg = "Truncated unicode escape sequence in string literal."; |
74 | 72 | throw StaticError(loc, msg); |
75 | 306k | } else if (x >= '0' && x <= '9') { |
76 | 242k | digit = x - '0'; |
77 | 242k | } else if (x >= 'a' && x <= 'f') { |
78 | 23.8k | digit = x - 'a' + 10; |
79 | 39.9k | } else if (x >= 'A' && x <= 'F') { |
80 | 39.7k | digit = x - 'A' + 10; |
81 | 39.7k | } else { |
82 | 213 | std::stringstream ss; |
83 | 213 | ss << "Malformed unicode escape character, " |
84 | 213 | << "should be hex: '" << x << "'"; |
85 | 213 | throw StaticError(loc, ss.str()); |
86 | 213 | } |
87 | 306k | codepoint *= 16; |
88 | 306k | codepoint += digit; |
89 | 306k | } |
90 | 76.5k | return codepoint; |
91 | 76.8k | } |
92 | | |
93 | | bool is_bmp_codepoint(const unsigned long codepoint) |
94 | 75.4k | { |
95 | 75.4k | return codepoint < 0xd800 || (codepoint >= 0xe000 && codepoint < 0x10000); |
96 | 75.4k | } |
97 | | |
98 | | char32_t decode_utf16_surrogates(const LocationRange &loc, const unsigned long high, const unsigned long low) |
99 | 1.06k | { |
100 | 1.06k | if (high >= 0xd800 && high < 0xdc00 && low >= 0xdc00 && low < 0xe000) { |
101 | 1.02k | return 0x10000 + ((high & 0x03ff) << 10) + (low & 0x03ff); |
102 | 1.02k | } else { |
103 | 44 | std::stringstream ss; |
104 | 44 | ss << "Invalid UTF-16 bytes"; |
105 | 44 | throw StaticError(loc, ss.str()); |
106 | 44 | } |
107 | 1.06k | } |
108 | | |
109 | | UString jsonnet_string_unescape(const LocationRange &loc, const UString &s) |
110 | 10.9M | { |
111 | 10.9M | UString r; |
112 | 10.9M | const char32_t *s_ptr = s.c_str(); |
113 | 187M | for (const char32_t *c = s_ptr; *c != U'\0'; ++c) { |
114 | 176M | switch (*c) { |
115 | 1.32M | case '\\': |
116 | 1.32M | switch (*(++c)) { |
117 | 50.7k | case '"': |
118 | 75.9k | case '\'': r += *c; break; |
119 | | |
120 | 273k | case '\\': r += *c; break; |
121 | | |
122 | 7.79k | case '/': r += *c; break; |
123 | | |
124 | 33.3k | case 'b': r += '\b'; break; |
125 | | |
126 | 84.4k | case 'f': r += '\f'; break; |
127 | | |
128 | 644k | case 'n': r += '\n'; break; |
129 | | |
130 | 46.8k | case 'r': r += '\r'; break; |
131 | | |
132 | 78.9k | case 't': r += '\t'; break; |
133 | | |
134 | 75.7k | case 'u': { |
135 | 75.7k | ++c; // Consume the 'u'. |
136 | 75.7k | unsigned long codepoint = jsonnet_string_parse_unicode(loc, c); |
137 | | |
138 | | // Leave us on the last char, ready for the ++c at |
139 | | // the outer for loop. |
140 | 75.7k | c += 3; |
141 | 75.7k | if (!is_bmp_codepoint(codepoint)) { |
142 | 1.15k | if (*(++c) != '\\') { |
143 | 51 | std::stringstream ss; |
144 | 51 | ss << "Invalid non-BMP Unicode escape in string literal"; |
145 | 51 | throw StaticError(loc, ss.str()); |
146 | 51 | } |
147 | 1.10k | if (*(++c) != 'u') { |
148 | 21 | std::stringstream ss; |
149 | 21 | ss << "Invalid non-BMP Unicode escape in string literal"; |
150 | 21 | throw StaticError(loc, ss.str()); |
151 | 21 | } |
152 | 1.08k | ++c; |
153 | 1.08k | unsigned long codepoint2 = jsonnet_string_parse_unicode(loc, c); |
154 | 1.08k | c += 3; |
155 | 1.08k | codepoint = decode_utf16_surrogates(loc, codepoint, codepoint2); |
156 | 1.08k | } |
157 | 75.6k | r += codepoint; |
158 | 75.6k | } break; |
159 | | |
160 | 8 | case '\0': { |
161 | 8 | auto msg = "Truncated escape sequence in string literal."; |
162 | 8 | throw StaticError(loc, msg); |
163 | 75.7k | } |
164 | | |
165 | 402 | default: { |
166 | 402 | std::stringstream ss; |
167 | 402 | std::string utf8; |
168 | 402 | encode_utf8(*c, utf8); |
169 | 402 | ss << "Unknown escape sequence in string literal: '" << utf8 << "'"; |
170 | 402 | throw StaticError(loc, ss.str()); |
171 | 75.7k | } |
172 | 1.32M | } |
173 | 1.32M | break; |
174 | | |
175 | 175M | default: |
176 | | // Just a regular letter. |
177 | 175M | r += *c; |
178 | 176M | } |
179 | 176M | } |
180 | 10.9M | return r; |
181 | 10.9M | } |
182 | | |
183 | | } // namespace jsonnet::internal |