/src/hermes/lib/VM/JSLib/escape.cpp
Line | Count | Source |
1 | | /* |
2 | | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | | * |
4 | | * This source code is licensed under the MIT license found in the |
5 | | * LICENSE file in the root directory of this source tree. |
6 | | */ |
7 | | |
8 | | #include "JSLibInternal.h" |
9 | | |
10 | | #include "hermes/Support/UTF8.h" |
11 | | #include "hermes/VM/Operations.h" |
12 | | #include "hermes/VM/SmallXString.h" |
13 | | #include "hermes/VM/StringView.h" |
14 | | |
15 | | #include "llvh/Support/ConvertUTF.h" |
16 | | #pragma GCC diagnostic push |
17 | | |
18 | | #ifdef HERMES_COMPILER_SUPPORTS_WSHORTEN_64_TO_32 |
19 | | #pragma GCC diagnostic ignored "-Wshorten-64-to-32" |
20 | | #endif |
21 | | namespace hermes { |
22 | | namespace vm { |
23 | | |
24 | | using llvh::ConversionResult; |
25 | | using llvh::UTF32; |
26 | | using llvh::UTF8; |
27 | | |
28 | | /// \return true if c is a character that doesn't need to be escaped. |
29 | 0 | static inline bool noEscape(char16_t c) { |
30 | 0 | return (u'A' <= c && c <= u'Z') || (u'a' <= c && c <= u'z') || |
31 | 0 | (u'0' <= c && c <= u'9') || c == u'@' || c == u'*' || c == u'_' || |
32 | 0 | c == u'+' || c == u'-' || c == u'.' || c == '/'; |
33 | 0 | } |
34 | | |
35 | | /// \param x must be between 0 and 15 inclusive. |
36 | | /// \return the result of converting x to a hex character. |
37 | 1.49M | static inline char16_t toHexChar(int x) { |
38 | 1.49M | assert(0 <= x && x <= 15 && "toHexChar argument out of bounds"); |
39 | 1.49M | if (0 <= x && x <= 9) { |
40 | 1.37M | return x + u'0'; |
41 | 1.37M | } |
42 | 124k | return x - 10 + u'A'; |
43 | 1.49M | } |
44 | | |
45 | | /// \return true if c is a valid hex char in the range [0-9a-fA-F]. |
46 | 0 | static inline bool isHexChar(char16_t c) { |
47 | | // Convert to lowercase. |
48 | 0 | char16_t cLow = c | 32; |
49 | 0 | return (u'0' <= c && c <= u'9') || (u'a' <= cLow && cLow <= 'f'); |
50 | 0 | } |
51 | | |
52 | | /// \param c must be a hex char. |
53 | | /// \return the result of converting c into a number (result is 8 bits). |
54 | 0 | static inline int fromHexChar(char16_t c) { |
55 | 0 | assert(isHexChar(c) && "fromHexChar argument out of bounds"); |
56 | 0 | if (u'0' <= c && c <= u'9') { |
57 | 0 | return c - u'0'; |
58 | 0 | } |
59 | | // Convert to lowercase. |
60 | 0 | c |= 32; |
61 | 0 | return c - u'a' + 10; |
62 | 0 | } |
63 | | |
64 | | /// Convert the argument to string and escape unicode characters. |
65 | 0 | CallResult<HermesValue> escape(void *, Runtime &runtime, NativeArgs args) { |
66 | 0 | auto res = toString_RJS(runtime, args.getArgHandle(0)); |
67 | 0 | if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) { |
68 | 0 | return ExecutionStatus::EXCEPTION; |
69 | 0 | } |
70 | 0 | auto string = runtime.makeHandle(std::move(*res)); |
71 | 0 | auto len = string->getStringLength(); |
72 | 0 | SmallU16String<32> R{}; |
73 | 0 | R.reserve(len); |
74 | |
|
75 | 0 | for (char16_t c : StringPrimitive::createStringView(runtime, string)) { |
76 | 0 | if (noEscape(c)) { |
77 | | // Just append. |
78 | 0 | R.push_back(c); |
79 | 0 | } else if (c < 256) { |
80 | | // R += "%xy" where xy is the 2 bytes of c. |
81 | 0 | R.push_back(u'%'); |
82 | 0 | R.push_back(toHexChar((c >> 4) & 0xf)); |
83 | 0 | R.push_back(toHexChar(c & 0xf)); |
84 | 0 | } else { |
85 | | // R += "%uwxyz" where wxyz is the 4 bytes of c. |
86 | 0 | R.append(u"%u"); |
87 | 0 | R.push_back(toHexChar((c >> 12) & 0xf)); |
88 | 0 | R.push_back(toHexChar((c >> 8) & 0xf)); |
89 | 0 | R.push_back(toHexChar((c >> 4) & 0xf)); |
90 | 0 | R.push_back(toHexChar(c & 0xf)); |
91 | 0 | } |
92 | 0 | } |
93 | |
|
94 | 0 | return StringPrimitive::create(runtime, R); |
95 | 0 | } |
96 | | |
97 | | /// Convert the argument to string and unescape unicode characters. |
98 | 0 | CallResult<HermesValue> unescape(void *, Runtime &runtime, NativeArgs args) { |
99 | 0 | auto res = toString_RJS(runtime, args.getArgHandle(0)); |
100 | 0 | if (LLVM_UNLIKELY(res == ExecutionStatus::EXCEPTION)) { |
101 | 0 | return ExecutionStatus::EXCEPTION; |
102 | 0 | } |
103 | 0 | auto strPrim = runtime.makeHandle(std::move(*res)); |
104 | 0 | auto len = strPrim->getStringLength(); |
105 | 0 | SmallU16String<32> R{}; |
106 | 0 | R.reserve(len); |
107 | |
|
108 | 0 | uint32_t k = 0; |
109 | 0 | auto str = StringPrimitive::createStringView(runtime, strPrim); |
110 | 0 | while (k < len) { |
111 | 0 | char16_t c = str[k]; |
112 | | // Resultant char to append to R. |
113 | 0 | char16_t r = c; |
114 | 0 | if (c == u'%') { |
115 | | // Try to read a hex string instead. |
116 | 0 | if (k + 6 <= len && str[k + 1] == u'u' && |
117 | 0 | std::all_of(str.begin() + k + 2, str.begin() + k + 6, isHexChar)) { |
118 | | // Long form %uwxyz |
119 | 0 | r = (fromHexChar(str[k + 2]) << 12) | (fromHexChar(str[k + 3]) << 8) | |
120 | 0 | (fromHexChar(str[k + 4]) << 4) | fromHexChar(str[k + 5]); |
121 | 0 | k += 5; |
122 | 0 | } else if ( |
123 | 0 | k + 3 <= len && isHexChar(str[k + 1]) && isHexChar(str[k + 2])) { |
124 | | // Short form %xy |
125 | 0 | r = (fromHexChar(str[k + 1]) << 4) | fromHexChar(str[k + 2]); |
126 | 0 | k += 2; |
127 | 0 | } |
128 | 0 | } |
129 | 0 | R.push_back(r); |
130 | 0 | ++k; |
131 | 0 | } |
132 | |
|
133 | 0 | return StringPrimitive::create(runtime, R); |
134 | 0 | } |
135 | | |
136 | | /// Removes one character from the end of \p str. |
137 | | /// Used to remove the null terminator when UTF16Ref is constructed from |
138 | | /// literals. |
139 | 2.99M | static inline UTF16Ref removeNullTerminator(const UTF16Ref str) { |
140 | 2.99M | return str.slice(0, str.size() - 1); |
141 | 2.99M | } |
142 | | |
143 | | /// Function used in place of a set to indicate if \p c is in the unescaped set. |
144 | | using CharSetFn = bool (*)(char16_t c); |
145 | | |
146 | | /// Is a member of uriUnescaped. |
147 | 1.49M | static bool uriUnescaped(char16_t c) { |
148 | 1.49M | const UTF16Ref marks = removeNullTerminator(u"-_.!~*'()"); |
149 | 1.49M | if (std::find(marks.begin(), marks.end(), c) != marks.end()) { |
150 | 249k | return true; |
151 | 249k | } |
152 | 1.24M | if (u'0' <= c && c <= u'9') { |
153 | 498k | return true; |
154 | 498k | } |
155 | | // Convert to lowercase and see if it's alphabetic. |
156 | 747k | c |= 32; |
157 | 747k | return u'a' <= c && c <= u'z'; |
158 | 1.24M | } |
159 | | |
160 | | /// Is a member of uriReserved. |
161 | 1.49M | static bool uriReserved(char16_t c) { |
162 | 1.49M | const UTF16Ref reserved = removeNullTerminator(u";/?:@&=+$,"); |
163 | 1.49M | return std::find(reserved.begin(), reserved.end(), c) != reserved.end(); |
164 | 1.49M | } |
165 | | |
166 | | /// Is a member of uriUnescaped plus '#', or is a member of uriReserved. |
167 | 1.49M | static bool unescapedURISet(char16_t c) { |
168 | 1.49M | return uriReserved(c) || uriUnescaped(c) || c == '#'; |
169 | 1.49M | } |
170 | | |
171 | | /// Is a member of uriReserved plus '#'. |
172 | 0 | static bool reservedURISet(char16_t c) { |
173 | 0 | return uriReserved(c) || c == '#'; |
174 | 0 | } |
175 | | |
176 | | /// ES 5.1 15.1.3 |
177 | | /// Encode abstract method, takes a string and URI encodes it. |
178 | | /// \param unescapedSet a function indicating which characters to not escape. |
179 | | static CallResult<Handle<StringPrimitive>> encode( |
180 | | Runtime &runtime, |
181 | | Handle<StringPrimitive> strHandle, |
182 | 124k | CharSetFn unescapedSet) { |
183 | 124k | auto str = StringPrimitive::createStringView(runtime, strHandle); |
184 | 124k | auto strLen = str.length(); |
185 | 124k | SmallU16String<32> R{}; |
186 | 124k | R.reserve(strLen); |
187 | 1.62M | for (auto itr = str.begin(), e = str.end(); itr != e;) { |
188 | | // Use int32_t to allow for arithmetic past 16 bits. |
189 | 1.49M | uint32_t C = *itr; |
190 | 1.49M | if (unescapedSet(C)) { |
191 | 747k | R.push_back(C); |
192 | 747k | } else { |
193 | 747k | if (C >= 0xdc00 && C <= 0xdfff) { |
194 | 0 | return runtime.raiseURIError("Malformed encodeURI input"); |
195 | 0 | } |
196 | | // Code point to convert to UTF8. |
197 | 747k | uint32_t V; |
198 | 747k | if (C < 0xd800 || C > 0xdbff) { |
199 | 747k | V = C; |
200 | 747k | } else { |
201 | 0 | ++itr; |
202 | 0 | if (itr == e) { |
203 | 0 | return runtime.raiseURIError("Malformed encodeURI input"); |
204 | 0 | } |
205 | 0 | uint32_t kChar = *itr; |
206 | 0 | if (kChar < 0xdc00 || kChar > 0xdfff) { |
207 | 0 | return runtime.raiseURIError("Malformed encodeURI input"); |
208 | 0 | } |
209 | 0 | V = (C - 0xd800) * 0x400 + (kChar - 0xdc00) + 0x10000; |
210 | 0 | } |
211 | 747k | char octets[UNI_MAX_UTF8_BYTES_PER_CODE_POINT]; |
212 | 747k | char *targetStart = octets; |
213 | 747k | hermes::encodeUTF8(targetStart, V); |
214 | | // Length of the octets array. |
215 | 747k | uint32_t L = targetStart - octets; |
216 | 1.49M | for (uint32_t j = 0; j < L; ++j) { |
217 | 747k | auto jOctet = octets[j]; |
218 | 747k | R.push_back(u'%'); |
219 | 747k | R.push_back(toHexChar((jOctet >> 4) & 0xf)); |
220 | 747k | R.push_back(toHexChar(jOctet & 0xf)); |
221 | 747k | } |
222 | 747k | } |
223 | 1.49M | ++itr; |
224 | 1.49M | } |
225 | | |
226 | 124k | auto finalStr = StringPrimitive::create(runtime, R); |
227 | 124k | if (LLVM_UNLIKELY(finalStr == ExecutionStatus::EXCEPTION)) { |
228 | 0 | return ExecutionStatus::EXCEPTION; |
229 | 0 | } |
230 | 124k | return runtime.makeHandle<StringPrimitive>(*finalStr); |
231 | 124k | } |
232 | | |
233 | 124k | CallResult<HermesValue> encodeURI(void *, Runtime &runtime, NativeArgs args) { |
234 | 124k | auto strRes = toString_RJS(runtime, args.getArgHandle(0)); |
235 | 124k | if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) { |
236 | 0 | return ExecutionStatus::EXCEPTION; |
237 | 0 | } |
238 | 124k | auto res = |
239 | 124k | encode(runtime, runtime.makeHandle(std::move(*strRes)), unescapedURISet); |
240 | 124k | if (res == ExecutionStatus::EXCEPTION) |
241 | 0 | return ExecutionStatus::EXCEPTION; |
242 | 124k | return res->getHermesValue(); |
243 | 124k | } |
244 | | |
245 | | CallResult<HermesValue> |
246 | 0 | encodeURIComponent(void *, Runtime &runtime, NativeArgs args) { |
247 | 0 | auto strRes = toString_RJS(runtime, args.getArgHandle(0)); |
248 | 0 | if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) { |
249 | 0 | return ExecutionStatus::EXCEPTION; |
250 | 0 | } |
251 | 0 | auto res = |
252 | 0 | encode(runtime, runtime.makeHandle(std::move(*strRes)), uriUnescaped); |
253 | 0 | if (res == ExecutionStatus::EXCEPTION) |
254 | 0 | return ExecutionStatus::EXCEPTION; |
255 | 0 | return res->getHermesValue(); |
256 | 0 | } |
257 | | |
258 | | /// ES 5.1 15.1.3 |
259 | | /// Decode abstract method, takes a string and URI decodes it. |
260 | | /// \param reservedSet a function indicating which characters to not escape. |
261 | | static CallResult<Handle<StringPrimitive>> decode( |
262 | | Runtime &runtime, |
263 | | Handle<StringPrimitive> strHandle, |
264 | 0 | CharSetFn reservedSet) { |
265 | 0 | auto str = StringPrimitive::createStringView(runtime, strHandle); |
266 | 0 | auto strLen = str.length(); |
267 | 0 | SmallU16String<32> R{}; |
268 | 0 | R.reserve(strLen); |
269 | 0 | for (auto itr = str.begin(), e = str.end(); itr != e;) { |
270 | 0 | char16_t C = *itr; |
271 | 0 | if (C != u'%') { |
272 | | // Regular character, continue. |
273 | 0 | R.push_back(C); |
274 | 0 | } else { |
275 | 0 | auto start = itr; |
276 | 0 | if (itr + 2 >= e || !(isHexChar(*(itr + 1)) && isHexChar(*(itr + 2)))) { |
277 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
278 | 0 | } |
279 | 0 | uint8_t B = (fromHexChar(*(itr + 1)) << 4) | fromHexChar(*(itr + 2)); |
280 | 0 | itr += 2; |
281 | 0 | if ((B & 0x80) == 0) { |
282 | | // Most significant bit of B is 0. |
283 | 0 | C = B; |
284 | 0 | if (!reservedSet(C)) { |
285 | 0 | R.push_back(C); |
286 | 0 | } else { |
287 | 0 | R.insert(R.end(), start, itr + 1); |
288 | 0 | } |
289 | 0 | } else { |
290 | | // Most significant bit of B is 1. |
291 | 0 | uint32_t n = 0; |
292 | | // Set n to be smallest such that (B << n) & 0x80 is 0. |
293 | | // n is set to the number of leading 1s in B. |
294 | 0 | for (; n <= 8 && (((B << n) & 0x80) != 0); ++n) { |
295 | 0 | } |
296 | 0 | if (n == 1 || n > 4) { |
297 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
298 | 0 | } |
299 | | // Safe because we ensure that n <= 4. |
300 | 0 | UTF8 octets[4]{B}; |
301 | | // Not enough bytes to fill all n octets. |
302 | 0 | if ((itr + (3 * (n - 1))) >= e) { |
303 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
304 | 0 | } |
305 | | // Populate octets. |
306 | 0 | for (uint32_t j = 1; j < n; ++j) { |
307 | 0 | ++itr; |
308 | 0 | if (*itr != u'%' || |
309 | 0 | !(isHexChar(*(itr + 1)) && isHexChar(*(itr + 2)))) { |
310 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
311 | 0 | } |
312 | 0 | B = (fromHexChar(*(itr + 1)) << 4) | fromHexChar(*(itr + 2)); |
313 | 0 | if (((B >> 6) & 0x3) != 0x2) { |
314 | | // The highest two bits aren't 10. |
315 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
316 | 0 | } |
317 | 0 | itr += 2; |
318 | 0 | octets[j] = B; |
319 | 0 | } |
320 | | // Code point encoded by the n octets. |
321 | 0 | uint32_t V; |
322 | 0 | const UTF8 *sourceStart = octets; |
323 | 0 | const UTF8 *sourceEnd = octets + n; |
324 | 0 | UTF32 *targetStart = &V; |
325 | 0 | UTF32 *targetEnd = &V + 1; |
326 | 0 | ConversionResult cRes = ConvertUTF8toUTF32( |
327 | 0 | &sourceStart, |
328 | 0 | sourceEnd, |
329 | 0 | &targetStart, |
330 | 0 | targetEnd, |
331 | 0 | llvh::strictConversion); |
332 | 0 | if (cRes != ConversionResult::conversionOK) { |
333 | 0 | return runtime.raiseURIError("Malformed decodeURI input"); |
334 | 0 | } |
335 | 0 | if (V < 0x10000) { |
336 | | // Safe to cast. |
337 | 0 | C = static_cast<char16_t>(V); |
338 | 0 | if (!reservedSet(C)) { |
339 | 0 | R.push_back(C); |
340 | 0 | } else { |
341 | 0 | R.insert(R.end(), start, itr + 1); |
342 | 0 | } |
343 | 0 | } else { |
344 | | // V >= 0x10000 |
345 | | // Notice that L and H are both only 2 byte values, |
346 | | // because of they way that they're computed. |
347 | 0 | char16_t L = ((V - 0x10000) & 0x3ff) + 0xdc00; |
348 | 0 | char16_t H = (((V - 0x10000) >> 10) & 0x3ff) + 0xd800; |
349 | 0 | R.push_back(H); |
350 | 0 | R.push_back(L); |
351 | 0 | } |
352 | 0 | } |
353 | 0 | } |
354 | 0 | ++itr; |
355 | 0 | } |
356 | | |
357 | 0 | return runtime.makeHandle<StringPrimitive>( |
358 | 0 | *StringPrimitive::create(runtime, R)); |
359 | 0 | } |
360 | | |
361 | 0 | CallResult<HermesValue> decodeURI(void *, Runtime &runtime, NativeArgs args) { |
362 | 0 | auto strRes = toString_RJS(runtime, args.getArgHandle(0)); |
363 | 0 | if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) { |
364 | 0 | return ExecutionStatus::EXCEPTION; |
365 | 0 | } |
366 | 0 | auto res = |
367 | 0 | decode(runtime, runtime.makeHandle(std::move(*strRes)), reservedURISet); |
368 | 0 | if (res == ExecutionStatus::EXCEPTION) |
369 | 0 | return ExecutionStatus::EXCEPTION; |
370 | 0 | return res->getHermesValue(); |
371 | 0 | } |
372 | | |
373 | | CallResult<HermesValue> |
374 | 0 | decodeURIComponent(void *, Runtime &runtime, NativeArgs args) { |
375 | 0 | auto strRes = toString_RJS(runtime, args.getArgHandle(0)); |
376 | 0 | if (LLVM_UNLIKELY(strRes == ExecutionStatus::EXCEPTION)) { |
377 | 0 | return ExecutionStatus::EXCEPTION; |
378 | 0 | } |
379 | 0 | auto emptySet = [](char16_t) { return false; }; |
380 | 0 | auto res = decode(runtime, runtime.makeHandle(std::move(*strRes)), emptySet); |
381 | 0 | if (res == ExecutionStatus::EXCEPTION) |
382 | 0 | return ExecutionStatus::EXCEPTION; |
383 | 0 | return res->getHermesValue(); |
384 | 0 | } |
385 | | |
386 | | } // namespace vm |
387 | | } // namespace hermes |