Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/uri.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/handles.h"
11 : #include "src/isolate-inl.h"
12 : #include "src/string-search.h"
13 : #include "src/unicode-inl.h"
14 :
15 : namespace v8 {
16 : namespace internal {
17 :
18 : namespace { // anonymous namespace for DecodeURI helper functions
19 : bool IsReservedPredicate(uc16 c) {
20 : switch (c) {
21 : case '#':
22 : case '$':
23 : case '&':
24 : case '+':
25 : case ',':
26 : case '/':
27 : case ':':
28 : case ';':
29 : case '=':
30 : case '?':
31 : case '@':
32 : return true;
33 : default:
34 : return false;
35 : }
36 : }
37 :
38 : bool IsReplacementCharacter(const uint8_t* octets, int length) {
39 : // The replacement character is at codepoint U+FFFD in the Unicode Specials
40 : // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
41 286 : if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf ||
42 18 : octets[2] != 0xbd) {
43 : return false;
44 : }
45 : return true;
46 : }
47 :
48 5596 : bool DecodeOctets(const uint8_t* octets, int length,
49 : std::vector<uc16>* buffer) {
50 5596 : size_t cursor = 0;
51 5596 : uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
52 5864 : if (value == unibrow::Utf8::kBadChar &&
53 : !IsReplacementCharacter(octets, length)) {
54 : return false;
55 : }
56 :
57 5346 : if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
58 10008 : buffer->push_back(value);
59 : } else {
60 684 : buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
61 684 : buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
62 : }
63 : return true;
64 : }
65 :
66 8142884 : int TwoDigitHex(uc16 character1, uc16 character2) {
67 8142884 : if (character1 > 'f') return -1;
68 8142464 : int high = HexValue(character1);
69 8142464 : if (high == -1) return -1;
70 8142264 : if (character2 > 'f') return -1;
71 8142204 : int low = HexValue(character2);
72 8142204 : if (low == -1) return -1;
73 8142024 : return (high << 4) + low;
74 : }
75 :
76 : template <typename T>
77 3276956 : void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
78 : bool is_uri, std::vector<T>* buffer) {
79 3276956 : if (is_uri && IsReservedPredicate(decoded)) {
80 0 : buffer->push_back('%');
81 0 : uc16 first = uri_content->Get(index + 1);
82 0 : uc16 second = uri_content->Get(index + 2);
83 : DCHECK_GT(std::numeric_limits<T>::max(), first);
84 : DCHECK_GT(std::numeric_limits<T>::max(), second);
85 :
86 0 : buffer->push_back(first);
87 0 : buffer->push_back(second);
88 : } else {
89 6553902 : buffer->push_back(decoded);
90 : }
91 3276956 : }
92 :
93 5546 : bool IntoTwoByte(int index, bool is_uri, int uri_length,
94 5696 : String::FlatContent* uri_content, std::vector<uc16>* buffer) {
95 21804 : for (int k = index; k < uri_length; k++) {
96 5696 : uc16 code = uri_content->Get(k);
97 5696 : if (code == '%') {
98 : int two_digits;
99 11392 : if (k + 2 >= uri_length ||
100 5696 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
101 11392 : uri_content->Get(k + 2))) < 0) {
102 340 : return false;
103 : }
104 : k += 2;
105 5696 : uc16 decoded = static_cast<uc16>(two_digits);
106 5696 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
107 : uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
108 5686 : octets[0] = decoded;
109 :
110 : int number_of_continuation_bytes = 0;
111 22582 : while ((decoded << ++number_of_continuation_bytes) & 0x80) {
112 11300 : if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
113 340 : return false;
114 : }
115 33650 : if (uri_content->Get(++k) != '%' ||
116 11210 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
117 22420 : uri_content->Get(k + 2))) < 0) {
118 : return false;
119 : }
120 : k += 2;
121 : uc16 continuation_byte = static_cast<uc16>(two_digits);
122 11210 : octets[number_of_continuation_bytes] = continuation_byte;
123 : }
124 :
125 5596 : if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
126 : return false;
127 : }
128 : } else {
129 10 : AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
130 : }
131 : } else {
132 0 : buffer->push_back(code);
133 : }
134 : }
135 : return true;
136 : }
137 :
138 5780 : bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
139 : std::vector<uint8_t>* one_byte_buffer,
140 : std::vector<uc16>* two_byte_buffer) {
141 : DisallowHeapAllocation no_gc;
142 5780 : String::FlatContent uri_content = uri->GetFlatContent();
143 :
144 : int uri_length = uri->length();
145 22289010 : for (int k = 0; k < uri_length; k++) {
146 22288796 : uc16 code = uri_content.Get(k);
147 22288796 : if (code == '%') {
148 : int two_digits;
149 6565004 : if (k + 2 >= uri_length ||
150 3282492 : (two_digits = TwoDigitHex(uri_content.Get(k + 1),
151 6564984 : uri_content.Get(k + 2))) < 0) {
152 : return false;
153 : }
154 :
155 3282492 : uc16 decoded = static_cast<uc16>(two_digits);
156 3282492 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
157 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
158 5546 : two_byte_buffer);
159 : }
160 :
161 3276946 : AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
162 : k += 2;
163 : } else {
164 19006284 : if (code > unibrow::Utf8::kMaxOneByteChar) {
165 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
166 0 : two_byte_buffer);
167 : }
168 38012568 : one_byte_buffer->push_back(code);
169 : }
170 : }
171 : return true;
172 : }
173 :
174 : } // anonymous namespace
175 :
176 5780 : MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
177 : bool is_uri) {
178 5780 : uri = String::Flatten(uri);
179 : std::vector<uint8_t> one_byte_buffer;
180 : std::vector<uc16> two_byte_buffer;
181 :
182 5780 : if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
183 360 : THROW_NEW_ERROR(isolate, NewURIError(), String);
184 : }
185 :
186 5420 : if (two_byte_buffer.empty()) {
187 : return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>(
188 642 : one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
189 : }
190 :
191 : Handle<SeqTwoByteString> result;
192 : int result_length =
193 15618 : static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
194 10412 : ASSIGN_RETURN_ON_EXCEPTION(
195 : isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
196 : String);
197 :
198 10412 : CopyChars(result->GetChars(), one_byte_buffer.data(), one_byte_buffer.size());
199 5206 : CopyChars(result->GetChars() + one_byte_buffer.size(), two_byte_buffer.data(),
200 10412 : two_byte_buffer.size());
201 :
202 5206 : return result;
203 : }
204 :
205 : namespace { // anonymous namespace for EncodeURI helper functions
206 22869474 : bool IsUnescapePredicateInUriComponent(uc16 c) {
207 45738948 : if (IsAlphaNumeric(c)) {
208 : return true;
209 : }
210 :
211 5255549 : switch (c) {
212 : case '!':
213 : case '\'':
214 : case '(':
215 : case ')':
216 : case '*':
217 : case '-':
218 : case '.':
219 : case '_':
220 : case '~':
221 : return true;
222 : default:
223 3284037 : return false;
224 : }
225 : }
226 :
227 : bool IsUriSeparator(uc16 c) {
228 : switch (c) {
229 : case '#':
230 : case ':':
231 : case ';':
232 : case '/':
233 : case '?':
234 : case '$':
235 : case '&':
236 : case '+':
237 : case ',':
238 : case '@':
239 : case '=':
240 : return true;
241 : default:
242 : return false;
243 : }
244 : }
245 :
246 3297139 : void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
247 6594278 : buffer->push_back('%');
248 9891417 : buffer->push_back(HexCharOfValue(octet >> 4));
249 9891417 : buffer->push_back(HexCharOfValue(octet & 0x0F));
250 3297139 : }
251 :
252 3283927 : void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) {
253 3283927 : char s[4] = {};
254 : int number_of_bytes;
255 : number_of_bytes =
256 3283927 : unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
257 6579818 : for (int k = 0; k < number_of_bytes; k++) {
258 3295891 : AddEncodedOctetToBuffer(s[k], buffer);
259 : }
260 3283927 : }
261 :
262 312 : void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) {
263 312 : char s[4] = {};
264 : int number_of_bytes =
265 : unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
266 312 : unibrow::Utf16::kNoPreviousCharacter, false);
267 1560 : for (int k = 0; k < number_of_bytes; k++) {
268 1248 : AddEncodedOctetToBuffer(s[k], buffer);
269 : }
270 312 : }
271 :
272 : } // anonymous namespace
273 :
274 78637 : MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
275 : bool is_uri) {
276 78637 : uri = String::Flatten(uri);
277 : int uri_length = uri->length();
278 : std::vector<uint8_t> buffer;
279 78637 : buffer.reserve(uri_length);
280 :
281 : {
282 : DisallowHeapAllocation no_gc;
283 78637 : String::FlatContent uri_content = uri->GetFlatContent();
284 :
285 22948423 : for (int k = 0; k < uri_length; k++) {
286 22874718 : uc16 cc1 = uri_content.Get(k);
287 45749436 : if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
288 5118 : k++;
289 5118 : if (k < uri_length) {
290 : uc16 cc2 = uri->Get(k);
291 10236 : if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
292 312 : EncodePair(cc1, cc2, &buffer);
293 312 : continue;
294 : }
295 : }
296 22869600 : } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
297 45738948 : if (IsUnescapePredicateInUriComponent(cc1) ||
298 3279600 : (is_uri && IsUriSeparator(cc1))) {
299 39171094 : buffer.push_back(cc1);
300 : } else {
301 3283927 : EncodeSingle(cc1, &buffer);
302 : }
303 : continue;
304 : }
305 :
306 : AllowHeapAllocation allocate_error_and_return;
307 4932 : THROW_NEW_ERROR(isolate, NewURIError(), String);
308 : }
309 : }
310 :
311 : return isolate->factory()->NewStringFromOneByte(
312 221115 : Vector<const uint8_t>(buffer.data(), static_cast<int>(buffer.size())));
313 : }
314 :
315 : namespace { // Anonymous namespace for Escape and Unescape
316 :
317 : template <typename Char>
318 14314322 : int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
319 28628644 : uint16_t character = vector[i];
320 : int32_t hi = 0;
321 : int32_t lo = 0;
322 19531370 : if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
323 1485414 : (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
324 1484934 : (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
325 494818 : *step = 6;
326 494818 : return (hi << 8) + lo;
327 17672874 : } else if (character == '%' && i <= length - 3 &&
328 11560110 : (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
329 3852830 : *step = 3;
330 : return lo;
331 : } else {
332 9966674 : *step = 1;
333 9966674 : return character;
334 : }
335 : }
336 :
337 : template <typename Char>
338 61703 : MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
339 : int start_index) {
340 : bool one_byte = true;
341 : int length = string->length();
342 :
343 : int unescaped_length = 0;
344 : {
345 : DisallowHeapAllocation no_allocation;
346 : Vector<const Char> vector = string->GetCharVector<Char>();
347 7218864 : for (int i = start_index; i < length; unescaped_length++) {
348 : int step;
349 7157161 : if (UnescapeChar(vector, i, length, &step) >
350 : String::kMaxOneByteCharCode) {
351 : one_byte = false;
352 : }
353 7157161 : i += step;
354 : }
355 : }
356 :
357 : DCHECK(start_index < length);
358 : Handle<String> first_part =
359 61703 : isolate->factory()->NewProperSubString(string, 0, start_index);
360 :
361 : int dest_position = 0;
362 : Handle<String> second_part;
363 : DCHECK_LE(unescaped_length, String::kMaxLength);
364 61703 : if (one_byte) {
365 : Handle<SeqOneByteString> dest = isolate->factory()
366 : ->NewRawOneByteString(unescaped_length)
367 1908 : .ToHandleChecked();
368 : DisallowHeapAllocation no_allocation;
369 : Vector<const Char> vector = string->GetCharVector<Char>();
370 11787 : for (int i = start_index; i < length; dest_position++) {
371 : int step;
372 10833 : dest->SeqOneByteStringSet(dest_position,
373 10833 : UnescapeChar(vector, i, length, &step));
374 10833 : i += step;
375 : }
376 954 : second_part = dest;
377 : } else {
378 : Handle<SeqTwoByteString> dest = isolate->factory()
379 : ->NewRawTwoByteString(unescaped_length)
380 121498 : .ToHandleChecked();
381 : DisallowHeapAllocation no_allocation;
382 : Vector<const Char> vector = string->GetCharVector<Char>();
383 7207077 : for (int i = start_index; i < length; dest_position++) {
384 : int step;
385 7146328 : dest->SeqTwoByteStringSet(dest_position,
386 7146328 : UnescapeChar(vector, i, length, &step));
387 7146328 : i += step;
388 : }
389 60749 : second_part = dest;
390 : }
391 61703 : return isolate->factory()->NewConsString(first_part, second_part);
392 : }
393 :
394 7095603 : bool IsNotEscaped(uint16_t c) {
395 14191206 : if (IsAlphaNumeric(c)) {
396 : return true;
397 : }
398 : // @*_+-./
399 : switch (c) {
400 : case '@':
401 : case '*':
402 : case '_':
403 : case '+':
404 : case '-':
405 : case '.':
406 : case '/':
407 : return true;
408 : default:
409 1930312 : return false;
410 : }
411 : }
412 :
413 : template <typename Char>
414 129682 : static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
415 : Handle<String> source) {
416 : int index;
417 : {
418 : DisallowHeapAllocation no_allocation;
419 : StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
420 : index = search.Search(source->GetCharVector<Char>(), 0);
421 129682 : if (index < 0) return source;
422 : }
423 61703 : return UnescapeSlow<Char>(isolate, source, index);
424 : }
425 :
426 : template <typename Char>
427 41452 : static MaybeHandle<String> EscapePrivate(Isolate* isolate,
428 : Handle<String> string) {
429 : DCHECK(string->IsFlat());
430 : int escaped_length = 0;
431 : int length = string->length();
432 :
433 : {
434 : DisallowHeapAllocation no_allocation;
435 : Vector<const Char> vector = string->GetCharVector<Char>();
436 3725145 : for (int i = 0; i < length; i++) {
437 7367386 : uint16_t c = vector[i];
438 3676304 : if (c >= 256) {
439 134968 : escaped_length += 6;
440 3548725 : } else if (IsNotEscaped(c)) {
441 2583569 : escaped_length++;
442 : } else {
443 965156 : escaped_length += 3;
444 : }
445 :
446 : // We don't allow strings that are longer than a maximal length.
447 : DCHECK_LT(String::kMaxLength, 0x7fffffff - 6); // Cannot overflow.
448 3683693 : if (escaped_length > String::kMaxLength) break; // Provoke exception.
449 : }
450 : }
451 :
452 : // No length change implies no change. Return original string if no change.
453 41452 : if (escaped_length == length) return string;
454 :
455 : Handle<SeqOneByteString> dest;
456 81348 : ASSIGN_RETURN_ON_EXCEPTION(
457 : isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
458 : String);
459 : int dest_position = 0;
460 :
461 : {
462 : DisallowHeapAllocation no_allocation;
463 : Vector<const Char> vector = string->GetCharVector<Char>();
464 3722520 : for (int i = 0; i < length; i++) {
465 7363692 : uint16_t c = vector[i];
466 3676304 : if (c >= 256) {
467 : dest->SeqOneByteStringSet(dest_position, '%');
468 134968 : dest->SeqOneByteStringSet(dest_position + 1, 'u');
469 269936 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
470 : dest->SeqOneByteStringSet(dest_position + 3,
471 269936 : HexCharOfValue((c >> 8) & 0xf));
472 : dest->SeqOneByteStringSet(dest_position + 4,
473 269936 : HexCharOfValue((c >> 4) & 0xf));
474 269936 : dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf));
475 134968 : dest_position += 6;
476 3546878 : } else if (IsNotEscaped(c)) {
477 : dest->SeqOneByteStringSet(dest_position, c);
478 2581722 : dest_position++;
479 : } else {
480 : dest->SeqOneByteStringSet(dest_position, '%');
481 1930312 : dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
482 1930312 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf));
483 965156 : dest_position += 3;
484 : }
485 : }
486 : }
487 :
488 40674 : return dest;
489 : }
490 :
491 : } // Anonymous namespace
492 :
493 41452 : MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
494 : Handle<String> result;
495 41452 : string = String::Flatten(string);
496 41452 : return string->IsOneByteRepresentationUnderneath()
497 : ? EscapePrivate<uint8_t>(isolate, string)
498 41452 : : EscapePrivate<uc16>(isolate, string);
499 : }
500 :
501 129682 : MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
502 : Handle<String> result;
503 129682 : string = String::Flatten(string);
504 129682 : return string->IsOneByteRepresentationUnderneath()
505 : ? UnescapePrivate<uint8_t>(isolate, string)
506 129682 : : UnescapePrivate<uc16>(isolate, string);
507 : }
508 :
509 : } // namespace internal
510 : } // namespace v8
|