Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/uri.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/isolate-inl.h"
11 : #include "src/string-search.h"
12 : #include "src/unicode-inl.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : namespace { // anonymous namespace for DecodeURI helper functions
18 : bool IsReservedPredicate(uc16 c) {
19 : switch (c) {
20 : case '#':
21 : case '$':
22 : case '&':
23 : case '+':
24 : case ',':
25 : case '/':
26 : case ':':
27 : case ';':
28 : case '=':
29 : case '?':
30 : case '@':
31 : return true;
32 : default:
33 : return false;
34 : }
35 : }
36 :
37 : bool IsReplacementCharacter(const uint8_t* octets, int length) {
38 : // The replacement character is at codepoint U+FFFD in the Unicode Specials
39 : // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40 261 : if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41 18 : octets[2] != 0xBD) {
42 : return false;
43 : }
44 : return true;
45 : }
46 :
47 5526 : bool DecodeOctets(const uint8_t* octets, int length,
48 : std::vector<uc16>* buffer) {
49 5526 : size_t cursor = 0;
50 5526 : uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51 5769 : if (value == unibrow::Utf8::kBadChar &&
52 : !IsReplacementCharacter(octets, length)) {
53 : return false;
54 : }
55 :
56 5301 : if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
57 9936 : buffer->push_back(value);
58 : } else {
59 666 : buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
60 666 : buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
61 : }
62 : return true;
63 : }
64 :
65 7330907 : int TwoDigitHex(uc16 character1, uc16 character2) {
66 7330907 : if (character1 > 'f') return -1;
67 7330529 : int high = HexValue(character1);
68 7330529 : if (high == -1) return -1;
69 7330349 : if (character2 > 'f') return -1;
70 7330295 : int low = HexValue(character2);
71 7330295 : if (low == -1) return -1;
72 7330133 : return (high << 4) + low;
73 : }
74 :
75 : template <typename T>
76 2949264 : void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
77 : bool is_uri, std::vector<T>* buffer) {
78 2949264 : if (is_uri && IsReservedPredicate(decoded)) {
79 0 : buffer->push_back('%');
80 0 : uc16 first = uri_content->Get(index + 1);
81 0 : uc16 second = uri_content->Get(index + 2);
82 : DCHECK_GT(std::numeric_limits<T>::max(), first);
83 : DCHECK_GT(std::numeric_limits<T>::max(), second);
84 :
85 0 : buffer->push_back(first);
86 0 : buffer->push_back(second);
87 : } else {
88 5898519 : buffer->push_back(decoded);
89 : }
90 2949264 : }
91 :
92 5481 : bool IntoTwoByte(int index, bool is_uri, int uri_length,
93 5616 : String::FlatContent* uri_content, std::vector<uc16>* buffer) {
94 21582 : for (int k = index; k < uri_length; k++) {
95 5616 : uc16 code = uri_content->Get(k);
96 5616 : if (code == '%') {
97 : int two_digits;
98 11232 : if (k + 2 >= uri_length ||
99 5616 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
100 11232 : uri_content->Get(k + 2))) < 0) {
101 306 : return false;
102 : }
103 : k += 2;
104 5616 : uc16 decoded = static_cast<uc16>(two_digits);
105 5616 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
106 : uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
107 5607 : octets[0] = decoded;
108 :
109 : int number_of_continuation_bytes = 0;
110 22293 : while ((decoded << ++number_of_continuation_bytes) & 0x80) {
111 11160 : if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
112 306 : return false;
113 : }
114 33255 : if (uri_content->Get(++k) != '%' ||
115 11079 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
116 22158 : uri_content->Get(k + 2))) < 0) {
117 : return false;
118 : }
119 : k += 2;
120 : uc16 continuation_byte = static_cast<uc16>(two_digits);
121 11079 : octets[number_of_continuation_bytes] = continuation_byte;
122 : }
123 :
124 5526 : if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
125 : return false;
126 : }
127 : } else {
128 9 : AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
129 : }
130 : } else {
131 0 : buffer->push_back(code);
132 : }
133 : }
134 : return true;
135 : }
136 :
137 5697 : bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
138 : std::vector<uint8_t>* one_byte_buffer,
139 : std::vector<uc16>* two_byte_buffer) {
140 : DisallowHeapAllocation no_gc;
141 5697 : String::FlatContent uri_content = uri->GetFlatContent(no_gc);
142 :
143 : int uri_length = uri->length();
144 20060613 : for (int k = 0; k < uri_length; k++) {
145 20060415 : uc16 code = uri_content.Get(k);
146 20060415 : if (code == '%') {
147 : int two_digits;
148 5909490 : if (k + 2 >= uri_length ||
149 2954736 : (two_digits = TwoDigitHex(uri_content.Get(k + 1),
150 5909472 : uri_content.Get(k + 2))) < 0) {
151 : return false;
152 : }
153 :
154 2954736 : uc16 decoded = static_cast<uc16>(two_digits);
155 2954736 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
156 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
157 5481 : two_byte_buffer);
158 : }
159 :
160 2949255 : AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
161 : k += 2;
162 : } else {
163 17105661 : if (code > unibrow::Utf8::kMaxOneByteChar) {
164 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
165 0 : two_byte_buffer);
166 : }
167 34211322 : one_byte_buffer->push_back(code);
168 : }
169 : }
170 : return true;
171 : }
172 :
173 : } // anonymous namespace
174 :
175 5697 : MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
176 : bool is_uri) {
177 5697 : uri = String::Flatten(isolate, uri);
178 : std::vector<uint8_t> one_byte_buffer;
179 : std::vector<uc16> two_byte_buffer;
180 :
181 5697 : if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
182 324 : THROW_NEW_ERROR(isolate, NewURIError(), String);
183 : }
184 :
185 5373 : if (two_byte_buffer.empty()) {
186 : return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>(
187 594 : one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
188 : }
189 :
190 : Handle<SeqTwoByteString> result;
191 : int result_length =
192 15525 : static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
193 10350 : ASSIGN_RETURN_ON_EXCEPTION(
194 : isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
195 : String);
196 :
197 : DisallowHeapAllocation no_gc;
198 : CopyChars(result->GetChars(no_gc), one_byte_buffer.data(),
199 5175 : one_byte_buffer.size());
200 5175 : CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(),
201 10350 : two_byte_buffer.data(), two_byte_buffer.size());
202 :
203 5175 : return result;
204 : }
205 :
206 : namespace { // anonymous namespace for EncodeURI helper functions
207 20970508 : bool IsUnescapePredicateInUriComponent(uc16 c) {
208 20970508 : if (IsAlphaNumeric(c)) {
209 : return true;
210 : }
211 :
212 4759391 : switch (c) {
213 : case '!':
214 : case '\'':
215 : case '(':
216 : case ')':
217 : case '*':
218 : case '-':
219 : case '.':
220 : case '_':
221 : case '~':
222 : return true;
223 : default:
224 2956302 : return false;
225 : }
226 : }
227 :
228 : bool IsUriSeparator(uc16 c) {
229 : switch (c) {
230 : case '#':
231 : case ':':
232 : case ';':
233 : case '/':
234 : case '?':
235 : case '$':
236 : case '&':
237 : case '+':
238 : case ',':
239 : case '@':
240 : case '=':
241 : return true;
242 : default:
243 : return false;
244 : }
245 : }
246 :
247 2969074 : void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
248 5938148 : buffer->push_back('%');
249 8907222 : buffer->push_back(HexCharOfValue(octet >> 4));
250 8907222 : buffer->push_back(HexCharOfValue(octet & 0x0F));
251 2969074 : }
252 :
253 2956203 : void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) {
254 2956203 : char s[4] = {};
255 : int number_of_bytes;
256 : number_of_bytes =
257 2956203 : unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
258 5924053 : for (int k = 0; k < number_of_bytes; k++) {
259 2967850 : AddEncodedOctetToBuffer(s[k], buffer);
260 : }
261 2956203 : }
262 :
263 306 : void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) {
264 306 : char s[4] = {};
265 : int number_of_bytes =
266 : unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
267 306 : unibrow::Utf16::kNoPreviousCharacter, false);
268 1530 : for (int k = 0; k < number_of_bytes; k++) {
269 1224 : AddEncodedOctetToBuffer(s[k], buffer);
270 : }
271 306 : }
272 :
273 : } // anonymous namespace
274 :
275 156591 : MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
276 : bool is_uri) {
277 156591 : uri = String::Flatten(isolate, uri);
278 : int uri_length = uri->length();
279 : std::vector<uint8_t> buffer;
280 156597 : buffer.reserve(uri_length);
281 :
282 : {
283 : DisallowHeapAllocation no_gc;
284 156598 : String::FlatContent uri_content = uri->GetFlatContent(no_gc);
285 :
286 21127393 : for (int k = 0; k < uri_length; k++) {
287 20975739 : uc16 cc1 = uri_content.Get(k);
288 20975739 : if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
289 5102 : k++;
290 5102 : if (k < uri_length) {
291 10224 : uc16 cc2 = uri->Get(k);
292 5112 : if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
293 306 : EncodePair(cc1, cc2, &buffer);
294 306 : continue;
295 : }
296 : }
297 20970637 : } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
298 41941001 : if (IsUnescapePredicateInUriComponent(cc1) ||
299 2951874 : (is_uri && IsUriSeparator(cc1))) {
300 36028602 : buffer.push_back(cc1);
301 : } else {
302 2956203 : EncodeSingle(cc1, &buffer);
303 : }
304 : continue;
305 : }
306 :
307 : AllowHeapAllocation allocate_error_and_return;
308 4928 : THROW_NEW_ERROR(isolate, NewURIError(), String);
309 : }
310 : }
311 :
312 151654 : return isolate->factory()->NewStringFromOneByte(VectorOf(buffer));
313 : }
314 :
315 : namespace { // Anonymous namespace for Escape and Unescape
316 :
317 : template <typename Char>
318 12886074 : int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
319 25772148 : uint16_t character = vector[i];
320 : int32_t hi = 0;
321 : int32_t lo = 0;
322 17581790 : if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
323 1336878 : (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
324 1336446 : (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
325 445338 : *step = 6;
326 445338 : return (hi << 8) + lo;
327 15909104 : } else if (character == '%' && i <= length - 3 &&
328 10405104 : (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
329 3467882 : *step = 3;
330 : return lo;
331 : } else {
332 8972854 : *step = 1;
333 8972854 : return character;
334 : }
335 : }
336 :
337 : template <typename Char>
338 55735 : MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
339 : int start_index) {
340 : bool one_byte = true;
341 : int length = string->length();
342 :
343 : int unescaped_length = 0;
344 : {
345 : DisallowHeapAllocation no_allocation;
346 111470 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
347 6498772 : for (int i = start_index; i < length; unescaped_length++) {
348 : int step;
349 6443037 : if (UnescapeChar(vector, i, length, &step) >
350 : String::kMaxOneByteCharCode) {
351 : one_byte = false;
352 : }
353 6443037 : i += step;
354 : }
355 : }
356 :
357 : DCHECK(start_index < length);
358 : Handle<String> first_part =
359 55735 : isolate->factory()->NewProperSubString(string, 0, start_index);
360 :
361 : int dest_position = 0;
362 : Handle<String> second_part;
363 : DCHECK_LE(unescaped_length, String::kMaxLength);
364 55735 : if (one_byte) {
365 : Handle<SeqOneByteString> dest = isolate->factory()
366 : ->NewRawOneByteString(unescaped_length)
367 2120 : .ToHandleChecked();
368 : DisallowHeapAllocation no_allocation;
369 2120 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
370 12400 : for (int i = start_index; i < length; dest_position++) {
371 : int step;
372 11340 : dest->SeqOneByteStringSet(dest_position,
373 11340 : UnescapeChar(vector, i, length, &step));
374 11340 : i += step;
375 : }
376 1060 : second_part = dest;
377 : } else {
378 : Handle<SeqTwoByteString> dest = isolate->factory()
379 : ->NewRawTwoByteString(unescaped_length)
380 109350 : .ToHandleChecked();
381 : DisallowHeapAllocation no_allocation;
382 109350 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
383 6486372 : for (int i = start_index; i < length; dest_position++) {
384 : int step;
385 6431697 : dest->SeqTwoByteStringSet(dest_position,
386 6431697 : UnescapeChar(vector, i, length, &step));
387 6431697 : i += step;
388 : }
389 54675 : second_part = dest;
390 : }
391 55735 : return isolate->factory()->NewConsString(first_part, second_part);
392 : }
393 :
394 6386067 : bool IsNotEscaped(uint16_t c) {
395 6386067 : if (IsAlphaNumeric(c)) {
396 : return true;
397 : }
398 : // @*_+-./
399 : switch (c) {
400 : case '@':
401 : case '*':
402 : case '_':
403 : case '+':
404 : case '-':
405 : case '.':
406 : case '/':
407 : return true;
408 : default:
409 1737288 : return false;
410 : }
411 : }
412 :
413 : template <typename Char>
414 201541 : static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
415 : Handle<String> source) {
416 : int index;
417 : {
418 : DisallowHeapAllocation no_allocation;
419 : StringSearch<uint8_t, Char> search(isolate, StaticCharVector("%"));
420 403068 : index = search.Search(source->GetCharVector<Char>(no_allocation), 0);
421 201525 : if (index < 0) return source;
422 : }
423 55735 : return UnescapeSlow<Char>(isolate, source, index);
424 : }
425 :
426 : template <typename Char>
427 37314 : static MaybeHandle<String> EscapePrivate(Isolate* isolate,
428 : Handle<String> string) {
429 : DCHECK(string->IsFlat());
430 : int escaped_length = 0;
431 : int length = string->length();
432 :
433 : {
434 : DisallowHeapAllocation no_allocation;
435 74628 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
436 3352653 : for (int i = 0; i < length; i++) {
437 6630678 : uint16_t c = vector[i];
438 3308679 : if (c >= 256) {
439 121473 : escaped_length += 6;
440 3193866 : } else if (IsNotEscaped(c)) {
441 2325222 : escaped_length++;
442 : } else {
443 868644 : escaped_length += 3;
444 : }
445 :
446 : // We don't allow strings that are longer than a maximal length.
447 : DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow.
448 3315339 : if (escaped_length > String::kMaxLength) break; // Provoke exception.
449 : }
450 : }
451 :
452 : // No length change implies no change. Return original string if no change.
453 37314 : if (escaped_length == length) return string;
454 :
455 : Handle<SeqOneByteString> dest;
456 73224 : ASSIGN_RETURN_ON_EXCEPTION(
457 : isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
458 : String);
459 : int dest_position = 0;
460 :
461 : {
462 : DisallowHeapAllocation no_allocation;
463 73224 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
464 3350286 : for (int i = 0; i < length; i++) {
465 6627348 : uint16_t c = vector[i];
466 3308679 : if (c >= 256) {
467 : dest->SeqOneByteStringSet(dest_position, '%');
468 121473 : dest->SeqOneByteStringSet(dest_position + 1, 'u');
469 242946 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
470 : dest->SeqOneByteStringSet(dest_position + 3,
471 242946 : HexCharOfValue((c >> 8) & 0xF));
472 : dest->SeqOneByteStringSet(dest_position + 4,
473 242946 : HexCharOfValue((c >> 4) & 0xF));
474 242946 : dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF));
475 121473 : dest_position += 6;
476 3192201 : } else if (IsNotEscaped(c)) {
477 : dest->SeqOneByteStringSet(dest_position, c);
478 2323557 : dest_position++;
479 : } else {
480 : dest->SeqOneByteStringSet(dest_position, '%');
481 1737288 : dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
482 1737288 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF));
483 868644 : dest_position += 3;
484 : }
485 : }
486 : }
487 :
488 36612 : return dest;
489 : }
490 :
491 : } // Anonymous namespace
492 :
493 37314 : MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
494 : Handle<String> result;
495 37314 : string = String::Flatten(isolate, string);
496 37314 : return String::IsOneByteRepresentationUnderneath(*string)
497 : ? EscapePrivate<uint8_t>(isolate, string)
498 37314 : : EscapePrivate<uc16>(isolate, string);
499 : }
500 :
501 201542 : MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
502 : Handle<String> result;
503 201542 : string = String::Flatten(isolate, string);
504 201553 : return String::IsOneByteRepresentationUnderneath(*string)
505 : ? UnescapePrivate<uint8_t>(isolate, string)
506 201547 : : UnescapePrivate<uc16>(isolate, string);
507 : }
508 :
509 : } // namespace internal
510 183867 : } // namespace v8
|