Line data Source code
1 : // Copyright 2016 the V8 project authors. All rights reserved.
2 : // Use of this source code is governed by a BSD-style license that can be
3 : // found in the LICENSE file.
4 :
5 : #include "src/uri.h"
6 :
7 : #include <vector>
8 :
9 : #include "src/char-predicates-inl.h"
10 : #include "src/isolate-inl.h"
11 : #include "src/string-search.h"
12 : #include "src/unicode-inl.h"
13 :
14 : namespace v8 {
15 : namespace internal {
16 :
17 : namespace { // anonymous namespace for DecodeURI helper functions
18 : bool IsReservedPredicate(uc16 c) {
19 : switch (c) {
20 : case '#':
21 : case '$':
22 : case '&':
23 : case '+':
24 : case ',':
25 : case '/':
26 : case ':':
27 : case ';':
28 : case '=':
29 : case '?':
30 : case '@':
31 : return true;
32 : default:
33 : return false;
34 : }
35 : }
36 :
37 : bool IsReplacementCharacter(const uint8_t* octets, int length) {
38 : // The replacement character is at codepoint U+FFFD in the Unicode Specials
39 : // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40 252 : if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41 18 : octets[2] != 0xBD) {
42 : return false;
43 : }
44 : return true;
45 : }
46 :
47 5497 : bool DecodeOctets(const uint8_t* octets, int length,
48 : std::vector<uc16>* buffer) {
49 5497 : size_t cursor = 0;
50 10994 : uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51 5731 : if (value == unibrow::Utf8::kBadChar &&
52 : !IsReplacementCharacter(octets, length)) {
53 : return false;
54 : }
55 :
56 5281 : if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
57 9906 : buffer->push_back(value);
58 : } else {
59 656 : buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
60 656 : buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
61 : }
62 : return true;
63 : }
64 :
65 7329961 : int TwoDigitHex(uc16 character1, uc16 character2) {
66 7329961 : if (character1 > 'f') return -1;
67 7329583 : int high = HexValue(character1);
68 7329583 : if (high == -1) return -1;
69 7329403 : if (character2 > 'f') return -1;
70 7329349 : int low = HexValue(character2);
71 7329349 : if (low == -1) return -1;
72 7329187 : return (high << 4) + low;
73 : }
74 :
75 : template <typename T>
76 2949259 : void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
77 : bool is_uri, std::vector<T>* buffer) {
78 2949259 : if (is_uri && IsReservedPredicate(decoded)) {
79 0 : buffer->push_back('%');
80 0 : uc16 first = uri_content->Get(index + 1);
81 0 : uc16 second = uri_content->Get(index + 2);
82 : DCHECK_GT(std::numeric_limits<T>::max(), first);
83 : DCHECK_GT(std::numeric_limits<T>::max(), second);
84 :
85 0 : buffer->push_back(first);
86 0 : buffer->push_back(second);
87 : } else {
88 5898509 : buffer->push_back(decoded);
89 : }
90 2949259 : }
91 :
92 5459 : bool IntoTwoByte(int index, bool is_uri, int uri_length,
93 : String::FlatContent* uri_content, std::vector<uc16>* buffer) {
94 16039 : for (int k = index; k < uri_length; k++) {
95 5584 : uc16 code = uri_content->Get(k);
96 5584 : if (code == '%') {
97 : int two_digits;
98 11168 : if (k + 2 >= uri_length ||
99 11168 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
100 : uri_content->Get(k + 2))) < 0) {
101 294 : return false;
102 : }
103 : k += 2;
104 5584 : uc16 decoded = static_cast<uc16>(two_digits);
105 5584 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
106 : uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
107 5575 : octets[0] = decoded;
108 :
109 : int number_of_continuation_bytes = 0;
110 27611 : while ((decoded << ++number_of_continuation_bytes) & 0x80) {
111 11096 : if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
112 294 : return false;
113 : }
114 33072 : if (uri_content->Get(++k) != '%' ||
115 22036 : (two_digits = TwoDigitHex(uri_content->Get(k + 1),
116 : uri_content->Get(k + 2))) < 0) {
117 : return false;
118 : }
119 : k += 2;
120 : uc16 continuation_byte = static_cast<uc16>(two_digits);
121 11018 : octets[number_of_continuation_bytes] = continuation_byte;
122 : }
123 :
124 5497 : if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
125 : return false;
126 : }
127 : } else {
128 9 : AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
129 : }
130 : } else {
131 0 : buffer->push_back(code);
132 : }
133 : }
134 : return true;
135 : }
136 :
137 5670 : bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
138 : std::vector<uint8_t>* one_byte_buffer,
139 : std::vector<uc16>* two_byte_buffer) {
140 : DisallowHeapAllocation no_gc;
141 5670 : String::FlatContent uri_content = uri->GetFlatContent(no_gc);
142 :
143 : int uri_length = uri->length();
144 40115402 : for (int k = 0; k < uri_length; k++) {
145 : uc16 code = uri_content.Get(k);
146 20060343 : if (code == '%') {
147 : int two_digits;
148 5909436 : if (k + 2 >= uri_length ||
149 5909418 : (two_digits = TwoDigitHex(uri_content.Get(k + 1),
150 : uri_content.Get(k + 2))) < 0) {
151 : return false;
152 : }
153 :
154 2954709 : uc16 decoded = static_cast<uc16>(two_digits);
155 2954709 : if (decoded > unibrow::Utf8::kMaxOneByteChar) {
156 5459 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
157 5459 : two_byte_buffer);
158 : }
159 :
160 2949250 : AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
161 : k += 2;
162 : } else {
163 17105616 : if (code > unibrow::Utf8::kMaxOneByteChar) {
164 0 : return IntoTwoByte(k, is_uri, uri_length, &uri_content,
165 0 : two_byte_buffer);
166 : }
167 34211232 : one_byte_buffer->push_back(code);
168 : }
169 : }
170 : return true;
171 : }
172 :
173 : } // anonymous namespace
174 :
175 5670 : MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
176 : bool is_uri) {
177 5670 : uri = String::Flatten(isolate, uri);
178 : std::vector<uint8_t> one_byte_buffer;
179 : std::vector<uc16> two_byte_buffer;
180 :
181 5670 : if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
182 312 : THROW_NEW_ERROR(isolate, NewURIError(), String);
183 : }
184 :
185 5358 : if (two_byte_buffer.empty()) {
186 : return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>(
187 386 : one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
188 : }
189 :
190 : Handle<SeqTwoByteString> result;
191 : int result_length =
192 10330 : static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
193 10330 : ASSIGN_RETURN_ON_EXCEPTION(
194 : isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
195 : String);
196 :
197 : DisallowHeapAllocation no_gc;
198 : CopyChars(result->GetChars(no_gc), one_byte_buffer.data(),
199 : one_byte_buffer.size());
200 5165 : CopyChars(result->GetChars(no_gc) + one_byte_buffer.size(),
201 : two_byte_buffer.data(), two_byte_buffer.size());
202 :
203 5165 : return result;
204 : }
205 :
206 : namespace { // anonymous namespace for EncodeURI helper functions
207 20916981 : bool IsUnescapePredicateInUriComponent(uc16 c) {
208 20916981 : if (IsAlphaNumeric(c)) {
209 : return true;
210 : }
211 :
212 4760638 : switch (c) {
213 : case '!':
214 : case '\'':
215 : case '(':
216 : case ')':
217 : case '*':
218 : case '-':
219 : case '.':
220 : case '_':
221 : case '~':
222 : return true;
223 : default:
224 2956080 : return false;
225 : }
226 : }
227 :
228 : bool IsUriSeparator(uc16 c) {
229 : switch (c) {
230 : case '#':
231 : case ':':
232 : case ';':
233 : case '/':
234 : case '?':
235 : case '$':
236 : case '&':
237 : case '+':
238 : case ',':
239 : case '@':
240 : case '=':
241 : return true;
242 : default:
243 : return false;
244 : }
245 : }
246 :
247 2968661 : void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
248 5937322 : buffer->push_back('%');
249 8905983 : buffer->push_back(HexCharOfValue(octet >> 4));
250 8905983 : buffer->push_back(HexCharOfValue(octet & 0x0F));
251 2968661 : }
252 :
253 2955981 : void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) {
254 2955981 : char s[4] = {};
255 : int number_of_bytes;
256 : number_of_bytes =
257 2955981 : unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
258 8890791 : for (int k = 0; k < number_of_bytes; k++) {
259 2967405 : AddEncodedOctetToBuffer(s[k], buffer);
260 : }
261 2955981 : }
262 :
263 314 : void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) {
264 314 : char s[4] = {};
265 : int number_of_bytes =
266 314 : unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
267 314 : unibrow::Utf16::kNoPreviousCharacter, false);
268 2826 : for (int k = 0; k < number_of_bytes; k++) {
269 1256 : AddEncodedOctetToBuffer(s[k], buffer);
270 : }
271 314 : }
272 :
273 : } // anonymous namespace
274 :
275 150128 : MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
276 : bool is_uri) {
277 150128 : uri = String::Flatten(isolate, uri);
278 : int uri_length = uri->length();
279 : std::vector<uint8_t> buffer;
280 150131 : buffer.reserve(uri_length);
281 :
282 : {
283 : DisallowHeapAllocation no_gc;
284 150138 : String::FlatContent uri_content = uri->GetFlatContent(no_gc);
285 :
286 41984725 : for (int k = 0; k < uri_length; k++) {
287 : uc16 cc1 = uri_content.Get(k);
288 20922226 : if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
289 5120 : k++;
290 5120 : if (k < uri_length) {
291 : uc16 cc2 = uri->Get(k);
292 5120 : if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
293 314 : EncodePair(cc1, cc2, &buffer);
294 314 : continue;
295 : }
296 : }
297 20917106 : } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
298 41833966 : if (IsUnescapePredicateInUriComponent(cc1) ||
299 2951874 : (is_uri && IsUriSeparator(cc1))) {
300 35922015 : buffer.push_back(cc1);
301 : } else {
302 2955981 : EncodeSingle(cc1, &buffer);
303 : }
304 : continue;
305 : }
306 :
307 : AllowHeapAllocation allocate_error_and_return;
308 4931 : THROW_NEW_ERROR(isolate, NewURIError(), String);
309 : }
310 : }
311 :
312 145193 : return isolate->factory()->NewStringFromOneByte(VectorOf(buffer));
313 : }
314 :
315 : namespace { // Anonymous namespace for Escape and Unescape
316 :
317 : template <typename Char>
318 12884548 : int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
319 25769096 : uint16_t character = vector[i];
320 : int32_t hi = 0;
321 : int32_t lo = 0;
322 17579480 : if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
323 1336878 : (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
324 1336446 : (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
325 445338 : *step = 6;
326 445338 : return (hi << 8) + lo;
327 15906752 : } else if (character == '%' && i <= length - 3 &&
328 10402626 : (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
329 3467056 : *step = 3;
330 : return lo;
331 : } else {
332 8972154 : *step = 1;
333 8972154 : return character;
334 : }
335 : }
336 :
337 : template <typename Char>
338 55688 : MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
339 : int start_index) {
340 : bool one_byte = true;
341 : int length = string->length();
342 :
343 : int unescaped_length = 0;
344 : {
345 : DisallowHeapAllocation no_allocation;
346 111376 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
347 12940236 : for (int i = start_index; i < length; unescaped_length++) {
348 : int step;
349 6442274 : if (UnescapeChar(vector, i, length, &step) >
350 : String::kMaxOneByteCharCode) {
351 : one_byte = false;
352 : }
353 6442274 : i += step;
354 : }
355 : }
356 :
357 : DCHECK(start_index < length);
358 : Handle<String> first_part =
359 55688 : isolate->factory()->NewProperSubString(string, 0, start_index);
360 :
361 : int dest_position = 0;
362 : Handle<String> second_part;
363 : DCHECK_LE(unescaped_length, String::kMaxLength);
364 55688 : if (one_byte) {
365 : Handle<SeqOneByteString> dest = isolate->factory()
366 : ->NewRawOneByteString(unescaped_length)
367 2026 : .ToHandleChecked();
368 : DisallowHeapAllocation no_allocation;
369 2026 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
370 22167 : for (int i = start_index; i < length; dest_position++) {
371 : int step;
372 : dest->SeqOneByteStringSet(dest_position,
373 10577 : UnescapeChar(vector, i, length, &step));
374 10577 : i += step;
375 : }
376 1013 : second_part = dest;
377 : } else {
378 : Handle<SeqTwoByteString> dest = isolate->factory()
379 : ->NewRawTwoByteString(unescaped_length)
380 109350 : .ToHandleChecked();
381 : DisallowHeapAllocation no_allocation;
382 109350 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
383 12918069 : for (int i = start_index; i < length; dest_position++) {
384 : int step;
385 : dest->SeqTwoByteStringSet(dest_position,
386 6431697 : UnescapeChar(vector, i, length, &step));
387 6431697 : i += step;
388 : }
389 54675 : second_part = dest;
390 : }
391 55688 : return isolate->factory()->NewConsString(first_part, second_part);
392 : }
393 :
394 6385781 : bool IsNotEscaped(uint16_t c) {
395 6385781 : if (IsAlphaNumeric(c)) {
396 : return true;
397 : }
398 : // @*_+-./
399 : switch (c) {
400 : case '@':
401 : case '*':
402 : case '_':
403 : case '+':
404 : case '-':
405 : case '.':
406 : case '/':
407 : return true;
408 : default:
409 1737092 : return false;
410 : }
411 : }
412 :
413 : template <typename Char>
414 195088 : static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
415 : Handle<String> source) {
416 : int index;
417 : {
418 : DisallowHeapAllocation no_allocation;
419 : StringSearch<uint8_t, Char> search(isolate, StaticCharVector("%"));
420 390166 : index = search.Search(source->GetCharVector<Char>(no_allocation), 0);
421 195078 : if (index < 0) return source;
422 : }
423 55688 : return UnescapeSlow<Char>(isolate, source, index);
424 : }
425 :
426 : template <typename Char>
427 37287 : static MaybeHandle<String> EscapePrivate(Isolate* isolate,
428 : Handle<String> string) {
429 : DCHECK(string->IsFlat());
430 : int escaped_length = 0;
431 : int length = string->length();
432 :
433 : {
434 : DisallowHeapAllocation no_allocation;
435 74574 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
436 6667679 : for (int i = 0; i < length; i++) {
437 6630392 : uint16_t c = vector[i];
438 3308679 : if (c >= 256) {
439 121473 : escaped_length += 6;
440 3193723 : } else if (IsNotEscaped(c)) {
441 2325177 : escaped_length++;
442 : } else {
443 868546 : escaped_length += 3;
444 : }
445 :
446 : // We don't allow strings that are longer than a maximal length.
447 : DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow.
448 3315196 : if (escaped_length > String::kMaxLength) break; // Provoke exception.
449 : }
450 : }
451 :
452 : // No length change implies no change. Return original string if no change.
453 37287 : if (escaped_length == length) return string;
454 :
455 : Handle<SeqOneByteString> dest;
456 73170 : ASSIGN_RETURN_ON_EXCEPTION(
457 : isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
458 : String);
459 : int dest_position = 0;
460 :
461 : {
462 : DisallowHeapAllocation no_allocation;
463 73170 : Vector<const Char> vector = string->GetCharVector<Char>(no_allocation);
464 6663647 : for (int i = 0; i < length; i++) {
465 6627062 : uint16_t c = vector[i];
466 3308679 : if (c >= 256) {
467 : dest->SeqOneByteStringSet(dest_position, '%');
468 121473 : dest->SeqOneByteStringSet(dest_position + 1, 'u');
469 242946 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
470 121473 : dest->SeqOneByteStringSet(dest_position + 3,
471 121473 : HexCharOfValue((c >> 8) & 0xF));
472 121473 : dest->SeqOneByteStringSet(dest_position + 4,
473 121473 : HexCharOfValue((c >> 4) & 0xF));
474 242946 : dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF));
475 121473 : dest_position += 6;
476 3192058 : } else if (IsNotEscaped(c)) {
477 : dest->SeqOneByteStringSet(dest_position, c);
478 2323512 : dest_position++;
479 : } else {
480 : dest->SeqOneByteStringSet(dest_position, '%');
481 1737092 : dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
482 1737092 : dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF));
483 868546 : dest_position += 3;
484 : }
485 : }
486 : }
487 :
488 36585 : return dest;
489 : }
490 :
491 : } // Anonymous namespace
492 :
493 37287 : MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
494 : Handle<String> result;
495 37287 : string = String::Flatten(isolate, string);
496 37287 : return String::IsOneByteRepresentationUnderneath(*string)
497 : ? EscapePrivate<uint8_t>(isolate, string)
498 37287 : : EscapePrivate<uc16>(isolate, string);
499 : }
500 :
501 195086 : MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
502 : Handle<String> result;
503 195086 : string = String::Flatten(isolate, string);
504 195089 : return String::IsOneByteRepresentationUnderneath(*string)
505 : ? UnescapePrivate<uint8_t>(isolate, string)
506 195087 : : UnescapePrivate<uc16>(isolate, string);
507 : }
508 :
509 : } // namespace internal
510 122004 : } // namespace v8
|