1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/uri.h"
6
7 #include <vector>
8
9 #include "src/char-predicates-inl.h"
10 #include "src/isolate-inl.h"
11 #include "src/string-search.h"
12 #include "src/unicode-inl.h"
13
14 namespace v8 {
15 namespace internal {
16
17 namespace { // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(uc16 c)18 bool IsReservedPredicate(uc16 c) {
19 switch (c) {
20 case '#':
21 case '$':
22 case '&':
23 case '+':
24 case ',':
25 case '/':
26 case ':':
27 case ';':
28 case '=':
29 case '?':
30 case '@':
31 return true;
32 default:
33 return false;
34 }
35 }
36
IsReplacementCharacter(const uint8_t * octets,int length)37 bool IsReplacementCharacter(const uint8_t* octets, int length) {
38 // The replacement character is at codepoint U+FFFD in the Unicode Specials
39 // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40 if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41 octets[2] != 0xBD) {
42 return false;
43 }
44 return true;
45 }
46
DecodeOctets(const uint8_t * octets,int length,std::vector<uc16> * buffer)47 bool DecodeOctets(const uint8_t* octets, int length,
48 std::vector<uc16>* buffer) {
49 size_t cursor = 0;
50 uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51 if (value == unibrow::Utf8::kBadChar &&
52 !IsReplacementCharacter(octets, length)) {
53 return false;
54 }
55
56 if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
57 buffer->push_back(value);
58 } else {
59 buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
60 buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
61 }
62 return true;
63 }
64
TwoDigitHex(uc16 character1,uc16 character2)65 int TwoDigitHex(uc16 character1, uc16 character2) {
66 if (character1 > 'f') return -1;
67 int high = HexValue(character1);
68 if (high == -1) return -1;
69 if (character2 > 'f') return -1;
70 int low = HexValue(character2);
71 if (low == -1) return -1;
72 return (high << 4) + low;
73 }
74
75 template <typename T>
AddToBuffer(uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,std::vector<T> * buffer)76 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
77 bool is_uri, std::vector<T>* buffer) {
78 if (is_uri && IsReservedPredicate(decoded)) {
79 buffer->push_back('%');
80 uc16 first = uri_content->Get(index + 1);
81 uc16 second = uri_content->Get(index + 2);
82 DCHECK_GT(std::numeric_limits<T>::max(), first);
83 DCHECK_GT(std::numeric_limits<T>::max(), second);
84
85 buffer->push_back(first);
86 buffer->push_back(second);
87 } else {
88 buffer->push_back(decoded);
89 }
90 }
91
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,std::vector<uc16> * buffer)92 bool IntoTwoByte(int index, bool is_uri, int uri_length,
93 String::FlatContent* uri_content, std::vector<uc16>* buffer) {
94 for (int k = index; k < uri_length; k++) {
95 uc16 code = uri_content->Get(k);
96 if (code == '%') {
97 int two_digits;
98 if (k + 2 >= uri_length ||
99 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
100 uri_content->Get(k + 2))) < 0) {
101 return false;
102 }
103 k += 2;
104 uc16 decoded = static_cast<uc16>(two_digits);
105 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
106 uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
107 octets[0] = decoded;
108
109 int number_of_continuation_bytes = 0;
110 while ((decoded << ++number_of_continuation_bytes) & 0x80) {
111 if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
112 return false;
113 }
114 if (uri_content->Get(++k) != '%' ||
115 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
116 uri_content->Get(k + 2))) < 0) {
117 return false;
118 }
119 k += 2;
120 uc16 continuation_byte = static_cast<uc16>(two_digits);
121 octets[number_of_continuation_bytes] = continuation_byte;
122 }
123
124 if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
125 return false;
126 }
127 } else {
128 AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
129 }
130 } else {
131 buffer->push_back(code);
132 }
133 }
134 return true;
135 }
136
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,std::vector<uint8_t> * one_byte_buffer,std::vector<uc16> * two_byte_buffer)137 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
138 std::vector<uint8_t>* one_byte_buffer,
139 std::vector<uc16>* two_byte_buffer) {
140 DisallowHeapAllocation no_gc;
141 String::FlatContent uri_content = uri->GetFlatContent();
142
143 int uri_length = uri->length();
144 for (int k = 0; k < uri_length; k++) {
145 uc16 code = uri_content.Get(k);
146 if (code == '%') {
147 int two_digits;
148 if (k + 2 >= uri_length ||
149 (two_digits = TwoDigitHex(uri_content.Get(k + 1),
150 uri_content.Get(k + 2))) < 0) {
151 return false;
152 }
153
154 uc16 decoded = static_cast<uc16>(two_digits);
155 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
156 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
157 two_byte_buffer);
158 }
159
160 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
161 k += 2;
162 } else {
163 if (code > unibrow::Utf8::kMaxOneByteChar) {
164 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
165 two_byte_buffer);
166 }
167 one_byte_buffer->push_back(code);
168 }
169 }
170 return true;
171 }
172
173 } // anonymous namespace
174
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)175 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
176 bool is_uri) {
177 uri = String::Flatten(isolate, uri);
178 std::vector<uint8_t> one_byte_buffer;
179 std::vector<uc16> two_byte_buffer;
180
181 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
182 THROW_NEW_ERROR(isolate, NewURIError(), String);
183 }
184
185 if (two_byte_buffer.empty()) {
186 return isolate->factory()->NewStringFromOneByte(Vector<const uint8_t>(
187 one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
188 }
189
190 Handle<SeqTwoByteString> result;
191 int result_length =
192 static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
193 ASSIGN_RETURN_ON_EXCEPTION(
194 isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
195 String);
196
197 CopyChars(result->GetChars(), one_byte_buffer.data(), one_byte_buffer.size());
198 CopyChars(result->GetChars() + one_byte_buffer.size(), two_byte_buffer.data(),
199 two_byte_buffer.size());
200
201 return result;
202 }
203
204 namespace { // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(uc16 c)205 bool IsUnescapePredicateInUriComponent(uc16 c) {
206 if (IsAlphaNumeric(c)) {
207 return true;
208 }
209
210 switch (c) {
211 case '!':
212 case '\'':
213 case '(':
214 case ')':
215 case '*':
216 case '-':
217 case '.':
218 case '_':
219 case '~':
220 return true;
221 default:
222 return false;
223 }
224 }
225
IsUriSeparator(uc16 c)226 bool IsUriSeparator(uc16 c) {
227 switch (c) {
228 case '#':
229 case ':':
230 case ';':
231 case '/':
232 case '?':
233 case '$':
234 case '&':
235 case '+':
236 case ',':
237 case '@':
238 case '=':
239 return true;
240 default:
241 return false;
242 }
243 }
244
AddEncodedOctetToBuffer(uint8_t octet,std::vector<uint8_t> * buffer)245 void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
246 buffer->push_back('%');
247 buffer->push_back(HexCharOfValue(octet >> 4));
248 buffer->push_back(HexCharOfValue(octet & 0x0F));
249 }
250
EncodeSingle(uc16 c,std::vector<uint8_t> * buffer)251 void EncodeSingle(uc16 c, std::vector<uint8_t>* buffer) {
252 char s[4] = {};
253 int number_of_bytes;
254 number_of_bytes =
255 unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
256 for (int k = 0; k < number_of_bytes; k++) {
257 AddEncodedOctetToBuffer(s[k], buffer);
258 }
259 }
260
EncodePair(uc16 cc1,uc16 cc2,std::vector<uint8_t> * buffer)261 void EncodePair(uc16 cc1, uc16 cc2, std::vector<uint8_t>* buffer) {
262 char s[4] = {};
263 int number_of_bytes =
264 unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
265 unibrow::Utf16::kNoPreviousCharacter, false);
266 for (int k = 0; k < number_of_bytes; k++) {
267 AddEncodedOctetToBuffer(s[k], buffer);
268 }
269 }
270
271 } // anonymous namespace
272
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)273 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
274 bool is_uri) {
275 uri = String::Flatten(isolate, uri);
276 int uri_length = uri->length();
277 std::vector<uint8_t> buffer;
278 buffer.reserve(uri_length);
279
280 {
281 DisallowHeapAllocation no_gc;
282 String::FlatContent uri_content = uri->GetFlatContent();
283
284 for (int k = 0; k < uri_length; k++) {
285 uc16 cc1 = uri_content.Get(k);
286 if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
287 k++;
288 if (k < uri_length) {
289 uc16 cc2 = uri->Get(k);
290 if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
291 EncodePair(cc1, cc2, &buffer);
292 continue;
293 }
294 }
295 } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
296 if (IsUnescapePredicateInUriComponent(cc1) ||
297 (is_uri && IsUriSeparator(cc1))) {
298 buffer.push_back(cc1);
299 } else {
300 EncodeSingle(cc1, &buffer);
301 }
302 continue;
303 }
304
305 AllowHeapAllocation allocate_error_and_return;
306 THROW_NEW_ERROR(isolate, NewURIError(), String);
307 }
308 }
309
310 return isolate->factory()->NewStringFromOneByte(
311 Vector<const uint8_t>(buffer.data(), static_cast<int>(buffer.size())));
312 }
313
314 namespace { // Anonymous namespace for Escape and Unescape
315
316 template <typename Char>
UnescapeChar(Vector<const Char> vector,int i,int length,int * step)317 int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
318 uint16_t character = vector[i];
319 int32_t hi = 0;
320 int32_t lo = 0;
321 if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
322 (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
323 (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
324 *step = 6;
325 return (hi << 8) + lo;
326 } else if (character == '%' && i <= length - 3 &&
327 (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
328 *step = 3;
329 return lo;
330 } else {
331 *step = 1;
332 return character;
333 }
334 }
335
336 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)337 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
338 int start_index) {
339 bool one_byte = true;
340 int length = string->length();
341
342 int unescaped_length = 0;
343 {
344 DisallowHeapAllocation no_allocation;
345 Vector<const Char> vector = string->GetCharVector<Char>();
346 for (int i = start_index; i < length; unescaped_length++) {
347 int step;
348 if (UnescapeChar(vector, i, length, &step) >
349 String::kMaxOneByteCharCode) {
350 one_byte = false;
351 }
352 i += step;
353 }
354 }
355
356 DCHECK(start_index < length);
357 Handle<String> first_part =
358 isolate->factory()->NewProperSubString(string, 0, start_index);
359
360 int dest_position = 0;
361 Handle<String> second_part;
362 DCHECK_LE(unescaped_length, String::kMaxLength);
363 if (one_byte) {
364 Handle<SeqOneByteString> dest = isolate->factory()
365 ->NewRawOneByteString(unescaped_length)
366 .ToHandleChecked();
367 DisallowHeapAllocation no_allocation;
368 Vector<const Char> vector = string->GetCharVector<Char>();
369 for (int i = start_index; i < length; dest_position++) {
370 int step;
371 dest->SeqOneByteStringSet(dest_position,
372 UnescapeChar(vector, i, length, &step));
373 i += step;
374 }
375 second_part = dest;
376 } else {
377 Handle<SeqTwoByteString> dest = isolate->factory()
378 ->NewRawTwoByteString(unescaped_length)
379 .ToHandleChecked();
380 DisallowHeapAllocation no_allocation;
381 Vector<const Char> vector = string->GetCharVector<Char>();
382 for (int i = start_index; i < length; dest_position++) {
383 int step;
384 dest->SeqTwoByteStringSet(dest_position,
385 UnescapeChar(vector, i, length, &step));
386 i += step;
387 }
388 second_part = dest;
389 }
390 return isolate->factory()->NewConsString(first_part, second_part);
391 }
392
IsNotEscaped(uint16_t c)393 bool IsNotEscaped(uint16_t c) {
394 if (IsAlphaNumeric(c)) {
395 return true;
396 }
397 // @*_+-./
398 switch (c) {
399 case '@':
400 case '*':
401 case '_':
402 case '+':
403 case '-':
404 case '.':
405 case '/':
406 return true;
407 default:
408 return false;
409 }
410 }
411
412 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)413 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
414 Handle<String> source) {
415 int index;
416 {
417 DisallowHeapAllocation no_allocation;
418 StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
419 index = search.Search(source->GetCharVector<Char>(), 0);
420 if (index < 0) return source;
421 }
422 return UnescapeSlow<Char>(isolate, source, index);
423 }
424
425 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)426 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
427 Handle<String> string) {
428 DCHECK(string->IsFlat());
429 int escaped_length = 0;
430 int length = string->length();
431
432 {
433 DisallowHeapAllocation no_allocation;
434 Vector<const Char> vector = string->GetCharVector<Char>();
435 for (int i = 0; i < length; i++) {
436 uint16_t c = vector[i];
437 if (c >= 256) {
438 escaped_length += 6;
439 } else if (IsNotEscaped(c)) {
440 escaped_length++;
441 } else {
442 escaped_length += 3;
443 }
444
445 // We don't allow strings that are longer than a maximal length.
446 DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow.
447 if (escaped_length > String::kMaxLength) break; // Provoke exception.
448 }
449 }
450
451 // No length change implies no change. Return original string if no change.
452 if (escaped_length == length) return string;
453
454 Handle<SeqOneByteString> dest;
455 ASSIGN_RETURN_ON_EXCEPTION(
456 isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
457 String);
458 int dest_position = 0;
459
460 {
461 DisallowHeapAllocation no_allocation;
462 Vector<const Char> vector = string->GetCharVector<Char>();
463 for (int i = 0; i < length; i++) {
464 uint16_t c = vector[i];
465 if (c >= 256) {
466 dest->SeqOneByteStringSet(dest_position, '%');
467 dest->SeqOneByteStringSet(dest_position + 1, 'u');
468 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
469 dest->SeqOneByteStringSet(dest_position + 3,
470 HexCharOfValue((c >> 8) & 0xF));
471 dest->SeqOneByteStringSet(dest_position + 4,
472 HexCharOfValue((c >> 4) & 0xF));
473 dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xF));
474 dest_position += 6;
475 } else if (IsNotEscaped(c)) {
476 dest->SeqOneByteStringSet(dest_position, c);
477 dest_position++;
478 } else {
479 dest->SeqOneByteStringSet(dest_position, '%');
480 dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
481 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xF));
482 dest_position += 3;
483 }
484 }
485 }
486
487 return dest;
488 }
489
490 } // Anonymous namespace
491
Escape(Isolate * isolate,Handle<String> string)492 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
493 Handle<String> result;
494 string = String::Flatten(isolate, string);
495 return string->IsOneByteRepresentationUnderneath()
496 ? EscapePrivate<uint8_t>(isolate, string)
497 : EscapePrivate<uc16>(isolate, string);
498 }
499
Unescape(Isolate * isolate,Handle<String> string)500 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
501 Handle<String> result;
502 string = String::Flatten(isolate, string);
503 return string->IsOneByteRepresentationUnderneath()
504 ? UnescapePrivate<uint8_t>(isolate, string)
505 : UnescapePrivate<uc16>(isolate, string);
506 }
507
508 } // namespace internal
509 } // namespace v8
510