1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/strings/uri.h"
6
7 #include <vector>
8
9 #include "src/execution/isolate-inl.h"
10 #include "src/strings/char-predicates-inl.h"
11 #include "src/strings/string-search.h"
12 #include "src/strings/unicode-inl.h"
13
14 namespace v8 {
15 namespace internal {
16
17 namespace { // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(base::uc16 c)18 bool IsReservedPredicate(base::uc16 c) {
19 switch (c) {
20 case '#':
21 case '$':
22 case '&':
23 case '+':
24 case ',':
25 case '/':
26 case ':':
27 case ';':
28 case '=':
29 case '?':
30 case '@':
31 return true;
32 default:
33 return false;
34 }
35 }
36
IsReplacementCharacter(const uint8_t * octets,int length)37 bool IsReplacementCharacter(const uint8_t* octets, int length) {
38 // The replacement character is at codepoint U+FFFD in the Unicode Specials
39 // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40 if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41 octets[2] != 0xBD) {
42 return false;
43 }
44 return true;
45 }
46
DecodeOctets(const uint8_t * octets,int length,std::vector<base::uc16> * buffer)47 bool DecodeOctets(const uint8_t* octets, int length,
48 std::vector<base::uc16>* buffer) {
49 size_t cursor = 0;
50 base::uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51 if (value == unibrow::Utf8::kBadChar &&
52 !IsReplacementCharacter(octets, length)) {
53 return false;
54 }
55
56 if (value <=
57 static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
58 buffer->push_back(value);
59 } else {
60 buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
61 buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
62 }
63 return true;
64 }
65
TwoDigitHex(base::uc16 character1,base::uc16 character2)66 int TwoDigitHex(base::uc16 character1, base::uc16 character2) {
67 if (character1 > 'f') return -1;
68 int high = base::HexValue(character1);
69 if (high == -1) return -1;
70 if (character2 > 'f') return -1;
71 int low = base::HexValue(character2);
72 if (low == -1) return -1;
73 return (high << 4) + low;
74 }
75
76 template <typename T>
AddToBuffer(base::uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,std::vector<T> * buffer)77 void AddToBuffer(base::uc16 decoded, String::FlatContent* uri_content,
78 int index, bool is_uri, std::vector<T>* buffer) {
79 if (is_uri && IsReservedPredicate(decoded)) {
80 buffer->push_back('%');
81 base::uc16 first = uri_content->Get(index + 1);
82 base::uc16 second = uri_content->Get(index + 2);
83 DCHECK_GT(std::numeric_limits<T>::max(), first);
84 DCHECK_GT(std::numeric_limits<T>::max(), second);
85
86 buffer->push_back(first);
87 buffer->push_back(second);
88 } else {
89 buffer->push_back(decoded);
90 }
91 }
92
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,std::vector<base::uc16> * buffer)93 bool IntoTwoByte(int index, bool is_uri, int uri_length,
94 String::FlatContent* uri_content,
95 std::vector<base::uc16>* buffer) {
96 for (int k = index; k < uri_length; k++) {
97 base::uc16 code = uri_content->Get(k);
98 if (code == '%') {
99 int two_digits;
100 if (k + 2 >= uri_length ||
101 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
102 uri_content->Get(k + 2))) < 0) {
103 return false;
104 }
105 k += 2;
106 base::uc16 decoded = static_cast<base::uc16>(two_digits);
107 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
108 uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
109 octets[0] = decoded;
110
111 int number_of_continuation_bytes = 0;
112 while ((decoded << ++number_of_continuation_bytes) & 0x80) {
113 if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
114 return false;
115 }
116 if (uri_content->Get(++k) != '%' ||
117 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
118 uri_content->Get(k + 2))) < 0) {
119 return false;
120 }
121 k += 2;
122 base::uc16 continuation_byte = static_cast<base::uc16>(two_digits);
123 octets[number_of_continuation_bytes] = continuation_byte;
124 }
125
126 if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
127 return false;
128 }
129 } else {
130 AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
131 }
132 } else {
133 buffer->push_back(code);
134 }
135 }
136 return true;
137 }
138
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,std::vector<uint8_t> * one_byte_buffer,std::vector<base::uc16> * two_byte_buffer)139 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
140 std::vector<uint8_t>* one_byte_buffer,
141 std::vector<base::uc16>* two_byte_buffer) {
142 DisallowGarbageCollection no_gc;
143 String::FlatContent uri_content = uri->GetFlatContent(no_gc);
144
145 int uri_length = uri->length();
146 for (int k = 0; k < uri_length; k++) {
147 base::uc16 code = uri_content.Get(k);
148 if (code == '%') {
149 int two_digits;
150 if (k + 2 >= uri_length ||
151 (two_digits = TwoDigitHex(uri_content.Get(k + 1),
152 uri_content.Get(k + 2))) < 0) {
153 return false;
154 }
155
156 base::uc16 decoded = static_cast<base::uc16>(two_digits);
157 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
158 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
159 two_byte_buffer);
160 }
161
162 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
163 k += 2;
164 } else {
165 if (code > unibrow::Utf8::kMaxOneByteChar) {
166 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
167 two_byte_buffer);
168 }
169 one_byte_buffer->push_back(code);
170 }
171 }
172 return true;
173 }
174
175 } // anonymous namespace
176
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)177 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
178 bool is_uri) {
179 uri = String::Flatten(isolate, uri);
180 std::vector<uint8_t> one_byte_buffer;
181 std::vector<base::uc16> two_byte_buffer;
182
183 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
184 THROW_NEW_ERROR(isolate, NewURIError(), String);
185 }
186
187 if (two_byte_buffer.empty()) {
188 return isolate->factory()->NewStringFromOneByte(base::Vector<const uint8_t>(
189 one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
190 }
191
192 Handle<SeqTwoByteString> result;
193 int result_length =
194 static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
195 ASSIGN_RETURN_ON_EXCEPTION(
196 isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
197 String);
198
199 DisallowGarbageCollection no_gc;
200 base::uc16* chars = result->GetChars(no_gc);
201 if (!one_byte_buffer.empty()) {
202 CopyChars(chars, one_byte_buffer.data(), one_byte_buffer.size());
203 chars += one_byte_buffer.size();
204 }
205 if (!two_byte_buffer.empty()) {
206 CopyChars(chars, two_byte_buffer.data(), two_byte_buffer.size());
207 }
208
209 return result;
210 }
211
212 namespace { // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(base::uc16 c)213 bool IsUnescapePredicateInUriComponent(base::uc16 c) {
214 if (IsAlphaNumeric(c)) {
215 return true;
216 }
217
218 switch (c) {
219 case '!':
220 case '\'':
221 case '(':
222 case ')':
223 case '*':
224 case '-':
225 case '.':
226 case '_':
227 case '~':
228 return true;
229 default:
230 return false;
231 }
232 }
233
IsUriSeparator(base::uc16 c)234 bool IsUriSeparator(base::uc16 c) {
235 switch (c) {
236 case '#':
237 case ':':
238 case ';':
239 case '/':
240 case '?':
241 case '$':
242 case '&':
243 case '+':
244 case ',':
245 case '@':
246 case '=':
247 return true;
248 default:
249 return false;
250 }
251 }
252
AddEncodedOctetToBuffer(uint8_t octet,std::vector<uint8_t> * buffer)253 void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
254 buffer->push_back('%');
255 buffer->push_back(base::HexCharOfValue(octet >> 4));
256 buffer->push_back(base::HexCharOfValue(octet & 0x0F));
257 }
258
EncodeSingle(base::uc16 c,std::vector<uint8_t> * buffer)259 void EncodeSingle(base::uc16 c, std::vector<uint8_t>* buffer) {
260 char s[4] = {};
261 int number_of_bytes;
262 number_of_bytes =
263 unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
264 for (int k = 0; k < number_of_bytes; k++) {
265 AddEncodedOctetToBuffer(s[k], buffer);
266 }
267 }
268
EncodePair(base::uc16 cc1,base::uc16 cc2,std::vector<uint8_t> * buffer)269 void EncodePair(base::uc16 cc1, base::uc16 cc2, std::vector<uint8_t>* buffer) {
270 char s[4] = {};
271 int number_of_bytes =
272 unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
273 unibrow::Utf16::kNoPreviousCharacter, false);
274 for (int k = 0; k < number_of_bytes; k++) {
275 AddEncodedOctetToBuffer(s[k], buffer);
276 }
277 }
278
279 } // anonymous namespace
280
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)281 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
282 bool is_uri) {
283 uri = String::Flatten(isolate, uri);
284 int uri_length = uri->length();
285 std::vector<uint8_t> buffer;
286 buffer.reserve(uri_length);
287
288 bool throw_error = false;
289 {
290 DisallowGarbageCollection no_gc;
291 String::FlatContent uri_content = uri->GetFlatContent(no_gc);
292
293 for (int k = 0; k < uri_length; k++) {
294 base::uc16 cc1 = uri_content.Get(k);
295 if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
296 k++;
297 if (k < uri_length) {
298 base::uc16 cc2 = uri->Get(k);
299 if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
300 EncodePair(cc1, cc2, &buffer);
301 continue;
302 }
303 }
304 } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
305 if (IsUnescapePredicateInUriComponent(cc1) ||
306 (is_uri && IsUriSeparator(cc1))) {
307 buffer.push_back(cc1);
308 } else {
309 EncodeSingle(cc1, &buffer);
310 }
311 continue;
312 }
313
314 // String::FlatContent DCHECKs its contents did not change during its
315 // lifetime. Throwing the error inside the loop may cause GC and move the
316 // string contents.
317 throw_error = true;
318 break;
319 }
320 }
321
322 if (throw_error) THROW_NEW_ERROR(isolate, NewURIError(), String);
323 return isolate->factory()->NewStringFromOneByte(base::VectorOf(buffer));
324 }
325
326 namespace { // Anonymous namespace for Escape and Unescape
327
328 template <typename Char>
UnescapeChar(base::Vector<const Char> vector,int i,int length,int * step)329 int UnescapeChar(base::Vector<const Char> vector, int i, int length,
330 int* step) {
331 uint16_t character = vector[i];
332 int32_t hi = 0;
333 int32_t lo = 0;
334 if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
335 (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
336 (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
337 *step = 6;
338 return (hi << 8) + lo;
339 } else if (character == '%' && i <= length - 3 &&
340 (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
341 *step = 3;
342 return lo;
343 } else {
344 *step = 1;
345 return character;
346 }
347 }
348
349 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)350 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
351 int start_index) {
352 bool one_byte = true;
353 int length = string->length();
354
355 int unescaped_length = 0;
356 {
357 DisallowGarbageCollection no_gc;
358 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
359 for (int i = start_index; i < length; unescaped_length++) {
360 int step;
361 if (UnescapeChar(vector, i, length, &step) >
362 String::kMaxOneByteCharCode) {
363 one_byte = false;
364 }
365 i += step;
366 }
367 }
368
369 DCHECK(start_index < length);
370 Handle<String> first_part =
371 isolate->factory()->NewProperSubString(string, 0, start_index);
372
373 int dest_position = 0;
374 Handle<String> second_part;
375 DCHECK_LE(unescaped_length, String::kMaxLength);
376 if (one_byte) {
377 Handle<SeqOneByteString> dest = isolate->factory()
378 ->NewRawOneByteString(unescaped_length)
379 .ToHandleChecked();
380 DisallowGarbageCollection no_gc;
381 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
382 for (int i = start_index; i < length; dest_position++) {
383 int step;
384 dest->SeqOneByteStringSet(dest_position,
385 UnescapeChar(vector, i, length, &step));
386 i += step;
387 }
388 second_part = dest;
389 } else {
390 Handle<SeqTwoByteString> dest = isolate->factory()
391 ->NewRawTwoByteString(unescaped_length)
392 .ToHandleChecked();
393 DisallowGarbageCollection no_gc;
394 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
395 for (int i = start_index; i < length; dest_position++) {
396 int step;
397 dest->SeqTwoByteStringSet(dest_position,
398 UnescapeChar(vector, i, length, &step));
399 i += step;
400 }
401 second_part = dest;
402 }
403 return isolate->factory()->NewConsString(first_part, second_part);
404 }
405
IsNotEscaped(uint16_t c)406 bool IsNotEscaped(uint16_t c) {
407 if (IsAlphaNumeric(c)) {
408 return true;
409 }
410 // @*_+-./
411 switch (c) {
412 case '@':
413 case '*':
414 case '_':
415 case '+':
416 case '-':
417 case '.':
418 case '/':
419 return true;
420 default:
421 return false;
422 }
423 }
424
425 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)426 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
427 Handle<String> source) {
428 int index;
429 {
430 DisallowGarbageCollection no_gc;
431 StringSearch<uint8_t, Char> search(isolate, base::StaticOneByteVector("%"));
432 index = search.Search(source->GetCharVector<Char>(no_gc), 0);
433 if (index < 0) return source;
434 }
435 return UnescapeSlow<Char>(isolate, source, index);
436 }
437
438 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)439 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
440 Handle<String> string) {
441 DCHECK(string->IsFlat());
442 int escaped_length = 0;
443 int length = string->length();
444
445 {
446 DisallowGarbageCollection no_gc;
447 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
448 for (int i = 0; i < length; i++) {
449 uint16_t c = vector[i];
450 if (c >= 256) {
451 escaped_length += 6;
452 } else if (IsNotEscaped(c)) {
453 escaped_length++;
454 } else {
455 escaped_length += 3;
456 }
457
458 // We don't allow strings that are longer than a maximal length.
459 DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6); // Cannot overflow.
460 if (escaped_length > String::kMaxLength) break; // Provoke exception.
461 }
462 }
463
464 // No length change implies no change. Return original string if no change.
465 if (escaped_length == length) return string;
466
467 Handle<SeqOneByteString> dest;
468 ASSIGN_RETURN_ON_EXCEPTION(
469 isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
470 String);
471 int dest_position = 0;
472
473 {
474 DisallowGarbageCollection no_gc;
475 base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
476 for (int i = 0; i < length; i++) {
477 uint16_t c = vector[i];
478 if (c >= 256) {
479 dest->SeqOneByteStringSet(dest_position, '%');
480 dest->SeqOneByteStringSet(dest_position + 1, 'u');
481 dest->SeqOneByteStringSet(dest_position + 2,
482 base::HexCharOfValue(c >> 12));
483 dest->SeqOneByteStringSet(dest_position + 3,
484 base::HexCharOfValue((c >> 8) & 0xF));
485 dest->SeqOneByteStringSet(dest_position + 4,
486 base::HexCharOfValue((c >> 4) & 0xF));
487 dest->SeqOneByteStringSet(dest_position + 5,
488 base::HexCharOfValue(c & 0xF));
489 dest_position += 6;
490 } else if (IsNotEscaped(c)) {
491 dest->SeqOneByteStringSet(dest_position, c);
492 dest_position++;
493 } else {
494 dest->SeqOneByteStringSet(dest_position, '%');
495 dest->SeqOneByteStringSet(dest_position + 1,
496 base::HexCharOfValue(c >> 4));
497 dest->SeqOneByteStringSet(dest_position + 2,
498 base::HexCharOfValue(c & 0xF));
499 dest_position += 3;
500 }
501 }
502 }
503
504 return dest;
505 }
506
507 } // anonymous namespace
508
Escape(Isolate * isolate,Handle<String> string)509 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
510 Handle<String> result;
511 string = String::Flatten(isolate, string);
512 return String::IsOneByteRepresentationUnderneath(*string)
513 ? EscapePrivate<uint8_t>(isolate, string)
514 : EscapePrivate<base::uc16>(isolate, string);
515 }
516
Unescape(Isolate * isolate,Handle<String> string)517 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
518 Handle<String> result;
519 string = String::Flatten(isolate, string);
520 return String::IsOneByteRepresentationUnderneath(*string)
521 ? UnescapePrivate<uint8_t>(isolate, string)
522 : UnescapePrivate<base::uc16>(isolate, string);
523 }
524
525 } // namespace internal
526 } // namespace v8
527