1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/uri.h"
6
7 #include "src/char-predicates-inl.h"
8 #include "src/handles.h"
9 #include "src/isolate-inl.h"
10 #include "src/list.h"
11 #include "src/string-search.h"
12
13 namespace v8 {
14 namespace internal {
15
16 namespace { // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(uc16 c)17 bool IsReservedPredicate(uc16 c) {
18 switch (c) {
19 case '#':
20 case '$':
21 case '&':
22 case '+':
23 case ',':
24 case '/':
25 case ':':
26 case ';':
27 case '=':
28 case '?':
29 case '@':
30 return true;
31 default:
32 return false;
33 }
34 }
35
IsReplacementCharacter(const uint8_t * octets,int length)36 bool IsReplacementCharacter(const uint8_t* octets, int length) {
37 // The replacement character is at codepoint U+FFFD in the Unicode Specials
38 // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
39 if (length != 3 || octets[0] != 0xef || octets[1] != 0xbf ||
40 octets[2] != 0xbd) {
41 return false;
42 }
43 return true;
44 }
45
DecodeOctets(const uint8_t * octets,int length,List<uc16> * buffer)46 bool DecodeOctets(const uint8_t* octets, int length, List<uc16>* buffer) {
47 size_t cursor = 0;
48 uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
49 if (value == unibrow::Utf8::kBadChar &&
50 !IsReplacementCharacter(octets, length)) {
51 return false;
52 }
53
54 if (value <= static_cast<uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
55 buffer->Add(value);
56 } else {
57 buffer->Add(unibrow::Utf16::LeadSurrogate(value));
58 buffer->Add(unibrow::Utf16::TrailSurrogate(value));
59 }
60 return true;
61 }
62
TwoDigitHex(uc16 character1,uc16 character2)63 int TwoDigitHex(uc16 character1, uc16 character2) {
64 if (character1 > 'f') return -1;
65 int high = HexValue(character1);
66 if (high == -1) return -1;
67 if (character2 > 'f') return -1;
68 int low = HexValue(character2);
69 if (low == -1) return -1;
70 return (high << 4) + low;
71 }
72
73 template <typename T>
AddToBuffer(uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,List<T> * buffer)74 void AddToBuffer(uc16 decoded, String::FlatContent* uri_content, int index,
75 bool is_uri, List<T>* buffer) {
76 if (is_uri && IsReservedPredicate(decoded)) {
77 buffer->Add('%');
78 uc16 first = uri_content->Get(index + 1);
79 uc16 second = uri_content->Get(index + 2);
80 DCHECK_GT(std::numeric_limits<T>::max(), first);
81 DCHECK_GT(std::numeric_limits<T>::max(), second);
82
83 buffer->Add(first);
84 buffer->Add(second);
85 } else {
86 buffer->Add(decoded);
87 }
88 }
89
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,List<uc16> * buffer)90 bool IntoTwoByte(int index, bool is_uri, int uri_length,
91 String::FlatContent* uri_content, List<uc16>* buffer) {
92 for (int k = index; k < uri_length; k++) {
93 uc16 code = uri_content->Get(k);
94 if (code == '%') {
95 int two_digits;
96 if (k + 2 >= uri_length ||
97 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
98 uri_content->Get(k + 2))) < 0) {
99 return false;
100 }
101 k += 2;
102 uc16 decoded = static_cast<uc16>(two_digits);
103 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
104 uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
105 octets[0] = decoded;
106
107 int number_of_continuation_bytes = 0;
108 while ((decoded << ++number_of_continuation_bytes) & 0x80) {
109 if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
110 return false;
111 }
112 if (uri_content->Get(++k) != '%' ||
113 (two_digits = TwoDigitHex(uri_content->Get(k + 1),
114 uri_content->Get(k + 2))) < 0) {
115 return false;
116 }
117 k += 2;
118 uc16 continuation_byte = static_cast<uc16>(two_digits);
119 octets[number_of_continuation_bytes] = continuation_byte;
120 }
121
122 if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
123 return false;
124 }
125 } else {
126 AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
127 }
128 } else {
129 buffer->Add(code);
130 }
131 }
132 return true;
133 }
134
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,List<uint8_t> * one_byte_buffer,List<uc16> * two_byte_buffer)135 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
136 List<uint8_t>* one_byte_buffer,
137 List<uc16>* two_byte_buffer) {
138 DisallowHeapAllocation no_gc;
139 String::FlatContent uri_content = uri->GetFlatContent();
140
141 int uri_length = uri->length();
142 for (int k = 0; k < uri_length; k++) {
143 uc16 code = uri_content.Get(k);
144 if (code == '%') {
145 int two_digits;
146 if (k + 2 >= uri_length ||
147 (two_digits = TwoDigitHex(uri_content.Get(k + 1),
148 uri_content.Get(k + 2))) < 0) {
149 return false;
150 }
151
152 uc16 decoded = static_cast<uc16>(two_digits);
153 if (decoded > unibrow::Utf8::kMaxOneByteChar) {
154 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
155 two_byte_buffer);
156 }
157
158 AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
159 k += 2;
160 } else {
161 if (code > unibrow::Utf8::kMaxOneByteChar) {
162 return IntoTwoByte(k, is_uri, uri_length, &uri_content,
163 two_byte_buffer);
164 }
165 one_byte_buffer->Add(code);
166 }
167 }
168 return true;
169 }
170
171 } // anonymous namespace
172
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)173 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
174 bool is_uri) {
175 uri = String::Flatten(uri);
176 List<uint8_t> one_byte_buffer;
177 List<uc16> two_byte_buffer;
178
179 if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
180 THROW_NEW_ERROR(isolate, NewURIError(), String);
181 }
182
183 if (two_byte_buffer.is_empty()) {
184 return isolate->factory()->NewStringFromOneByte(
185 one_byte_buffer.ToConstVector());
186 }
187
188 Handle<SeqTwoByteString> result;
189 ASSIGN_RETURN_ON_EXCEPTION(
190 isolate, result, isolate->factory()->NewRawTwoByteString(
191 one_byte_buffer.length() + two_byte_buffer.length()),
192 String);
193
194 CopyChars(result->GetChars(), one_byte_buffer.ToConstVector().start(),
195 one_byte_buffer.length());
196 CopyChars(result->GetChars() + one_byte_buffer.length(),
197 two_byte_buffer.ToConstVector().start(), two_byte_buffer.length());
198
199 return result;
200 }
201
202 namespace { // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(uc16 c)203 bool IsUnescapePredicateInUriComponent(uc16 c) {
204 if (IsAlphaNumeric(c)) {
205 return true;
206 }
207
208 switch (c) {
209 case '!':
210 case '\'':
211 case '(':
212 case ')':
213 case '*':
214 case '-':
215 case '.':
216 case '_':
217 case '~':
218 return true;
219 default:
220 return false;
221 }
222 }
223
IsUriSeparator(uc16 c)224 bool IsUriSeparator(uc16 c) {
225 switch (c) {
226 case '#':
227 case ':':
228 case ';':
229 case '/':
230 case '?':
231 case '$':
232 case '&':
233 case '+':
234 case ',':
235 case '@':
236 case '=':
237 return true;
238 default:
239 return false;
240 }
241 }
242
AddEncodedOctetToBuffer(uint8_t octet,List<uint8_t> * buffer)243 void AddEncodedOctetToBuffer(uint8_t octet, List<uint8_t>* buffer) {
244 buffer->Add('%');
245 buffer->Add(HexCharOfValue(octet >> 4));
246 buffer->Add(HexCharOfValue(octet & 0x0F));
247 }
248
EncodeSingle(uc16 c,List<uint8_t> * buffer)249 void EncodeSingle(uc16 c, List<uint8_t>* buffer) {
250 char s[4] = {};
251 int number_of_bytes;
252 number_of_bytes =
253 unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
254 for (int k = 0; k < number_of_bytes; k++) {
255 AddEncodedOctetToBuffer(s[k], buffer);
256 }
257 }
258
EncodePair(uc16 cc1,uc16 cc2,List<uint8_t> * buffer)259 void EncodePair(uc16 cc1, uc16 cc2, List<uint8_t>* buffer) {
260 char s[4] = {};
261 int number_of_bytes =
262 unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
263 unibrow::Utf16::kNoPreviousCharacter, false);
264 for (int k = 0; k < number_of_bytes; k++) {
265 AddEncodedOctetToBuffer(s[k], buffer);
266 }
267 }
268
269 } // anonymous namespace
270
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)271 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
272 bool is_uri) {
273 uri = String::Flatten(uri);
274 int uri_length = uri->length();
275 List<uint8_t> buffer(uri_length);
276
277 {
278 DisallowHeapAllocation no_gc;
279 String::FlatContent uri_content = uri->GetFlatContent();
280
281 for (int k = 0; k < uri_length; k++) {
282 uc16 cc1 = uri_content.Get(k);
283 if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
284 k++;
285 if (k < uri_length) {
286 uc16 cc2 = uri->Get(k);
287 if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
288 EncodePair(cc1, cc2, &buffer);
289 continue;
290 }
291 }
292 } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
293 if (IsUnescapePredicateInUriComponent(cc1) ||
294 (is_uri && IsUriSeparator(cc1))) {
295 buffer.Add(cc1);
296 } else {
297 EncodeSingle(cc1, &buffer);
298 }
299 continue;
300 }
301
302 AllowHeapAllocation allocate_error_and_return;
303 THROW_NEW_ERROR(isolate, NewURIError(), String);
304 }
305 }
306
307 return isolate->factory()->NewStringFromOneByte(buffer.ToConstVector());
308 }
309
310 namespace { // Anonymous namespace for Escape and Unescape
311
312 template <typename Char>
UnescapeChar(Vector<const Char> vector,int i,int length,int * step)313 int UnescapeChar(Vector<const Char> vector, int i, int length, int* step) {
314 uint16_t character = vector[i];
315 int32_t hi = 0;
316 int32_t lo = 0;
317 if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
318 (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
319 (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
320 *step = 6;
321 return (hi << 8) + lo;
322 } else if (character == '%' && i <= length - 3 &&
323 (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
324 *step = 3;
325 return lo;
326 } else {
327 *step = 1;
328 return character;
329 }
330 }
331
332 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)333 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
334 int start_index) {
335 bool one_byte = true;
336 int length = string->length();
337
338 int unescaped_length = 0;
339 {
340 DisallowHeapAllocation no_allocation;
341 Vector<const Char> vector = string->GetCharVector<Char>();
342 for (int i = start_index; i < length; unescaped_length++) {
343 int step;
344 if (UnescapeChar(vector, i, length, &step) >
345 String::kMaxOneByteCharCode) {
346 one_byte = false;
347 }
348 i += step;
349 }
350 }
351
352 DCHECK(start_index < length);
353 Handle<String> first_part =
354 isolate->factory()->NewProperSubString(string, 0, start_index);
355
356 int dest_position = 0;
357 Handle<String> second_part;
358 DCHECK(unescaped_length <= String::kMaxLength);
359 if (one_byte) {
360 Handle<SeqOneByteString> dest = isolate->factory()
361 ->NewRawOneByteString(unescaped_length)
362 .ToHandleChecked();
363 DisallowHeapAllocation no_allocation;
364 Vector<const Char> vector = string->GetCharVector<Char>();
365 for (int i = start_index; i < length; dest_position++) {
366 int step;
367 dest->SeqOneByteStringSet(dest_position,
368 UnescapeChar(vector, i, length, &step));
369 i += step;
370 }
371 second_part = dest;
372 } else {
373 Handle<SeqTwoByteString> dest = isolate->factory()
374 ->NewRawTwoByteString(unescaped_length)
375 .ToHandleChecked();
376 DisallowHeapAllocation no_allocation;
377 Vector<const Char> vector = string->GetCharVector<Char>();
378 for (int i = start_index; i < length; dest_position++) {
379 int step;
380 dest->SeqTwoByteStringSet(dest_position,
381 UnescapeChar(vector, i, length, &step));
382 i += step;
383 }
384 second_part = dest;
385 }
386 return isolate->factory()->NewConsString(first_part, second_part);
387 }
388
IsNotEscaped(uint16_t c)389 bool IsNotEscaped(uint16_t c) {
390 if (IsAlphaNumeric(c)) {
391 return true;
392 }
393 // @*_+-./
394 switch (c) {
395 case '@':
396 case '*':
397 case '_':
398 case '+':
399 case '-':
400 case '.':
401 case '/':
402 return true;
403 default:
404 return false;
405 }
406 }
407
408 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)409 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
410 Handle<String> source) {
411 int index;
412 {
413 DisallowHeapAllocation no_allocation;
414 StringSearch<uint8_t, Char> search(isolate, STATIC_CHAR_VECTOR("%"));
415 index = search.Search(source->GetCharVector<Char>(), 0);
416 if (index < 0) return source;
417 }
418 return UnescapeSlow<Char>(isolate, source, index);
419 }
420
421 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)422 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
423 Handle<String> string) {
424 DCHECK(string->IsFlat());
425 int escaped_length = 0;
426 int length = string->length();
427
428 {
429 DisallowHeapAllocation no_allocation;
430 Vector<const Char> vector = string->GetCharVector<Char>();
431 for (int i = 0; i < length; i++) {
432 uint16_t c = vector[i];
433 if (c >= 256) {
434 escaped_length += 6;
435 } else if (IsNotEscaped(c)) {
436 escaped_length++;
437 } else {
438 escaped_length += 3;
439 }
440
441 // We don't allow strings that are longer than a maximal length.
442 DCHECK(String::kMaxLength < 0x7fffffff - 6); // Cannot overflow.
443 if (escaped_length > String::kMaxLength) break; // Provoke exception.
444 }
445 }
446
447 // No length change implies no change. Return original string if no change.
448 if (escaped_length == length) return string;
449
450 Handle<SeqOneByteString> dest;
451 ASSIGN_RETURN_ON_EXCEPTION(
452 isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
453 String);
454 int dest_position = 0;
455
456 {
457 DisallowHeapAllocation no_allocation;
458 Vector<const Char> vector = string->GetCharVector<Char>();
459 for (int i = 0; i < length; i++) {
460 uint16_t c = vector[i];
461 if (c >= 256) {
462 dest->SeqOneByteStringSet(dest_position, '%');
463 dest->SeqOneByteStringSet(dest_position + 1, 'u');
464 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c >> 12));
465 dest->SeqOneByteStringSet(dest_position + 3,
466 HexCharOfValue((c >> 8) & 0xf));
467 dest->SeqOneByteStringSet(dest_position + 4,
468 HexCharOfValue((c >> 4) & 0xf));
469 dest->SeqOneByteStringSet(dest_position + 5, HexCharOfValue(c & 0xf));
470 dest_position += 6;
471 } else if (IsNotEscaped(c)) {
472 dest->SeqOneByteStringSet(dest_position, c);
473 dest_position++;
474 } else {
475 dest->SeqOneByteStringSet(dest_position, '%');
476 dest->SeqOneByteStringSet(dest_position + 1, HexCharOfValue(c >> 4));
477 dest->SeqOneByteStringSet(dest_position + 2, HexCharOfValue(c & 0xf));
478 dest_position += 3;
479 }
480 }
481 }
482
483 return dest;
484 }
485
486 } // Anonymous namespace
487
Escape(Isolate * isolate,Handle<String> string)488 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
489 Handle<String> result;
490 string = String::Flatten(string);
491 return string->IsOneByteRepresentationUnderneath()
492 ? EscapePrivate<uint8_t>(isolate, string)
493 : EscapePrivate<uc16>(isolate, string);
494 }
495
Unescape(Isolate * isolate,Handle<String> string)496 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
497 Handle<String> result;
498 string = String::Flatten(string);
499 return string->IsOneByteRepresentationUnderneath()
500 ? UnescapePrivate<uint8_t>(isolate, string)
501 : UnescapePrivate<uc16>(isolate, string);
502 }
503
504 } // namespace internal
505 } // namespace v8
506