• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2016 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/strings/uri.h"
6 
7 #include <vector>
8 
9 #include "src/execution/isolate-inl.h"
10 #include "src/strings/char-predicates-inl.h"
11 #include "src/strings/string-search.h"
12 #include "src/strings/unicode-inl.h"
13 
14 namespace v8 {
15 namespace internal {
16 
17 namespace {  // anonymous namespace for DecodeURI helper functions
IsReservedPredicate(base::uc16 c)18 bool IsReservedPredicate(base::uc16 c) {
19   switch (c) {
20     case '#':
21     case '$':
22     case '&':
23     case '+':
24     case ',':
25     case '/':
26     case ':':
27     case ';':
28     case '=':
29     case '?':
30     case '@':
31       return true;
32     default:
33       return false;
34   }
35 }
36 
IsReplacementCharacter(const uint8_t * octets,int length)37 bool IsReplacementCharacter(const uint8_t* octets, int length) {
38   // The replacement character is at codepoint U+FFFD in the Unicode Specials
39   // table. Its UTF-8 encoding is 0xEF 0xBF 0xBD.
40   if (length != 3 || octets[0] != 0xEF || octets[1] != 0xBF ||
41       octets[2] != 0xBD) {
42     return false;
43   }
44   return true;
45 }
46 
DecodeOctets(const uint8_t * octets,int length,std::vector<base::uc16> * buffer)47 bool DecodeOctets(const uint8_t* octets, int length,
48                   std::vector<base::uc16>* buffer) {
49   size_t cursor = 0;
50   base::uc32 value = unibrow::Utf8::ValueOf(octets, length, &cursor);
51   if (value == unibrow::Utf8::kBadChar &&
52       !IsReplacementCharacter(octets, length)) {
53     return false;
54   }
55 
56   if (value <=
57       static_cast<base::uc32>(unibrow::Utf16::kMaxNonSurrogateCharCode)) {
58     buffer->push_back(value);
59   } else {
60     buffer->push_back(unibrow::Utf16::LeadSurrogate(value));
61     buffer->push_back(unibrow::Utf16::TrailSurrogate(value));
62   }
63   return true;
64 }
65 
TwoDigitHex(base::uc16 character1,base::uc16 character2)66 int TwoDigitHex(base::uc16 character1, base::uc16 character2) {
67   if (character1 > 'f') return -1;
68   int high = base::HexValue(character1);
69   if (high == -1) return -1;
70   if (character2 > 'f') return -1;
71   int low = base::HexValue(character2);
72   if (low == -1) return -1;
73   return (high << 4) + low;
74 }
75 
76 template <typename T>
AddToBuffer(base::uc16 decoded,String::FlatContent * uri_content,int index,bool is_uri,std::vector<T> * buffer)77 void AddToBuffer(base::uc16 decoded, String::FlatContent* uri_content,
78                  int index, bool is_uri, std::vector<T>* buffer) {
79   if (is_uri && IsReservedPredicate(decoded)) {
80     buffer->push_back('%');
81     base::uc16 first = uri_content->Get(index + 1);
82     base::uc16 second = uri_content->Get(index + 2);
83     DCHECK_GT(std::numeric_limits<T>::max(), first);
84     DCHECK_GT(std::numeric_limits<T>::max(), second);
85 
86     buffer->push_back(first);
87     buffer->push_back(second);
88   } else {
89     buffer->push_back(decoded);
90   }
91 }
92 
IntoTwoByte(int index,bool is_uri,int uri_length,String::FlatContent * uri_content,std::vector<base::uc16> * buffer)93 bool IntoTwoByte(int index, bool is_uri, int uri_length,
94                  String::FlatContent* uri_content,
95                  std::vector<base::uc16>* buffer) {
96   for (int k = index; k < uri_length; k++) {
97     base::uc16 code = uri_content->Get(k);
98     if (code == '%') {
99       int two_digits;
100       if (k + 2 >= uri_length ||
101           (two_digits = TwoDigitHex(uri_content->Get(k + 1),
102                                     uri_content->Get(k + 2))) < 0) {
103         return false;
104       }
105       k += 2;
106       base::uc16 decoded = static_cast<base::uc16>(two_digits);
107       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
108         uint8_t octets[unibrow::Utf8::kMaxEncodedSize];
109         octets[0] = decoded;
110 
111         int number_of_continuation_bytes = 0;
112         while ((decoded << ++number_of_continuation_bytes) & 0x80) {
113           if (number_of_continuation_bytes > 3 || k + 3 >= uri_length) {
114             return false;
115           }
116           if (uri_content->Get(++k) != '%' ||
117               (two_digits = TwoDigitHex(uri_content->Get(k + 1),
118                                         uri_content->Get(k + 2))) < 0) {
119             return false;
120           }
121           k += 2;
122           base::uc16 continuation_byte = static_cast<base::uc16>(two_digits);
123           octets[number_of_continuation_bytes] = continuation_byte;
124         }
125 
126         if (!DecodeOctets(octets, number_of_continuation_bytes, buffer)) {
127           return false;
128         }
129       } else {
130         AddToBuffer(decoded, uri_content, k - 2, is_uri, buffer);
131       }
132     } else {
133       buffer->push_back(code);
134     }
135   }
136   return true;
137 }
138 
IntoOneAndTwoByte(Handle<String> uri,bool is_uri,std::vector<uint8_t> * one_byte_buffer,std::vector<base::uc16> * two_byte_buffer)139 bool IntoOneAndTwoByte(Handle<String> uri, bool is_uri,
140                        std::vector<uint8_t>* one_byte_buffer,
141                        std::vector<base::uc16>* two_byte_buffer) {
142   DisallowGarbageCollection no_gc;
143   String::FlatContent uri_content = uri->GetFlatContent(no_gc);
144 
145   int uri_length = uri->length();
146   for (int k = 0; k < uri_length; k++) {
147     base::uc16 code = uri_content.Get(k);
148     if (code == '%') {
149       int two_digits;
150       if (k + 2 >= uri_length ||
151           (two_digits = TwoDigitHex(uri_content.Get(k + 1),
152                                     uri_content.Get(k + 2))) < 0) {
153         return false;
154       }
155 
156       base::uc16 decoded = static_cast<base::uc16>(two_digits);
157       if (decoded > unibrow::Utf8::kMaxOneByteChar) {
158         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
159                            two_byte_buffer);
160       }
161 
162       AddToBuffer(decoded, &uri_content, k, is_uri, one_byte_buffer);
163       k += 2;
164     } else {
165       if (code > unibrow::Utf8::kMaxOneByteChar) {
166         return IntoTwoByte(k, is_uri, uri_length, &uri_content,
167                            two_byte_buffer);
168       }
169       one_byte_buffer->push_back(code);
170     }
171   }
172   return true;
173 }
174 
175 }  // anonymous namespace
176 
Decode(Isolate * isolate,Handle<String> uri,bool is_uri)177 MaybeHandle<String> Uri::Decode(Isolate* isolate, Handle<String> uri,
178                                 bool is_uri) {
179   uri = String::Flatten(isolate, uri);
180   std::vector<uint8_t> one_byte_buffer;
181   std::vector<base::uc16> two_byte_buffer;
182 
183   if (!IntoOneAndTwoByte(uri, is_uri, &one_byte_buffer, &two_byte_buffer)) {
184     THROW_NEW_ERROR(isolate, NewURIError(), String);
185   }
186 
187   if (two_byte_buffer.empty()) {
188     return isolate->factory()->NewStringFromOneByte(base::Vector<const uint8_t>(
189         one_byte_buffer.data(), static_cast<int>(one_byte_buffer.size())));
190   }
191 
192   Handle<SeqTwoByteString> result;
193   int result_length =
194       static_cast<int>(one_byte_buffer.size() + two_byte_buffer.size());
195   ASSIGN_RETURN_ON_EXCEPTION(
196       isolate, result, isolate->factory()->NewRawTwoByteString(result_length),
197       String);
198 
199   DisallowGarbageCollection no_gc;
200   base::uc16* chars = result->GetChars(no_gc);
201   if (!one_byte_buffer.empty()) {
202     CopyChars(chars, one_byte_buffer.data(), one_byte_buffer.size());
203     chars += one_byte_buffer.size();
204   }
205   if (!two_byte_buffer.empty()) {
206     CopyChars(chars, two_byte_buffer.data(), two_byte_buffer.size());
207   }
208 
209   return result;
210 }
211 
212 namespace {  // anonymous namespace for EncodeURI helper functions
IsUnescapePredicateInUriComponent(base::uc16 c)213 bool IsUnescapePredicateInUriComponent(base::uc16 c) {
214   if (IsAlphaNumeric(c)) {
215     return true;
216   }
217 
218   switch (c) {
219     case '!':
220     case '\'':
221     case '(':
222     case ')':
223     case '*':
224     case '-':
225     case '.':
226     case '_':
227     case '~':
228       return true;
229     default:
230       return false;
231   }
232 }
233 
IsUriSeparator(base::uc16 c)234 bool IsUriSeparator(base::uc16 c) {
235   switch (c) {
236     case '#':
237     case ':':
238     case ';':
239     case '/':
240     case '?':
241     case '$':
242     case '&':
243     case '+':
244     case ',':
245     case '@':
246     case '=':
247       return true;
248     default:
249       return false;
250   }
251 }
252 
AddEncodedOctetToBuffer(uint8_t octet,std::vector<uint8_t> * buffer)253 void AddEncodedOctetToBuffer(uint8_t octet, std::vector<uint8_t>* buffer) {
254   buffer->push_back('%');
255   buffer->push_back(base::HexCharOfValue(octet >> 4));
256   buffer->push_back(base::HexCharOfValue(octet & 0x0F));
257 }
258 
EncodeSingle(base::uc16 c,std::vector<uint8_t> * buffer)259 void EncodeSingle(base::uc16 c, std::vector<uint8_t>* buffer) {
260   char s[4] = {};
261   int number_of_bytes;
262   number_of_bytes =
263       unibrow::Utf8::Encode(s, c, unibrow::Utf16::kNoPreviousCharacter, false);
264   for (int k = 0; k < number_of_bytes; k++) {
265     AddEncodedOctetToBuffer(s[k], buffer);
266   }
267 }
268 
EncodePair(base::uc16 cc1,base::uc16 cc2,std::vector<uint8_t> * buffer)269 void EncodePair(base::uc16 cc1, base::uc16 cc2, std::vector<uint8_t>* buffer) {
270   char s[4] = {};
271   int number_of_bytes =
272       unibrow::Utf8::Encode(s, unibrow::Utf16::CombineSurrogatePair(cc1, cc2),
273                             unibrow::Utf16::kNoPreviousCharacter, false);
274   for (int k = 0; k < number_of_bytes; k++) {
275     AddEncodedOctetToBuffer(s[k], buffer);
276   }
277 }
278 
279 }  // anonymous namespace
280 
Encode(Isolate * isolate,Handle<String> uri,bool is_uri)281 MaybeHandle<String> Uri::Encode(Isolate* isolate, Handle<String> uri,
282                                 bool is_uri) {
283   uri = String::Flatten(isolate, uri);
284   int uri_length = uri->length();
285   std::vector<uint8_t> buffer;
286   buffer.reserve(uri_length);
287 
288   bool throw_error = false;
289   {
290     DisallowGarbageCollection no_gc;
291     String::FlatContent uri_content = uri->GetFlatContent(no_gc);
292 
293     for (int k = 0; k < uri_length; k++) {
294       base::uc16 cc1 = uri_content.Get(k);
295       if (unibrow::Utf16::IsLeadSurrogate(cc1)) {
296         k++;
297         if (k < uri_length) {
298           base::uc16 cc2 = uri->Get(k);
299           if (unibrow::Utf16::IsTrailSurrogate(cc2)) {
300             EncodePair(cc1, cc2, &buffer);
301             continue;
302           }
303         }
304       } else if (!unibrow::Utf16::IsTrailSurrogate(cc1)) {
305         if (IsUnescapePredicateInUriComponent(cc1) ||
306             (is_uri && IsUriSeparator(cc1))) {
307           buffer.push_back(cc1);
308         } else {
309           EncodeSingle(cc1, &buffer);
310         }
311         continue;
312       }
313 
314       // String::FlatContent DCHECKs its contents did not change during its
315       // lifetime. Throwing the error inside the loop may cause GC and move the
316       // string contents.
317       throw_error = true;
318       break;
319     }
320   }
321 
322   if (throw_error) THROW_NEW_ERROR(isolate, NewURIError(), String);
323   return isolate->factory()->NewStringFromOneByte(base::VectorOf(buffer));
324 }
325 
326 namespace {  // Anonymous namespace for Escape and Unescape
327 
328 template <typename Char>
UnescapeChar(base::Vector<const Char> vector,int i,int length,int * step)329 int UnescapeChar(base::Vector<const Char> vector, int i, int length,
330                  int* step) {
331   uint16_t character = vector[i];
332   int32_t hi = 0;
333   int32_t lo = 0;
334   if (character == '%' && i <= length - 6 && vector[i + 1] == 'u' &&
335       (hi = TwoDigitHex(vector[i + 2], vector[i + 3])) > -1 &&
336       (lo = TwoDigitHex(vector[i + 4], vector[i + 5])) > -1) {
337     *step = 6;
338     return (hi << 8) + lo;
339   } else if (character == '%' && i <= length - 3 &&
340              (lo = TwoDigitHex(vector[i + 1], vector[i + 2])) > -1) {
341     *step = 3;
342     return lo;
343   } else {
344     *step = 1;
345     return character;
346   }
347 }
348 
349 template <typename Char>
UnescapeSlow(Isolate * isolate,Handle<String> string,int start_index)350 MaybeHandle<String> UnescapeSlow(Isolate* isolate, Handle<String> string,
351                                  int start_index) {
352   bool one_byte = true;
353   int length = string->length();
354 
355   int unescaped_length = 0;
356   {
357     DisallowGarbageCollection no_gc;
358     base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
359     for (int i = start_index; i < length; unescaped_length++) {
360       int step;
361       if (UnescapeChar(vector, i, length, &step) >
362           String::kMaxOneByteCharCode) {
363         one_byte = false;
364       }
365       i += step;
366     }
367   }
368 
369   DCHECK(start_index < length);
370   Handle<String> first_part =
371       isolate->factory()->NewProperSubString(string, 0, start_index);
372 
373   int dest_position = 0;
374   Handle<String> second_part;
375   DCHECK_LE(unescaped_length, String::kMaxLength);
376   if (one_byte) {
377     Handle<SeqOneByteString> dest = isolate->factory()
378                                         ->NewRawOneByteString(unescaped_length)
379                                         .ToHandleChecked();
380     DisallowGarbageCollection no_gc;
381     base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
382     for (int i = start_index; i < length; dest_position++) {
383       int step;
384       dest->SeqOneByteStringSet(dest_position,
385                                 UnescapeChar(vector, i, length, &step));
386       i += step;
387     }
388     second_part = dest;
389   } else {
390     Handle<SeqTwoByteString> dest = isolate->factory()
391                                         ->NewRawTwoByteString(unescaped_length)
392                                         .ToHandleChecked();
393     DisallowGarbageCollection no_gc;
394     base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
395     for (int i = start_index; i < length; dest_position++) {
396       int step;
397       dest->SeqTwoByteStringSet(dest_position,
398                                 UnescapeChar(vector, i, length, &step));
399       i += step;
400     }
401     second_part = dest;
402   }
403   return isolate->factory()->NewConsString(first_part, second_part);
404 }
405 
IsNotEscaped(uint16_t c)406 bool IsNotEscaped(uint16_t c) {
407   if (IsAlphaNumeric(c)) {
408     return true;
409   }
410   //  @*_+-./
411   switch (c) {
412     case '@':
413     case '*':
414     case '_':
415     case '+':
416     case '-':
417     case '.':
418     case '/':
419       return true;
420     default:
421       return false;
422   }
423 }
424 
425 template <typename Char>
UnescapePrivate(Isolate * isolate,Handle<String> source)426 static MaybeHandle<String> UnescapePrivate(Isolate* isolate,
427                                            Handle<String> source) {
428   int index;
429   {
430     DisallowGarbageCollection no_gc;
431     StringSearch<uint8_t, Char> search(isolate, base::StaticOneByteVector("%"));
432     index = search.Search(source->GetCharVector<Char>(no_gc), 0);
433     if (index < 0) return source;
434   }
435   return UnescapeSlow<Char>(isolate, source, index);
436 }
437 
438 template <typename Char>
EscapePrivate(Isolate * isolate,Handle<String> string)439 static MaybeHandle<String> EscapePrivate(Isolate* isolate,
440                                          Handle<String> string) {
441   DCHECK(string->IsFlat());
442   int escaped_length = 0;
443   int length = string->length();
444 
445   {
446     DisallowGarbageCollection no_gc;
447     base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
448     for (int i = 0; i < length; i++) {
449       uint16_t c = vector[i];
450       if (c >= 256) {
451         escaped_length += 6;
452       } else if (IsNotEscaped(c)) {
453         escaped_length++;
454       } else {
455         escaped_length += 3;
456       }
457 
458       // We don't allow strings that are longer than a maximal length.
459       DCHECK_LT(String::kMaxLength, 0x7FFFFFFF - 6);   // Cannot overflow.
460       if (escaped_length > String::kMaxLength) break;  // Provoke exception.
461     }
462   }
463 
464   // No length change implies no change.  Return original string if no change.
465   if (escaped_length == length) return string;
466 
467   Handle<SeqOneByteString> dest;
468   ASSIGN_RETURN_ON_EXCEPTION(
469       isolate, dest, isolate->factory()->NewRawOneByteString(escaped_length),
470       String);
471   int dest_position = 0;
472 
473   {
474     DisallowGarbageCollection no_gc;
475     base::Vector<const Char> vector = string->GetCharVector<Char>(no_gc);
476     for (int i = 0; i < length; i++) {
477       uint16_t c = vector[i];
478       if (c >= 256) {
479         dest->SeqOneByteStringSet(dest_position, '%');
480         dest->SeqOneByteStringSet(dest_position + 1, 'u');
481         dest->SeqOneByteStringSet(dest_position + 2,
482                                   base::HexCharOfValue(c >> 12));
483         dest->SeqOneByteStringSet(dest_position + 3,
484                                   base::HexCharOfValue((c >> 8) & 0xF));
485         dest->SeqOneByteStringSet(dest_position + 4,
486                                   base::HexCharOfValue((c >> 4) & 0xF));
487         dest->SeqOneByteStringSet(dest_position + 5,
488                                   base::HexCharOfValue(c & 0xF));
489         dest_position += 6;
490       } else if (IsNotEscaped(c)) {
491         dest->SeqOneByteStringSet(dest_position, c);
492         dest_position++;
493       } else {
494         dest->SeqOneByteStringSet(dest_position, '%');
495         dest->SeqOneByteStringSet(dest_position + 1,
496                                   base::HexCharOfValue(c >> 4));
497         dest->SeqOneByteStringSet(dest_position + 2,
498                                   base::HexCharOfValue(c & 0xF));
499         dest_position += 3;
500       }
501     }
502   }
503 
504   return dest;
505 }
506 
507 }  // anonymous namespace
508 
Escape(Isolate * isolate,Handle<String> string)509 MaybeHandle<String> Uri::Escape(Isolate* isolate, Handle<String> string) {
510   Handle<String> result;
511   string = String::Flatten(isolate, string);
512   return String::IsOneByteRepresentationUnderneath(*string)
513              ? EscapePrivate<uint8_t>(isolate, string)
514              : EscapePrivate<base::uc16>(isolate, string);
515 }
516 
Unescape(Isolate * isolate,Handle<String> string)517 MaybeHandle<String> Uri::Unescape(Isolate* isolate, Handle<String> string) {
518   Handle<String> result;
519   string = String::Flatten(isolate, string);
520   return String::IsOneByteRepresentationUnderneath(*string)
521              ? UnescapePrivate<uint8_t>(isolate, string)
522              : UnescapePrivate<base::uc16>(isolate, string);
523 }
524 
525 }  // namespace internal
526 }  // namespace v8
527