• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/check.h"
6 #include "base/cpu_reduction_experiment.h"
7 #include "url/url_canon.h"
8 #include "url/url_canon_internal.h"
9 #include "url/url_features.h"
10 
11 namespace url {
12 
13 namespace {
14 
15 // clang-format off
16 //
17 // For reference, here's what IE supports:
18 // Key: 0 (disallowed: failure if present in the input)
19 //      + (allowed either escaped or unescaped, and unmodified)
20 //      U (allowed escaped or unescaped but always unescaped if present in
21 //         escaped form)
22 //      E (allowed escaped or unescaped but always escaped if present in
23 //         unescaped form)
24 //      % (only allowed escaped in the input, will be unmodified).
25 //      I left blank alpha numeric characters.
26 //
27 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
28 //    -----------------------------------------------
29 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
30 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
31 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
32 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
33 // 4   %
34 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
35 // 6   E                                               <-- That's  `
36 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
37 //
38 // NOTE: I didn't actually test all the control characters. Some may be
39 // disallowed in the input, but they are all accepted escaped except for 0.
40 // I also didn't test if characters affecting HTML parsing are allowed
41 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.
42 // Surprisingly, space is accepted in the input and always escaped.
43 //
44 // TODO(https://crbug.com/1416013): Remove the above historical reference
45 // information once we are 100% standard compliant to the URL Standard.
46 //
47 // This table lists the canonical version of all characters we allow in the
48 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
49 // value to indicate that this character should be escaped. We are a little more
50 // restrictive than IE, but less restrictive than Firefox.
51 //
52 const unsigned char kEsc = 0xff;
53 const unsigned char kHostCharLookup[0x80] = {
54 // 00-1f: all are invalid
55      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
57 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
58    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
59 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
60     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
61 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
62    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
63 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
64     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
65 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
66    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
67 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
68     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
69 
70 // The following table is used when kStandardCompliantHostCharLookup feature is
71 // enabled. See https://crbug.com/1416013 for details. At present, ' ' (SPACE)
72 // and '*' (asterisk) are still non-compliant to the URL Standard.
73 const unsigned char kStandardCompliantHostCharLookup[0x80] = {
74 // 00-1f: all are invalid
75      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
76      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
77 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
78     kEsc,'!', '"',  0,  '$',  0,  '&', '\'','(', ')', kEsc, '+', ',', '-', '.',  0,
79 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
80     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';' , 0,  '=',  0,   0,
81 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
82      0,  'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
83 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
84     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0,  ']',  0,  '_',
85 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
86     '`', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
87 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
88     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '{',  0, '}',  '~',  0 };
89 // clang-format on
90 
91 // RFC1034 maximum FQDN length.
92 constexpr size_t kMaxHostLength = 253;
93 
94 // Generous padding to account for the fact that UTS#46 normalization can cause
95 // a long string to actually shrink and fit within the 253 character RFC1034
96 // FQDN length limit. Note that this can still be too short for pathological
97 // cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
98 // removed from the input by UTS#46 processing. However, this should be
99 // sufficient for all normally-encountered, non-abusive hostname strings.
100 constexpr size_t kMaxHostBufferLength = kMaxHostLength * 5;
101 
102 constexpr size_t kTempHostBufferLen = 1024;
103 using StackBuffer = RawCanonOutputT<char, kTempHostBufferLen>;
104 using StackBufferW = RawCanonOutputT<char16_t, kTempHostBufferLen>;
105 
106 // Scans a host name and fills in the output flags according to what we find.
107 // |has_non_ascii| will be true if there are any non-7-bit characters, and
108 // |has_escaped| will be true if there is a percent sign.
109 template<typename CHAR, typename UCHAR>
ScanHostname(const CHAR * spec,const Component & host,bool * has_non_ascii,bool * has_escaped)110 void ScanHostname(const CHAR* spec,
111                   const Component& host,
112                   bool* has_non_ascii,
113                   bool* has_escaped) {
114   int end = host.end();
115   *has_non_ascii = false;
116   *has_escaped = false;
117   for (int i = host.begin; i < end; i++) {
118     if (static_cast<UCHAR>(spec[i]) >= 0x80)
119       *has_non_ascii = true;
120     else if (spec[i] == '%')
121       *has_escaped = true;
122   }
123 }
124 
125 // Canonicalizes a host name that is entirely 8-bit characters (even though
126 // the type holding them may be 16 bits. Escaped characters will be unescaped.
127 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
128 //
129 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
130 // the output.
131 //
132 // This function is used in two situations:
133 //
134 //  * When the caller knows there is no non-ASCII or percent escaped
135 //    characters. This is what DoHost does. The result will be a completely
136 //    canonicalized host since we know nothing weird can happen (escaped
137 //    characters could be unescaped to non-7-bit, so they have to be treated
138 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
139 //
140 //  * When the caller has an 8-bit string that may need unescaping.
141 //    DoComplexHost calls us this situation to do unescaping and validation.
142 //    After this, it may do other IDN operations depending on the value of the
143 //    |*has_non_ascii| flag.
144 //
145 // The return value indicates if the output is a potentially valid host name.
146 template <typename INCHAR, typename OUTCHAR>
DoSimpleHost(const INCHAR * host,size_t host_len,CanonOutputT<OUTCHAR> * output,bool * has_non_ascii)147 bool DoSimpleHost(const INCHAR* host,
148                   size_t host_len,
149                   CanonOutputT<OUTCHAR>* output,
150                   bool* has_non_ascii) {
151   *has_non_ascii = false;
152 
153   bool success = true;
154   for (size_t i = 0; i < host_len; ++i) {
155     unsigned int source = host[i];
156     if (source == '%') {
157       // Unescape first, if possible.
158       // Source will be used only if decode operation was successful.
159       if (!DecodeEscaped(host, &i, host_len,
160                          reinterpret_cast<unsigned char*>(&source))) {
161         // Invalid escaped character. There is nothing that can make this
162         // host valid. We append an escaped percent so the URL looks reasonable
163         // and mark as failed.
164         AppendEscapedChar('%', output);
165         success = false;
166         continue;
167       }
168     }
169 
170     if (source < 0x80) {
171       // We have ASCII input, we can use our lookup table.
172       unsigned char replacement;
173       if (url::IsUsingStandardCompliantHostCharacters()) {
174         replacement = kStandardCompliantHostCharLookup[source];
175       } else {
176         replacement = kHostCharLookup[source];
177       }
178       if (!replacement) {
179         // Invalid character, add it as percent-escaped and mark as failed.
180         AppendEscapedChar(source, output);
181         success = false;
182       } else if (replacement == kEsc) {
183         // This character is valid but should be escaped.
184         AppendEscapedChar(source, output);
185       } else {
186         // Common case, the given character is valid in a hostname, the lookup
187         // table tells us the canonical representation of that character (lower
188         // cased).
189         output->push_back(replacement);
190       }
191     } else {
192       // It's a non-ascii char. Just push it to the output.
193       // In case where we have char16 input, and char output it's safe to
194       // cast char16->char only if input string was converted to ASCII.
195       output->push_back(static_cast<OUTCHAR>(source));
196       *has_non_ascii = true;
197     }
198   }
199   return success;
200 }
201 
202 // Canonicalizes a host that requires IDN conversion. Returns true on success
DoIDNHost(const char16_t * src,size_t src_len,CanonOutput * output)203 bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) {
204   int original_output_len = output->length();  // So we can rewind below.
205 
206   // We need to escape URL before doing IDN conversion, since punicode strings
207   // cannot be escaped after they are created.
208   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
209   bool has_non_ascii;
210   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
211   if (url_escaped_host.length() > kMaxHostBufferLength) {
212     AppendInvalidNarrowString(src, 0, src_len, output);
213     return false;
214   }
215 
216   StackBufferW wide_output;
217   if (!IDNToASCII(url_escaped_host.view(), &wide_output)) {
218     // Some error, give up. This will write some reasonable looking
219     // representation of the string to the output.
220     AppendInvalidNarrowString(src, 0, src_len, output);
221     return false;
222   }
223 
224   // Now we check the ASCII output like a normal host. It will also handle
225   // unescaping. Although we unescaped everything before this function call, if
226   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
227   bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output,
228                               &has_non_ascii);
229   if (has_non_ascii) {
230     // ICU generated something that DoSimpleHost didn't think looked like
231     // ASCII. This is quite rare, but ICU might convert some characters to
232     // percent signs which might generate new escape sequences which might in
233     // turn be invalid. An example is U+FE6A "small percent" which ICU will
234     // name prep into an ASCII percent and then we can interpret the following
235     // characters as escaped characters.
236     //
237     // If DoSimpleHost didn't think the output was ASCII, just escape the
238     // thing we gave ICU and give up. DoSimpleHost will have handled a further
239     // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
240     // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
241     // do more (like handle escaped non-ASCII sequences). Handling the escaped
242     // ASCII isn't strictly necessary, but DoSimpleHost handles this case
243     // anyway so we handle it/
244     output->set_length(original_output_len);
245     AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
246                               output);
247     return false;
248   }
249   return success;
250 }
251 
252 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
253 // UTF-16. The has_escaped flag should be set if the input string requires
254 // unescaping.
DoComplexHost(const char * host,size_t host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)255 bool DoComplexHost(const char* host,
256                    size_t host_len,
257                    bool has_non_ascii,
258                    bool has_escaped,
259                    CanonOutput* output) {
260   // Save the current position in the output. We may write stuff and rewind it
261   // below, so we need to know where to rewind to.
262   size_t begin_length = output->length();
263 
264   // Points to the UTF-8 data we want to convert. This will either be the
265   // input or the unescaped version written to |*output| if necessary.
266   const char* utf8_source;
267   size_t utf8_source_len;
268   bool are_all_escaped_valid = true;
269   if (has_escaped) {
270     // Unescape before converting to UTF-16 for IDN. We write this into the
271     // output because it most likely does not require IDNization, and we can
272     // save another huge stack buffer. It will be replaced below if it requires
273     // IDN. This will also update our non-ASCII flag so we know whether the
274     // unescaped input requires IDN.
275     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
276       // Error with some escape sequence. We'll call the current output
277       // complete. DoSimpleHost will have written some "reasonable" output
278       // for the invalid escapes, but the output could be non-ASCII and
279       // needs to go through re-encoding below.
280       are_all_escaped_valid = false;
281     }
282 
283     // Unescaping may have left us with ASCII input, in which case the
284     // unescaped version we wrote to output is complete.
285     if (!has_non_ascii) {
286       return are_all_escaped_valid;
287     }
288 
289     // Save the pointer into the data was just converted (it may be appended to
290     // other data in the output buffer).
291     utf8_source = &output->data()[begin_length];
292     utf8_source_len = output->length() - begin_length;
293   } else {
294     // We don't need to unescape, use input for IDNization later. (We know the
295     // input has non-ASCII, or the simple version would have been called
296     // instead of us.)
297     utf8_source = host;
298     utf8_source_len = host_len;
299   }
300 
301   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
302   // Above, we may have used the output to write the unescaped values to, so
303   // we have to rewind it to where we started after we convert it to UTF-16.
304   StackBufferW utf16;
305   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
306     // In this error case, the input may or may not be the output.
307     StackBuffer utf8;
308     for (size_t i = 0; i < utf8_source_len; i++)
309       utf8.push_back(utf8_source[i]);
310     output->set_length(begin_length);
311     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
312     return false;
313   }
314   output->set_length(begin_length);
315 
316   // This will call DoSimpleHost which will do normal ASCII canonicalization
317   // and also check for IP addresses in the outpt.
318   return DoIDNHost(utf16.data(), utf16.length(), output) &&
319          are_all_escaped_valid;
320 }
321 
322 // UTF-16 convert host to its ASCII version. The set up is already ready for
323 // the backend, so we just pass through. The has_escaped flag should be set if
324 // the input string requires unescaping.
DoComplexHost(const char16_t * host,size_t host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)325 bool DoComplexHost(const char16_t* host,
326                    size_t host_len,
327                    bool has_non_ascii,
328                    bool has_escaped,
329                    CanonOutput* output) {
330   if (has_escaped) {
331     // Yikes, we have escaped characters with wide input. The escaped
332     // characters should be interpreted as UTF-8. To solve this problem,
333     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
334     //
335     // We don't bother to optimize the conversion in the ASCII case (which
336     // *could* just be a copy) and use the UTF-8 path, because it should be
337     // very rare that host names have escaped characters, and it is relatively
338     // fast to do the conversion anyway.
339     StackBuffer utf8;
340     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
341       AppendInvalidNarrowString(host, 0, host_len, output);
342       return false;
343     }
344 
345     // Once we convert to UTF-8, we can use the 8-bit version of the complex
346     // host handling code above.
347     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, has_escaped,
348                          output);
349   }
350 
351   // No unescaping necessary, we can safely pass the input to ICU. This
352   // function will only get called if we either have escaped or non-ascii
353   // input, so it's safe to just use ICU now. Even if the input is ASCII,
354   // this function will do the right thing (just slower than we could).
355   return DoIDNHost(host, host_len, output);
356 }
357 
358 template <typename CHAR, typename UCHAR>
DoHostSubstring(const CHAR * spec,const Component & host,CanonOutput * output)359 bool DoHostSubstring(const CHAR* spec,
360                      const Component& host,
361                      CanonOutput* output) {
362   DCHECK(host.is_valid());
363 
364   bool has_non_ascii, has_escaped;
365   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
366 
367   if (has_non_ascii || has_escaped) {
368     return DoComplexHost(&spec[host.begin], static_cast<size_t>(host.len),
369                          has_non_ascii, has_escaped, output);
370   }
371 
372   const bool success = DoSimpleHost(
373       &spec[host.begin], static_cast<size_t>(host.len), output, &has_non_ascii);
374   DCHECK(!has_non_ascii);
375   return success;
376 }
377 
378 template <typename CHAR, typename UCHAR>
DoHost(const CHAR * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)379 void DoHost(const CHAR* spec,
380             const Component& host,
381             CanonOutput* output,
382             CanonHostInfo* host_info) {
383   if (host.is_empty()) {
384     // Empty hosts don't need anything.
385     host_info->family = CanonHostInfo::NEUTRAL;
386     host_info->out_host = Component();
387     return;
388   }
389 
390   // Keep track of output's initial length, so we can rewind later.
391   const int output_begin = output->length();
392 
393   if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
394     // After all the other canonicalization, check if we ended up with an IP
395     // address. IP addresses are small, so writing into this temporary buffer
396     // should not cause an allocation.
397     RawCanonOutput<64> canon_ip;
398     CanonicalizeIPAddress(output->data(),
399                           MakeRange(output_begin, output->length()),
400                           &canon_ip, host_info);
401 
402     // If we got an IPv4/IPv6 address, copy the canonical form back to the
403     // real buffer. Otherwise, it's a hostname or broken IP, in which case
404     // we just leave it in place.
405     if (host_info->IsIPAddress()) {
406       output->set_length(output_begin);
407       output->Append(canon_ip.view());
408     }
409   } else {
410     // Canonicalization failed. Set BROKEN to notify the caller.
411     host_info->family = CanonHostInfo::BROKEN;
412   }
413 
414   host_info->out_host = MakeRange(output_begin, output->length());
415 }
416 
417 }  // namespace
418 
CanonicalizeHost(const char * spec,const Component & host,CanonOutput * output,Component * out_host)419 bool CanonicalizeHost(const char* spec,
420                       const Component& host,
421                       CanonOutput* output,
422                       Component* out_host) {
423   CanonHostInfo host_info;
424   DoHost<char, unsigned char>(spec, host, output, &host_info);
425   *out_host = host_info.out_host;
426   return (host_info.family != CanonHostInfo::BROKEN);
427 }
428 
CanonicalizeHost(const char16_t * spec,const Component & host,CanonOutput * output,Component * out_host)429 bool CanonicalizeHost(const char16_t* spec,
430                       const Component& host,
431                       CanonOutput* output,
432                       Component* out_host) {
433   CanonHostInfo host_info;
434   DoHost<char16_t, char16_t>(spec, host, output, &host_info);
435   *out_host = host_info.out_host;
436   return (host_info.family != CanonHostInfo::BROKEN);
437 }
438 
CanonicalizeHostVerbose(const char * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)439 void CanonicalizeHostVerbose(const char* spec,
440                              const Component& host,
441                              CanonOutput* output,
442                              CanonHostInfo* host_info) {
443   DoHost<char, unsigned char>(spec, host, output, host_info);
444 }
445 
CanonicalizeHostVerbose(const char16_t * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)446 void CanonicalizeHostVerbose(const char16_t* spec,
447                              const Component& host,
448                              CanonOutput* output,
449                              CanonHostInfo* host_info) {
450   DoHost<char16_t, char16_t>(spec, host, output, host_info);
451 }
452 
CanonicalizeHostSubstring(const char * spec,const Component & host,CanonOutput * output)453 bool CanonicalizeHostSubstring(const char* spec,
454                                const Component& host,
455                                CanonOutput* output) {
456   return DoHostSubstring<char, unsigned char>(spec, host, output);
457 }
458 
CanonicalizeHostSubstring(const char16_t * spec,const Component & host,CanonOutput * output)459 bool CanonicalizeHostSubstring(const char16_t* spec,
460                                const Component& host,
461                                CanonOutput* output) {
462   return DoHostSubstring<char16_t, char16_t>(spec, host, output);
463 }
464 
465 }  // namespace url
466