• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/check.h"
6 #include "base/cpu_reduction_experiment.h"
7 #include "url/url_canon.h"
8 #include "url/url_canon_internal.h"
9 
10 namespace url {
11 
12 namespace {
13 
14 // For reference, here's what IE supports:
15 // Key: 0 (disallowed: failure if present in the input)
16 //      + (allowed either escaped or unescaped, and unmodified)
17 //      U (allowed escaped or unescaped but always unescaped if present in
18 //         escaped form)
19 //      E (allowed escaped or unescaped but always escaped if present in
20 //         unescaped form)
21 //      % (only allowed escaped in the input, will be unmodified).
22 //      I left blank alpha numeric characters.
23 //
24 //    00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
25 //    -----------------------------------------------
26 // 0   0  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
27 // 1   E  E  E  E  E  E  E  E  E  E  E  E  E  E  E  E
28 // 2   E  +  E  E  +  E  +  +  +  +  +  +  +  U  U  0
29 // 3                                 %  %  E  +  E  0  <-- Those are  : ; < = > ?
30 // 4   %
31 // 5                                    U  0  U  U  U  <-- Those are  [ \ ] ^ _
32 // 6   E                                               <-- That's  `
33 // 7                                    E  E  E  U  E  <-- Those are { | } ~ (UNPRINTABLE)
34 //
35 // NOTE: I didn't actually test all the control characters. Some may be
36 // disallowed in the input, but they are all accepted escaped except for 0.
37 // I also didn't test if characters affecting HTML parsing are allowed
38 // unescaped, e.g. (") or (#), which would indicate the beginning of the path.
39 // Surprisingly, space is accepted in the input and always escaped.
40 
41 // This table lists the canonical version of all characters we allow in the
42 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
43 // value to indicate that this character should be escaped. We are a little more
44 // restrictive than IE, but less restrictive than Firefox.
45 //
46 // Note that we disallow the % character. We will allow it when part of an
47 // escape sequence, of course, but this disallows "%25". Even though IE allows
48 // it, allowing it would put us in a funny state. If there was an invalid
49 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
50 // Allowing percents means we'll succeed a second time, so validity would change
51 // based on how many times you run the canonicalizer. We prefer to always report
52 // the same vailidity, so reject this.
53 const unsigned char kEsc = 0xff;
54 const unsigned char kHostCharLookup[0x80] = {
55 // 00-1f: all are invalid
56      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
57      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
58 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
59    kEsc,kEsc,kEsc,kEsc,kEsc,  0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.',  0,
60 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
61     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':',  0 ,kEsc,kEsc,kEsc,  0 ,
62 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
63    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
64 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
65     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[',  0 , ']',  0 , '_',
66 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
67    kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
68 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
69     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc,  0 ,  0 };
70 
71 // RFC1034 maximum FQDN length.
72 constexpr size_t kMaxHostLength = 253;
73 
74 // Generous padding to account for the fact that UTS#46 normalization can cause
75 // a long string to actually shrink and fit within the 253 character RFC1034
76 // FQDN length limit. Note that this can still be too short for pathological
77 // cases: An arbitrary number of characters (e.g. U+00AD SOFT HYPHEN) can be
78 // removed from the input by UTS#46 processing. However, this should be
79 // sufficient for all normally-encountered, non-abusive hostname strings.
80 constexpr size_t kMaxHostBufferLength = kMaxHostLength * 5;
81 
82 constexpr size_t kTempHostBufferLen = 1024;
83 using StackBuffer = RawCanonOutputT<char, kTempHostBufferLen>;
84 using StackBufferW = RawCanonOutputT<char16_t, kTempHostBufferLen>;
85 
86 // Scans a host name and fills in the output flags according to what we find.
87 // |has_non_ascii| will be true if there are any non-7-bit characters, and
88 // |has_escaped| will be true if there is a percent sign.
89 template<typename CHAR, typename UCHAR>
ScanHostname(const CHAR * spec,const Component & host,bool * has_non_ascii,bool * has_escaped)90 void ScanHostname(const CHAR* spec,
91                   const Component& host,
92                   bool* has_non_ascii,
93                   bool* has_escaped) {
94   int end = host.end();
95   *has_non_ascii = false;
96   *has_escaped = false;
97   for (int i = host.begin; i < end; i++) {
98     if (static_cast<UCHAR>(spec[i]) >= 0x80)
99       *has_non_ascii = true;
100     else if (spec[i] == '%')
101       *has_escaped = true;
102   }
103 }
104 
105 // Canonicalizes a host name that is entirely 8-bit characters (even though
106 // the type holding them may be 16 bits. Escaped characters will be unescaped.
107 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
108 //
109 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
110 // the output.
111 //
112 // This function is used in two situations:
113 //
114 //  * When the caller knows there is no non-ASCII or percent escaped
115 //    characters. This is what DoHost does. The result will be a completely
116 //    canonicalized host since we know nothing weird can happen (escaped
117 //    characters could be unescaped to non-7-bit, so they have to be treated
118 //    with suspicion at this point). It does not use the |has_non_ascii| flag.
119 //
120 //  * When the caller has an 8-bit string that may need unescaping.
121 //    DoComplexHost calls us this situation to do unescaping and validation.
122 //    After this, it may do other IDN operations depending on the value of the
123 //    |*has_non_ascii| flag.
124 //
125 // The return value indicates if the output is a potentially valid host name.
126 template <typename INCHAR, typename OUTCHAR>
DoSimpleHost(const INCHAR * host,size_t host_len,CanonOutputT<OUTCHAR> * output,bool * has_non_ascii)127 bool DoSimpleHost(const INCHAR* host,
128                   size_t host_len,
129                   CanonOutputT<OUTCHAR>* output,
130                   bool* has_non_ascii) {
131   *has_non_ascii = false;
132 
133   bool success = true;
134   for (size_t i = 0; i < host_len; ++i) {
135     unsigned int source = host[i];
136     if (source == '%') {
137       // Unescape first, if possible.
138       // Source will be used only if decode operation was successful.
139       if (!DecodeEscaped(host, &i, host_len,
140                          reinterpret_cast<unsigned char*>(&source))) {
141         // Invalid escaped character. There is nothing that can make this
142         // host valid. We append an escaped percent so the URL looks reasonable
143         // and mark as failed.
144         AppendEscapedChar('%', output);
145         success = false;
146         continue;
147       }
148     }
149 
150     if (source < 0x80) {
151       // We have ASCII input, we can use our lookup table.
152       unsigned char replacement = kHostCharLookup[source];
153       if (!replacement) {
154         // Invalid character, add it as percent-escaped and mark as failed.
155         AppendEscapedChar(source, output);
156         success = false;
157       } else if (replacement == kEsc) {
158         // This character is valid but should be escaped.
159         AppendEscapedChar(source, output);
160       } else {
161         // Common case, the given character is valid in a hostname, the lookup
162         // table tells us the canonical representation of that character (lower
163         // cased).
164         output->push_back(replacement);
165       }
166     } else {
167       // It's a non-ascii char. Just push it to the output.
168       // In case where we have char16 input, and char output it's safe to
169       // cast char16->char only if input string was converted to ASCII.
170       output->push_back(static_cast<OUTCHAR>(source));
171       *has_non_ascii = true;
172     }
173   }
174   return success;
175 }
176 
177 // Canonicalizes a host that requires IDN conversion. Returns true on success
DoIDNHost(const char16_t * src,size_t src_len,CanonOutput * output)178 bool DoIDNHost(const char16_t* src, size_t src_len, CanonOutput* output) {
179   int original_output_len = output->length();  // So we can rewind below.
180 
181   // We need to escape URL before doing IDN conversion, since punicode strings
182   // cannot be escaped after they are created.
183   RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
184   bool has_non_ascii;
185   DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
186   if (url_escaped_host.length() > kMaxHostBufferLength) {
187     AppendInvalidNarrowString(src, 0, src_len, output);
188     return false;
189   }
190 
191   StackBufferW wide_output;
192   if (!IDNToASCII(url_escaped_host.data(),
193                   url_escaped_host.length(),
194                   &wide_output)) {
195     // Some error, give up. This will write some reasonable looking
196     // representation of the string to the output.
197     AppendInvalidNarrowString(src, 0, src_len, output);
198     return false;
199   }
200 
201   // Now we check the ASCII output like a normal host. It will also handle
202   // unescaping. Although we unescaped everything before this function call, if
203   // somebody does %00 as fullwidth, ICU will convert this to ASCII.
204   bool success = DoSimpleHost(wide_output.data(), wide_output.length(), output,
205                               &has_non_ascii);
206   if (has_non_ascii) {
207     // ICU generated something that DoSimpleHost didn't think looked like
208     // ASCII. This is quite rare, but ICU might convert some characters to
209     // percent signs which might generate new escape sequences which might in
210     // turn be invalid. An example is U+FE6A "small percent" which ICU will
211     // name prep into an ASCII percent and then we can interpret the following
212     // characters as escaped characters.
213     //
214     // If DoSimpleHost didn't think the output was ASCII, just escape the
215     // thing we gave ICU and give up. DoSimpleHost will have handled a further
216     // level of escaping from ICU for simple ASCII cases (i.e. if ICU generates
217     // a new escaped ASCII sequence like "%41" we'll unescape it) but it won't
218     // do more (like handle escaped non-ASCII sequences). Handling the escaped
219     // ASCII isn't strictly necessary, but DoSimpleHost handles this case
220     // anyway so we handle it/
221     output->set_length(original_output_len);
222     AppendInvalidNarrowString(wide_output.data(), 0, wide_output.length(),
223                               output);
224     return false;
225   }
226   return success;
227 }
228 
229 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
230 // UTF-16. The has_escaped flag should be set if the input string requires
231 // unescaping.
DoComplexHost(const char * host,size_t host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)232 bool DoComplexHost(const char* host,
233                    size_t host_len,
234                    bool has_non_ascii,
235                    bool has_escaped,
236                    CanonOutput* output) {
237   // Save the current position in the output. We may write stuff and rewind it
238   // below, so we need to know where to rewind to.
239   size_t begin_length = output->length();
240 
241   // Points to the UTF-8 data we want to convert. This will either be the
242   // input or the unescaped version written to |*output| if necessary.
243   const char* utf8_source;
244   size_t utf8_source_len;
245   bool are_all_escaped_valid = true;
246   if (has_escaped) {
247     // Unescape before converting to UTF-16 for IDN. We write this into the
248     // output because it most likely does not require IDNization, and we can
249     // save another huge stack buffer. It will be replaced below if it requires
250     // IDN. This will also update our non-ASCII flag so we know whether the
251     // unescaped input requires IDN.
252     if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
253       // Error with some escape sequence. We'll call the current output
254       // complete. DoSimpleHost will have written some "reasonable" output
255       // for the invalid escapes, but the output could be non-ASCII and
256       // needs to go through re-encoding below.
257       are_all_escaped_valid = false;
258     }
259 
260     // Unescaping may have left us with ASCII input, in which case the
261     // unescaped version we wrote to output is complete.
262     if (!has_non_ascii) {
263       return are_all_escaped_valid;
264     }
265 
266     // Save the pointer into the data was just converted (it may be appended to
267     // other data in the output buffer).
268     utf8_source = &output->data()[begin_length];
269     utf8_source_len = output->length() - begin_length;
270   } else {
271     // We don't need to unescape, use input for IDNization later. (We know the
272     // input has non-ASCII, or the simple version would have been called
273     // instead of us.)
274     utf8_source = host;
275     utf8_source_len = host_len;
276   }
277 
278   // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
279   // Above, we may have used the output to write the unescaped values to, so
280   // we have to rewind it to where we started after we convert it to UTF-16.
281   StackBufferW utf16;
282   if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
283     // In this error case, the input may or may not be the output.
284     StackBuffer utf8;
285     for (size_t i = 0; i < utf8_source_len; i++)
286       utf8.push_back(utf8_source[i]);
287     output->set_length(begin_length);
288     AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
289     return false;
290   }
291   output->set_length(begin_length);
292 
293   // This will call DoSimpleHost which will do normal ASCII canonicalization
294   // and also check for IP addresses in the outpt.
295   return DoIDNHost(utf16.data(), utf16.length(), output) &&
296          are_all_escaped_valid;
297 }
298 
299 // UTF-16 convert host to its ASCII version. The set up is already ready for
300 // the backend, so we just pass through. The has_escaped flag should be set if
301 // the input string requires unescaping.
DoComplexHost(const char16_t * host,size_t host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)302 bool DoComplexHost(const char16_t* host,
303                    size_t host_len,
304                    bool has_non_ascii,
305                    bool has_escaped,
306                    CanonOutput* output) {
307   if (has_escaped) {
308     // Yikes, we have escaped characters with wide input. The escaped
309     // characters should be interpreted as UTF-8. To solve this problem,
310     // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
311     //
312     // We don't bother to optimize the conversion in the ASCII case (which
313     // *could* just be a copy) and use the UTF-8 path, because it should be
314     // very rare that host names have escaped characters, and it is relatively
315     // fast to do the conversion anyway.
316     StackBuffer utf8;
317     if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
318       AppendInvalidNarrowString(host, 0, host_len, output);
319       return false;
320     }
321 
322     // Once we convert to UTF-8, we can use the 8-bit version of the complex
323     // host handling code above.
324     return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii, has_escaped,
325                          output);
326   }
327 
328   // No unescaping necessary, we can safely pass the input to ICU. This
329   // function will only get called if we either have escaped or non-ascii
330   // input, so it's safe to just use ICU now. Even if the input is ASCII,
331   // this function will do the right thing (just slower than we could).
332   return DoIDNHost(host, host_len, output);
333 }
334 
335 template <typename CHAR, typename UCHAR>
DoHostSubstring(const CHAR * spec,const Component & host,CanonOutput * output)336 bool DoHostSubstring(const CHAR* spec,
337                      const Component& host,
338                      CanonOutput* output) {
339   DCHECK(host.is_valid());
340 
341   bool has_non_ascii, has_escaped;
342   ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
343 
344   if (has_non_ascii || has_escaped) {
345     return DoComplexHost(&spec[host.begin], static_cast<size_t>(host.len),
346                          has_non_ascii, has_escaped, output);
347   }
348 
349   const bool success = DoSimpleHost(
350       &spec[host.begin], static_cast<size_t>(host.len), output, &has_non_ascii);
351   DCHECK(!has_non_ascii);
352   return success;
353 }
354 
355 template <typename CHAR, typename UCHAR>
DoHost(const CHAR * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)356 void DoHost(const CHAR* spec,
357             const Component& host,
358             CanonOutput* output,
359             CanonHostInfo* host_info) {
360   if (host.is_empty()) {
361     // Empty hosts don't need anything.
362     host_info->family = CanonHostInfo::NEUTRAL;
363     host_info->out_host = Component();
364     return;
365   }
366 
367   // Keep track of output's initial length, so we can rewind later.
368   const int output_begin = output->length();
369 
370   if (DoHostSubstring<CHAR, UCHAR>(spec, host, output)) {
371     // After all the other canonicalization, check if we ended up with an IP
372     // address. IP addresses are small, so writing into this temporary buffer
373     // should not cause an allocation.
374     RawCanonOutput<64> canon_ip;
375     CanonicalizeIPAddress(output->data(),
376                           MakeRange(output_begin, output->length()),
377                           &canon_ip, host_info);
378 
379     // If we got an IPv4/IPv6 address, copy the canonical form back to the
380     // real buffer. Otherwise, it's a hostname or broken IP, in which case
381     // we just leave it in place.
382     if (host_info->IsIPAddress()) {
383       output->set_length(output_begin);
384       output->Append(canon_ip.data(), canon_ip.length());
385     }
386   } else {
387     // Canonicalization failed. Set BROKEN to notify the caller.
388     host_info->family = CanonHostInfo::BROKEN;
389   }
390 
391   host_info->out_host = MakeRange(output_begin, output->length());
392 }
393 
394 }  // namespace
395 
CanonicalizeHost(const char * spec,const Component & host,CanonOutput * output,Component * out_host)396 bool CanonicalizeHost(const char* spec,
397                       const Component& host,
398                       CanonOutput* output,
399                       Component* out_host) {
400   CanonHostInfo host_info;
401   DoHost<char, unsigned char>(spec, host, output, &host_info);
402   *out_host = host_info.out_host;
403   return (host_info.family != CanonHostInfo::BROKEN);
404 }
405 
CanonicalizeHost(const char16_t * spec,const Component & host,CanonOutput * output,Component * out_host)406 bool CanonicalizeHost(const char16_t* spec,
407                       const Component& host,
408                       CanonOutput* output,
409                       Component* out_host) {
410   CanonHostInfo host_info;
411   DoHost<char16_t, char16_t>(spec, host, output, &host_info);
412   *out_host = host_info.out_host;
413   return (host_info.family != CanonHostInfo::BROKEN);
414 }
415 
CanonicalizeHostVerbose(const char * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)416 void CanonicalizeHostVerbose(const char* spec,
417                              const Component& host,
418                              CanonOutput* output,
419                              CanonHostInfo* host_info) {
420   DoHost<char, unsigned char>(spec, host, output, host_info);
421 }
422 
CanonicalizeHostVerbose(const char16_t * spec,const Component & host,CanonOutput * output,CanonHostInfo * host_info)423 void CanonicalizeHostVerbose(const char16_t* spec,
424                              const Component& host,
425                              CanonOutput* output,
426                              CanonHostInfo* host_info) {
427   DoHost<char16_t, char16_t>(spec, host, output, host_info);
428 }
429 
CanonicalizeHostSubstring(const char * spec,const Component & host,CanonOutput * output)430 bool CanonicalizeHostSubstring(const char* spec,
431                                const Component& host,
432                                CanonOutput* output) {
433   return DoHostSubstring<char, unsigned char>(spec, host, output);
434 }
435 
CanonicalizeHostSubstring(const char16_t * spec,const Component & host,CanonOutput * output)436 bool CanonicalizeHostSubstring(const char16_t* spec,
437                                const Component& host,
438                                CanonOutput* output) {
439   return DoHostSubstring<char16_t, char16_t>(spec, host, output);
440 }
441 
442 }  // namespace url
443