1 // Copyright 2013 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/logging.h"
6 #include "url/url_canon.h"
7 #include "url/url_canon_internal.h"
8
9 namespace url_canon {
10
11 namespace {
12
13 // For reference, here's what IE supports:
14 // Key: 0 (disallowed: failure if present in the input)
15 // + (allowed either escaped or unescaped, and unmodified)
16 // U (allowed escaped or unescaped but always unescaped if present in
17 // escaped form)
18 // E (allowed escaped or unescaped but always escaped if present in
19 // unescaped form)
20 // % (only allowed escaped in the input, will be unmodified).
21 // I left blank alpha numeric characters.
22 //
23 // 00 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f
24 // -----------------------------------------------
25 // 0 0 E E E E E E E E E E E E E E E
26 // 1 E E E E E E E E E E E E E E E E
27 // 2 E + E E + E + + + + + + + U U 0
28 // 3 % % E + E 0 <-- Those are : ; < = > ?
29 // 4 %
30 // 5 U 0 U U U <-- Those are [ \ ] ^ _
31 // 6 E <-- That's `
32 // 7 E E E U E <-- Those are { | } ~ (UNPRINTABLE)
33 //
34 // NOTE: I didn't actually test all the control characters. Some may be
35 // disallowed in the input, but they are all accepted escaped except for 0.
36 // I also didn't test if characters affecting HTML parsing are allowed
37 // unescaped, eg. (") or (#), which would indicate the beginning of the path.
38 // Surprisingly, space is accepted in the input and always escaped.
39
40 // This table lists the canonical version of all characters we allow in the
41 // input, with 0 indicating it is disallowed. We use the magic kEscapedHostChar
42 // value to indicate that this character should be escaped. We are a little more
43 // restrictive than IE, but less restrictive than Firefox.
44 //
45 // Note that we disallow the % character. We will allow it when part of an
46 // escape sequence, of course, but this disallows "%25". Even though IE allows
47 // it, allowing it would put us in a funny state. If there was an invalid
48 // escape sequence like "%zz", we'll add "%25zz" to the output and fail.
49 // Allowing percents means we'll succeed a second time, so validity would change
50 // based on how many times you run the canonicalizer. We prefer to always report
51 // the same vailidity, so reject this.
52 const unsigned char kEsc = 0xff;
53 const unsigned char kHostCharLookup[0x80] = {
54 // 00-1f: all are invalid
55 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
56 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57 // ' ' ! " # $ % & ' ( ) * + , - . /
58 kEsc,kEsc,kEsc,kEsc,kEsc, 0, kEsc,kEsc,kEsc,kEsc,kEsc, '+',kEsc, '-', '.', 0,
59 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
60 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', 0 ,kEsc,kEsc,kEsc, 0 ,
61 // @ A B C D E F G H I J K L M N O
62 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
63 // P Q R S T U V W X Y Z [ \ ] ^ _
64 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '[', 0 , ']', 0 , '_',
65 // ` a b c d e f g h i j k l m n o
66 kEsc, 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
67 // p q r s t u v w x y z { | } ~
68 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',kEsc,kEsc,kEsc, 0 , 0 };
69
70 const int kTempHostBufferLen = 1024;
71 typedef RawCanonOutputT<char, kTempHostBufferLen> StackBuffer;
72 typedef RawCanonOutputT<base::char16, kTempHostBufferLen> StackBufferW;
73
74 // Scans a host name and fills in the output flags according to what we find.
75 // |has_non_ascii| will be true if there are any non-7-bit characters, and
76 // |has_escaped| will be true if there is a percent sign.
77 template<typename CHAR, typename UCHAR>
ScanHostname(const CHAR * spec,const url_parse::Component & host,bool * has_non_ascii,bool * has_escaped)78 void ScanHostname(const CHAR* spec, const url_parse::Component& host,
79 bool* has_non_ascii, bool* has_escaped) {
80 int end = host.end();
81 *has_non_ascii = false;
82 *has_escaped = false;
83 for (int i = host.begin; i < end; i++) {
84 if (static_cast<UCHAR>(spec[i]) >= 0x80)
85 *has_non_ascii = true;
86 else if (spec[i] == '%')
87 *has_escaped = true;
88 }
89 }
90
91 // Canonicalizes a host name that is entirely 8-bit characters (even though
92 // the type holding them may be 16 bits. Escaped characters will be unescaped.
93 // Non-7-bit characters (for example, UTF-8) will be passed unchanged.
94 //
95 // The |*has_non_ascii| flag will be true if there are non-7-bit characters in
96 // the output.
97 //
98 // This function is used in two situations:
99 //
100 // * When the caller knows there is no non-ASCII or percent escaped
101 // characters. This is what DoHost does. The result will be a completely
102 // canonicalized host since we know nothing weird can happen (escaped
103 // characters could be unescaped to non-7-bit, so they have to be treated
104 // with suspicion at this point). It does not use the |has_non_ascii| flag.
105 //
106 // * When the caller has an 8-bit string that may need unescaping.
107 // DoComplexHost calls us this situation to do unescaping and validation.
108 // After this, it may do other IDN operations depending on the value of the
109 // |*has_non_ascii| flag.
110 //
111 // The return value indicates if the output is a potentially valid host name.
112 template<typename INCHAR, typename OUTCHAR>
DoSimpleHost(const INCHAR * host,int host_len,CanonOutputT<OUTCHAR> * output,bool * has_non_ascii)113 bool DoSimpleHost(const INCHAR* host,
114 int host_len,
115 CanonOutputT<OUTCHAR>* output,
116 bool* has_non_ascii) {
117 *has_non_ascii = false;
118
119 bool success = true;
120 for (int i = 0; i < host_len; ++i) {
121 unsigned int source = host[i];
122 if (source == '%') {
123 // Unescape first, if possible.
124 // Source will be used only if decode operation was successful.
125 if (!DecodeEscaped(host, &i, host_len,
126 reinterpret_cast<unsigned char*>(&source))) {
127 // Invalid escaped character. There is nothing that can make this
128 // host valid. We append an escaped percent so the URL looks reasonable
129 // and mark as failed.
130 AppendEscapedChar('%', output);
131 success = false;
132 continue;
133 }
134 }
135
136 if (source < 0x80) {
137 // We have ASCII input, we can use our lookup table.
138 unsigned char replacement = kHostCharLookup[source];
139 if (!replacement) {
140 // Invalid character, add it as percent-escaped and mark as failed.
141 AppendEscapedChar(source, output);
142 success = false;
143 } else if (replacement == kEsc) {
144 // This character is valid but should be escaped.
145 AppendEscapedChar(source, output);
146 } else {
147 // Common case, the given character is valid in a hostname, the lookup
148 // table tells us the canonical representation of that character (lower
149 // cased).
150 output->push_back(replacement);
151 }
152 } else {
153 // It's a non-ascii char. Just push it to the output.
154 // In case where we have char16 input, and char output it's safe to
155 // cast char16->char only if input string was converted to ASCII.
156 output->push_back(static_cast<OUTCHAR>(source));
157 *has_non_ascii = true;
158 }
159 }
160
161 return success;
162 }
163
164 // Canonicalizes a host that requires IDN conversion. Returns true on success
DoIDNHost(const base::char16 * src,int src_len,CanonOutput * output)165 bool DoIDNHost(const base::char16* src, int src_len, CanonOutput* output) {
166 // We need to escape URL before doing IDN conversion, since punicode strings
167 // cannot be escaped after they are created.
168 RawCanonOutputW<kTempHostBufferLen> url_escaped_host;
169 bool has_non_ascii;
170 DoSimpleHost(src, src_len, &url_escaped_host, &has_non_ascii);
171
172 StackBufferW wide_output;
173 if (!IDNToASCII(url_escaped_host.data(),
174 url_escaped_host.length(),
175 &wide_output)) {
176 // Some error, give up. This will write some reasonable looking
177 // representation of the string to the output.
178 AppendInvalidNarrowString(src, 0, src_len, output);
179 return false;
180 }
181
182 // Now we check the ASCII output like a normal host. It will also handle
183 // unescaping. Although we unescaped everything before this function call, if
184 // somebody does %00 as fullwidth, ICU will convert this to ASCII.
185 bool success = DoSimpleHost(wide_output.data(),
186 wide_output.length(),
187 output, &has_non_ascii);
188 DCHECK(!has_non_ascii);
189 return success;
190 }
191
192 // 8-bit convert host to its ASCII version: this converts the UTF-8 input to
193 // UTF-16. The has_escaped flag should be set if the input string requires
194 // unescaping.
DoComplexHost(const char * host,int host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)195 bool DoComplexHost(const char* host, int host_len,
196 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
197 // Save the current position in the output. We may write stuff and rewind it
198 // below, so we need to know where to rewind to.
199 int begin_length = output->length();
200
201 // Points to the UTF-8 data we want to convert. This will either be the
202 // input or the unescaped version written to |*output| if necessary.
203 const char* utf8_source;
204 int utf8_source_len;
205 if (has_escaped) {
206 // Unescape before converting to UTF-16 for IDN. We write this into the
207 // output because it most likely does not require IDNization, and we can
208 // save another huge stack buffer. It will be replaced below if it requires
209 // IDN. This will also update our non-ASCII flag so we know whether the
210 // unescaped input requires IDN.
211 if (!DoSimpleHost(host, host_len, output, &has_non_ascii)) {
212 // Error with some escape sequence. We'll call the current output
213 // complete. DoSimpleHost will have written some "reasonable" output.
214 return false;
215 }
216
217 // Unescaping may have left us with ASCII input, in which case the
218 // unescaped version we wrote to output is complete.
219 if (!has_non_ascii) {
220 return true;
221 }
222
223 // Save the pointer into the data was just converted (it may be appended to
224 // other data in the output buffer).
225 utf8_source = &output->data()[begin_length];
226 utf8_source_len = output->length() - begin_length;
227 } else {
228 // We don't need to unescape, use input for IDNization later. (We know the
229 // input has non-ASCII, or the simple version would have been called
230 // instead of us.)
231 utf8_source = host;
232 utf8_source_len = host_len;
233 }
234
235 // Non-ASCII input requires IDN, convert to UTF-16 and do the IDN conversion.
236 // Above, we may have used the output to write the unescaped values to, so
237 // we have to rewind it to where we started after we convert it to UTF-16.
238 StackBufferW utf16;
239 if (!ConvertUTF8ToUTF16(utf8_source, utf8_source_len, &utf16)) {
240 // In this error case, the input may or may not be the output.
241 StackBuffer utf8;
242 for (int i = 0; i < utf8_source_len; i++)
243 utf8.push_back(utf8_source[i]);
244 output->set_length(begin_length);
245 AppendInvalidNarrowString(utf8.data(), 0, utf8.length(), output);
246 return false;
247 }
248 output->set_length(begin_length);
249
250 // This will call DoSimpleHost which will do normal ASCII canonicalization
251 // and also check for IP addresses in the outpt.
252 return DoIDNHost(utf16.data(), utf16.length(), output);
253 }
254
255 // UTF-16 convert host to its ASCII version. The set up is already ready for
256 // the backend, so we just pass through. The has_escaped flag should be set if
257 // the input string requires unescaping.
DoComplexHost(const base::char16 * host,int host_len,bool has_non_ascii,bool has_escaped,CanonOutput * output)258 bool DoComplexHost(const base::char16* host, int host_len,
259 bool has_non_ascii, bool has_escaped, CanonOutput* output) {
260 if (has_escaped) {
261 // Yikes, we have escaped characters with wide input. The escaped
262 // characters should be interpreted as UTF-8. To solve this problem,
263 // we convert to UTF-8, unescape, then convert back to UTF-16 for IDN.
264 //
265 // We don't bother to optimize the conversion in the ASCII case (which
266 // *could* just be a copy) and use the UTF-8 path, because it should be
267 // very rare that host names have escaped characters, and it is relatively
268 // fast to do the conversion anyway.
269 StackBuffer utf8;
270 if (!ConvertUTF16ToUTF8(host, host_len, &utf8)) {
271 AppendInvalidNarrowString(host, 0, host_len, output);
272 return false;
273 }
274
275 // Once we convert to UTF-8, we can use the 8-bit version of the complex
276 // host handling code above.
277 return DoComplexHost(utf8.data(), utf8.length(), has_non_ascii,
278 has_escaped, output);
279 }
280
281 // No unescaping necessary, we can safely pass the input to ICU. This
282 // function will only get called if we either have escaped or non-ascii
283 // input, so it's safe to just use ICU now. Even if the input is ASCII,
284 // this function will do the right thing (just slower than we could).
285 return DoIDNHost(host, host_len, output);
286 }
287
288 template<typename CHAR, typename UCHAR>
DoHost(const CHAR * spec,const url_parse::Component & host,CanonOutput * output,CanonHostInfo * host_info)289 void DoHost(const CHAR* spec,
290 const url_parse::Component& host,
291 CanonOutput* output,
292 CanonHostInfo* host_info) {
293 if (host.len <= 0) {
294 // Empty hosts don't need anything.
295 host_info->family = CanonHostInfo::NEUTRAL;
296 host_info->out_host = url_parse::Component();
297 return;
298 }
299
300 bool has_non_ascii, has_escaped;
301 ScanHostname<CHAR, UCHAR>(spec, host, &has_non_ascii, &has_escaped);
302
303 // Keep track of output's initial length, so we can rewind later.
304 const int output_begin = output->length();
305
306 bool success;
307 if (!has_non_ascii && !has_escaped) {
308 success = DoSimpleHost(&spec[host.begin], host.len,
309 output, &has_non_ascii);
310 DCHECK(!has_non_ascii);
311 } else {
312 success = DoComplexHost(&spec[host.begin], host.len,
313 has_non_ascii, has_escaped, output);
314 }
315
316 if (!success) {
317 // Canonicalization failed. Set BROKEN to notify the caller.
318 host_info->family = CanonHostInfo::BROKEN;
319 } else {
320 // After all the other canonicalization, check if we ended up with an IP
321 // address. IP addresses are small, so writing into this temporary buffer
322 // should not cause an allocation.
323 RawCanonOutput<64> canon_ip;
324 CanonicalizeIPAddress(output->data(),
325 url_parse::MakeRange(output_begin, output->length()),
326 &canon_ip, host_info);
327
328 // If we got an IPv4/IPv6 address, copy the canonical form back to the
329 // real buffer. Otherwise, it's a hostname or broken IP, in which case
330 // we just leave it in place.
331 if (host_info->IsIPAddress()) {
332 output->set_length(output_begin);
333 output->Append(canon_ip.data(), canon_ip.length());
334 }
335 }
336
337 host_info->out_host = url_parse::MakeRange(output_begin, output->length());
338 }
339
340 } // namespace
341
CanonicalizeHost(const char * spec,const url_parse::Component & host,CanonOutput * output,url_parse::Component * out_host)342 bool CanonicalizeHost(const char* spec,
343 const url_parse::Component& host,
344 CanonOutput* output,
345 url_parse::Component* out_host) {
346 CanonHostInfo host_info;
347 DoHost<char, unsigned char>(spec, host, output, &host_info);
348 *out_host = host_info.out_host;
349 return (host_info.family != CanonHostInfo::BROKEN);
350 }
351
CanonicalizeHost(const base::char16 * spec,const url_parse::Component & host,CanonOutput * output,url_parse::Component * out_host)352 bool CanonicalizeHost(const base::char16* spec,
353 const url_parse::Component& host,
354 CanonOutput* output,
355 url_parse::Component* out_host) {
356 CanonHostInfo host_info;
357 DoHost<base::char16, base::char16>(spec, host, output, &host_info);
358 *out_host = host_info.out_host;
359 return (host_info.family != CanonHostInfo::BROKEN);
360 }
361
CanonicalizeHostVerbose(const char * spec,const url_parse::Component & host,CanonOutput * output,CanonHostInfo * host_info)362 void CanonicalizeHostVerbose(const char* spec,
363 const url_parse::Component& host,
364 CanonOutput* output,
365 CanonHostInfo *host_info) {
366 DoHost<char, unsigned char>(spec, host, output, host_info);
367 }
368
CanonicalizeHostVerbose(const base::char16 * spec,const url_parse::Component & host,CanonOutput * output,CanonHostInfo * host_info)369 void CanonicalizeHostVerbose(const base::char16* spec,
370 const url_parse::Component& host,
371 CanonOutput* output,
372 CanonHostInfo *host_info) {
373 DoHost<base::char16, base::char16>(spec, host, output, host_info);
374 }
375
376 } // namespace url_canon
377