1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/net_util.h"
6
7 #include <algorithm>
8 #include <map>
9 #include <unicode/ucnv.h>
10 #include <unicode/uidna.h>
11 #include <unicode/ulocdata.h>
12 #include <unicode/uniset.h>
13 #include <unicode/uscript.h>
14 #include <unicode/uset.h>
15
16 #include "build/build_config.h"
17
18 #if defined(OS_WIN)
19 #include <windows.h>
20 #include <winsock2.h>
21 #include <ws2tcpip.h>
22 #include <wspiapi.h> // Needed for Win2k compat.
23 #elif defined(OS_POSIX)
24 #include <netdb.h>
25 #include <sys/socket.h>
26 #include <fcntl.h>
27 #endif
28
29 #include "base/base64.h"
30 #include "base/basictypes.h"
31 #include "base/file_path.h"
32 #include "base/file_util.h"
33 #include "base/i18n/file_util_icu.h"
34 #include "base/i18n/icu_string_conversions.h"
35 #include "base/i18n/time_formatting.h"
36 #include "base/json/string_escape.h"
37 #include "base/lock.h"
38 #include "base/logging.h"
39 #include "base/message_loop.h"
40 #include "base/path_service.h"
41 #include "base/singleton.h"
42 #include "base/stl_util-inl.h"
43 #include "base/string_piece.h"
44 #include "base/string_tokenizer.h"
45 #include "base/string_util.h"
46 #include "base/sys_string_conversions.h"
47 #include "base/time.h"
48 #include "base/utf_offset_string_conversions.h"
49 #include "grit/net_resources.h"
50 #include "googleurl/src/gurl.h"
51 #include "googleurl/src/url_canon.h"
52 #include "googleurl/src/url_parse.h"
53 #include "net/base/escape.h"
54 #include "net/base/net_module.h"
55 #if defined(OS_WIN)
56 #include "net/base/winsock_init.h"
57 #endif
58 #include "unicode/datefmt.h"
59
60
61 using base::Time;
62
63 namespace {
64
65 // what we prepend to get a file URL
66 static const FilePath::CharType kFileURLPrefix[] =
67 FILE_PATH_LITERAL("file:///");
68
69 // The general list of blocked ports. Will be blocked unless a specific
70 // protocol overrides it. (Ex: ftp can use ports 20 and 21)
71 static const int kRestrictedPorts[] = {
72 1, // tcpmux
73 7, // echo
74 9, // discard
75 11, // systat
76 13, // daytime
77 15, // netstat
78 17, // qotd
79 19, // chargen
80 20, // ftp data
81 21, // ftp access
82 22, // ssh
83 23, // telnet
84 25, // smtp
85 37, // time
86 42, // name
87 43, // nicname
88 53, // domain
89 77, // priv-rjs
90 79, // finger
91 87, // ttylink
92 95, // supdup
93 101, // hostriame
94 102, // iso-tsap
95 103, // gppitnp
96 104, // acr-nema
97 109, // pop2
98 110, // pop3
99 111, // sunrpc
100 113, // auth
101 115, // sftp
102 117, // uucp-path
103 119, // nntp
104 123, // NTP
105 135, // loc-srv /epmap
106 139, // netbios
107 143, // imap2
108 179, // BGP
109 389, // ldap
110 465, // smtp+ssl
111 512, // print / exec
112 513, // login
113 514, // shell
114 515, // printer
115 526, // tempo
116 530, // courier
117 531, // chat
118 532, // netnews
119 540, // uucp
120 556, // remotefs
121 563, // nntp+ssl
122 587, // stmp?
123 601, // ??
124 636, // ldap+ssl
125 993, // ldap+ssl
126 995, // pop3+ssl
127 2049, // nfs
128 3659, // apple-sasl / PasswordServer
129 4045, // lockd
130 6000, // X11
131 };
132
133 // FTP overrides the following restricted ports.
134 static const int kAllowedFtpPorts[] = {
135 21, // ftp data
136 22, // ssh
137 };
138
139 template<typename STR>
GetSpecificHeaderT(const STR & headers,const STR & name)140 STR GetSpecificHeaderT(const STR& headers, const STR& name) {
141 // We want to grab the Value from the "Key: Value" pairs in the headers,
142 // which should look like this (no leading spaces, \n-separated) (we format
143 // them this way in url_request_inet.cc):
144 // HTTP/1.1 200 OK\n
145 // ETag: "6d0b8-947-24f35ec0"\n
146 // Content-Length: 2375\n
147 // Content-Type: text/html; charset=UTF-8\n
148 // Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
149 if (headers.empty())
150 return STR();
151
152 STR match;
153 match.push_back('\n');
154 match.append(name);
155 match.push_back(':');
156
157 typename STR::const_iterator begin =
158 search(headers.begin(), headers.end(), match.begin(), match.end(),
159 CaseInsensitiveCompareASCII<typename STR::value_type>());
160
161 if (begin == headers.end())
162 return STR();
163
164 begin += match.length();
165
166 typename STR::const_iterator end = find(begin, headers.end(), '\n');
167
168 STR ret;
169 TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
170 return ret;
171 }
172
173 // TODO(jungshik): We have almost identical hex-decoding code else where.
174 // Consider refactoring and moving it somewhere(base?). Bug 1224311
IsHexDigit(unsigned char c)175 inline bool IsHexDigit(unsigned char c) {
176 return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
177 ('a' <= c && c <= 'f'));
178 }
179
HexToInt(unsigned char c)180 inline unsigned char HexToInt(unsigned char c) {
181 DCHECK(IsHexDigit(c));
182 static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u};
183 return c - kOffset[(c >> 5) & 3];
184 }
185
186 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence
187 // of bytes. If input is invalid, return false.
QPDecode(const std::string & input,std::string * output)188 bool QPDecode(const std::string& input, std::string* output) {
189 std::string temp;
190 temp.reserve(input.size());
191 std::string::const_iterator it = input.begin();
192 while (it != input.end()) {
193 if (*it == '_') {
194 temp.push_back(' ');
195 } else if (*it == '=') {
196 if (input.end() - it < 3) {
197 return false;
198 }
199 if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) &&
200 IsHexDigit(static_cast<unsigned char>(*(it + 2)))) {
201 unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2));
202 temp.push_back(static_cast<char>(ch));
203 ++it;
204 ++it;
205 } else {
206 return false;
207 }
208 } else if (0x20 < *it && *it < 0x7F) {
209 // In a Q-encoded word, only printable ASCII characters
210 // represent themselves. Besides, space, '=', '_' and '?' are
211 // not allowed, but they're already filtered out.
212 DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F);
213 temp.push_back(*it);
214 } else {
215 return false;
216 }
217 ++it;
218 }
219 output->swap(temp);
220 return true;
221 }
222
223 enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
DecodeBQEncoding(const std::string & part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)224 bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type,
225 const std::string& charset, std::string* output) {
226 std::string decoded;
227 if (enc_type == B_ENCODING) {
228 if (!base::Base64Decode(part, &decoded)) {
229 return false;
230 }
231 } else {
232 if (!QPDecode(part, &decoded)) {
233 return false;
234 }
235 }
236
237 UErrorCode err = U_ZERO_ERROR;
238 UConverter* converter(ucnv_open(charset.c_str(), &err));
239 if (U_FAILURE(err)) {
240 return false;
241 }
242
243 // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
244 // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
245 // in UTF-8. Therefore, the expansion ratio is 3 at most.
246 int length = static_cast<int>(decoded.length());
247 char* buf = WriteInto(output, length * 3);
248 length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3,
249 decoded.data(), length, &err);
250 ucnv_close(converter);
251 if (U_FAILURE(err)) {
252 return false;
253 }
254 output->resize(length);
255 return true;
256 }
257
DecodeWord(const std::string & encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output)258 bool DecodeWord(const std::string& encoded_word,
259 const std::string& referrer_charset,
260 bool *is_rfc2047,
261 std::string* output) {
262 if (!IsStringASCII(encoded_word)) {
263 // Try UTF-8, referrer_charset and the native OS default charset in turn.
264 if (IsStringUTF8(encoded_word)) {
265 *output = encoded_word;
266 } else {
267 std::wstring wide_output;
268 if (!referrer_charset.empty() &&
269 base::CodepageToWide(encoded_word, referrer_charset.c_str(),
270 base::OnStringConversionError::FAIL,
271 &wide_output)) {
272 *output = WideToUTF8(wide_output);
273 } else {
274 *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
275 }
276 }
277 *is_rfc2047 = false;
278 return true;
279 }
280
281 // RFC 2047 : one of encoding methods supported by Firefox and relatively
282 // widely used by web servers.
283 // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
284 // We don't care about the length restriction (72 bytes) because
285 // many web servers generate encoded words longer than the limit.
286 std::string tmp;
287 *is_rfc2047 = true;
288 int part_index = 0;
289 std::string charset;
290 StringTokenizer t(encoded_word, "?");
291 RFC2047EncodingType enc_type = Q_ENCODING;
292 while (*is_rfc2047 && t.GetNext()) {
293 std::string part = t.token();
294 switch (part_index) {
295 case 0:
296 if (part != "=") {
297 *is_rfc2047 = false;
298 break;
299 }
300 ++part_index;
301 break;
302 case 1:
303 // Do we need charset validity check here?
304 charset = part;
305 ++part_index;
306 break;
307 case 2:
308 if (part.size() > 1 ||
309 part.find_first_of("bBqQ") == std::string::npos) {
310 *is_rfc2047 = false;
311 break;
312 }
313 if (part[0] == 'b' || part[0] == 'B') {
314 enc_type = B_ENCODING;
315 }
316 ++part_index;
317 break;
318 case 3:
319 *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
320 if (!*is_rfc2047) {
321 // Last minute failure. Invalid B/Q encoding. Rather than
322 // passing it through, return now.
323 return false;
324 }
325 ++part_index;
326 break;
327 case 4:
328 if (part != "=") {
329 // Another last minute failure !
330 // Likely to be a case of two encoded-words in a row or
331 // an encoded word followed by a non-encoded word. We can be
332 // generous, but it does not help much in terms of compatibility,
333 // I believe. Return immediately.
334 *is_rfc2047 = false;
335 return false;
336 }
337 ++part_index;
338 break;
339 default:
340 *is_rfc2047 = false;
341 return false;
342 }
343 }
344
345 if (*is_rfc2047) {
346 if (*(encoded_word.end() - 1) == '=') {
347 output->swap(tmp);
348 return true;
349 }
350 // encoded_word ending prematurelly with '?' or extra '?'
351 *is_rfc2047 = false;
352 return false;
353 }
354
355 // We're not handling 'especial' characters quoted with '\', but
356 // it should be Ok because we're not an email client but a
357 // web browser.
358
359 // What IE6/7 does: %-escaped UTF-8. We could extend this to
360 // support a rudimentary form of RFC 2231 with charset label, but
361 // it'd gain us little in terms of compatibility.
362 tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
363 if (IsStringUTF8(tmp)) {
364 output->swap(tmp);
365 return true;
366 // We can try either the OS default charset or 'origin charset' here,
367 // As far as I can tell, IE does not support it. However, I've seen
368 // web servers emit %-escaped string in a legacy encoding (usually
369 // origin charset).
370 // TODO(jungshik) : Test IE further and consider adding a fallback here.
371 }
372 return false;
373 }
374
DecodeParamValue(const std::string & input,const std::string & referrer_charset,std::string * output)375 bool DecodeParamValue(const std::string& input,
376 const std::string& referrer_charset,
377 std::string* output) {
378 std::string tmp;
379 // Tokenize with whitespace characters.
380 StringTokenizer t(input, " \t\n\r");
381 t.set_options(StringTokenizer::RETURN_DELIMS);
382 bool is_previous_token_rfc2047 = true;
383 while (t.GetNext()) {
384 if (t.token_is_delim()) {
385 // If the previous non-delimeter token is not RFC2047-encoded,
386 // put in a space in its place. Otheriwse, skip over it.
387 if (!is_previous_token_rfc2047) {
388 tmp.push_back(' ');
389 }
390 continue;
391 }
392 // We don't support a single multibyte character split into
393 // adjacent encoded words. Some broken mail clients emit headers
394 // with that problem, but most web servers usually encode a filename
395 // in a single encoded-word. Firefox/Thunderbird do not support
396 // it, either.
397 std::string decoded;
398 if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
399 &decoded))
400 return false;
401 tmp.append(decoded);
402 }
403 output->swap(tmp);
404 return true;
405 }
406
407 // TODO(mpcomplete): This is a quick and dirty implementation for now. I'm
408 // sure this doesn't properly handle all (most?) cases.
409 template<typename STR>
GetHeaderParamValueT(const STR & header,const STR & param_name)410 STR GetHeaderParamValueT(const STR& header, const STR& param_name) {
411 // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
412 typename STR::const_iterator param_begin =
413 search(header.begin(), header.end(), param_name.begin(), param_name.end(),
414 CaseInsensitiveCompareASCII<typename STR::value_type>());
415
416 if (param_begin == header.end())
417 return STR();
418 param_begin += param_name.length();
419
420 STR whitespace;
421 whitespace.push_back(' ');
422 whitespace.push_back('\t');
423 const typename STR::size_type equals_offset =
424 header.find_first_not_of(whitespace, param_begin - header.begin());
425 if (equals_offset == STR::npos || header.at(equals_offset) != '=')
426 return STR();
427
428 param_begin = header.begin() + equals_offset + 1;
429 if (param_begin == header.end())
430 return STR();
431
432 typename STR::const_iterator param_end;
433 if (*param_begin == '"') {
434 param_end = find(param_begin+1, header.end(), '"');
435 if (param_end == header.end())
436 return STR(); // poorly formatted param?
437
438 ++param_begin; // skip past the quote.
439 } else {
440 param_end = find(param_begin+1, header.end(), ';');
441 }
442
443 return STR(param_begin, param_end);
444 }
445
446 // Does some simple normalization of scripts so we can allow certain scripts
447 // to exist together.
448 // TODO(brettw) bug 880223: we should allow some other languages to be
449 // oombined such as Chinese and Latin. We will probably need a more
450 // complicated system of language pairs to have more fine-grained control.
NormalizeScript(UScriptCode code)451 UScriptCode NormalizeScript(UScriptCode code) {
452 switch (code) {
453 case USCRIPT_KATAKANA:
454 case USCRIPT_HIRAGANA:
455 case USCRIPT_KATAKANA_OR_HIRAGANA:
456 case USCRIPT_HANGUL: // This one is arguable.
457 return USCRIPT_HAN;
458 default:
459 return code;
460 }
461 }
462
IsIDNComponentInSingleScript(const char16 * str,int str_len)463 bool IsIDNComponentInSingleScript(const char16* str, int str_len) {
464 UScriptCode first_script = USCRIPT_INVALID_CODE;
465 bool is_first = true;
466
467 int i = 0;
468 while (i < str_len) {
469 unsigned code_point;
470 U16_NEXT(str, i, str_len, code_point);
471
472 UErrorCode err = U_ZERO_ERROR;
473 UScriptCode cur_script = uscript_getScript(code_point, &err);
474 if (err != U_ZERO_ERROR)
475 return false; // Report mixed on error.
476 cur_script = NormalizeScript(cur_script);
477
478 // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.
479 if (is_first && cur_script != USCRIPT_COMMON) {
480 first_script = cur_script;
481 is_first = false;
482 } else {
483 if (cur_script != USCRIPT_COMMON && cur_script != first_script)
484 return false;
485 }
486 }
487 return true;
488 }
489
490 // Check if the script of a language can be 'safely' mixed with
491 // Latin letters in the ASCII range.
IsCompatibleWithASCIILetters(const std::string & lang)492 bool IsCompatibleWithASCIILetters(const std::string& lang) {
493 // For now, just list Chinese, Japanese and Korean (positive list).
494 // An alternative is negative-listing (languages using Greek and
495 // Cyrillic letters), but it can be more dangerous.
496 return !lang.substr(0, 2).compare("zh") ||
497 !lang.substr(0, 2).compare("ja") ||
498 !lang.substr(0, 2).compare("ko");
499 }
500
501 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;
502
503 class LangToExemplarSet {
504 private:
505 LangToExemplarSetMap map;
LangToExemplarSet()506 LangToExemplarSet() { }
~LangToExemplarSet()507 ~LangToExemplarSet() {
508 STLDeleteContainerPairSecondPointers(map.begin(), map.end());
509 }
510
511 friend class Singleton<LangToExemplarSet>;
512 friend struct DefaultSingletonTraits<LangToExemplarSet>;
513 friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);
514 friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);
515
516 DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);
517 };
518
GetExemplarSetForLang(const std::string & lang,icu::UnicodeSet ** lang_set)519 bool GetExemplarSetForLang(const std::string& lang,
520 icu::UnicodeSet** lang_set) {
521 const LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map;
522 LangToExemplarSetMap::const_iterator pos = map.find(lang);
523 if (pos != map.end()) {
524 *lang_set = pos->second;
525 return true;
526 }
527 return false;
528 }
529
SetExemplarSetForLang(const std::string & lang,icu::UnicodeSet * lang_set)530 void SetExemplarSetForLang(const std::string& lang,
531 icu::UnicodeSet* lang_set) {
532 LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map;
533 map.insert(std::make_pair(lang, lang_set));
534 }
535
536 static Lock lang_set_lock;
537
538 // Returns true if all the characters in component_characters are used by
539 // the language |lang|.
IsComponentCoveredByLang(const icu::UnicodeSet & component_characters,const std::string & lang)540 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,
541 const std::string& lang) {
542 static const icu::UnicodeSet kASCIILetters(0x61, 0x7a); // [a-z]
543 icu::UnicodeSet* lang_set;
544 // We're called from both the UI thread and the history thread.
545 {
546 AutoLock lock(lang_set_lock);
547 if (!GetExemplarSetForLang(lang, &lang_set)) {
548 UErrorCode status = U_ZERO_ERROR;
549 ULocaleData* uld = ulocdata_open(lang.c_str(), &status);
550 // TODO(jungshik) Turn this check on when the ICU data file is
551 // rebuilt with the minimal subset of locale data for languages
552 // to which Chrome is not localized but which we offer in the list
553 // of languages selectable for Accept-Languages. With the rebuilt ICU
554 // data, ulocdata_open never should fall back to the default locale.
555 // (issue 2078)
556 // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);
557 if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {
558 lang_set = reinterpret_cast<icu::UnicodeSet *>(
559 ulocdata_getExemplarSet(uld, NULL, 0,
560 ULOCDATA_ES_STANDARD, &status));
561 // If |lang| is compatible with ASCII Latin letters, add them.
562 if (IsCompatibleWithASCIILetters(lang))
563 lang_set->addAll(kASCIILetters);
564 } else {
565 lang_set = new icu::UnicodeSet(1, 0);
566 }
567 lang_set->freeze();
568 SetExemplarSetForLang(lang, lang_set);
569 ulocdata_close(uld);
570 }
571 }
572 return !lang_set->isEmpty() && lang_set->containsAll(component_characters);
573 }
574
575 // Returns true if the given Unicode host component is safe to display to the
576 // user.
IsIDNComponentSafe(const char16 * str,int str_len,const std::wstring & languages)577 bool IsIDNComponentSafe(const char16* str,
578 int str_len,
579 const std::wstring& languages) {
580 // Most common cases (non-IDN) do not reach here so that we don't
581 // need a fast return path.
582 // TODO(jungshik) : Check if there's any character inappropriate
583 // (although allowed) for domain names.
584 // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
585 // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
586 // For now, we borrow the list from Mozilla and tweaked it slightly.
587 // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
588 // they're gonna be canonicalized to U+0020 and full stop before
589 // reaching here.)
590 // The original list is available at
591 // http://kb.mozillazine.org/Network.IDN.blacklist_chars and
592 // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703
593
594 UErrorCode status = U_ZERO_ERROR;
595 #ifdef U_WCHAR_IS_UTF16
596 icu::UnicodeSet dangerous_characters(icu::UnicodeString(
597 L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"
598 L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"
599 L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"
600 L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"
601 L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"
602 L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"
603 L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"
604 L"[\ufffa-\ufffd]]"), status);
605 #else
606 icu::UnicodeSet dangerous_characters(icu::UnicodeString(
607 "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"
608 "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"
609 "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"
610 "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"
611 "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"
612 "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"
613 "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"
614 "[\\ufffa-\\ufffd]]", -1, US_INV), status);
615 #endif
616 DCHECK(U_SUCCESS(status));
617 icu::UnicodeSet component_characters;
618 component_characters.addAll(icu::UnicodeString(str, str_len));
619 if (dangerous_characters.containsSome(component_characters))
620 return false;
621
622 // If the language list is empty, the result is completely determined
623 // by whether a component is a single script or not. This will block
624 // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are
625 // allowed with |languages| (while it blocks Chinese + Latin letters with
626 // an accent as should be the case), but we want to err on the safe side
627 // when |languages| is empty.
628 if (languages.empty())
629 return IsIDNComponentInSingleScript(str, str_len);
630
631 // |common_characters| is made up of ASCII numbers, hyphen, plus and
632 // underscore that are used across scripts and allowed in domain names.
633 // (sync'd with characters allowed in url_canon_host with square
634 // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
635 icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
636 status);
637 DCHECK(U_SUCCESS(status));
638 // Subtract common characters because they're always allowed so that
639 // we just have to check if a language-specific set contains
640 // the remainder.
641 component_characters.removeAll(common_characters);
642
643 std::string languages_list(WideToASCII(languages));
644 StringTokenizer t(languages_list, ",");
645 while (t.GetNext()) {
646 if (IsComponentCoveredByLang(component_characters, t.token()))
647 return true;
648 }
649 return false;
650 }
651
652 // Converts one component of a host (between dots) to IDN if safe. The result
653 // will be APPENDED to the given output string and will be the same as the input
654 // if it is not IDN or the IDN is unsafe to display. Returns whether any
655 // conversion was performed.
IDNToUnicodeOneComponent(const char16 * comp,size_t comp_len,const std::wstring & languages,string16 * out)656 bool IDNToUnicodeOneComponent(const char16* comp,
657 size_t comp_len,
658 const std::wstring& languages,
659 string16* out) {
660 DCHECK(out);
661 if (comp_len == 0)
662 return false;
663
664 // Only transform if the input can be an IDN component.
665 static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
666 if ((comp_len > arraysize(kIdnPrefix)) &&
667 !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {
668 // Repeatedly expand the output string until it's big enough. It looks like
669 // ICU will return the required size of the buffer, but that's not
670 // documented, so we'll just grow by 2x. This should be rare and is not on a
671 // critical path.
672 size_t original_length = out->length();
673 for (int extra_space = 64; ; extra_space *= 2) {
674 UErrorCode status = U_ZERO_ERROR;
675 out->resize(out->length() + extra_space);
676 int output_chars = uidna_IDNToUnicode(comp,
677 static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space,
678 UIDNA_DEFAULT, NULL, &status);
679 if (status == U_ZERO_ERROR) {
680 // Converted successfully.
681 out->resize(original_length + output_chars);
682 if (IsIDNComponentSafe(out->data() + original_length, output_chars,
683 languages))
684 return true;
685 }
686
687 if (status != U_BUFFER_OVERFLOW_ERROR)
688 break;
689 }
690 // Failed, revert back to original string.
691 out->resize(original_length);
692 }
693
694 // We get here with no IDN or on error, in which case we just append the
695 // literal input.
696 out->append(comp, comp_len);
697 return false;
698 }
699
700 // Helper for FormatUrl().
FormatViewSourceUrl(const GURL & url,const std::wstring & languages,bool omit_username_password,UnescapeRule::Type unescape_rules,url_parse::Parsed * new_parsed,size_t * prefix_end,size_t * offset_for_adjustment)701 std::wstring FormatViewSourceUrl(const GURL& url,
702 const std::wstring& languages,
703 bool omit_username_password,
704 UnescapeRule::Type unescape_rules,
705 url_parse::Parsed* new_parsed,
706 size_t* prefix_end,
707 size_t* offset_for_adjustment) {
708 DCHECK(new_parsed);
709 const wchar_t* const kWideViewSource = L"view-source:";
710 const size_t kViewSourceLengthPlus1 = 12;
711
712 GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));
713 size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ?
714 std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1);
715 size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ?
716 NULL : &temp_offset;
717 std::wstring result = net::FormatUrl(real_url, languages,
718 omit_username_password, unescape_rules, new_parsed, prefix_end,
719 temp_offset_ptr);
720 result.insert(0, kWideViewSource);
721
722 // Adjust position values.
723 if (new_parsed->scheme.is_nonempty()) {
724 // Assume "view-source:real-scheme" as a scheme.
725 new_parsed->scheme.len += kViewSourceLengthPlus1;
726 } else {
727 new_parsed->scheme.begin = 0;
728 new_parsed->scheme.len = kViewSourceLengthPlus1 - 1;
729 }
730 if (new_parsed->username.is_nonempty())
731 new_parsed->username.begin += kViewSourceLengthPlus1;
732 if (new_parsed->password.is_nonempty())
733 new_parsed->password.begin += kViewSourceLengthPlus1;
734 if (new_parsed->host.is_nonempty())
735 new_parsed->host.begin += kViewSourceLengthPlus1;
736 if (new_parsed->port.is_nonempty())
737 new_parsed->port.begin += kViewSourceLengthPlus1;
738 if (new_parsed->path.is_nonempty())
739 new_parsed->path.begin += kViewSourceLengthPlus1;
740 if (new_parsed->query.is_nonempty())
741 new_parsed->query.begin += kViewSourceLengthPlus1;
742 if (new_parsed->ref.is_nonempty())
743 new_parsed->ref.begin += kViewSourceLengthPlus1;
744 if (prefix_end)
745 *prefix_end += kViewSourceLengthPlus1;
746 if (temp_offset_ptr) {
747 *offset_for_adjustment = (temp_offset == std::wstring::npos) ?
748 std::wstring::npos : (temp_offset + kViewSourceLengthPlus1);
749 }
750 return result;
751 }
752
753 } // namespace
754
755 namespace net {
756
757 std::set<int> explicitly_allowed_ports;
758
759 // Appends the substring |in_component| inside of the URL |spec| to |output|,
760 // and the resulting range will be filled into |out_component|. |unescape_rules|
761 // defines how to clean the URL for human readability. |offset_for_adjustment|
762 // is an offset into |output| which will be adjusted based on how it maps to the
763 // component being converted; if it is less than output->length(), it will be
764 // untouched, and if it is greater than output->length() + in_component.len it
765 // will be shortened by the difference in lengths between the input and output
766 // components. Otherwise it points into the component being converted, and is
767 // adjusted to point to the same logical place in |output|.
768 // |offset_for_adjustment| may not be NULL.
769 static void AppendFormattedComponent(const std::string& spec,
770 const url_parse::Component& in_component,
771 UnescapeRule::Type unescape_rules,
772 std::wstring* output,
773 url_parse::Component* out_component,
774 size_t* offset_for_adjustment);
775
FilePathToFileURL(const FilePath & path)776 GURL FilePathToFileURL(const FilePath& path) {
777 // Produce a URL like "file:///C:/foo" for a regular file, or
778 // "file://///server/path" for UNC. The URL canonicalizer will fix up the
779 // latter case to be the canonical UNC form: "file://server/path"
780 FilePath::StringType url_string(kFileURLPrefix);
781 url_string.append(path.value());
782
783 // Now do replacement of some characters. Since we assume the input is a
784 // literal filename, anything the URL parser might consider special should
785 // be escaped here.
786
787 // must be the first substitution since others will introduce percents as the
788 // escape character
789 ReplaceSubstringsAfterOffset(&url_string, 0,
790 FILE_PATH_LITERAL("%"), FILE_PATH_LITERAL("%25"));
791
792 // semicolon is supposed to be some kind of separator according to RFC 2396
793 ReplaceSubstringsAfterOffset(&url_string, 0,
794 FILE_PATH_LITERAL(";"), FILE_PATH_LITERAL("%3B"));
795
796 ReplaceSubstringsAfterOffset(&url_string, 0,
797 FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23"));
798
799 #if defined(OS_POSIX)
800 ReplaceSubstringsAfterOffset(&url_string, 0,
801 FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C"));
802 #endif
803
804 return GURL(url_string);
805 }
806
GetSpecificHeader(const std::wstring & headers,const std::wstring & name)807 std::wstring GetSpecificHeader(const std::wstring& headers,
808 const std::wstring& name) {
809 return GetSpecificHeaderT(headers, name);
810 }
811
GetSpecificHeader(const std::string & headers,const std::string & name)812 std::string GetSpecificHeader(const std::string& headers,
813 const std::string& name) {
814 return GetSpecificHeaderT(headers, name);
815 }
816
GetFileNameFromCD(const std::string & header,const std::string & referrer_charset)817 std::string GetFileNameFromCD(const std::string& header,
818 const std::string& referrer_charset) {
819 std::string param_value = GetHeaderParamValue(header, "filename");
820 if (param_value.empty()) {
821 // Some servers use 'name' parameter.
822 param_value = GetHeaderParamValue(header, "name");
823 }
824 if (param_value.empty())
825 return std::string();
826 std::string decoded;
827 if (DecodeParamValue(param_value, referrer_charset, &decoded))
828 return decoded;
829 return std::string();
830 }
831
GetHeaderParamValue(const std::wstring & field,const std::wstring & param_name)832 std::wstring GetHeaderParamValue(const std::wstring& field,
833 const std::wstring& param_name) {
834 return GetHeaderParamValueT(field, param_name);
835 }
836
GetHeaderParamValue(const std::string & field,const std::string & param_name)837 std::string GetHeaderParamValue(const std::string& field,
838 const std::string& param_name) {
839 return GetHeaderParamValueT(field, param_name);
840 }
841
842 // TODO(brettw) bug 734373: check the scripts for each host component and
843 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
844 // scripts that the user has installed. For now, just put the entire
845 // path through IDN. Maybe this feature can be implemented in ICU itself?
846 //
847 // We may want to skip this step in the case of file URLs to allow unicode
848 // UNC hostnames regardless of encodings.
IDNToUnicode(const char * host,size_t host_len,const std::wstring & languages,size_t * offset_for_adjustment)849 std::wstring IDNToUnicode(const char* host,
850 size_t host_len,
851 const std::wstring& languages,
852 size_t* offset_for_adjustment) {
853 // Convert the ASCII input to a wide string for ICU.
854 string16 input16;
855 input16.reserve(host_len);
856 std::copy(host, host + host_len, std::back_inserter(input16));
857
858 string16 out16;
859 size_t output_offset = offset_for_adjustment ?
860 *offset_for_adjustment : std::wstring::npos;
861
862 // Do each component of the host separately, since we enforce script matching
863 // on a per-component basis.
864 for (size_t component_start = 0, component_end;
865 component_start < input16.length();
866 component_start = component_end + 1) {
867 // Find the end of the component.
868 component_end = input16.find('.', component_start);
869 if (component_end == string16::npos)
870 component_end = input16.length(); // For getting the last component.
871 size_t component_length = component_end - component_start;
872
873 size_t output_component_start = out16.length();
874 bool converted_idn = false;
875 if (component_end > component_start) {
876 // Add the substring that we just found.
877 converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
878 component_length, languages, &out16);
879 }
880 size_t output_component_length = out16.length() - output_component_start;
881
882 if ((output_offset != std::wstring::npos) &&
883 (*offset_for_adjustment > component_start)) {
884 if ((*offset_for_adjustment < component_end) && converted_idn)
885 output_offset = std::wstring::npos;
886 else
887 output_offset += output_component_length - component_length;
888 }
889
890 // Need to add the dot we just found (if we found one).
891 if (component_end < input16.length())
892 out16.push_back('.');
893 }
894
895 if (offset_for_adjustment)
896 *offset_for_adjustment = output_offset;
897
898 return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment);
899 }
900
CanonicalizeHost(const std::string & host,url_canon::CanonHostInfo * host_info)901 std::string CanonicalizeHost(const std::string& host,
902 url_canon::CanonHostInfo* host_info) {
903 // Try to canonicalize the host.
904 const url_parse::Component raw_host_component(
905 0, static_cast<int>(host.length()));
906 std::string canon_host;
907 url_canon::StdStringCanonOutput canon_host_output(&canon_host);
908 url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component,
909 &canon_host_output, host_info);
910
911 if (host_info->out_host.is_nonempty() &&
912 host_info->family != url_canon::CanonHostInfo::BROKEN) {
913 // Success! Assert that there's no extra garbage.
914 canon_host_output.Complete();
915 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
916 } else {
917 // Empty host, or canonicalization failed. We'll return empty.
918 canon_host.clear();
919 }
920
921 return canon_host;
922 }
923
CanonicalizeHost(const std::wstring & host,url_canon::CanonHostInfo * host_info)924 std::string CanonicalizeHost(const std::wstring& host,
925 url_canon::CanonHostInfo* host_info) {
926 std::string converted_host;
927 WideToUTF8(host.c_str(), host.length(), &converted_host);
928 return CanonicalizeHost(converted_host, host_info);
929 }
930
GetDirectoryListingHeader(const string16 & title)931 std::string GetDirectoryListingHeader(const string16& title) {
932 static const base::StringPiece header(
933 NetModule::GetResource(IDR_DIR_HEADER_HTML));
934 // This can be null in unit tests.
935 DLOG_IF(WARNING, header.empty()) <<
936 "Missing resource: directory listing header";
937
938 std::string result;
939 if (!header.empty())
940 result.assign(header.data(), header.size());
941
942 result.append("<script>start(");
943 base::JsonDoubleQuote(title, true, &result);
944 result.append(");</script>\n");
945
946 return result;
947 }
948
IsHostCharAlpha(char c)949 inline bool IsHostCharAlpha(char c) {
950 // We can just check lowercase because uppercase characters have already been
951 // normalized.
952 return (c >= 'a') && (c <= 'z');
953 }
954
IsHostCharDigit(char c)955 inline bool IsHostCharDigit(char c) {
956 return (c >= '0') && (c <= '9');
957 }
958
IsCanonicalizedHostCompliant(const std::string & host)959 bool IsCanonicalizedHostCompliant(const std::string& host) {
960 if (host.empty())
961 return false;
962
963 bool in_component = false;
964 bool most_recent_component_started_alpha = false;
965 bool last_char_was_hyphen_or_underscore = false;
966
967 for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
968 const char c = *i;
969 if (!in_component) {
970 most_recent_component_started_alpha = IsHostCharAlpha(c);
971 if (!most_recent_component_started_alpha && !IsHostCharDigit(c))
972 return false;
973 in_component = true;
974 } else {
975 if (c == '.') {
976 if (last_char_was_hyphen_or_underscore)
977 return false;
978 in_component = false;
979 } else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) {
980 last_char_was_hyphen_or_underscore = false;
981 } else if ((c == '-') || (c == '_')) {
982 last_char_was_hyphen_or_underscore = true;
983 } else {
984 return false;
985 }
986 }
987 }
988
989 return most_recent_component_started_alpha;
990 }
991
GetDirectoryListingEntry(const string16 & name,const std::string & raw_bytes,bool is_dir,int64 size,Time modified)992 std::string GetDirectoryListingEntry(const string16& name,
993 const std::string& raw_bytes,
994 bool is_dir,
995 int64 size,
996 Time modified) {
997 std::string result;
998 result.append("<script>addRow(");
999 base::JsonDoubleQuote(name, true, &result);
1000 result.append(",");
1001 if (raw_bytes.empty()) {
1002 base::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)),
1003 true, &result);
1004 } else {
1005 base::JsonDoubleQuote(EscapePath(raw_bytes), true, &result);
1006 }
1007 if (is_dir) {
1008 result.append(",1,");
1009 } else {
1010 result.append(",0,");
1011 }
1012
1013 base::JsonDoubleQuote(
1014 WideToUTF16Hack(FormatBytes(size, GetByteDisplayUnits(size), true)), true,
1015 &result);
1016
1017 result.append(",");
1018
1019 string16 modified_str;
1020 // |modified| can be NULL in FTP listings.
1021 if (!modified.is_null()) {
1022 modified_str = WideToUTF16Hack(base::TimeFormatShortDateAndTime(modified));
1023 }
1024 base::JsonDoubleQuote(modified_str, true, &result);
1025
1026 result.append(");</script>\n");
1027
1028 return result;
1029 }
1030
StripWWW(const std::wstring & text)1031 std::wstring StripWWW(const std::wstring& text) {
1032 const std::wstring www(L"www.");
1033 return (text.compare(0, www.length(), www) == 0) ?
1034 text.substr(www.length()) : text;
1035 }
1036
GetSuggestedFilename(const GURL & url,const std::string & content_disposition,const std::string & referrer_charset,const FilePath & default_name)1037 FilePath GetSuggestedFilename(const GURL& url,
1038 const std::string& content_disposition,
1039 const std::string& referrer_charset,
1040 const FilePath& default_name) {
1041 // We don't translate this fallback string, "download". If localization is
1042 // needed, the caller should provide localized fallback default_name.
1043 static const FilePath::CharType kFinalFallbackName[] =
1044 FILE_PATH_LITERAL("download");
1045
1046 // about: and data: URLs don't have file names, but esp. data: URLs may
1047 // contain parts that look like ones (i.e., contain a slash).
1048 // Therefore we don't attempt to divine a file name out of them.
1049 if (url.SchemeIs("about") || url.SchemeIs("data")) {
1050 return default_name.empty() ? FilePath(kFinalFallbackName) : default_name;
1051 }
1052
1053 const std::string filename_from_cd = GetFileNameFromCD(content_disposition,
1054 referrer_charset);
1055 #if defined(OS_WIN)
1056 FilePath::StringType filename = UTF8ToWide(filename_from_cd);
1057 #elif defined(OS_POSIX)
1058 FilePath::StringType filename = filename_from_cd;
1059 #endif
1060
1061 if (!filename.empty()) {
1062 // Remove any path information the server may have sent, take the name
1063 // only.
1064 filename = FilePath(filename).BaseName().value();
1065
1066 // Next, remove "." from the beginning and end of the file name to avoid
1067 // tricks with hidden files, "..", and "."
1068 TrimString(filename, FILE_PATH_LITERAL("."), &filename);
1069 }
1070 if (filename.empty()) {
1071 if (url.is_valid()) {
1072 const std::string unescaped_url_filename = UnescapeURLComponent(
1073 url.ExtractFileName(),
1074 UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
1075 #if defined(OS_WIN)
1076 filename = UTF8ToWide(unescaped_url_filename);
1077 #elif defined(OS_POSIX)
1078 filename = unescaped_url_filename;
1079 #endif
1080 }
1081 }
1082
1083 // Trim '.' once more.
1084 TrimString(filename, FILE_PATH_LITERAL("."), &filename);
1085
1086 // If there's no filename or it gets trimed to be empty, use
1087 // the URL hostname or default_name
1088 if (filename.empty()) {
1089 if (!default_name.empty()) {
1090 filename = default_name.value();
1091 } else if (url.is_valid()) {
1092 // Some schemes (e.g. file) do not have a hostname. Even though it's
1093 // not likely to reach here, let's hardcode the last fallback name.
1094 // TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451)
1095 filename = url.host().empty() ? kFinalFallbackName :
1096 #if defined(OS_WIN)
1097 UTF8ToWide(url.host());
1098 #elif defined(OS_POSIX)
1099 url.host();
1100 #endif
1101 } else {
1102 NOTREACHED();
1103 }
1104 }
1105
1106 file_util::ReplaceIllegalCharactersInPath(&filename, '-');
1107 return FilePath(filename);
1108 }
1109
IsPortAllowedByDefault(int port)1110 bool IsPortAllowedByDefault(int port) {
1111 int array_size = arraysize(kRestrictedPorts);
1112 for (int i = 0; i < array_size; i++) {
1113 if (kRestrictedPorts[i] == port) {
1114 return false;
1115 }
1116 }
1117 return true;
1118 }
1119
IsPortAllowedByFtp(int port)1120 bool IsPortAllowedByFtp(int port) {
1121 int array_size = arraysize(kAllowedFtpPorts);
1122 for (int i = 0; i < array_size; i++) {
1123 if (kAllowedFtpPorts[i] == port) {
1124 return true;
1125 }
1126 }
1127 // Port not explicitly allowed by FTP, so return the default restrictions.
1128 return IsPortAllowedByDefault(port);
1129 }
1130
IsPortAllowedByOverride(int port)1131 bool IsPortAllowedByOverride(int port) {
1132 if (explicitly_allowed_ports.empty())
1133 return false;
1134
1135 std::set<int>::const_iterator it =
1136 std::find(explicitly_allowed_ports.begin(),
1137 explicitly_allowed_ports.end(),
1138 port);
1139
1140 return it != explicitly_allowed_ports.end();
1141 }
1142
SetNonBlocking(int fd)1143 int SetNonBlocking(int fd) {
1144 #if defined(OS_WIN)
1145 unsigned long no_block = 1;
1146 return ioctlsocket(fd, FIONBIO, &no_block);
1147 #elif defined(OS_POSIX)
1148 int flags = fcntl(fd, F_GETFL, 0);
1149 if (-1 == flags)
1150 flags = 0;
1151 return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1152 #endif
1153 }
1154
ParseHostAndPort(std::string::const_iterator host_and_port_begin,std::string::const_iterator host_and_port_end,std::string * host,int * port)1155 bool ParseHostAndPort(std::string::const_iterator host_and_port_begin,
1156 std::string::const_iterator host_and_port_end,
1157 std::string* host,
1158 int* port) {
1159 if (host_and_port_begin >= host_and_port_end)
1160 return false;
1161
1162 // When using url_parse, we use char*.
1163 const char* auth_begin = &(*host_and_port_begin);
1164 int auth_len = host_and_port_end - host_and_port_begin;
1165
1166 url_parse::Component auth_component(0, auth_len);
1167 url_parse::Component username_component;
1168 url_parse::Component password_component;
1169 url_parse::Component hostname_component;
1170 url_parse::Component port_component;
1171
1172 url_parse::ParseAuthority(auth_begin, auth_component, &username_component,
1173 &password_component, &hostname_component, &port_component);
1174
1175 // There shouldn't be a username/password.
1176 if (username_component.is_valid() || password_component.is_valid())
1177 return false;
1178
1179 if (!hostname_component.is_nonempty())
1180 return false; // Failed parsing.
1181
1182 int parsed_port_number = -1;
1183 if (port_component.is_nonempty()) {
1184 parsed_port_number = url_parse::ParsePort(auth_begin, port_component);
1185
1186 // If parsing failed, port_number will be either PORT_INVALID or
1187 // PORT_UNSPECIFIED, both of which are negative.
1188 if (parsed_port_number < 0)
1189 return false; // Failed parsing the port number.
1190 }
1191
1192 if (port_component.len == 0)
1193 return false; // Reject inputs like "foo:"
1194
1195 // Pass results back to caller.
1196 host->assign(auth_begin + hostname_component.begin, hostname_component.len);
1197 *port = parsed_port_number;
1198
1199 return true; // Success.
1200 }
1201
ParseHostAndPort(const std::string & host_and_port,std::string * host,int * port)1202 bool ParseHostAndPort(const std::string& host_and_port,
1203 std::string* host,
1204 int* port) {
1205 return ParseHostAndPort(
1206 host_and_port.begin(), host_and_port.end(), host, port);
1207 }
1208
GetHostAndPort(const GURL & url)1209 std::string GetHostAndPort(const GURL& url) {
1210 // For IPv6 literals, GURL::host() already includes the brackets so it is
1211 // safe to just append a colon.
1212 return StringPrintf("%s:%d", url.host().c_str(), url.EffectiveIntPort());
1213 }
1214
GetHostAndOptionalPort(const GURL & url)1215 std::string GetHostAndOptionalPort(const GURL& url) {
1216 // For IPv6 literals, GURL::host() already includes the brackets
1217 // so it is safe to just append a colon.
1218 if (url.has_port())
1219 return StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
1220 return url.host();
1221 }
1222
NetAddressToString(const struct addrinfo * net_address)1223 std::string NetAddressToString(const struct addrinfo* net_address) {
1224 #if defined(OS_WIN)
1225 EnsureWinsockInit();
1226 #endif
1227
1228 // This buffer is large enough to fit the biggest IPv6 string.
1229 char buffer[INET6_ADDRSTRLEN];
1230
1231 int result = getnameinfo(net_address->ai_addr,
1232 net_address->ai_addrlen, buffer, sizeof(buffer), NULL, 0, NI_NUMERICHOST);
1233
1234 if (result != 0) {
1235 DLOG(INFO) << "getnameinfo() failed with " << result;
1236 buffer[0] = '\0';
1237 }
1238 return std::string(buffer);
1239 }
1240
GetHostName()1241 std::string GetHostName() {
1242 #if defined(OS_WIN)
1243 EnsureWinsockInit();
1244 #endif
1245
1246 // Host names are limited to 255 bytes.
1247 char buffer[256];
1248 int result = gethostname(buffer, sizeof(buffer));
1249 if (result != 0) {
1250 DLOG(INFO) << "gethostname() failed with " << result;
1251 buffer[0] = '\0';
1252 }
1253 return std::string(buffer);
1254 }
1255
GetIdentityFromURL(const GURL & url,std::wstring * username,std::wstring * password)1256 void GetIdentityFromURL(const GURL& url,
1257 std::wstring* username,
1258 std::wstring* password) {
1259 UnescapeRule::Type flags = UnescapeRule::SPACES;
1260 *username = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.username(),
1261 flags, NULL));
1262 *password = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.password(),
1263 flags, NULL));
1264 }
1265
AppendFormattedHost(const GURL & url,const std::wstring & languages,std::wstring * output,url_parse::Parsed * new_parsed,size_t * offset_for_adjustment)1266 void AppendFormattedHost(const GURL& url,
1267 const std::wstring& languages,
1268 std::wstring* output,
1269 url_parse::Parsed* new_parsed,
1270 size_t* offset_for_adjustment) {
1271 DCHECK(output);
1272 const url_parse::Component& host =
1273 url.parsed_for_possibly_invalid_spec().host;
1274
1275 if (host.is_nonempty()) {
1276 // Handle possible IDN in the host name.
1277 int new_host_begin = static_cast<int>(output->length());
1278 if (new_parsed)
1279 new_parsed->host.begin = new_host_begin;
1280 size_t offset_past_current_output =
1281 (!offset_for_adjustment ||
1282 (*offset_for_adjustment == std::wstring::npos) ||
1283 (*offset_for_adjustment < output->length())) ?
1284 std::wstring::npos : (*offset_for_adjustment - output->length());
1285 size_t* offset_into_host =
1286 (offset_past_current_output >= static_cast<size_t>(host.len)) ?
1287 NULL : &offset_past_current_output;
1288
1289 const std::string& spec = url.possibly_invalid_spec();
1290 DCHECK(host.begin >= 0 &&
1291 ((spec.length() == 0 && host.begin == 0) ||
1292 host.begin < static_cast<int>(spec.length())));
1293 output->append(net::IDNToUnicode(&spec[host.begin],
1294 static_cast<size_t>(host.len), languages, offset_into_host));
1295
1296 int new_host_len = static_cast<int>(output->length()) - new_host_begin;
1297 if (new_parsed)
1298 new_parsed->host.len = new_host_len;
1299 if (offset_into_host) {
1300 *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ?
1301 std::wstring::npos : (new_host_begin + *offset_into_host);
1302 } else if (offset_past_current_output != std::wstring::npos) {
1303 *offset_for_adjustment += new_host_len - host.len;
1304 }
1305 } else if (new_parsed) {
1306 new_parsed->host.reset();
1307 }
1308 }
1309
1310 /* static */
AppendFormattedComponent(const std::string & spec,const url_parse::Component & in_component,UnescapeRule::Type unescape_rules,std::wstring * output,url_parse::Component * out_component,size_t * offset_for_adjustment)1311 void AppendFormattedComponent(const std::string& spec,
1312 const url_parse::Component& in_component,
1313 UnescapeRule::Type unescape_rules,
1314 std::wstring* output,
1315 url_parse::Component* out_component,
1316 size_t* offset_for_adjustment) {
1317 DCHECK(output);
1318 DCHECK(offset_for_adjustment);
1319 if (in_component.is_nonempty()) {
1320 out_component->begin = static_cast<int>(output->length());
1321 size_t offset_past_current_output =
1322 ((*offset_for_adjustment == std::wstring::npos) ||
1323 (*offset_for_adjustment < output->length())) ?
1324 std::wstring::npos : (*offset_for_adjustment - output->length());
1325 size_t* offset_into_component =
1326 (offset_past_current_output >= static_cast<size_t>(in_component.len)) ?
1327 NULL : &offset_past_current_output;
1328 if (unescape_rules == UnescapeRule::NONE) {
1329 output->append(UTF8ToWideAndAdjustOffset(
1330 spec.substr(in_component.begin, in_component.len),
1331 offset_into_component));
1332 } else {
1333 output->append(UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(
1334 spec.substr(in_component.begin, in_component.len), unescape_rules,
1335 offset_into_component)));
1336 }
1337 out_component->len =
1338 static_cast<int>(output->length()) - out_component->begin;
1339 if (offset_into_component) {
1340 *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ?
1341 std::wstring::npos : (out_component->begin + *offset_into_component);
1342 } else if (offset_past_current_output != std::wstring::npos) {
1343 *offset_for_adjustment += out_component->len - in_component.len;
1344 }
1345 } else {
1346 out_component->reset();
1347 }
1348 }
1349
FormatUrl(const GURL & url,const std::wstring & languages,bool omit_username_password,UnescapeRule::Type unescape_rules,url_parse::Parsed * new_parsed,size_t * prefix_end,size_t * offset_for_adjustment)1350 std::wstring FormatUrl(const GURL& url,
1351 const std::wstring& languages,
1352 bool omit_username_password,
1353 UnescapeRule::Type unescape_rules,
1354 url_parse::Parsed* new_parsed,
1355 size_t* prefix_end,
1356 size_t* offset_for_adjustment) {
1357 url_parse::Parsed parsed_temp;
1358 if (!new_parsed)
1359 new_parsed = &parsed_temp;
1360 size_t offset_temp = std::wstring::npos;
1361 if (!offset_for_adjustment)
1362 offset_for_adjustment = &offset_temp;
1363
1364 std::wstring url_string;
1365
1366 // Check for empty URLs or 0 available text width.
1367 if (url.is_empty()) {
1368 if (prefix_end)
1369 *prefix_end = 0;
1370 *offset_for_adjustment = std::wstring::npos;
1371 return url_string;
1372 }
1373
1374 // Special handling for view-source:. Don't use chrome::kViewSourceScheme
1375 // because this library shouldn't depend on chrome.
1376 const char* const kViewSource = "view-source";
1377 const char* const kViewSourceTwice = "view-source:view-source:";
1378 // Rejects view-source:view-source:... to avoid deep recursive call.
1379 if (url.SchemeIs(kViewSource) &&
1380 !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
1381 return FormatViewSourceUrl(url, languages, omit_username_password,
1382 unescape_rules, new_parsed, prefix_end, offset_for_adjustment);
1383 }
1384
1385 // We handle both valid and invalid URLs (this will give us the spec
1386 // regardless of validity).
1387 const std::string& spec = url.possibly_invalid_spec();
1388 const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
1389 if (*offset_for_adjustment >= spec.length())
1390 *offset_for_adjustment = std::wstring::npos;
1391
1392 // Copy everything before the username (the scheme and the separators.)
1393 // These are ASCII.
1394 std::copy(spec.begin(),
1395 spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
1396 true),
1397 std::back_inserter(url_string));
1398 new_parsed->scheme = parsed.scheme;
1399
1400 if (omit_username_password) {
1401 // Remove the username and password fields. We don't want to display those
1402 // to the user since they can be used for attacks,
1403 // e.g. "http://google.com:search@evil.ru/"
1404 new_parsed->username.reset();
1405 new_parsed->password.reset();
1406 if ((*offset_for_adjustment != std::wstring::npos) &&
1407 (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1408 if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1409 // The seeming off-by-one and off-by-two in these first two lines are to
1410 // account for the ':' after the username and '@' after the password.
1411 if (*offset_for_adjustment >
1412 static_cast<size_t>(parsed.password.end())) {
1413 *offset_for_adjustment -=
1414 (parsed.username.len + parsed.password.len + 2);
1415 } else if (*offset_for_adjustment >
1416 static_cast<size_t>(parsed.username.begin)) {
1417 *offset_for_adjustment = std::wstring::npos;
1418 }
1419 } else {
1420 const url_parse::Component* nonempty_component =
1421 parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1422 // The seeming off-by-one in these first two lines is to account for the
1423 // '@' after the username/password.
1424 if (*offset_for_adjustment >
1425 static_cast<size_t>(nonempty_component->end())) {
1426 *offset_for_adjustment -= (nonempty_component->len + 1);
1427 } else if (*offset_for_adjustment >
1428 static_cast<size_t>(nonempty_component->begin)) {
1429 *offset_for_adjustment = std::wstring::npos;
1430 }
1431 }
1432 }
1433 } else {
1434 AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,
1435 &new_parsed->username, offset_for_adjustment);
1436 if (parsed.password.is_valid()) {
1437 url_string.push_back(':');
1438 }
1439 AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,
1440 &new_parsed->password, offset_for_adjustment);
1441 if (parsed.username.is_valid() || parsed.password.is_valid()) {
1442 url_string.push_back('@');
1443 }
1444 }
1445 if (prefix_end)
1446 *prefix_end = static_cast<size_t>(url_string.length());
1447
1448 AppendFormattedHost(url, languages, &url_string, new_parsed,
1449 offset_for_adjustment);
1450
1451 // Port.
1452 if (parsed.port.is_nonempty()) {
1453 url_string.push_back(':');
1454 new_parsed->port.begin = url_string.length();
1455 std::copy(spec.begin() + parsed.port.begin,
1456 spec.begin() + parsed.port.end(), std::back_inserter(url_string));
1457 new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1458 } else {
1459 new_parsed->port.reset();
1460 }
1461
1462 // Path and query both get the same general unescape & convert treatment.
1463 AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,
1464 &new_parsed->path, offset_for_adjustment);
1465 if (parsed.query.is_valid())
1466 url_string.push_back('?');
1467 AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,
1468 &new_parsed->query, offset_for_adjustment);
1469
1470 // Reference is stored in valid, unescaped UTF-8, so we can just convert.
1471 if (parsed.ref.is_valid()) {
1472 url_string.push_back('#');
1473 new_parsed->ref.begin = url_string.length();
1474 size_t offset_past_current_output =
1475 ((*offset_for_adjustment == std::wstring::npos) ||
1476 (*offset_for_adjustment < url_string.length())) ?
1477 std::wstring::npos : (*offset_for_adjustment - url_string.length());
1478 size_t* offset_into_ref =
1479 (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ?
1480 NULL : &offset_past_current_output;
1481 if (parsed.ref.len > 0) {
1482 url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin,
1483 parsed.ref.len),
1484 offset_into_ref));
1485 }
1486 new_parsed->ref.len = url_string.length() - new_parsed->ref.begin;
1487 if (offset_into_ref) {
1488 *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ?
1489 std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref);
1490 } else if (offset_past_current_output != std::wstring::npos) {
1491 // We clamped the offset near the beginning of this function to ensure it
1492 // was within the input URL. If we reach here, the input was something
1493 // invalid and non-parseable such that the offset was past any component
1494 // we could figure out. In this case it won't be represented in the
1495 // output string, so reset it.
1496 *offset_for_adjustment = std::wstring::npos;
1497 }
1498 }
1499
1500 return url_string;
1501 }
1502
SimplifyUrlForRequest(const GURL & url)1503 GURL SimplifyUrlForRequest(const GURL& url) {
1504 DCHECK(url.is_valid());
1505 GURL::Replacements replacements;
1506 replacements.ClearUsername();
1507 replacements.ClearPassword();
1508 replacements.ClearRef();
1509 return url.ReplaceComponents(replacements);
1510 }
1511
1512 // Specifies a comma separated list of port numbers that should be accepted
1513 // despite bans. If the string is invalid no allowed ports are stored.
SetExplicitlyAllowedPorts(const std::wstring & allowed_ports)1514 void SetExplicitlyAllowedPorts(const std::wstring& allowed_ports) {
1515 if (allowed_ports.empty())
1516 return;
1517
1518 std::set<int> ports;
1519 size_t last = 0;
1520 size_t size = allowed_ports.size();
1521 // The comma delimiter.
1522 const std::wstring::value_type kComma = L',';
1523
1524 // Overflow is still possible for evil user inputs.
1525 for (size_t i = 0; i <= size; ++i) {
1526 // The string should be composed of only digits and commas.
1527 if (i != size && !IsAsciiDigit(allowed_ports[i]) &&
1528 (allowed_ports[i] != kComma))
1529 return;
1530 if (i == size || allowed_ports[i] == kComma) {
1531 size_t length = i - last;
1532 if (length > 0)
1533 ports.insert(StringToInt(WideToASCII(
1534 allowed_ports.substr(last, length))));
1535 last = i + 1;
1536 }
1537 }
1538 explicitly_allowed_ports = ports;
1539 }
1540
1541 } // namespace net
1542