• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2009 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/net_util.h"
6 
7 #include <algorithm>
8 #include <map>
9 #include <unicode/ucnv.h>
10 #include <unicode/uidna.h>
11 #include <unicode/ulocdata.h>
12 #include <unicode/uniset.h>
13 #include <unicode/uscript.h>
14 #include <unicode/uset.h>
15 
16 #include "build/build_config.h"
17 
18 #if defined(OS_WIN)
19 #include <windows.h>
20 #include <winsock2.h>
21 #include <ws2tcpip.h>
22 #include <wspiapi.h>  // Needed for Win2k compat.
23 #elif defined(OS_POSIX)
24 #include <netdb.h>
25 #include <sys/socket.h>
26 #include <fcntl.h>
27 #endif
28 
29 #include "base/base64.h"
30 #include "base/basictypes.h"
31 #include "base/file_path.h"
32 #include "base/file_util.h"
33 #include "base/i18n/file_util_icu.h"
34 #include "base/i18n/icu_string_conversions.h"
35 #include "base/i18n/time_formatting.h"
36 #include "base/json/string_escape.h"
37 #include "base/lock.h"
38 #include "base/logging.h"
39 #include "base/message_loop.h"
40 #include "base/path_service.h"
41 #include "base/singleton.h"
42 #include "base/stl_util-inl.h"
43 #include "base/string_piece.h"
44 #include "base/string_tokenizer.h"
45 #include "base/string_util.h"
46 #include "base/sys_string_conversions.h"
47 #include "base/time.h"
48 #include "base/utf_offset_string_conversions.h"
49 #include "grit/net_resources.h"
50 #include "googleurl/src/gurl.h"
51 #include "googleurl/src/url_canon.h"
52 #include "googleurl/src/url_parse.h"
53 #include "net/base/escape.h"
54 #include "net/base/net_module.h"
55 #if defined(OS_WIN)
56 #include "net/base/winsock_init.h"
57 #endif
58 #include "unicode/datefmt.h"
59 
60 
61 using base::Time;
62 
63 namespace {
64 
65 // what we prepend to get a file URL
66 static const FilePath::CharType kFileURLPrefix[] =
67     FILE_PATH_LITERAL("file:///");
68 
69 // The general list of blocked ports. Will be blocked unless a specific
70 // protocol overrides it. (Ex: ftp can use ports 20 and 21)
71 static const int kRestrictedPorts[] = {
72   1,    // tcpmux
73   7,    // echo
74   9,    // discard
75   11,   // systat
76   13,   // daytime
77   15,   // netstat
78   17,   // qotd
79   19,   // chargen
80   20,   // ftp data
81   21,   // ftp access
82   22,   // ssh
83   23,   // telnet
84   25,   // smtp
85   37,   // time
86   42,   // name
87   43,   // nicname
88   53,   // domain
89   77,   // priv-rjs
90   79,   // finger
91   87,   // ttylink
92   95,   // supdup
93   101,  // hostriame
94   102,  // iso-tsap
95   103,  // gppitnp
96   104,  // acr-nema
97   109,  // pop2
98   110,  // pop3
99   111,  // sunrpc
100   113,  // auth
101   115,  // sftp
102   117,  // uucp-path
103   119,  // nntp
104   123,  // NTP
105   135,  // loc-srv /epmap
106   139,  // netbios
107   143,  // imap2
108   179,  // BGP
109   389,  // ldap
110   465,  // smtp+ssl
111   512,  // print / exec
112   513,  // login
113   514,  // shell
114   515,  // printer
115   526,  // tempo
116   530,  // courier
117   531,  // chat
118   532,  // netnews
119   540,  // uucp
120   556,  // remotefs
121   563,  // nntp+ssl
122   587,  // stmp?
123   601,  // ??
124   636,  // ldap+ssl
125   993,  // ldap+ssl
126   995,  // pop3+ssl
127   2049, // nfs
128   3659, // apple-sasl / PasswordServer
129   4045, // lockd
130   6000, // X11
131 };
132 
133 // FTP overrides the following restricted ports.
134 static const int kAllowedFtpPorts[] = {
135   21,   // ftp data
136   22,   // ssh
137 };
138 
139 template<typename STR>
GetSpecificHeaderT(const STR & headers,const STR & name)140 STR GetSpecificHeaderT(const STR& headers, const STR& name) {
141   // We want to grab the Value from the "Key: Value" pairs in the headers,
142   // which should look like this (no leading spaces, \n-separated) (we format
143   // them this way in url_request_inet.cc):
144   //    HTTP/1.1 200 OK\n
145   //    ETag: "6d0b8-947-24f35ec0"\n
146   //    Content-Length: 2375\n
147   //    Content-Type: text/html; charset=UTF-8\n
148   //    Last-Modified: Sun, 03 Sep 2006 04:34:43 GMT\n
149   if (headers.empty())
150     return STR();
151 
152   STR match;
153   match.push_back('\n');
154   match.append(name);
155   match.push_back(':');
156 
157   typename STR::const_iterator begin =
158       search(headers.begin(), headers.end(), match.begin(), match.end(),
159              CaseInsensitiveCompareASCII<typename STR::value_type>());
160 
161   if (begin == headers.end())
162     return STR();
163 
164   begin += match.length();
165 
166   typename STR::const_iterator end = find(begin, headers.end(), '\n');
167 
168   STR ret;
169   TrimWhitespace(STR(begin, end), TRIM_ALL, &ret);
170   return ret;
171 }
172 
173 // TODO(jungshik): We have almost identical hex-decoding code else where.
174 // Consider refactoring and moving it somewhere(base?). Bug 1224311
IsHexDigit(unsigned char c)175 inline bool IsHexDigit(unsigned char c) {
176   return (('0' <= c && c <= '9') || ('A' <= c && c <= 'F') ||
177           ('a' <= c && c <= 'f'));
178 }
179 
HexToInt(unsigned char c)180 inline unsigned char HexToInt(unsigned char c) {
181   DCHECK(IsHexDigit(c));
182   static unsigned char kOffset[4] = {0, 0x30u, 0x37u, 0x57u};
183   return c - kOffset[(c >> 5) & 3];
184 }
185 
186 // Similar to Base64Decode. Decodes a Q-encoded string to a sequence
187 // of bytes. If input is invalid, return false.
QPDecode(const std::string & input,std::string * output)188 bool QPDecode(const std::string& input, std::string* output) {
189   std::string temp;
190   temp.reserve(input.size());
191   std::string::const_iterator it = input.begin();
192   while (it != input.end()) {
193     if (*it == '_') {
194       temp.push_back(' ');
195     } else if (*it == '=') {
196       if (input.end() - it < 3) {
197         return false;
198       }
199       if (IsHexDigit(static_cast<unsigned char>(*(it + 1))) &&
200           IsHexDigit(static_cast<unsigned char>(*(it + 2)))) {
201         unsigned char ch = HexToInt(*(it + 1)) * 16 + HexToInt(*(it + 2));
202         temp.push_back(static_cast<char>(ch));
203         ++it;
204         ++it;
205       } else {
206         return false;
207       }
208     } else if (0x20 < *it && *it < 0x7F) {
209       // In a Q-encoded word, only printable ASCII characters
210       // represent themselves. Besides, space, '=', '_' and '?' are
211       // not allowed, but they're already filtered out.
212       DCHECK(*it != 0x3D && *it != 0x5F && *it != 0x3F);
213       temp.push_back(*it);
214     } else {
215       return false;
216     }
217     ++it;
218   }
219   output->swap(temp);
220   return true;
221 }
222 
223 enum RFC2047EncodingType {Q_ENCODING, B_ENCODING};
DecodeBQEncoding(const std::string & part,RFC2047EncodingType enc_type,const std::string & charset,std::string * output)224 bool DecodeBQEncoding(const std::string& part, RFC2047EncodingType enc_type,
225                        const std::string& charset, std::string* output) {
226   std::string decoded;
227   if (enc_type == B_ENCODING) {
228     if (!base::Base64Decode(part, &decoded)) {
229       return false;
230     }
231   } else {
232     if (!QPDecode(part, &decoded)) {
233       return false;
234     }
235   }
236 
237   UErrorCode err = U_ZERO_ERROR;
238   UConverter* converter(ucnv_open(charset.c_str(), &err));
239   if (U_FAILURE(err)) {
240     return false;
241   }
242 
243   // A single byte in a legacy encoding can be expanded to 3 bytes in UTF-8.
244   // A 'two-byte character' in a legacy encoding can be expanded to 4 bytes
245   // in UTF-8. Therefore, the expansion ratio is 3 at most.
246   int length = static_cast<int>(decoded.length());
247   char* buf = WriteInto(output, length * 3);
248   length = ucnv_toAlgorithmic(UCNV_UTF8, converter, buf, length * 3,
249       decoded.data(), length, &err);
250   ucnv_close(converter);
251   if (U_FAILURE(err)) {
252     return false;
253   }
254   output->resize(length);
255   return true;
256 }
257 
DecodeWord(const std::string & encoded_word,const std::string & referrer_charset,bool * is_rfc2047,std::string * output)258 bool DecodeWord(const std::string& encoded_word,
259                 const std::string& referrer_charset,
260                 bool *is_rfc2047,
261                 std::string* output) {
262   if (!IsStringASCII(encoded_word)) {
263     // Try UTF-8, referrer_charset and the native OS default charset in turn.
264     if (IsStringUTF8(encoded_word)) {
265       *output = encoded_word;
266     } else {
267       std::wstring wide_output;
268       if (!referrer_charset.empty() &&
269           base::CodepageToWide(encoded_word, referrer_charset.c_str(),
270                                base::OnStringConversionError::FAIL,
271                                &wide_output)) {
272         *output = WideToUTF8(wide_output);
273       } else {
274         *output = WideToUTF8(base::SysNativeMBToWide(encoded_word));
275       }
276     }
277     *is_rfc2047 = false;
278     return true;
279   }
280 
281   // RFC 2047 : one of encoding methods supported by Firefox and relatively
282   // widely used by web servers.
283   // =?charset?<E>?<encoded string>?= where '<E>' is either 'B' or 'Q'.
284   // We don't care about the length restriction (72 bytes) because
285   // many web servers generate encoded words longer than the limit.
286   std::string tmp;
287   *is_rfc2047 = true;
288   int part_index = 0;
289   std::string charset;
290   StringTokenizer t(encoded_word, "?");
291   RFC2047EncodingType enc_type = Q_ENCODING;
292   while (*is_rfc2047 && t.GetNext()) {
293     std::string part = t.token();
294     switch (part_index) {
295       case 0:
296         if (part != "=") {
297           *is_rfc2047 = false;
298           break;
299         }
300         ++part_index;
301         break;
302       case 1:
303         // Do we need charset validity check here?
304         charset = part;
305         ++part_index;
306         break;
307       case 2:
308         if (part.size() > 1 ||
309             part.find_first_of("bBqQ") == std::string::npos) {
310           *is_rfc2047 = false;
311           break;
312         }
313         if (part[0] == 'b' || part[0] == 'B') {
314           enc_type = B_ENCODING;
315         }
316         ++part_index;
317         break;
318       case 3:
319         *is_rfc2047 = DecodeBQEncoding(part, enc_type, charset, &tmp);
320         if (!*is_rfc2047) {
321           // Last minute failure. Invalid B/Q encoding. Rather than
322           // passing it through, return now.
323           return false;
324         }
325         ++part_index;
326         break;
327       case 4:
328         if (part != "=") {
329           // Another last minute failure !
330           // Likely to be a case of two encoded-words in a row or
331           // an encoded word followed by a non-encoded word. We can be
332           // generous, but it does not help much in terms of compatibility,
333           // I believe. Return immediately.
334           *is_rfc2047 = false;
335           return false;
336         }
337         ++part_index;
338         break;
339       default:
340         *is_rfc2047 = false;
341         return false;
342     }
343   }
344 
345   if (*is_rfc2047) {
346     if (*(encoded_word.end() - 1) == '=') {
347       output->swap(tmp);
348       return true;
349     }
350     // encoded_word ending prematurelly with '?' or extra '?'
351     *is_rfc2047 = false;
352     return false;
353   }
354 
355   // We're not handling 'especial' characters quoted with '\', but
356   // it should be Ok because we're not an email client but a
357   // web browser.
358 
359   // What IE6/7 does: %-escaped UTF-8. We could extend this to
360   // support a rudimentary form of RFC 2231 with charset label, but
361   // it'd gain us little in terms of compatibility.
362   tmp = UnescapeURLComponent(encoded_word, UnescapeRule::SPACES);
363   if (IsStringUTF8(tmp)) {
364     output->swap(tmp);
365     return true;
366     // We can try either the OS default charset or 'origin charset' here,
367     // As far as I can tell, IE does not support it. However, I've seen
368     // web servers emit %-escaped string in a legacy encoding (usually
369     // origin charset).
370     // TODO(jungshik) : Test IE further and consider adding a fallback here.
371   }
372   return false;
373 }
374 
DecodeParamValue(const std::string & input,const std::string & referrer_charset,std::string * output)375 bool DecodeParamValue(const std::string& input,
376                       const std::string& referrer_charset,
377                       std::string* output) {
378   std::string tmp;
379   // Tokenize with whitespace characters.
380   StringTokenizer t(input, " \t\n\r");
381   t.set_options(StringTokenizer::RETURN_DELIMS);
382   bool is_previous_token_rfc2047 = true;
383   while (t.GetNext()) {
384     if (t.token_is_delim()) {
385       // If the previous non-delimeter token is not RFC2047-encoded,
386       // put in a space in its place. Otheriwse, skip over it.
387       if (!is_previous_token_rfc2047) {
388         tmp.push_back(' ');
389       }
390       continue;
391     }
392     // We don't support a single multibyte character split into
393     // adjacent encoded words. Some broken mail clients emit headers
394     // with that problem, but most web servers usually encode a filename
395     // in a single encoded-word. Firefox/Thunderbird do not support
396     // it, either.
397     std::string decoded;
398     if (!DecodeWord(t.token(), referrer_charset, &is_previous_token_rfc2047,
399                     &decoded))
400       return false;
401     tmp.append(decoded);
402   }
403   output->swap(tmp);
404   return true;
405 }
406 
407 // TODO(mpcomplete): This is a quick and dirty implementation for now.  I'm
408 // sure this doesn't properly handle all (most?) cases.
409 template<typename STR>
GetHeaderParamValueT(const STR & header,const STR & param_name)410 STR GetHeaderParamValueT(const STR& header, const STR& param_name) {
411   // This assumes args are formatted exactly like "bla; arg1=value; arg2=value".
412   typename STR::const_iterator param_begin =
413       search(header.begin(), header.end(), param_name.begin(), param_name.end(),
414              CaseInsensitiveCompareASCII<typename STR::value_type>());
415 
416   if (param_begin == header.end())
417     return STR();
418   param_begin += param_name.length();
419 
420   STR whitespace;
421   whitespace.push_back(' ');
422   whitespace.push_back('\t');
423   const typename STR::size_type equals_offset =
424       header.find_first_not_of(whitespace, param_begin - header.begin());
425   if (equals_offset == STR::npos || header.at(equals_offset) != '=')
426     return STR();
427 
428   param_begin = header.begin() + equals_offset + 1;
429   if (param_begin == header.end())
430     return STR();
431 
432   typename STR::const_iterator param_end;
433   if (*param_begin == '"') {
434     param_end = find(param_begin+1, header.end(), '"');
435     if (param_end == header.end())
436       return STR();  // poorly formatted param?
437 
438     ++param_begin;  // skip past the quote.
439   } else {
440     param_end = find(param_begin+1, header.end(), ';');
441   }
442 
443   return STR(param_begin, param_end);
444 }
445 
446 // Does some simple normalization of scripts so we can allow certain scripts
447 // to exist together.
448 // TODO(brettw) bug 880223: we should allow some other languages to be
449 // oombined such as Chinese and Latin. We will probably need a more
450 // complicated system of language pairs to have more fine-grained control.
NormalizeScript(UScriptCode code)451 UScriptCode NormalizeScript(UScriptCode code) {
452   switch (code) {
453     case USCRIPT_KATAKANA:
454     case USCRIPT_HIRAGANA:
455     case USCRIPT_KATAKANA_OR_HIRAGANA:
456     case USCRIPT_HANGUL:  // This one is arguable.
457       return USCRIPT_HAN;
458     default:
459       return code;
460   }
461 }
462 
IsIDNComponentInSingleScript(const char16 * str,int str_len)463 bool IsIDNComponentInSingleScript(const char16* str, int str_len) {
464   UScriptCode first_script = USCRIPT_INVALID_CODE;
465   bool is_first = true;
466 
467   int i = 0;
468   while (i < str_len) {
469     unsigned code_point;
470     U16_NEXT(str, i, str_len, code_point);
471 
472     UErrorCode err = U_ZERO_ERROR;
473     UScriptCode cur_script = uscript_getScript(code_point, &err);
474     if (err != U_ZERO_ERROR)
475       return false;  // Report mixed on error.
476     cur_script = NormalizeScript(cur_script);
477 
478     // TODO(brettw) We may have to check for USCRIPT_INHERENT as well.
479     if (is_first && cur_script != USCRIPT_COMMON) {
480       first_script = cur_script;
481       is_first = false;
482     } else {
483       if (cur_script != USCRIPT_COMMON && cur_script != first_script)
484         return false;
485     }
486   }
487   return true;
488 }
489 
490 // Check if the script of a language can be 'safely' mixed with
491 // Latin letters in the ASCII range.
IsCompatibleWithASCIILetters(const std::string & lang)492 bool IsCompatibleWithASCIILetters(const std::string& lang) {
493   // For now, just list Chinese, Japanese and Korean (positive list).
494   // An alternative is negative-listing (languages using Greek and
495   // Cyrillic letters), but it can be more dangerous.
496   return !lang.substr(0, 2).compare("zh") ||
497          !lang.substr(0, 2).compare("ja") ||
498          !lang.substr(0, 2).compare("ko");
499 }
500 
501 typedef std::map<std::string, icu::UnicodeSet*> LangToExemplarSetMap;
502 
503 class LangToExemplarSet {
504  private:
505   LangToExemplarSetMap map;
LangToExemplarSet()506   LangToExemplarSet() { }
~LangToExemplarSet()507   ~LangToExemplarSet() {
508     STLDeleteContainerPairSecondPointers(map.begin(), map.end());
509   }
510 
511   friend class Singleton<LangToExemplarSet>;
512   friend struct DefaultSingletonTraits<LangToExemplarSet>;
513   friend bool GetExemplarSetForLang(const std::string&, icu::UnicodeSet**);
514   friend void SetExemplarSetForLang(const std::string&, icu::UnicodeSet*);
515 
516   DISALLOW_COPY_AND_ASSIGN(LangToExemplarSet);
517 };
518 
GetExemplarSetForLang(const std::string & lang,icu::UnicodeSet ** lang_set)519 bool GetExemplarSetForLang(const std::string& lang,
520                            icu::UnicodeSet** lang_set) {
521   const LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map;
522   LangToExemplarSetMap::const_iterator pos = map.find(lang);
523   if (pos != map.end()) {
524     *lang_set = pos->second;
525     return true;
526   }
527   return false;
528 }
529 
SetExemplarSetForLang(const std::string & lang,icu::UnicodeSet * lang_set)530 void SetExemplarSetForLang(const std::string& lang,
531                            icu::UnicodeSet* lang_set) {
532   LangToExemplarSetMap& map = Singleton<LangToExemplarSet>()->map;
533   map.insert(std::make_pair(lang, lang_set));
534 }
535 
536 static Lock lang_set_lock;
537 
538 // Returns true if all the characters in component_characters are used by
539 // the language |lang|.
IsComponentCoveredByLang(const icu::UnicodeSet & component_characters,const std::string & lang)540 bool IsComponentCoveredByLang(const icu::UnicodeSet& component_characters,
541                               const std::string& lang) {
542   static const icu::UnicodeSet kASCIILetters(0x61, 0x7a);  // [a-z]
543   icu::UnicodeSet* lang_set;
544   // We're called from both the UI thread and the history thread.
545   {
546     AutoLock lock(lang_set_lock);
547     if (!GetExemplarSetForLang(lang, &lang_set)) {
548       UErrorCode status = U_ZERO_ERROR;
549       ULocaleData* uld = ulocdata_open(lang.c_str(), &status);
550       // TODO(jungshik) Turn this check on when the ICU data file is
551       // rebuilt with the minimal subset of locale data for languages
552       // to which Chrome is not localized but which we offer in the list
553       // of languages selectable for Accept-Languages. With the rebuilt ICU
554       // data, ulocdata_open never should fall back to the default locale.
555       // (issue 2078)
556       // DCHECK(U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING);
557       if (U_SUCCESS(status) && status != U_USING_DEFAULT_WARNING) {
558         lang_set = reinterpret_cast<icu::UnicodeSet *>(
559             ulocdata_getExemplarSet(uld, NULL, 0,
560                                     ULOCDATA_ES_STANDARD, &status));
561         // If |lang| is compatible with ASCII Latin letters, add them.
562         if (IsCompatibleWithASCIILetters(lang))
563           lang_set->addAll(kASCIILetters);
564       } else {
565         lang_set = new icu::UnicodeSet(1, 0);
566       }
567       lang_set->freeze();
568       SetExemplarSetForLang(lang, lang_set);
569       ulocdata_close(uld);
570     }
571   }
572   return !lang_set->isEmpty() && lang_set->containsAll(component_characters);
573 }
574 
575 // Returns true if the given Unicode host component is safe to display to the
576 // user.
IsIDNComponentSafe(const char16 * str,int str_len,const std::wstring & languages)577 bool IsIDNComponentSafe(const char16* str,
578                         int str_len,
579                         const std::wstring& languages) {
580   // Most common cases (non-IDN) do not reach here so that we don't
581   // need a fast return path.
582   // TODO(jungshik) : Check if there's any character inappropriate
583   // (although allowed) for domain names.
584   // See http://www.unicode.org/reports/tr39/#IDN_Security_Profiles and
585   // http://www.unicode.org/reports/tr39/data/xidmodifications.txt
586   // For now, we borrow the list from Mozilla and tweaked it slightly.
587   // (e.g. Characters like U+00A0, U+3000, U+3002 are omitted because
588   //  they're gonna be canonicalized to U+0020 and full stop before
589   //  reaching here.)
590   // The original list is available at
591   // http://kb.mozillazine.org/Network.IDN.blacklist_chars and
592   // at http://mxr.mozilla.org/seamonkey/source/modules/libpref/src/init/all.js#703
593 
594   UErrorCode status = U_ZERO_ERROR;
595 #ifdef U_WCHAR_IS_UTF16
596   icu::UnicodeSet dangerous_characters(icu::UnicodeString(
597       L"[[\\ \u00bc\u00bd\u01c3\u0337\u0338"
598       L"\u05c3\u05f4\u06d4\u0702\u115f\u1160][\u2000-\u200b]"
599       L"[\u2024\u2027\u2028\u2029\u2039\u203a\u2044\u205f]"
600       L"[\u2154-\u2156][\u2159-\u215b][\u215f\u2215\u23ae"
601       L"\u29f6\u29f8\u2afb\u2afd][\u2ff0-\u2ffb][\u3014"
602       L"\u3015\u3033\u3164\u321d\u321e\u33ae\u33af\u33c6\u33df\ufe14"
603       L"\ufe15\ufe3f\ufe5d\ufe5e\ufeff\uff0e\uff06\uff61\uffa0\ufff9]"
604       L"[\ufffa-\ufffd]]"), status);
605 #else
606   icu::UnicodeSet dangerous_characters(icu::UnicodeString(
607       "[[\\u0020\\u00bc\\u00bd\\u01c3\\u0337\\u0338"
608       "\\u05c3\\u05f4\\u06d4\\u0702\\u115f\\u1160][\\u2000-\\u200b]"
609       "[\\u2024\\u2027\\u2028\\u2029\\u2039\\u203a\\u2044\\u205f]"
610       "[\\u2154-\\u2156][\\u2159-\\u215b][\\u215f\\u2215\\u23ae"
611       "\\u29f6\\u29f8\\u2afb\\u2afd][\\u2ff0-\\u2ffb][\\u3014"
612       "\\u3015\\u3033\\u3164\\u321d\\u321e\\u33ae\\u33af\\u33c6\\u33df\\ufe14"
613       "\\ufe15\\ufe3f\\ufe5d\\ufe5e\\ufeff\\uff0e\\uff06\\uff61\\uffa0\\ufff9]"
614       "[\\ufffa-\\ufffd]]", -1, US_INV), status);
615 #endif
616   DCHECK(U_SUCCESS(status));
617   icu::UnicodeSet component_characters;
618   component_characters.addAll(icu::UnicodeString(str, str_len));
619   if (dangerous_characters.containsSome(component_characters))
620     return false;
621 
622   // If the language list is empty, the result is completely determined
623   // by whether a component is a single script or not. This will block
624   // even "safe" script mixing cases like <Chinese, Latin-ASCII> that are
625   // allowed with |languages| (while it blocks Chinese + Latin letters with
626   // an accent as should be the case), but we want to err on the safe side
627   // when |languages| is empty.
628   if (languages.empty())
629     return IsIDNComponentInSingleScript(str, str_len);
630 
631   // |common_characters| is made up of  ASCII numbers, hyphen, plus and
632   // underscore that are used across scripts and allowed in domain names.
633   // (sync'd with characters allowed in url_canon_host with square
634   // brackets excluded.) See kHostCharLookup[] array in url_canon_host.cc.
635   icu::UnicodeSet common_characters(UNICODE_STRING_SIMPLE("[[0-9]\\-_+\\ ]"),
636                                     status);
637   DCHECK(U_SUCCESS(status));
638   // Subtract common characters because they're always allowed so that
639   // we just have to check if a language-specific set contains
640   // the remainder.
641   component_characters.removeAll(common_characters);
642 
643   std::string languages_list(WideToASCII(languages));
644   StringTokenizer t(languages_list, ",");
645   while (t.GetNext()) {
646     if (IsComponentCoveredByLang(component_characters, t.token()))
647       return true;
648   }
649   return false;
650 }
651 
652 // Converts one component of a host (between dots) to IDN if safe. The result
653 // will be APPENDED to the given output string and will be the same as the input
654 // if it is not IDN or the IDN is unsafe to display.  Returns whether any
655 // conversion was performed.
IDNToUnicodeOneComponent(const char16 * comp,size_t comp_len,const std::wstring & languages,string16 * out)656 bool IDNToUnicodeOneComponent(const char16* comp,
657                               size_t comp_len,
658                               const std::wstring& languages,
659                               string16* out) {
660   DCHECK(out);
661   if (comp_len == 0)
662     return false;
663 
664   // Only transform if the input can be an IDN component.
665   static const char16 kIdnPrefix[] = {'x', 'n', '-', '-'};
666   if ((comp_len > arraysize(kIdnPrefix)) &&
667       !memcmp(comp, kIdnPrefix, arraysize(kIdnPrefix) * sizeof(char16))) {
668     // Repeatedly expand the output string until it's big enough.  It looks like
669     // ICU will return the required size of the buffer, but that's not
670     // documented, so we'll just grow by 2x. This should be rare and is not on a
671     // critical path.
672     size_t original_length = out->length();
673     for (int extra_space = 64; ; extra_space *= 2) {
674       UErrorCode status = U_ZERO_ERROR;
675       out->resize(out->length() + extra_space);
676       int output_chars = uidna_IDNToUnicode(comp,
677           static_cast<int32_t>(comp_len), &(*out)[original_length], extra_space,
678           UIDNA_DEFAULT, NULL, &status);
679       if (status == U_ZERO_ERROR) {
680         // Converted successfully.
681         out->resize(original_length + output_chars);
682         if (IsIDNComponentSafe(out->data() + original_length, output_chars,
683                                languages))
684           return true;
685       }
686 
687       if (status != U_BUFFER_OVERFLOW_ERROR)
688         break;
689     }
690     // Failed, revert back to original string.
691     out->resize(original_length);
692   }
693 
694   // We get here with no IDN or on error, in which case we just append the
695   // literal input.
696   out->append(comp, comp_len);
697   return false;
698 }
699 
700 // Helper for FormatUrl().
FormatViewSourceUrl(const GURL & url,const std::wstring & languages,bool omit_username_password,UnescapeRule::Type unescape_rules,url_parse::Parsed * new_parsed,size_t * prefix_end,size_t * offset_for_adjustment)701 std::wstring FormatViewSourceUrl(const GURL& url,
702                                  const std::wstring& languages,
703                                  bool omit_username_password,
704                                  UnescapeRule::Type unescape_rules,
705                                  url_parse::Parsed* new_parsed,
706                                  size_t* prefix_end,
707                                  size_t* offset_for_adjustment) {
708   DCHECK(new_parsed);
709   const wchar_t* const kWideViewSource = L"view-source:";
710   const size_t kViewSourceLengthPlus1 = 12;
711 
712   GURL real_url(url.possibly_invalid_spec().substr(kViewSourceLengthPlus1));
713   size_t temp_offset = (*offset_for_adjustment == std::wstring::npos) ?
714       std::wstring::npos : (*offset_for_adjustment - kViewSourceLengthPlus1);
715   size_t* temp_offset_ptr = (*offset_for_adjustment < kViewSourceLengthPlus1) ?
716       NULL : &temp_offset;
717   std::wstring result = net::FormatUrl(real_url, languages,
718       omit_username_password, unescape_rules, new_parsed, prefix_end,
719       temp_offset_ptr);
720   result.insert(0, kWideViewSource);
721 
722   // Adjust position values.
723   if (new_parsed->scheme.is_nonempty()) {
724     // Assume "view-source:real-scheme" as a scheme.
725     new_parsed->scheme.len += kViewSourceLengthPlus1;
726   } else {
727     new_parsed->scheme.begin = 0;
728     new_parsed->scheme.len = kViewSourceLengthPlus1 - 1;
729   }
730   if (new_parsed->username.is_nonempty())
731     new_parsed->username.begin += kViewSourceLengthPlus1;
732   if (new_parsed->password.is_nonempty())
733     new_parsed->password.begin += kViewSourceLengthPlus1;
734   if (new_parsed->host.is_nonempty())
735     new_parsed->host.begin += kViewSourceLengthPlus1;
736   if (new_parsed->port.is_nonempty())
737     new_parsed->port.begin += kViewSourceLengthPlus1;
738   if (new_parsed->path.is_nonempty())
739     new_parsed->path.begin += kViewSourceLengthPlus1;
740   if (new_parsed->query.is_nonempty())
741     new_parsed->query.begin += kViewSourceLengthPlus1;
742   if (new_parsed->ref.is_nonempty())
743     new_parsed->ref.begin += kViewSourceLengthPlus1;
744   if (prefix_end)
745     *prefix_end += kViewSourceLengthPlus1;
746   if (temp_offset_ptr) {
747     *offset_for_adjustment = (temp_offset == std::wstring::npos) ?
748         std::wstring::npos : (temp_offset + kViewSourceLengthPlus1);
749   }
750   return result;
751 }
752 
753 }  // namespace
754 
755 namespace net {
756 
757 std::set<int> explicitly_allowed_ports;
758 
759 // Appends the substring |in_component| inside of the URL |spec| to |output|,
760 // and the resulting range will be filled into |out_component|. |unescape_rules|
761 // defines how to clean the URL for human readability.  |offset_for_adjustment|
762 // is an offset into |output| which will be adjusted based on how it maps to the
763 // component being converted; if it is less than output->length(), it will be
764 // untouched, and if it is greater than output->length() + in_component.len it
765 // will be shortened by the difference in lengths between the input and output
766 // components.  Otherwise it points into the component being converted, and is
767 // adjusted to point to the same logical place in |output|.
768 // |offset_for_adjustment| may not be NULL.
769 static void AppendFormattedComponent(const std::string& spec,
770                                      const url_parse::Component& in_component,
771                                      UnescapeRule::Type unescape_rules,
772                                      std::wstring* output,
773                                      url_parse::Component* out_component,
774                                      size_t* offset_for_adjustment);
775 
FilePathToFileURL(const FilePath & path)776 GURL FilePathToFileURL(const FilePath& path) {
777   // Produce a URL like "file:///C:/foo" for a regular file, or
778   // "file://///server/path" for UNC. The URL canonicalizer will fix up the
779   // latter case to be the canonical UNC form: "file://server/path"
780   FilePath::StringType url_string(kFileURLPrefix);
781   url_string.append(path.value());
782 
783   // Now do replacement of some characters. Since we assume the input is a
784   // literal filename, anything the URL parser might consider special should
785   // be escaped here.
786 
787   // must be the first substitution since others will introduce percents as the
788   // escape character
789   ReplaceSubstringsAfterOffset(&url_string, 0,
790       FILE_PATH_LITERAL("%"), FILE_PATH_LITERAL("%25"));
791 
792   // semicolon is supposed to be some kind of separator according to RFC 2396
793   ReplaceSubstringsAfterOffset(&url_string, 0,
794       FILE_PATH_LITERAL(";"), FILE_PATH_LITERAL("%3B"));
795 
796   ReplaceSubstringsAfterOffset(&url_string, 0,
797       FILE_PATH_LITERAL("#"), FILE_PATH_LITERAL("%23"));
798 
799 #if defined(OS_POSIX)
800   ReplaceSubstringsAfterOffset(&url_string, 0,
801       FILE_PATH_LITERAL("\\"), FILE_PATH_LITERAL("%5C"));
802 #endif
803 
804   return GURL(url_string);
805 }
806 
GetSpecificHeader(const std::wstring & headers,const std::wstring & name)807 std::wstring GetSpecificHeader(const std::wstring& headers,
808                                const std::wstring& name) {
809   return GetSpecificHeaderT(headers, name);
810 }
811 
GetSpecificHeader(const std::string & headers,const std::string & name)812 std::string GetSpecificHeader(const std::string& headers,
813                                const std::string& name) {
814   return GetSpecificHeaderT(headers, name);
815 }
816 
GetFileNameFromCD(const std::string & header,const std::string & referrer_charset)817 std::string GetFileNameFromCD(const std::string& header,
818                               const std::string& referrer_charset) {
819   std::string param_value = GetHeaderParamValue(header, "filename");
820   if (param_value.empty()) {
821     // Some servers use 'name' parameter.
822     param_value = GetHeaderParamValue(header, "name");
823   }
824   if (param_value.empty())
825     return std::string();
826   std::string decoded;
827   if (DecodeParamValue(param_value, referrer_charset, &decoded))
828     return decoded;
829   return std::string();
830 }
831 
GetHeaderParamValue(const std::wstring & field,const std::wstring & param_name)832 std::wstring GetHeaderParamValue(const std::wstring& field,
833                                  const std::wstring& param_name) {
834   return GetHeaderParamValueT(field, param_name);
835 }
836 
GetHeaderParamValue(const std::string & field,const std::string & param_name)837 std::string GetHeaderParamValue(const std::string& field,
838                                 const std::string& param_name) {
839   return GetHeaderParamValueT(field, param_name);
840 }
841 
842 // TODO(brettw) bug 734373: check the scripts for each host component and
843 // don't un-IDN-ize if there is more than one. Alternatively, only IDN for
844 // scripts that the user has installed. For now, just put the entire
845 // path through IDN. Maybe this feature can be implemented in ICU itself?
846 //
847 // We may want to skip this step in the case of file URLs to allow unicode
848 // UNC hostnames regardless of encodings.
IDNToUnicode(const char * host,size_t host_len,const std::wstring & languages,size_t * offset_for_adjustment)849 std::wstring IDNToUnicode(const char* host,
850                           size_t host_len,
851                           const std::wstring& languages,
852                           size_t* offset_for_adjustment) {
853   // Convert the ASCII input to a wide string for ICU.
854   string16 input16;
855   input16.reserve(host_len);
856   std::copy(host, host + host_len, std::back_inserter(input16));
857 
858   string16 out16;
859   size_t output_offset = offset_for_adjustment ?
860       *offset_for_adjustment : std::wstring::npos;
861 
862   // Do each component of the host separately, since we enforce script matching
863   // on a per-component basis.
864   for (size_t component_start = 0, component_end;
865        component_start < input16.length();
866        component_start = component_end + 1) {
867     // Find the end of the component.
868     component_end = input16.find('.', component_start);
869     if (component_end == string16::npos)
870       component_end = input16.length();  // For getting the last component.
871     size_t component_length = component_end - component_start;
872 
873     size_t output_component_start = out16.length();
874     bool converted_idn = false;
875     if (component_end > component_start) {
876       // Add the substring that we just found.
877       converted_idn = IDNToUnicodeOneComponent(input16.data() + component_start,
878           component_length, languages, &out16);
879     }
880     size_t output_component_length = out16.length() - output_component_start;
881 
882     if ((output_offset != std::wstring::npos) &&
883         (*offset_for_adjustment > component_start)) {
884       if ((*offset_for_adjustment < component_end) && converted_idn)
885         output_offset = std::wstring::npos;
886       else
887         output_offset += output_component_length - component_length;
888     }
889 
890     // Need to add the dot we just found (if we found one).
891     if (component_end < input16.length())
892       out16.push_back('.');
893   }
894 
895   if (offset_for_adjustment)
896     *offset_for_adjustment = output_offset;
897 
898   return UTF16ToWideAndAdjustOffset(out16, offset_for_adjustment);
899 }
900 
CanonicalizeHost(const std::string & host,url_canon::CanonHostInfo * host_info)901 std::string CanonicalizeHost(const std::string& host,
902                              url_canon::CanonHostInfo* host_info) {
903   // Try to canonicalize the host.
904   const url_parse::Component raw_host_component(
905       0, static_cast<int>(host.length()));
906   std::string canon_host;
907   url_canon::StdStringCanonOutput canon_host_output(&canon_host);
908   url_canon::CanonicalizeHostVerbose(host.c_str(), raw_host_component,
909                                      &canon_host_output, host_info);
910 
911   if (host_info->out_host.is_nonempty() &&
912       host_info->family != url_canon::CanonHostInfo::BROKEN) {
913     // Success!  Assert that there's no extra garbage.
914     canon_host_output.Complete();
915     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
916   } else {
917     // Empty host, or canonicalization failed.  We'll return empty.
918     canon_host.clear();
919   }
920 
921   return canon_host;
922 }
923 
CanonicalizeHost(const std::wstring & host,url_canon::CanonHostInfo * host_info)924 std::string CanonicalizeHost(const std::wstring& host,
925                              url_canon::CanonHostInfo* host_info) {
926   std::string converted_host;
927   WideToUTF8(host.c_str(), host.length(), &converted_host);
928   return CanonicalizeHost(converted_host, host_info);
929 }
930 
GetDirectoryListingHeader(const string16 & title)931 std::string GetDirectoryListingHeader(const string16& title) {
932   static const base::StringPiece header(
933       NetModule::GetResource(IDR_DIR_HEADER_HTML));
934   // This can be null in unit tests.
935   DLOG_IF(WARNING, header.empty()) <<
936       "Missing resource: directory listing header";
937 
938   std::string result;
939   if (!header.empty())
940     result.assign(header.data(), header.size());
941 
942   result.append("<script>start(");
943   base::JsonDoubleQuote(title, true, &result);
944   result.append(");</script>\n");
945 
946   return result;
947 }
948 
IsHostCharAlpha(char c)949 inline bool IsHostCharAlpha(char c) {
950   // We can just check lowercase because uppercase characters have already been
951   // normalized.
952   return (c >= 'a') && (c <= 'z');
953 }
954 
IsHostCharDigit(char c)955 inline bool IsHostCharDigit(char c) {
956   return (c >= '0') && (c <= '9');
957 }
958 
IsCanonicalizedHostCompliant(const std::string & host)959 bool IsCanonicalizedHostCompliant(const std::string& host) {
960   if (host.empty())
961     return false;
962 
963   bool in_component = false;
964   bool most_recent_component_started_alpha = false;
965   bool last_char_was_hyphen_or_underscore = false;
966 
967   for (std::string::const_iterator i(host.begin()); i != host.end(); ++i) {
968     const char c = *i;
969     if (!in_component) {
970       most_recent_component_started_alpha = IsHostCharAlpha(c);
971       if (!most_recent_component_started_alpha && !IsHostCharDigit(c))
972         return false;
973       in_component = true;
974     } else {
975       if (c == '.') {
976         if (last_char_was_hyphen_or_underscore)
977           return false;
978         in_component = false;
979       } else if (IsHostCharAlpha(c) || IsHostCharDigit(c)) {
980         last_char_was_hyphen_or_underscore = false;
981       } else if ((c == '-') || (c == '_')) {
982         last_char_was_hyphen_or_underscore = true;
983       } else {
984         return false;
985       }
986     }
987   }
988 
989   return most_recent_component_started_alpha;
990 }
991 
GetDirectoryListingEntry(const string16 & name,const std::string & raw_bytes,bool is_dir,int64 size,Time modified)992 std::string GetDirectoryListingEntry(const string16& name,
993                                      const std::string& raw_bytes,
994                                      bool is_dir,
995                                      int64 size,
996                                      Time modified) {
997   std::string result;
998   result.append("<script>addRow(");
999   base::JsonDoubleQuote(name, true, &result);
1000   result.append(",");
1001   if (raw_bytes.empty()) {
1002     base::JsonDoubleQuote(EscapePath(UTF16ToUTF8(name)),
1003                                    true, &result);
1004   } else {
1005     base::JsonDoubleQuote(EscapePath(raw_bytes), true, &result);
1006   }
1007   if (is_dir) {
1008     result.append(",1,");
1009   } else {
1010     result.append(",0,");
1011   }
1012 
1013   base::JsonDoubleQuote(
1014       WideToUTF16Hack(FormatBytes(size, GetByteDisplayUnits(size), true)), true,
1015       &result);
1016 
1017   result.append(",");
1018 
1019   string16 modified_str;
1020   // |modified| can be NULL in FTP listings.
1021   if (!modified.is_null()) {
1022     modified_str = WideToUTF16Hack(base::TimeFormatShortDateAndTime(modified));
1023   }
1024   base::JsonDoubleQuote(modified_str, true, &result);
1025 
1026   result.append(");</script>\n");
1027 
1028   return result;
1029 }
1030 
StripWWW(const std::wstring & text)1031 std::wstring StripWWW(const std::wstring& text) {
1032   const std::wstring www(L"www.");
1033   return (text.compare(0, www.length(), www) == 0) ?
1034       text.substr(www.length()) : text;
1035 }
1036 
GetSuggestedFilename(const GURL & url,const std::string & content_disposition,const std::string & referrer_charset,const FilePath & default_name)1037 FilePath GetSuggestedFilename(const GURL& url,
1038                               const std::string& content_disposition,
1039                               const std::string& referrer_charset,
1040                               const FilePath& default_name) {
1041   // We don't translate this fallback string, "download". If localization is
1042   // needed, the caller should provide localized fallback default_name.
1043   static const FilePath::CharType kFinalFallbackName[] =
1044       FILE_PATH_LITERAL("download");
1045 
1046   // about: and data: URLs don't have file names, but esp. data: URLs may
1047   // contain parts that look like ones (i.e., contain a slash).
1048   // Therefore we don't attempt to divine a file name out of them.
1049   if (url.SchemeIs("about") || url.SchemeIs("data")) {
1050     return default_name.empty() ? FilePath(kFinalFallbackName) : default_name;
1051   }
1052 
1053   const std::string filename_from_cd = GetFileNameFromCD(content_disposition,
1054                                                           referrer_charset);
1055 #if defined(OS_WIN)
1056   FilePath::StringType filename = UTF8ToWide(filename_from_cd);
1057 #elif defined(OS_POSIX)
1058   FilePath::StringType filename = filename_from_cd;
1059 #endif
1060 
1061   if (!filename.empty()) {
1062     // Remove any path information the server may have sent, take the name
1063     // only.
1064     filename = FilePath(filename).BaseName().value();
1065 
1066     // Next, remove "." from the beginning and end of the file name to avoid
1067     // tricks with hidden files, "..", and "."
1068     TrimString(filename, FILE_PATH_LITERAL("."), &filename);
1069   }
1070   if (filename.empty()) {
1071     if (url.is_valid()) {
1072       const std::string unescaped_url_filename = UnescapeURLComponent(
1073           url.ExtractFileName(),
1074           UnescapeRule::SPACES | UnescapeRule::URL_SPECIAL_CHARS);
1075 #if defined(OS_WIN)
1076       filename = UTF8ToWide(unescaped_url_filename);
1077 #elif defined(OS_POSIX)
1078       filename = unescaped_url_filename;
1079 #endif
1080     }
1081   }
1082 
1083   // Trim '.' once more.
1084   TrimString(filename, FILE_PATH_LITERAL("."), &filename);
1085 
1086   // If there's no filename or it gets trimed to be empty, use
1087   // the URL hostname or default_name
1088   if (filename.empty()) {
1089     if (!default_name.empty()) {
1090       filename = default_name.value();
1091     } else if (url.is_valid()) {
1092       // Some schemes (e.g. file) do not have a hostname. Even though it's
1093       // not likely to reach here, let's hardcode the last fallback name.
1094       // TODO(jungshik) : Decode a 'punycoded' IDN hostname. (bug 1264451)
1095       filename = url.host().empty() ? kFinalFallbackName :
1096 #if defined(OS_WIN)
1097           UTF8ToWide(url.host());
1098 #elif defined(OS_POSIX)
1099           url.host();
1100 #endif
1101     } else {
1102       NOTREACHED();
1103     }
1104   }
1105 
1106   file_util::ReplaceIllegalCharactersInPath(&filename, '-');
1107   return FilePath(filename);
1108 }
1109 
IsPortAllowedByDefault(int port)1110 bool IsPortAllowedByDefault(int port) {
1111   int array_size = arraysize(kRestrictedPorts);
1112   for (int i = 0; i < array_size; i++) {
1113     if (kRestrictedPorts[i] == port) {
1114       return false;
1115     }
1116   }
1117   return true;
1118 }
1119 
IsPortAllowedByFtp(int port)1120 bool IsPortAllowedByFtp(int port) {
1121   int array_size = arraysize(kAllowedFtpPorts);
1122   for (int i = 0; i < array_size; i++) {
1123     if (kAllowedFtpPorts[i] == port) {
1124         return true;
1125     }
1126   }
1127   // Port not explicitly allowed by FTP, so return the default restrictions.
1128   return IsPortAllowedByDefault(port);
1129 }
1130 
IsPortAllowedByOverride(int port)1131 bool IsPortAllowedByOverride(int port) {
1132   if (explicitly_allowed_ports.empty())
1133     return false;
1134 
1135   std::set<int>::const_iterator it =
1136       std::find(explicitly_allowed_ports.begin(),
1137                 explicitly_allowed_ports.end(),
1138                 port);
1139 
1140   return it != explicitly_allowed_ports.end();
1141 }
1142 
SetNonBlocking(int fd)1143 int SetNonBlocking(int fd) {
1144 #if defined(OS_WIN)
1145   unsigned long no_block = 1;
1146   return ioctlsocket(fd, FIONBIO, &no_block);
1147 #elif defined(OS_POSIX)
1148   int flags = fcntl(fd, F_GETFL, 0);
1149   if (-1 == flags)
1150     flags = 0;
1151   return fcntl(fd, F_SETFL, flags | O_NONBLOCK);
1152 #endif
1153 }
1154 
ParseHostAndPort(std::string::const_iterator host_and_port_begin,std::string::const_iterator host_and_port_end,std::string * host,int * port)1155 bool ParseHostAndPort(std::string::const_iterator host_and_port_begin,
1156                       std::string::const_iterator host_and_port_end,
1157                       std::string* host,
1158                       int* port) {
1159   if (host_and_port_begin >= host_and_port_end)
1160     return false;
1161 
1162   // When using url_parse, we use char*.
1163   const char* auth_begin = &(*host_and_port_begin);
1164   int auth_len = host_and_port_end - host_and_port_begin;
1165 
1166   url_parse::Component auth_component(0, auth_len);
1167   url_parse::Component username_component;
1168   url_parse::Component password_component;
1169   url_parse::Component hostname_component;
1170   url_parse::Component port_component;
1171 
1172   url_parse::ParseAuthority(auth_begin, auth_component, &username_component,
1173       &password_component, &hostname_component, &port_component);
1174 
1175   // There shouldn't be a username/password.
1176   if (username_component.is_valid() || password_component.is_valid())
1177     return false;
1178 
1179   if (!hostname_component.is_nonempty())
1180     return false;  // Failed parsing.
1181 
1182   int parsed_port_number = -1;
1183   if (port_component.is_nonempty()) {
1184     parsed_port_number = url_parse::ParsePort(auth_begin, port_component);
1185 
1186     // If parsing failed, port_number will be either PORT_INVALID or
1187     // PORT_UNSPECIFIED, both of which are negative.
1188     if (parsed_port_number < 0)
1189       return false;  // Failed parsing the port number.
1190   }
1191 
1192   if (port_component.len == 0)
1193     return false;  // Reject inputs like "foo:"
1194 
1195   // Pass results back to caller.
1196   host->assign(auth_begin + hostname_component.begin, hostname_component.len);
1197   *port = parsed_port_number;
1198 
1199   return true;  // Success.
1200 }
1201 
ParseHostAndPort(const std::string & host_and_port,std::string * host,int * port)1202 bool ParseHostAndPort(const std::string& host_and_port,
1203                       std::string* host,
1204                       int* port) {
1205   return ParseHostAndPort(
1206       host_and_port.begin(), host_and_port.end(), host, port);
1207 }
1208 
GetHostAndPort(const GURL & url)1209 std::string GetHostAndPort(const GURL& url) {
1210   // For IPv6 literals, GURL::host() already includes the brackets so it is
1211   // safe to just append a colon.
1212   return StringPrintf("%s:%d", url.host().c_str(), url.EffectiveIntPort());
1213 }
1214 
GetHostAndOptionalPort(const GURL & url)1215 std::string GetHostAndOptionalPort(const GURL& url) {
1216   // For IPv6 literals, GURL::host() already includes the brackets
1217   // so it is safe to just append a colon.
1218   if (url.has_port())
1219     return StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
1220   return url.host();
1221 }
1222 
NetAddressToString(const struct addrinfo * net_address)1223 std::string NetAddressToString(const struct addrinfo* net_address) {
1224 #if defined(OS_WIN)
1225   EnsureWinsockInit();
1226 #endif
1227 
1228   // This buffer is large enough to fit the biggest IPv6 string.
1229   char buffer[INET6_ADDRSTRLEN];
1230 
1231   int result = getnameinfo(net_address->ai_addr,
1232       net_address->ai_addrlen, buffer, sizeof(buffer), NULL, 0, NI_NUMERICHOST);
1233 
1234   if (result != 0) {
1235     DLOG(INFO) << "getnameinfo() failed with " << result;
1236     buffer[0] = '\0';
1237   }
1238   return std::string(buffer);
1239 }
1240 
GetHostName()1241 std::string GetHostName() {
1242 #if defined(OS_WIN)
1243   EnsureWinsockInit();
1244 #endif
1245 
1246   // Host names are limited to 255 bytes.
1247   char buffer[256];
1248   int result = gethostname(buffer, sizeof(buffer));
1249   if (result != 0) {
1250     DLOG(INFO) << "gethostname() failed with " << result;
1251     buffer[0] = '\0';
1252   }
1253   return std::string(buffer);
1254 }
1255 
GetIdentityFromURL(const GURL & url,std::wstring * username,std::wstring * password)1256 void GetIdentityFromURL(const GURL& url,
1257                         std::wstring* username,
1258                         std::wstring* password) {
1259   UnescapeRule::Type flags = UnescapeRule::SPACES;
1260   *username = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.username(),
1261                                                                 flags, NULL));
1262   *password = UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(url.password(),
1263                                                                 flags, NULL));
1264 }
1265 
AppendFormattedHost(const GURL & url,const std::wstring & languages,std::wstring * output,url_parse::Parsed * new_parsed,size_t * offset_for_adjustment)1266 void AppendFormattedHost(const GURL& url,
1267                          const std::wstring& languages,
1268                          std::wstring* output,
1269                          url_parse::Parsed* new_parsed,
1270                          size_t* offset_for_adjustment) {
1271   DCHECK(output);
1272   const url_parse::Component& host =
1273       url.parsed_for_possibly_invalid_spec().host;
1274 
1275   if (host.is_nonempty()) {
1276     // Handle possible IDN in the host name.
1277     int new_host_begin = static_cast<int>(output->length());
1278     if (new_parsed)
1279       new_parsed->host.begin = new_host_begin;
1280     size_t offset_past_current_output =
1281         (!offset_for_adjustment ||
1282          (*offset_for_adjustment == std::wstring::npos) ||
1283          (*offset_for_adjustment < output->length())) ?
1284             std::wstring::npos : (*offset_for_adjustment - output->length());
1285     size_t* offset_into_host =
1286         (offset_past_current_output >= static_cast<size_t>(host.len)) ?
1287             NULL : &offset_past_current_output;
1288 
1289     const std::string& spec = url.possibly_invalid_spec();
1290     DCHECK(host.begin >= 0 &&
1291            ((spec.length() == 0 && host.begin == 0) ||
1292             host.begin < static_cast<int>(spec.length())));
1293     output->append(net::IDNToUnicode(&spec[host.begin],
1294                    static_cast<size_t>(host.len), languages, offset_into_host));
1295 
1296     int new_host_len = static_cast<int>(output->length()) - new_host_begin;
1297     if (new_parsed)
1298       new_parsed->host.len = new_host_len;
1299     if (offset_into_host) {
1300       *offset_for_adjustment = (*offset_into_host == std::wstring::npos) ?
1301           std::wstring::npos : (new_host_begin + *offset_into_host);
1302     } else if (offset_past_current_output != std::wstring::npos) {
1303       *offset_for_adjustment += new_host_len - host.len;
1304     }
1305   } else if (new_parsed) {
1306     new_parsed->host.reset();
1307   }
1308 }
1309 
1310 /* static */
AppendFormattedComponent(const std::string & spec,const url_parse::Component & in_component,UnescapeRule::Type unescape_rules,std::wstring * output,url_parse::Component * out_component,size_t * offset_for_adjustment)1311 void AppendFormattedComponent(const std::string& spec,
1312                               const url_parse::Component& in_component,
1313                               UnescapeRule::Type unescape_rules,
1314                               std::wstring* output,
1315                               url_parse::Component* out_component,
1316                               size_t* offset_for_adjustment) {
1317   DCHECK(output);
1318   DCHECK(offset_for_adjustment);
1319   if (in_component.is_nonempty()) {
1320     out_component->begin = static_cast<int>(output->length());
1321     size_t offset_past_current_output =
1322         ((*offset_for_adjustment == std::wstring::npos) ||
1323          (*offset_for_adjustment < output->length())) ?
1324             std::wstring::npos : (*offset_for_adjustment - output->length());
1325     size_t* offset_into_component =
1326         (offset_past_current_output >= static_cast<size_t>(in_component.len)) ?
1327             NULL : &offset_past_current_output;
1328     if (unescape_rules == UnescapeRule::NONE) {
1329       output->append(UTF8ToWideAndAdjustOffset(
1330           spec.substr(in_component.begin, in_component.len),
1331           offset_into_component));
1332     } else {
1333       output->append(UTF16ToWideHack(UnescapeAndDecodeUTF8URLComponent(
1334           spec.substr(in_component.begin, in_component.len), unescape_rules,
1335           offset_into_component)));
1336     }
1337     out_component->len =
1338         static_cast<int>(output->length()) - out_component->begin;
1339     if (offset_into_component) {
1340       *offset_for_adjustment = (*offset_into_component == std::wstring::npos) ?
1341           std::wstring::npos : (out_component->begin + *offset_into_component);
1342     } else if (offset_past_current_output != std::wstring::npos) {
1343       *offset_for_adjustment += out_component->len - in_component.len;
1344     }
1345   } else {
1346     out_component->reset();
1347   }
1348 }
1349 
FormatUrl(const GURL & url,const std::wstring & languages,bool omit_username_password,UnescapeRule::Type unescape_rules,url_parse::Parsed * new_parsed,size_t * prefix_end,size_t * offset_for_adjustment)1350 std::wstring FormatUrl(const GURL& url,
1351                        const std::wstring& languages,
1352                        bool omit_username_password,
1353                        UnescapeRule::Type unescape_rules,
1354                        url_parse::Parsed* new_parsed,
1355                        size_t* prefix_end,
1356                        size_t* offset_for_adjustment) {
1357   url_parse::Parsed parsed_temp;
1358   if (!new_parsed)
1359     new_parsed = &parsed_temp;
1360   size_t offset_temp = std::wstring::npos;
1361   if (!offset_for_adjustment)
1362     offset_for_adjustment = &offset_temp;
1363 
1364   std::wstring url_string;
1365 
1366   // Check for empty URLs or 0 available text width.
1367   if (url.is_empty()) {
1368     if (prefix_end)
1369       *prefix_end = 0;
1370     *offset_for_adjustment = std::wstring::npos;
1371     return url_string;
1372   }
1373 
1374   // Special handling for view-source:.  Don't use chrome::kViewSourceScheme
1375   // because this library shouldn't depend on chrome.
1376   const char* const kViewSource = "view-source";
1377   const char* const kViewSourceTwice = "view-source:view-source:";
1378   // Rejects view-source:view-source:... to avoid deep recursive call.
1379   if (url.SchemeIs(kViewSource) &&
1380       !StartsWithASCII(url.possibly_invalid_spec(), kViewSourceTwice, false)) {
1381     return FormatViewSourceUrl(url, languages, omit_username_password,
1382         unescape_rules, new_parsed, prefix_end, offset_for_adjustment);
1383   }
1384 
1385   // We handle both valid and invalid URLs (this will give us the spec
1386   // regardless of validity).
1387   const std::string& spec = url.possibly_invalid_spec();
1388   const url_parse::Parsed& parsed = url.parsed_for_possibly_invalid_spec();
1389   if (*offset_for_adjustment >= spec.length())
1390     *offset_for_adjustment = std::wstring::npos;
1391 
1392   // Copy everything before the username (the scheme and the separators.)
1393   // These are ASCII.
1394   std::copy(spec.begin(),
1395       spec.begin() + parsed.CountCharactersBefore(url_parse::Parsed::USERNAME,
1396                                                   true),
1397       std::back_inserter(url_string));
1398   new_parsed->scheme = parsed.scheme;
1399 
1400   if (omit_username_password) {
1401     // Remove the username and password fields. We don't want to display those
1402     // to the user since they can be used for attacks,
1403     // e.g. "http://google.com:search@evil.ru/"
1404     new_parsed->username.reset();
1405     new_parsed->password.reset();
1406     if ((*offset_for_adjustment != std::wstring::npos) &&
1407         (parsed.username.is_nonempty() || parsed.password.is_nonempty())) {
1408       if (parsed.username.is_nonempty() && parsed.password.is_nonempty()) {
1409         // The seeming off-by-one and off-by-two in these first two lines are to
1410         // account for the ':' after the username and '@' after the password.
1411         if (*offset_for_adjustment >
1412             static_cast<size_t>(parsed.password.end())) {
1413           *offset_for_adjustment -=
1414               (parsed.username.len + parsed.password.len + 2);
1415         } else if (*offset_for_adjustment >
1416                    static_cast<size_t>(parsed.username.begin)) {
1417           *offset_for_adjustment = std::wstring::npos;
1418         }
1419       } else {
1420         const url_parse::Component* nonempty_component =
1421             parsed.username.is_nonempty() ? &parsed.username : &parsed.password;
1422         // The seeming off-by-one in these first two lines is to account for the
1423         // '@' after the username/password.
1424         if (*offset_for_adjustment >
1425             static_cast<size_t>(nonempty_component->end())) {
1426           *offset_for_adjustment -= (nonempty_component->len + 1);
1427         } else if (*offset_for_adjustment >
1428                    static_cast<size_t>(nonempty_component->begin)) {
1429           *offset_for_adjustment = std::wstring::npos;
1430         }
1431       }
1432     }
1433   } else {
1434     AppendFormattedComponent(spec, parsed.username, unescape_rules, &url_string,
1435                              &new_parsed->username, offset_for_adjustment);
1436     if (parsed.password.is_valid()) {
1437       url_string.push_back(':');
1438     }
1439     AppendFormattedComponent(spec, parsed.password, unescape_rules, &url_string,
1440                              &new_parsed->password, offset_for_adjustment);
1441     if (parsed.username.is_valid() || parsed.password.is_valid()) {
1442       url_string.push_back('@');
1443     }
1444   }
1445   if (prefix_end)
1446     *prefix_end = static_cast<size_t>(url_string.length());
1447 
1448   AppendFormattedHost(url, languages, &url_string, new_parsed,
1449                       offset_for_adjustment);
1450 
1451   // Port.
1452   if (parsed.port.is_nonempty()) {
1453     url_string.push_back(':');
1454     new_parsed->port.begin = url_string.length();
1455     std::copy(spec.begin() + parsed.port.begin,
1456               spec.begin() + parsed.port.end(), std::back_inserter(url_string));
1457     new_parsed->port.len = url_string.length() - new_parsed->port.begin;
1458   } else {
1459     new_parsed->port.reset();
1460   }
1461 
1462   // Path and query both get the same general unescape & convert treatment.
1463   AppendFormattedComponent(spec, parsed.path, unescape_rules, &url_string,
1464                            &new_parsed->path, offset_for_adjustment);
1465   if (parsed.query.is_valid())
1466     url_string.push_back('?');
1467   AppendFormattedComponent(spec, parsed.query, unescape_rules, &url_string,
1468                            &new_parsed->query, offset_for_adjustment);
1469 
1470   // Reference is stored in valid, unescaped UTF-8, so we can just convert.
1471   if (parsed.ref.is_valid()) {
1472     url_string.push_back('#');
1473     new_parsed->ref.begin = url_string.length();
1474     size_t offset_past_current_output =
1475         ((*offset_for_adjustment == std::wstring::npos) ||
1476          (*offset_for_adjustment < url_string.length())) ?
1477             std::wstring::npos : (*offset_for_adjustment - url_string.length());
1478     size_t* offset_into_ref =
1479         (offset_past_current_output >= static_cast<size_t>(parsed.ref.len)) ?
1480             NULL : &offset_past_current_output;
1481     if (parsed.ref.len > 0) {
1482       url_string.append(UTF8ToWideAndAdjustOffset(spec.substr(parsed.ref.begin,
1483                                                               parsed.ref.len),
1484                                                   offset_into_ref));
1485     }
1486     new_parsed->ref.len = url_string.length() - new_parsed->ref.begin;
1487     if (offset_into_ref) {
1488       *offset_for_adjustment = (*offset_into_ref == std::wstring::npos) ?
1489           std::wstring::npos : (new_parsed->ref.begin + *offset_into_ref);
1490     } else if (offset_past_current_output != std::wstring::npos) {
1491       // We clamped the offset near the beginning of this function to ensure it
1492       // was within the input URL.  If we reach here, the input was something
1493       // invalid and non-parseable such that the offset was past any component
1494       // we could figure out.  In this case it won't be represented in the
1495       // output string, so reset it.
1496       *offset_for_adjustment = std::wstring::npos;
1497     }
1498   }
1499 
1500   return url_string;
1501 }
1502 
SimplifyUrlForRequest(const GURL & url)1503 GURL SimplifyUrlForRequest(const GURL& url) {
1504   DCHECK(url.is_valid());
1505   GURL::Replacements replacements;
1506   replacements.ClearUsername();
1507   replacements.ClearPassword();
1508   replacements.ClearRef();
1509   return url.ReplaceComponents(replacements);
1510 }
1511 
1512 // Specifies a comma separated list of port numbers that should be accepted
1513 // despite bans. If the string is invalid no allowed ports are stored.
SetExplicitlyAllowedPorts(const std::wstring & allowed_ports)1514 void SetExplicitlyAllowedPorts(const std::wstring& allowed_ports) {
1515   if (allowed_ports.empty())
1516     return;
1517 
1518   std::set<int> ports;
1519   size_t last = 0;
1520   size_t size = allowed_ports.size();
1521   // The comma delimiter.
1522   const std::wstring::value_type kComma = L',';
1523 
1524   // Overflow is still possible for evil user inputs.
1525   for (size_t i = 0; i <= size; ++i) {
1526     // The string should be composed of only digits and commas.
1527     if (i != size && !IsAsciiDigit(allowed_ports[i]) &&
1528         (allowed_ports[i] != kComma))
1529       return;
1530     if (i == size || allowed_ports[i] == kComma) {
1531       size_t length = i - last;
1532       if (length > 0)
1533         ports.insert(StringToInt(WideToASCII(
1534             allowed_ports.substr(last, length))));
1535       last = i + 1;
1536     }
1537   }
1538   explicitly_allowed_ports = ports;
1539 }
1540 
1541 }  // namespace net
1542