• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/url_util.h"
6 
7 #include "build/build_config.h"
8 
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14 
15 #include "base/check_op.h"
16 #include "base/containers/fixed_flat_set.h"
17 #include "base/strings/escape.h"
18 #include "base/strings/strcat.h"
19 #include "base/strings/string_piece.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/stringprintf.h"
22 #include "base/strings/utf_string_conversions.h"
23 #include "net/base/ip_address.h"
24 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
25 #include "third_party/abseil-cpp/absl/types/optional.h"
26 #include "url/gurl.h"
27 #include "url/scheme_host_port.h"
28 #include "url/url_canon.h"
29 #include "url/url_canon_internal.h"
30 #include "url/url_canon_ip.h"
31 #include "url/url_constants.h"
32 #include "url/url_util.h"
33 
34 namespace net {
35 
36 namespace {
37 
IsHostCharAlphanumeric(char c)38 bool IsHostCharAlphanumeric(char c) {
39   // We can just check lowercase because uppercase characters have already been
40   // normalized.
41   return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
42 }
43 
IsNormalizedLocalhostTLD(base::StringPiece host)44 bool IsNormalizedLocalhostTLD(base::StringPiece host) {
45   return base::EndsWith(host, ".localhost",
46                         base::CompareCase::INSENSITIVE_ASCII);
47 }
48 
49 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
50 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
51 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
52 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(base::StringPiece escaped_text)53 std::u16string UnescapeIdentityString(base::StringPiece escaped_text) {
54   std::string unescaped_text;
55   if (base::UnescapeBinaryURLComponentSafe(
56           escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
57     std::u16string result;
58     if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
59                           &result)) {
60       return result;
61     }
62   }
63   return base::UTF8ToUTF16(escaped_text);
64 }
65 
66 }  // namespace
67 
AppendQueryParameter(const GURL & url,base::StringPiece name,base::StringPiece value)68 GURL AppendQueryParameter(const GURL& url,
69                           base::StringPiece name,
70                           base::StringPiece value) {
71   std::string query(url.query());
72 
73   if (!query.empty())
74     query += "&";
75 
76   query += (base::EscapeQueryParamValue(name, true) + "=" +
77             base::EscapeQueryParamValue(value, true));
78   GURL::Replacements replacements;
79   replacements.SetQueryStr(query);
80   return url.ReplaceComponents(replacements);
81 }
82 
AppendOrReplaceQueryParameter(const GURL & url,base::StringPiece name,absl::optional<base::StringPiece> value)83 GURL AppendOrReplaceQueryParameter(const GURL& url,
84                                    base::StringPiece name,
85                                    absl::optional<base::StringPiece> value) {
86   bool replaced = false;
87   std::string param_name = base::EscapeQueryParamValue(name, true);
88   bool should_keep_param = value.has_value();
89 
90   std::string param_value;
91   if (should_keep_param)
92     param_value = base::EscapeQueryParamValue(value.value(), true);
93 
94   const base::StringPiece input = url.query_piece();
95   url::Component cursor(0, input.size());
96   std::string output;
97   url::Component key_range, value_range;
98   while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
99                                    &value_range)) {
100     const base::StringPiece key = input.substr(key_range.begin, key_range.len);
101     std::string key_value_pair;
102     // Check |replaced| as only the first pair should be replaced.
103     if (!replaced && key == param_name) {
104       replaced = true;
105       if (!should_keep_param)
106         continue;
107 
108       key_value_pair = param_name + "=" + param_value;
109     } else {
110       key_value_pair = std::string(
111           input.substr(key_range.begin, value_range.end() - key_range.begin));
112     }
113     if (!output.empty())
114       output += "&";
115 
116     output += key_value_pair;
117   }
118   if (!replaced && should_keep_param) {
119     if (!output.empty())
120       output += "&";
121 
122     output += (param_name + "=" + param_value);
123   }
124   GURL::Replacements replacements;
125   replacements.SetQueryStr(output);
126   return url.ReplaceComponents(replacements);
127 }
128 
AppendOrReplaceRef(const GURL & url,const base::StringPiece & ref)129 GURL AppendOrReplaceRef(const GURL& url, const base::StringPiece& ref) {
130   GURL::Replacements replacements;
131   replacements.SetRefStr(ref);
132   return url.ReplaceComponents(replacements);
133 }
134 
QueryIterator(const GURL & url)135 QueryIterator::QueryIterator(const GURL& url)
136     : url_(url), at_end_(!url.is_valid()) {
137   if (!at_end_) {
138     query_ = url.parsed_for_possibly_invalid_spec().query;
139     Advance();
140   }
141 }
142 
143 QueryIterator::~QueryIterator() = default;
144 
GetKey() const145 base::StringPiece QueryIterator::GetKey() const {
146   DCHECK(!at_end_);
147   if (key_.is_nonempty())
148     return base::StringPiece(url_->spec()).substr(key_.begin, key_.len);
149   return base::StringPiece();
150 }
151 
GetValue() const152 base::StringPiece QueryIterator::GetValue() const {
153   DCHECK(!at_end_);
154   if (value_.is_nonempty())
155     return base::StringPiece(url_->spec()).substr(value_.begin, value_.len);
156   return base::StringPiece();
157 }
158 
GetUnescapedValue()159 const std::string& QueryIterator::GetUnescapedValue() {
160   DCHECK(!at_end_);
161   if (value_.is_nonempty() && unescaped_value_.empty()) {
162     unescaped_value_ = base::UnescapeURLComponent(
163         GetValue(),
164         base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
165             base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
166             base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
167   }
168   return unescaped_value_;
169 }
170 
IsAtEnd() const171 bool QueryIterator::IsAtEnd() const {
172   return at_end_;
173 }
174 
Advance()175 void QueryIterator::Advance() {
176   DCHECK(!at_end_);
177   key_.reset();
178   value_.reset();
179   unescaped_value_.clear();
180   at_end_ =
181       !url::ExtractQueryKeyValue(url_->spec().c_str(), &query_, &key_, &value_);
182 }
183 
GetValueForKeyInQuery(const GURL & url,base::StringPiece search_key,std::string * out_value)184 bool GetValueForKeyInQuery(const GURL& url,
185                            base::StringPiece search_key,
186                            std::string* out_value) {
187   for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
188     if (it.GetKey() == search_key) {
189       *out_value = it.GetUnescapedValue();
190       return true;
191     }
192   }
193   return false;
194 }
195 
ParseHostAndPort(base::StringPiece input,std::string * host,int * port)196 bool ParseHostAndPort(base::StringPiece input, std::string* host, int* port) {
197   if (input.empty())
198     return false;
199 
200   url::Component auth_component(0, input.size());
201   url::Component username_component;
202   url::Component password_component;
203   url::Component hostname_component;
204   url::Component port_component;
205 
206   url::ParseAuthority(input.data(), auth_component, &username_component,
207                       &password_component, &hostname_component,
208                       &port_component);
209 
210   // There shouldn't be a username/password.
211   if (username_component.is_valid() || password_component.is_valid())
212     return false;
213 
214   if (hostname_component.is_empty())
215     return false;  // Failed parsing.
216 
217   int parsed_port_number = -1;
218   if (port_component.is_nonempty()) {
219     parsed_port_number = url::ParsePort(input.data(), port_component);
220 
221     // If parsing failed, port_number will be either PORT_INVALID or
222     // PORT_UNSPECIFIED, both of which are negative.
223     if (parsed_port_number < 0)
224       return false;  // Failed parsing the port number.
225   }
226 
227   if (port_component.len == 0)
228     return false;  // Reject inputs like "foo:"
229 
230   unsigned char tmp_ipv6_addr[16];
231 
232   // If the hostname starts with a bracket, it is either an IPv6 literal or
233   // invalid. If it is an IPv6 literal then strip the brackets.
234   if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
235     if (input[hostname_component.end() - 1] == ']' &&
236         url::IPv6AddressToNumber(input.data(), hostname_component,
237                                  tmp_ipv6_addr)) {
238       // Strip the brackets.
239       hostname_component.begin++;
240       hostname_component.len -= 2;
241     } else {
242       return false;
243     }
244   }
245 
246   // Pass results back to caller.
247   host->assign(input.data() + hostname_component.begin, hostname_component.len);
248   *port = parsed_port_number;
249 
250   return true;  // Success.
251 }
252 
GetHostAndPort(const GURL & url)253 std::string GetHostAndPort(const GURL& url) {
254   // For IPv6 literals, GURL::host() already includes the brackets so it is
255   // safe to just append a colon.
256   return base::StringPrintf("%s:%d", url.host().c_str(),
257                             url.EffectiveIntPort());
258 }
259 
GetHostAndOptionalPort(const GURL & url)260 std::string GetHostAndOptionalPort(const GURL& url) {
261   // For IPv6 literals, GURL::host() already includes the brackets
262   // so it is safe to just append a colon.
263   if (url.has_port())
264     return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
265   return url.host();
266 }
267 
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)268 NET_EXPORT std::string GetHostAndOptionalPort(
269     const url::SchemeHostPort& scheme_host_port) {
270   int default_port = url::DefaultPortForScheme(
271       scheme_host_port.scheme().data(),
272       static_cast<int>(scheme_host_port.scheme().length()));
273   if (default_port != scheme_host_port.port()) {
274     return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
275                               scheme_host_port.port());
276   }
277   return scheme_host_port.host();
278 }
279 
TrimEndingDot(base::StringPiece host)280 std::string TrimEndingDot(base::StringPiece host) {
281   base::StringPiece host_trimmed = host;
282   size_t len = host_trimmed.length();
283   if (len > 1 && host_trimmed[len - 1] == '.') {
284     host_trimmed.remove_suffix(1);
285   }
286   return std::string(host_trimmed);
287 }
288 
GetHostOrSpecFromURL(const GURL & url)289 std::string GetHostOrSpecFromURL(const GURL& url) {
290   return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
291 }
292 
GetSuperdomain(base::StringPiece domain)293 std::string GetSuperdomain(base::StringPiece domain) {
294   size_t dot_pos = domain.find('.');
295   if (dot_pos == std::string::npos)
296     return "";
297   return std::string(domain.substr(dot_pos + 1));
298 }
299 
IsSubdomainOf(base::StringPiece subdomain,base::StringPiece superdomain)300 bool IsSubdomainOf(base::StringPiece subdomain, base::StringPiece superdomain) {
301   // Subdomain must be identical or have strictly more labels than the
302   // superdomain.
303   if (subdomain.length() <= superdomain.length())
304     return subdomain == superdomain;
305 
306   // Superdomain must be suffix of subdomain, and the last character not
307   // included in the matching substring must be a dot.
308   if (!base::EndsWith(subdomain, superdomain))
309     return false;
310   subdomain.remove_suffix(superdomain.length());
311   return subdomain.back() == '.';
312 }
313 
CanonicalizeHost(base::StringPiece host,url::CanonHostInfo * host_info)314 std::string CanonicalizeHost(base::StringPiece host,
315                              url::CanonHostInfo* host_info) {
316   // Try to canonicalize the host.
317   const url::Component raw_host_component(0, static_cast<int>(host.length()));
318   std::string canon_host;
319   url::StdStringCanonOutput canon_host_output(&canon_host);
320   // A url::StdStringCanonOutput starts off with a zero length buffer. The
321   // first time through Grow() immediately resizes it to 32 bytes, incurring
322   // a malloc. With libcxx a 22 byte or smaller request can be accommodated
323   // within the std::string itself (i.e. no malloc occurs). Start the buffer
324   // off at the max size to avoid a malloc on short strings.
325   // NOTE: To ensure the final size is correctly reflected, it's necessary
326   // to call Complete() which will adjust the size to the actual bytes written.
327   // This is handled below for success cases, while failure cases discard all
328   // the output.
329   const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
330   canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
331   url::CanonicalizeHostVerbose(host.data(), raw_host_component,
332                                &canon_host_output, host_info);
333 
334   if (host_info->out_host.is_nonempty() &&
335       host_info->family != url::CanonHostInfo::BROKEN) {
336     // Success!  Assert that there's no extra garbage.
337     canon_host_output.Complete();
338     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
339   } else {
340     // Empty host, or canonicalization failed.  We'll return empty.
341     canon_host.clear();
342   }
343 
344   return canon_host;
345 }
346 
IsCanonicalizedHostCompliant(base::StringPiece host)347 bool IsCanonicalizedHostCompliant(base::StringPiece host) {
348   if (host.empty() || host.size() > 254 ||
349       (host.back() != '.' && host.size() == 254)) {
350     return false;
351   }
352 
353   bool in_component = false;
354   bool most_recent_component_started_alphanumeric = false;
355   size_t label_size = 0;
356 
357   for (char c : host) {
358     ++label_size;
359     if (!in_component) {
360       most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
361       if (!most_recent_component_started_alphanumeric && (c != '-') &&
362           (c != '_')) {
363         return false;
364       }
365       in_component = true;
366     } else if (c == '.') {
367       in_component = false;
368       if (label_size > 64 || label_size == 1) {
369         // Label should not be empty or longer than 63 characters (+1 for '.'
370         // character included in `label_size`).
371         return false;
372       } else {
373         label_size = 0;
374       }
375     } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
376       return false;
377     }
378   }
379 
380   // Check for too-long label when not ended with final '.'.
381   if (label_size > 63)
382     return false;
383 
384   return most_recent_component_started_alphanumeric;
385 }
386 
IsHostnameNonUnique(base::StringPiece hostname)387 bool IsHostnameNonUnique(base::StringPiece hostname) {
388   // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
389   const std::string host_or_ip = hostname.find(':') != std::string::npos
390                                      ? base::StrCat({"[", hostname, "]"})
391                                      : std::string(hostname);
392   url::CanonHostInfo host_info;
393   std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
394 
395   // If canonicalization fails, then the input is truly malformed. However,
396   // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
397   if (canonical_name.empty())
398     return false;
399 
400   // If |hostname| is an IP address, check to see if it's in an IANA-reserved
401   // range reserved for non-publicly routable networks.
402   if (host_info.IsIPAddress()) {
403     IPAddress host_addr;
404     if (!host_addr.AssignFromIPLiteral(hostname.substr(
405             host_info.out_host.begin, host_info.out_host.len))) {
406       return false;
407     }
408     switch (host_info.family) {
409       case url::CanonHostInfo::IPV4:
410       case url::CanonHostInfo::IPV6:
411         return !host_addr.IsPubliclyRoutable();
412       case url::CanonHostInfo::NEUTRAL:
413       case url::CanonHostInfo::BROKEN:
414         return false;
415     }
416   }
417 
418   // Check for a registry controlled portion of |hostname|, ignoring private
419   // registries, as they already chain to ICANN-administered registries,
420   // and explicitly ignoring unknown registries.
421   //
422   // Note: This means that as new gTLDs are introduced on the Internet, they
423   // will be treated as non-unique until the registry controlled domain list
424   // is updated. However, because gTLDs are expected to provide significant
425   // advance notice to deprecate older versions of this code, this an
426   // acceptable tradeoff.
427   return !registry_controlled_domains::HostHasRegistryControlledDomain(
428       canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
429       registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
430 }
431 
IsLocalhost(const GURL & url)432 bool IsLocalhost(const GURL& url) {
433   return HostStringIsLocalhost(url.HostNoBracketsPiece());
434 }
435 
HostStringIsLocalhost(base::StringPiece host)436 bool HostStringIsLocalhost(base::StringPiece host) {
437   IPAddress ip_address;
438   if (ip_address.AssignFromIPLiteral(host))
439     return ip_address.IsLoopback();
440   return IsLocalHostname(host);
441 }
442 
SimplifyUrlForRequest(const GURL & url)443 GURL SimplifyUrlForRequest(const GURL& url) {
444   DCHECK(url.is_valid());
445   // Fast path to avoid re-canonicalization via ReplaceComponents.
446   if (!url.has_username() && !url.has_password() && !url.has_ref())
447     return url;
448   GURL::Replacements replacements;
449   replacements.ClearUsername();
450   replacements.ClearPassword();
451   replacements.ClearRef();
452   return url.ReplaceComponents(replacements);
453 }
454 
ChangeWebSocketSchemeToHttpScheme(const GURL & url)455 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
456   DCHECK(url.SchemeIsWSOrWSS());
457   GURL::Replacements replace_scheme;
458   replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
459                                                             : url::kHttpScheme);
460   return url.ReplaceComponents(replace_scheme);
461 }
462 
IsStandardSchemeWithNetworkHost(base::StringPiece scheme)463 bool IsStandardSchemeWithNetworkHost(base::StringPiece scheme) {
464   // file scheme is special. Windows file share origins can have network hosts.
465   if (scheme == url::kFileScheme)
466     return true;
467 
468   url::SchemeType scheme_type;
469   if (!url::GetStandardSchemeType(
470           scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
471     return false;
472   }
473   return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
474          scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
475 }
476 
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)477 void GetIdentityFromURL(const GURL& url,
478                         std::u16string* username,
479                         std::u16string* password) {
480   *username = UnescapeIdentityString(url.username());
481   *password = UnescapeIdentityString(url.password());
482 }
483 
HasGoogleHost(const GURL & url)484 bool HasGoogleHost(const GURL& url) {
485   return IsGoogleHost(url.host_piece());
486 }
487 
IsGoogleHost(base::StringPiece host)488 bool IsGoogleHost(base::StringPiece host) {
489   static const char* kGoogleHostSuffixes[] = {
490       ".google.com",
491       ".youtube.com",
492       ".gmail.com",
493       ".doubleclick.net",
494       ".gstatic.com",
495       ".googlevideo.com",
496       ".googleusercontent.com",
497       ".googlesyndication.com",
498       ".google-analytics.com",
499       ".googleadservices.com",
500       ".googleapis.com",
501       ".ytimg.com",
502   };
503   for (const char* suffix : kGoogleHostSuffixes) {
504     // Here it's possible to get away with faster case-sensitive comparisons
505     // because the list above is all lowercase, and a GURL's host name will
506     // always be canonicalized to lowercase as well.
507     if (base::EndsWith(host, suffix))
508       return true;
509   }
510   return false;
511 }
512 
IsGoogleHostWithAlpnH3(base::StringPiece host)513 bool IsGoogleHostWithAlpnH3(base::StringPiece host) {
514   return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
515          base::EqualsCaseInsensitiveASCII(host, "www.google.com");
516 }
517 
IsLocalHostname(base::StringPiece host)518 bool IsLocalHostname(base::StringPiece host) {
519   // Remove any trailing '.'.
520   if (!host.empty() && *host.rbegin() == '.')
521     host.remove_suffix(1);
522 
523   return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
524          IsNormalizedLocalhostTLD(host);
525 }
526 
UnescapePercentEncodedUrl(base::StringPiece input)527 std::string UnescapePercentEncodedUrl(base::StringPiece input) {
528   std::string result(input);
529   // Replace any 0x2B (+) with 0x20 (SP).
530   for (char& c : result) {
531     if (c == '+') {
532       c = ' ';
533     }
534   }
535   // Run UTF-8 decoding without BOM on the percent-decoding.
536   url::RawCanonOutputT<char16_t> canon_output;
537   url::DecodeURLEscapeSequences(result.data(), result.size(),
538                                 url::DecodeURLMode::kUTF8, &canon_output);
539   return base::UTF16ToUTF8(
540       base::StringPiece16(canon_output.data(), canon_output.length()));
541 }
542 
543 }  // namespace net
544