• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/url_util.h"
6 
7 #include "build/build_config.h"
8 
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14 
15 #include <optional>
16 #include <string_view>
17 
18 #include "base/check_op.h"
19 #include "base/containers/fixed_flat_set.h"
20 #include "base/strings/escape.h"
21 #include "base/strings/strcat.h"
22 #include "base/strings/string_util.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/strings/utf_string_conversions.h"
25 #include "net/base/ip_address.h"
26 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
27 #include "url/gurl.h"
28 #include "url/scheme_host_port.h"
29 #include "url/url_canon.h"
30 #include "url/url_canon_internal.h"
31 #include "url/url_canon_ip.h"
32 #include "url/url_constants.h"
33 #include "url/url_util.h"
34 
35 namespace net {
36 
37 namespace {
38 
IsHostCharAlphanumeric(char c)39 bool IsHostCharAlphanumeric(char c) {
40   // We can just check lowercase because uppercase characters have already been
41   // normalized.
42   return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
43 }
44 
IsNormalizedLocalhostTLD(std::string_view host)45 bool IsNormalizedLocalhostTLD(std::string_view host) {
46   return base::EndsWith(host, ".localhost",
47                         base::CompareCase::INSENSITIVE_ASCII);
48 }
49 
50 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
51 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
52 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
53 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(std::string_view escaped_text)54 std::u16string UnescapeIdentityString(std::string_view escaped_text) {
55   std::string unescaped_text;
56   if (base::UnescapeBinaryURLComponentSafe(
57           escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
58     std::u16string result;
59     if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
60                           &result)) {
61       return result;
62     }
63   }
64   return base::UTF8ToUTF16(escaped_text);
65 }
66 
67 }  // namespace
68 
AppendQueryParameter(const GURL & url,std::string_view name,std::string_view value)69 GURL AppendQueryParameter(const GURL& url,
70                           std::string_view name,
71                           std::string_view value) {
72   std::string query(url.query());
73 
74   if (!query.empty())
75     query += "&";
76 
77   query += (base::EscapeQueryParamValue(name, true) + "=" +
78             base::EscapeQueryParamValue(value, true));
79   GURL::Replacements replacements;
80   replacements.SetQueryStr(query);
81   return url.ReplaceComponents(replacements);
82 }
83 
AppendOrReplaceQueryParameter(const GURL & url,std::string_view name,std::optional<std::string_view> value)84 GURL AppendOrReplaceQueryParameter(const GURL& url,
85                                    std::string_view name,
86                                    std::optional<std::string_view> value) {
87   bool replaced = false;
88   std::string param_name = base::EscapeQueryParamValue(name, true);
89   bool should_keep_param = value.has_value();
90 
91   std::string param_value;
92   if (should_keep_param)
93     param_value = base::EscapeQueryParamValue(value.value(), true);
94 
95   const std::string_view input = url.query_piece();
96   url::Component cursor(0, input.size());
97   std::string output;
98   url::Component key_range, value_range;
99   while (url::ExtractQueryKeyValue(input, &cursor, &key_range, &value_range)) {
100     const std::string_view key = input.substr(key_range.begin, key_range.len);
101     std::string key_value_pair;
102     // Check |replaced| as only the first pair should be replaced.
103     if (!replaced && key == param_name) {
104       replaced = true;
105       if (!should_keep_param)
106         continue;
107 
108       key_value_pair = param_name + "=" + param_value;
109     } else {
110       key_value_pair = std::string(
111           input.substr(key_range.begin, value_range.end() - key_range.begin));
112     }
113     if (!output.empty())
114       output += "&";
115 
116     output += key_value_pair;
117   }
118   if (!replaced && should_keep_param) {
119     if (!output.empty())
120       output += "&";
121 
122     output += (param_name + "=" + param_value);
123   }
124   GURL::Replacements replacements;
125   replacements.SetQueryStr(output);
126   return url.ReplaceComponents(replacements);
127 }
128 
AppendOrReplaceRef(const GURL & url,const std::string_view & ref)129 GURL AppendOrReplaceRef(const GURL& url, const std::string_view& ref) {
130   GURL::Replacements replacements;
131   replacements.SetRefStr(ref);
132   return url.ReplaceComponents(replacements);
133 }
134 
QueryIterator(const GURL & url)135 QueryIterator::QueryIterator(const GURL& url)
136     : url_(url), at_end_(!url.is_valid()) {
137   if (!at_end_) {
138     query_ = url.parsed_for_possibly_invalid_spec().query;
139     Advance();
140   }
141 }
142 
143 QueryIterator::~QueryIterator() = default;
144 
GetKey() const145 std::string_view QueryIterator::GetKey() const {
146   DCHECK(!at_end_);
147   if (key_.is_nonempty())
148     return std::string_view(url_->spec()).substr(key_.begin, key_.len);
149   return std::string_view();
150 }
151 
GetValue() const152 std::string_view QueryIterator::GetValue() const {
153   DCHECK(!at_end_);
154   if (value_.is_nonempty())
155     return std::string_view(url_->spec()).substr(value_.begin, value_.len);
156   return std::string_view();
157 }
158 
GetUnescapedValue()159 const std::string& QueryIterator::GetUnescapedValue() {
160   DCHECK(!at_end_);
161   if (value_.is_nonempty() && unescaped_value_.empty()) {
162     unescaped_value_ = base::UnescapeURLComponent(
163         GetValue(),
164         base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
165             base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
166             base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
167   }
168   return unescaped_value_;
169 }
170 
IsAtEnd() const171 bool QueryIterator::IsAtEnd() const {
172   return at_end_;
173 }
174 
Advance()175 void QueryIterator::Advance() {
176   DCHECK(!at_end_);
177   key_.reset();
178   value_.reset();
179   unescaped_value_.clear();
180   at_end_ = !url::ExtractQueryKeyValue(url_->spec(), &query_, &key_, &value_);
181 }
182 
GetValueForKeyInQuery(const GURL & url,std::string_view search_key,std::string * out_value)183 bool GetValueForKeyInQuery(const GURL& url,
184                            std::string_view search_key,
185                            std::string* out_value) {
186   for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
187     if (it.GetKey() == search_key) {
188       *out_value = it.GetUnescapedValue();
189       return true;
190     }
191   }
192   return false;
193 }
194 
ParseHostAndPort(std::string_view input,std::string * host,int * port)195 bool ParseHostAndPort(std::string_view input, std::string* host, int* port) {
196   if (input.empty())
197     return false;
198 
199   url::Component auth_component(0, input.size());
200   url::Component username_component;
201   url::Component password_component;
202   url::Component hostname_component;
203   url::Component port_component;
204 
205   // `input` is not NUL-terminated, so `input.data()` must be accompanied by a
206   // length. In these calls, `url::Component` provides an offset and length.
207   url::ParseAuthority(input.data(), auth_component, &username_component,
208                       &password_component, &hostname_component,
209                       &port_component);
210 
211   // There shouldn't be a username/password.
212   if (username_component.is_valid() || password_component.is_valid())
213     return false;
214 
215   if (hostname_component.is_empty())
216     return false;  // Failed parsing.
217 
218   int parsed_port_number = -1;
219   if (port_component.is_nonempty()) {
220     parsed_port_number = url::ParsePort(input.data(), port_component);
221 
222     // If parsing failed, port_number will be either PORT_INVALID or
223     // PORT_UNSPECIFIED, both of which are negative.
224     if (parsed_port_number < 0)
225       return false;  // Failed parsing the port number.
226   }
227 
228   if (port_component.len == 0)
229     return false;  // Reject inputs like "foo:"
230 
231   unsigned char tmp_ipv6_addr[16];
232 
233   // If the hostname starts with a bracket, it is either an IPv6 literal or
234   // invalid. If it is an IPv6 literal then strip the brackets.
235   if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
236     if (input[hostname_component.end() - 1] == ']' &&
237         url::IPv6AddressToNumber(input.data(), hostname_component,
238                                  tmp_ipv6_addr)) {
239       // Strip the brackets.
240       hostname_component.begin++;
241       hostname_component.len -= 2;
242     } else {
243       return false;
244     }
245   }
246 
247   // Pass results back to caller.
248   *host = std::string(
249       input.substr(hostname_component.begin, hostname_component.len));
250   *port = parsed_port_number;
251 
252   return true;  // Success.
253 }
254 
GetHostAndPort(const GURL & url)255 std::string GetHostAndPort(const GURL& url) {
256   // For IPv6 literals, GURL::host() already includes the brackets so it is
257   // safe to just append a colon.
258   return base::StringPrintf("%s:%d", url.host().c_str(),
259                             url.EffectiveIntPort());
260 }
261 
GetHostAndOptionalPort(const GURL & url)262 std::string GetHostAndOptionalPort(const GURL& url) {
263   // For IPv6 literals, GURL::host() already includes the brackets
264   // so it is safe to just append a colon.
265   if (url.has_port())
266     return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
267   return url.host();
268 }
269 
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)270 NET_EXPORT std::string GetHostAndOptionalPort(
271     const url::SchemeHostPort& scheme_host_port) {
272   int default_port = url::DefaultPortForScheme(scheme_host_port.scheme());
273   if (default_port != scheme_host_port.port()) {
274     return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
275                               scheme_host_port.port());
276   }
277   return scheme_host_port.host();
278 }
279 
TrimEndingDot(std::string_view host)280 std::string TrimEndingDot(std::string_view host) {
281   std::string_view host_trimmed = host;
282   size_t len = host_trimmed.length();
283   if (len > 1 && host_trimmed[len - 1] == '.') {
284     host_trimmed.remove_suffix(1);
285   }
286   return std::string(host_trimmed);
287 }
288 
GetHostOrSpecFromURL(const GURL & url)289 std::string GetHostOrSpecFromURL(const GURL& url) {
290   return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
291 }
292 
GetSuperdomain(std::string_view domain)293 std::string GetSuperdomain(std::string_view domain) {
294   size_t dot_pos = domain.find('.');
295   if (dot_pos == std::string::npos)
296     return "";
297   return std::string(domain.substr(dot_pos + 1));
298 }
299 
IsSubdomainOf(std::string_view subdomain,std::string_view superdomain)300 bool IsSubdomainOf(std::string_view subdomain, std::string_view superdomain) {
301   // Subdomain must be identical or have strictly more labels than the
302   // superdomain.
303   if (subdomain.length() <= superdomain.length())
304     return subdomain == superdomain;
305 
306   // Superdomain must be suffix of subdomain, and the last character not
307   // included in the matching substring must be a dot.
308   if (!subdomain.ends_with(superdomain)) {
309     return false;
310   }
311   subdomain.remove_suffix(superdomain.length());
312   return subdomain.back() == '.';
313 }
314 
315 namespace {
CanonicalizeHost(std::string_view host,bool is_file_scheme,url::CanonHostInfo * host_info)316 std::string CanonicalizeHost(std::string_view host,
317                              bool is_file_scheme,
318                              url::CanonHostInfo* host_info) {
319   // Try to canonicalize the host.
320   const url::Component raw_host_component(0, static_cast<int>(host.length()));
321   std::string canon_host;
322   url::StdStringCanonOutput canon_host_output(&canon_host);
323   // A url::StdStringCanonOutput starts off with a zero length buffer. The
324   // first time through Grow() immediately resizes it to 32 bytes, incurring
325   // a malloc. With libcxx a 22 byte or smaller request can be accommodated
326   // within the std::string itself (i.e. no malloc occurs). Start the buffer
327   // off at the max size to avoid a malloc on short strings.
328   // NOTE: To ensure the final size is correctly reflected, it's necessary
329   // to call Complete() which will adjust the size to the actual bytes written.
330   // This is handled below for success cases, while failure cases discard all
331   // the output.
332   const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
333   canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
334   if (is_file_scheme) {
335     url::CanonicalizeFileHostVerbose(host.data(), raw_host_component,
336                                      canon_host_output, *host_info);
337   } else {
338     url::CanonicalizeSpecialHostVerbose(host.data(), raw_host_component,
339                                         canon_host_output, *host_info);
340   }
341 
342   if (host_info->out_host.is_nonempty() &&
343       host_info->family != url::CanonHostInfo::BROKEN) {
344     // Success!  Assert that there's no extra garbage.
345     canon_host_output.Complete();
346     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
347   } else {
348     // Empty host, or canonicalization failed.  We'll return empty.
349     canon_host.clear();
350   }
351 
352   return canon_host;
353 }
354 }  // namespace
355 
CanonicalizeHost(std::string_view host,url::CanonHostInfo * host_info)356 std::string CanonicalizeHost(std::string_view host,
357                              url::CanonHostInfo* host_info) {
358   return CanonicalizeHost(host, /*is_file_scheme=*/false, host_info);
359 }
360 
CanonicalizeFileHost(std::string_view host,url::CanonHostInfo * host_info)361 std::string CanonicalizeFileHost(std::string_view host,
362                                  url::CanonHostInfo* host_info) {
363   return CanonicalizeHost(host, /*is_file_scheme=*/true, host_info);
364 }
365 
IsCanonicalizedHostCompliant(std::string_view host)366 bool IsCanonicalizedHostCompliant(std::string_view host) {
367   if (host.empty() || host.size() > 254 ||
368       (host.back() != '.' && host.size() == 254)) {
369     return false;
370   }
371 
372   bool in_component = false;
373   bool most_recent_component_started_alphanumeric = false;
374   size_t label_size = 0;
375 
376   for (char c : host) {
377     ++label_size;
378     if (!in_component) {
379       most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
380       if (!most_recent_component_started_alphanumeric && (c != '-') &&
381           (c != '_')) {
382         return false;
383       }
384       in_component = true;
385     } else if (c == '.') {
386       in_component = false;
387       if (label_size > 64 || label_size == 1) {
388         // Label should not be empty or longer than 63 characters (+1 for '.'
389         // character included in `label_size`).
390         return false;
391       } else {
392         label_size = 0;
393       }
394     } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
395       return false;
396     }
397   }
398 
399   // Check for too-long label when not ended with final '.'.
400   if (label_size > 63)
401     return false;
402 
403   return most_recent_component_started_alphanumeric;
404 }
405 
IsHostnameNonUnique(std::string_view hostname)406 bool IsHostnameNonUnique(std::string_view hostname) {
407   // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
408   const std::string host_or_ip = hostname.find(':') != std::string::npos
409                                      ? base::StrCat({"[", hostname, "]"})
410                                      : std::string(hostname);
411   url::CanonHostInfo host_info;
412   std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
413 
414   // If canonicalization fails, then the input is truly malformed. However,
415   // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
416   if (canonical_name.empty())
417     return false;
418 
419   // If |hostname| is an IP address, check to see if it's in an IANA-reserved
420   // range reserved for non-publicly routable networks.
421   if (host_info.IsIPAddress()) {
422     IPAddress host_addr;
423     if (!host_addr.AssignFromIPLiteral(hostname.substr(
424             host_info.out_host.begin, host_info.out_host.len))) {
425       return false;
426     }
427     switch (host_info.family) {
428       case url::CanonHostInfo::IPV4:
429       case url::CanonHostInfo::IPV6:
430         return !host_addr.IsPubliclyRoutable();
431       case url::CanonHostInfo::NEUTRAL:
432       case url::CanonHostInfo::BROKEN:
433         return false;
434     }
435   }
436 
437   // Check for a registry controlled portion of |hostname|, ignoring private
438   // registries, as they already chain to ICANN-administered registries,
439   // and explicitly ignoring unknown registries. Registry identifiers themselves
440   // are also treated as unique, since a TLD is a valid hostname and can host a
441   // web server.
442   //
443   // Note: This means that as new gTLDs are introduced on the Internet, they
444   // will be treated as non-unique until the registry controlled domain list
445   // is updated. However, because gTLDs are expected to provide significant
446   // advance notice to deprecate older versions of this code, this an
447   // acceptable tradeoff.
448   return !registry_controlled_domains::HostHasRegistryControlledDomain(
449              canonical_name,
450              registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
451              registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES) &&
452          !registry_controlled_domains::HostIsRegistryIdentifier(
453              canonical_name,
454              registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
455 }
456 
IsLocalhost(const GURL & url)457 bool IsLocalhost(const GURL& url) {
458   return HostStringIsLocalhost(url.HostNoBracketsPiece());
459 }
460 
HostStringIsLocalhost(std::string_view host)461 bool HostStringIsLocalhost(std::string_view host) {
462   IPAddress ip_address;
463   if (ip_address.AssignFromIPLiteral(host))
464     return ip_address.IsLoopback();
465   return IsLocalHostname(host);
466 }
467 
SimplifyUrlForRequest(const GURL & url)468 GURL SimplifyUrlForRequest(const GURL& url) {
469   DCHECK(url.is_valid());
470   // Fast path to avoid re-canonicalization via ReplaceComponents.
471   if (!url.has_username() && !url.has_password() && !url.has_ref())
472     return url;
473   GURL::Replacements replacements;
474   replacements.ClearUsername();
475   replacements.ClearPassword();
476   replacements.ClearRef();
477   return url.ReplaceComponents(replacements);
478 }
479 
ChangeWebSocketSchemeToHttpScheme(const GURL & url)480 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
481   DCHECK(url.SchemeIsWSOrWSS());
482   GURL::Replacements replace_scheme;
483   replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
484                                                             : url::kHttpScheme);
485   return url.ReplaceComponents(replace_scheme);
486 }
487 
IsStandardSchemeWithNetworkHost(std::string_view scheme)488 bool IsStandardSchemeWithNetworkHost(std::string_view scheme) {
489   // file scheme is special. Windows file share origins can have network hosts.
490   if (scheme == url::kFileScheme)
491     return true;
492 
493   url::SchemeType scheme_type;
494   if (!url::GetStandardSchemeType(
495           scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
496     return false;
497   }
498   return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
499          scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
500 }
501 
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)502 void GetIdentityFromURL(const GURL& url,
503                         std::u16string* username,
504                         std::u16string* password) {
505   *username = UnescapeIdentityString(url.username());
506   *password = UnescapeIdentityString(url.password());
507 }
508 
HasGoogleHost(const GURL & url)509 bool HasGoogleHost(const GURL& url) {
510   return IsGoogleHost(url.host_piece());
511 }
512 
IsGoogleHost(std::string_view host)513 bool IsGoogleHost(std::string_view host) {
514   static const char* kGoogleHostSuffixes[] = {
515       ".google.com",
516       ".youtube.com",
517       ".gmail.com",
518       ".doubleclick.net",
519       ".gstatic.com",
520       ".googlevideo.com",
521       ".googleusercontent.com",
522       ".googlesyndication.com",
523       ".google-analytics.com",
524       ".googleadservices.com",
525       ".googleapis.com",
526       ".ytimg.com",
527   };
528   for (const char* suffix : kGoogleHostSuffixes) {
529     // Here it's possible to get away with faster case-sensitive comparisons
530     // because the list above is all lowercase, and a GURL's host name will
531     // always be canonicalized to lowercase as well.
532     if (host.ends_with(suffix)) {
533       return true;
534     }
535   }
536   return false;
537 }
538 
IsGoogleHostWithAlpnH3(std::string_view host)539 bool IsGoogleHostWithAlpnH3(std::string_view host) {
540   return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
541          base::EqualsCaseInsensitiveASCII(host, "www.google.com");
542 }
543 
IsLocalHostname(std::string_view host)544 bool IsLocalHostname(std::string_view host) {
545   // Remove any trailing '.'.
546   if (!host.empty() && *host.rbegin() == '.')
547     host.remove_suffix(1);
548 
549   return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
550          IsNormalizedLocalhostTLD(host);
551 }
552 
UnescapePercentEncodedUrl(std::string_view input)553 std::string UnescapePercentEncodedUrl(std::string_view input) {
554   std::string result(input);
555   // Replace any 0x2B (+) with 0x20 (SP).
556   for (char& c : result) {
557     if (c == '+') {
558       c = ' ';
559     }
560   }
561   // Run UTF-8 decoding without BOM on the percent-decoding.
562   url::RawCanonOutputT<char16_t> canon_output;
563   url::DecodeURLEscapeSequences(result, url::DecodeURLMode::kUTF8,
564                                 &canon_output);
565   return base::UTF16ToUTF8(canon_output.view());
566 }
567 
568 }  // namespace net
569