• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/base/url_util.h"
6 
7 #include "build/build_config.h"
8 
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14 
15 #include <optional>
16 #include <string_view>
17 
18 #include "base/check_op.h"
19 #include "base/containers/fixed_flat_set.h"
20 #include "base/strings/escape.h"
21 #include "base/strings/strcat.h"
22 #include "base/strings/string_util.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/strings/utf_string_conversions.h"
25 #include "net/base/ip_address.h"
26 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
27 #include "url/gurl.h"
28 #include "url/scheme_host_port.h"
29 #include "url/url_canon.h"
30 #include "url/url_canon_internal.h"
31 #include "url/url_canon_ip.h"
32 #include "url/url_constants.h"
33 #include "url/url_util.h"
34 
35 namespace net {
36 
37 namespace {
38 
IsHostCharAlphanumeric(char c)39 bool IsHostCharAlphanumeric(char c) {
40   // We can just check lowercase because uppercase characters have already been
41   // normalized.
42   return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
43 }
44 
IsNormalizedLocalhostTLD(std::string_view host)45 bool IsNormalizedLocalhostTLD(std::string_view host) {
46   return base::EndsWith(host, ".localhost",
47                         base::CompareCase::INSENSITIVE_ASCII);
48 }
49 
50 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
51 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
52 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
53 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(std::string_view escaped_text)54 std::u16string UnescapeIdentityString(std::string_view escaped_text) {
55   std::string unescaped_text;
56   if (base::UnescapeBinaryURLComponentSafe(
57           escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
58     std::u16string result;
59     if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
60                           &result)) {
61       return result;
62     }
63   }
64   return base::UTF8ToUTF16(escaped_text);
65 }
66 
67 }  // namespace
68 
AppendQueryParameter(const GURL & url,std::string_view name,std::string_view value)69 GURL AppendQueryParameter(const GURL& url,
70                           std::string_view name,
71                           std::string_view value) {
72   std::string query(url.query());
73 
74   if (!query.empty())
75     query += "&";
76 
77   query += (base::EscapeQueryParamValue(name, true) + "=" +
78             base::EscapeQueryParamValue(value, true));
79   GURL::Replacements replacements;
80   replacements.SetQueryStr(query);
81   return url.ReplaceComponents(replacements);
82 }
83 
AppendOrReplaceQueryParameter(const GURL & url,std::string_view name,std::optional<std::string_view> value)84 GURL AppendOrReplaceQueryParameter(const GURL& url,
85                                    std::string_view name,
86                                    std::optional<std::string_view> value) {
87   bool replaced = false;
88   std::string param_name = base::EscapeQueryParamValue(name, true);
89   bool should_keep_param = value.has_value();
90 
91   std::string param_value;
92   if (should_keep_param)
93     param_value = base::EscapeQueryParamValue(value.value(), true);
94 
95   const std::string_view input = url.query_piece();
96   url::Component cursor(0, input.size());
97   std::string output;
98   url::Component key_range, value_range;
99   while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
100                                    &value_range)) {
101     const std::string_view key = input.substr(key_range.begin, key_range.len);
102     std::string key_value_pair;
103     // Check |replaced| as only the first pair should be replaced.
104     if (!replaced && key == param_name) {
105       replaced = true;
106       if (!should_keep_param)
107         continue;
108 
109       key_value_pair = param_name + "=" + param_value;
110     } else {
111       key_value_pair = std::string(
112           input.substr(key_range.begin, value_range.end() - key_range.begin));
113     }
114     if (!output.empty())
115       output += "&";
116 
117     output += key_value_pair;
118   }
119   if (!replaced && should_keep_param) {
120     if (!output.empty())
121       output += "&";
122 
123     output += (param_name + "=" + param_value);
124   }
125   GURL::Replacements replacements;
126   replacements.SetQueryStr(output);
127   return url.ReplaceComponents(replacements);
128 }
129 
AppendOrReplaceRef(const GURL & url,const std::string_view & ref)130 GURL AppendOrReplaceRef(const GURL& url, const std::string_view& ref) {
131   GURL::Replacements replacements;
132   replacements.SetRefStr(ref);
133   return url.ReplaceComponents(replacements);
134 }
135 
QueryIterator(const GURL & url)136 QueryIterator::QueryIterator(const GURL& url)
137     : url_(url), at_end_(!url.is_valid()) {
138   if (!at_end_) {
139     query_ = url.parsed_for_possibly_invalid_spec().query;
140     Advance();
141   }
142 }
143 
144 QueryIterator::~QueryIterator() = default;
145 
GetKey() const146 std::string_view QueryIterator::GetKey() const {
147   DCHECK(!at_end_);
148   if (key_.is_nonempty())
149     return std::string_view(url_->spec()).substr(key_.begin, key_.len);
150   return std::string_view();
151 }
152 
GetValue() const153 std::string_view QueryIterator::GetValue() const {
154   DCHECK(!at_end_);
155   if (value_.is_nonempty())
156     return std::string_view(url_->spec()).substr(value_.begin, value_.len);
157   return std::string_view();
158 }
159 
GetUnescapedValue()160 const std::string& QueryIterator::GetUnescapedValue() {
161   DCHECK(!at_end_);
162   if (value_.is_nonempty() && unescaped_value_.empty()) {
163     unescaped_value_ = base::UnescapeURLComponent(
164         GetValue(),
165         base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
166             base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
167             base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
168   }
169   return unescaped_value_;
170 }
171 
IsAtEnd() const172 bool QueryIterator::IsAtEnd() const {
173   return at_end_;
174 }
175 
Advance()176 void QueryIterator::Advance() {
177   DCHECK(!at_end_);
178   key_.reset();
179   value_.reset();
180   unescaped_value_.clear();
181   at_end_ =
182       !url::ExtractQueryKeyValue(url_->spec().c_str(), &query_, &key_, &value_);
183 }
184 
GetValueForKeyInQuery(const GURL & url,std::string_view search_key,std::string * out_value)185 bool GetValueForKeyInQuery(const GURL& url,
186                            std::string_view search_key,
187                            std::string* out_value) {
188   for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
189     if (it.GetKey() == search_key) {
190       *out_value = it.GetUnescapedValue();
191       return true;
192     }
193   }
194   return false;
195 }
196 
ParseHostAndPort(std::string_view input,std::string * host,int * port)197 bool ParseHostAndPort(std::string_view input, std::string* host, int* port) {
198   if (input.empty())
199     return false;
200 
201   url::Component auth_component(0, input.size());
202   url::Component username_component;
203   url::Component password_component;
204   url::Component hostname_component;
205   url::Component port_component;
206 
207   // `input` is not NUL-terminated, so `input.data()` must be accompanied by a
208   // length. In these calls, `url::Component` provides an offset and length.
209   url::ParseAuthority(input.data(), auth_component, &username_component,
210                       &password_component, &hostname_component,
211                       &port_component);
212 
213   // There shouldn't be a username/password.
214   if (username_component.is_valid() || password_component.is_valid())
215     return false;
216 
217   if (hostname_component.is_empty())
218     return false;  // Failed parsing.
219 
220   int parsed_port_number = -1;
221   if (port_component.is_nonempty()) {
222     parsed_port_number = url::ParsePort(input.data(), port_component);
223 
224     // If parsing failed, port_number will be either PORT_INVALID or
225     // PORT_UNSPECIFIED, both of which are negative.
226     if (parsed_port_number < 0)
227       return false;  // Failed parsing the port number.
228   }
229 
230   if (port_component.len == 0)
231     return false;  // Reject inputs like "foo:"
232 
233   unsigned char tmp_ipv6_addr[16];
234 
235   // If the hostname starts with a bracket, it is either an IPv6 literal or
236   // invalid. If it is an IPv6 literal then strip the brackets.
237   if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
238     if (input[hostname_component.end() - 1] == ']' &&
239         url::IPv6AddressToNumber(input.data(), hostname_component,
240                                  tmp_ipv6_addr)) {
241       // Strip the brackets.
242       hostname_component.begin++;
243       hostname_component.len -= 2;
244     } else {
245       return false;
246     }
247   }
248 
249   // Pass results back to caller.
250   *host = std::string(
251       input.substr(hostname_component.begin, hostname_component.len));
252   *port = parsed_port_number;
253 
254   return true;  // Success.
255 }
256 
GetHostAndPort(const GURL & url)257 std::string GetHostAndPort(const GURL& url) {
258   // For IPv6 literals, GURL::host() already includes the brackets so it is
259   // safe to just append a colon.
260   return base::StringPrintf("%s:%d", url.host().c_str(),
261                             url.EffectiveIntPort());
262 }
263 
GetHostAndOptionalPort(const GURL & url)264 std::string GetHostAndOptionalPort(const GURL& url) {
265   // For IPv6 literals, GURL::host() already includes the brackets
266   // so it is safe to just append a colon.
267   if (url.has_port())
268     return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
269   return url.host();
270 }
271 
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)272 NET_EXPORT std::string GetHostAndOptionalPort(
273     const url::SchemeHostPort& scheme_host_port) {
274   int default_port = url::DefaultPortForScheme(
275       scheme_host_port.scheme().data(),
276       static_cast<int>(scheme_host_port.scheme().length()));
277   if (default_port != scheme_host_port.port()) {
278     return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
279                               scheme_host_port.port());
280   }
281   return scheme_host_port.host();
282 }
283 
TrimEndingDot(std::string_view host)284 std::string TrimEndingDot(std::string_view host) {
285   std::string_view host_trimmed = host;
286   size_t len = host_trimmed.length();
287   if (len > 1 && host_trimmed[len - 1] == '.') {
288     host_trimmed.remove_suffix(1);
289   }
290   return std::string(host_trimmed);
291 }
292 
GetHostOrSpecFromURL(const GURL & url)293 std::string GetHostOrSpecFromURL(const GURL& url) {
294   return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
295 }
296 
GetSuperdomain(std::string_view domain)297 std::string GetSuperdomain(std::string_view domain) {
298   size_t dot_pos = domain.find('.');
299   if (dot_pos == std::string::npos)
300     return "";
301   return std::string(domain.substr(dot_pos + 1));
302 }
303 
IsSubdomainOf(std::string_view subdomain,std::string_view superdomain)304 bool IsSubdomainOf(std::string_view subdomain, std::string_view superdomain) {
305   // Subdomain must be identical or have strictly more labels than the
306   // superdomain.
307   if (subdomain.length() <= superdomain.length())
308     return subdomain == superdomain;
309 
310   // Superdomain must be suffix of subdomain, and the last character not
311   // included in the matching substring must be a dot.
312   if (!subdomain.ends_with(superdomain)) {
313     return false;
314   }
315   subdomain.remove_suffix(superdomain.length());
316   return subdomain.back() == '.';
317 }
318 
CanonicalizeHost(std::string_view host,url::CanonHostInfo * host_info)319 std::string CanonicalizeHost(std::string_view host,
320                              url::CanonHostInfo* host_info) {
321   // Try to canonicalize the host.
322   const url::Component raw_host_component(0, static_cast<int>(host.length()));
323   std::string canon_host;
324   url::StdStringCanonOutput canon_host_output(&canon_host);
325   // A url::StdStringCanonOutput starts off with a zero length buffer. The
326   // first time through Grow() immediately resizes it to 32 bytes, incurring
327   // a malloc. With libcxx a 22 byte or smaller request can be accommodated
328   // within the std::string itself (i.e. no malloc occurs). Start the buffer
329   // off at the max size to avoid a malloc on short strings.
330   // NOTE: To ensure the final size is correctly reflected, it's necessary
331   // to call Complete() which will adjust the size to the actual bytes written.
332   // This is handled below for success cases, while failure cases discard all
333   // the output.
334   const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
335   canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
336   url::CanonicalizeHostVerbose(host.data(), raw_host_component,
337                                &canon_host_output, host_info);
338 
339   if (host_info->out_host.is_nonempty() &&
340       host_info->family != url::CanonHostInfo::BROKEN) {
341     // Success!  Assert that there's no extra garbage.
342     canon_host_output.Complete();
343     DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
344   } else {
345     // Empty host, or canonicalization failed.  We'll return empty.
346     canon_host.clear();
347   }
348 
349   return canon_host;
350 }
351 
IsCanonicalizedHostCompliant(std::string_view host)352 bool IsCanonicalizedHostCompliant(std::string_view host) {
353   if (host.empty() || host.size() > 254 ||
354       (host.back() != '.' && host.size() == 254)) {
355     return false;
356   }
357 
358   bool in_component = false;
359   bool most_recent_component_started_alphanumeric = false;
360   size_t label_size = 0;
361 
362   for (char c : host) {
363     ++label_size;
364     if (!in_component) {
365       most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
366       if (!most_recent_component_started_alphanumeric && (c != '-') &&
367           (c != '_')) {
368         return false;
369       }
370       in_component = true;
371     } else if (c == '.') {
372       in_component = false;
373       if (label_size > 64 || label_size == 1) {
374         // Label should not be empty or longer than 63 characters (+1 for '.'
375         // character included in `label_size`).
376         return false;
377       } else {
378         label_size = 0;
379       }
380     } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
381       return false;
382     }
383   }
384 
385   // Check for too-long label when not ended with final '.'.
386   if (label_size > 63)
387     return false;
388 
389   return most_recent_component_started_alphanumeric;
390 }
391 
IsHostnameNonUnique(std::string_view hostname)392 bool IsHostnameNonUnique(std::string_view hostname) {
393   // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
394   const std::string host_or_ip = hostname.find(':') != std::string::npos
395                                      ? base::StrCat({"[", hostname, "]"})
396                                      : std::string(hostname);
397   url::CanonHostInfo host_info;
398   std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
399 
400   // If canonicalization fails, then the input is truly malformed. However,
401   // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
402   if (canonical_name.empty())
403     return false;
404 
405   // If |hostname| is an IP address, check to see if it's in an IANA-reserved
406   // range reserved for non-publicly routable networks.
407   if (host_info.IsIPAddress()) {
408     IPAddress host_addr;
409     if (!host_addr.AssignFromIPLiteral(hostname.substr(
410             host_info.out_host.begin, host_info.out_host.len))) {
411       return false;
412     }
413     switch (host_info.family) {
414       case url::CanonHostInfo::IPV4:
415       case url::CanonHostInfo::IPV6:
416         return !host_addr.IsPubliclyRoutable();
417       case url::CanonHostInfo::NEUTRAL:
418       case url::CanonHostInfo::BROKEN:
419         return false;
420     }
421   }
422 
423   // Check for a registry controlled portion of |hostname|, ignoring private
424   // registries, as they already chain to ICANN-administered registries,
425   // and explicitly ignoring unknown registries.
426   //
427   // Note: This means that as new gTLDs are introduced on the Internet, they
428   // will be treated as non-unique until the registry controlled domain list
429   // is updated. However, because gTLDs are expected to provide significant
430   // advance notice to deprecate older versions of this code, this an
431   // acceptable tradeoff.
432   return !registry_controlled_domains::HostHasRegistryControlledDomain(
433       canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
434       registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
435 }
436 
IsLocalhost(const GURL & url)437 bool IsLocalhost(const GURL& url) {
438   return HostStringIsLocalhost(url.HostNoBracketsPiece());
439 }
440 
HostStringIsLocalhost(std::string_view host)441 bool HostStringIsLocalhost(std::string_view host) {
442   IPAddress ip_address;
443   if (ip_address.AssignFromIPLiteral(host))
444     return ip_address.IsLoopback();
445   return IsLocalHostname(host);
446 }
447 
SimplifyUrlForRequest(const GURL & url)448 GURL SimplifyUrlForRequest(const GURL& url) {
449   DCHECK(url.is_valid());
450   // Fast path to avoid re-canonicalization via ReplaceComponents.
451   if (!url.has_username() && !url.has_password() && !url.has_ref())
452     return url;
453   GURL::Replacements replacements;
454   replacements.ClearUsername();
455   replacements.ClearPassword();
456   replacements.ClearRef();
457   return url.ReplaceComponents(replacements);
458 }
459 
ChangeWebSocketSchemeToHttpScheme(const GURL & url)460 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
461   DCHECK(url.SchemeIsWSOrWSS());
462   GURL::Replacements replace_scheme;
463   replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
464                                                             : url::kHttpScheme);
465   return url.ReplaceComponents(replace_scheme);
466 }
467 
IsStandardSchemeWithNetworkHost(std::string_view scheme)468 bool IsStandardSchemeWithNetworkHost(std::string_view scheme) {
469   // file scheme is special. Windows file share origins can have network hosts.
470   if (scheme == url::kFileScheme)
471     return true;
472 
473   url::SchemeType scheme_type;
474   if (!url::GetStandardSchemeType(
475           scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
476     return false;
477   }
478   return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
479          scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
480 }
481 
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)482 void GetIdentityFromURL(const GURL& url,
483                         std::u16string* username,
484                         std::u16string* password) {
485   *username = UnescapeIdentityString(url.username());
486   *password = UnescapeIdentityString(url.password());
487 }
488 
HasGoogleHost(const GURL & url)489 bool HasGoogleHost(const GURL& url) {
490   return IsGoogleHost(url.host_piece());
491 }
492 
IsGoogleHost(std::string_view host)493 bool IsGoogleHost(std::string_view host) {
494   static const char* kGoogleHostSuffixes[] = {
495       ".google.com",
496       ".youtube.com",
497       ".gmail.com",
498       ".doubleclick.net",
499       ".gstatic.com",
500       ".googlevideo.com",
501       ".googleusercontent.com",
502       ".googlesyndication.com",
503       ".google-analytics.com",
504       ".googleadservices.com",
505       ".googleapis.com",
506       ".ytimg.com",
507   };
508   for (const char* suffix : kGoogleHostSuffixes) {
509     // Here it's possible to get away with faster case-sensitive comparisons
510     // because the list above is all lowercase, and a GURL's host name will
511     // always be canonicalized to lowercase as well.
512     if (host.ends_with(suffix)) {
513       return true;
514     }
515   }
516   return false;
517 }
518 
IsGoogleHostWithAlpnH3(std::string_view host)519 bool IsGoogleHostWithAlpnH3(std::string_view host) {
520   return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
521          base::EqualsCaseInsensitiveASCII(host, "www.google.com");
522 }
523 
IsLocalHostname(std::string_view host)524 bool IsLocalHostname(std::string_view host) {
525   // Remove any trailing '.'.
526   if (!host.empty() && *host.rbegin() == '.')
527     host.remove_suffix(1);
528 
529   return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
530          IsNormalizedLocalhostTLD(host);
531 }
532 
UnescapePercentEncodedUrl(std::string_view input)533 std::string UnescapePercentEncodedUrl(std::string_view input) {
534   std::string result(input);
535   // Replace any 0x2B (+) with 0x20 (SP).
536   for (char& c : result) {
537     if (c == '+') {
538       c = ' ';
539     }
540   }
541   // Run UTF-8 decoding without BOM on the percent-decoding.
542   url::RawCanonOutputT<char16_t> canon_output;
543   url::DecodeURLEscapeSequences(result, url::DecodeURLMode::kUTF8,
544                                 &canon_output);
545   return base::UTF16ToUTF8(canon_output.view());
546 }
547 
548 }  // namespace net
549