1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/url_util.h"
6
7 #include "build/build_config.h"
8
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14
15 #include <optional>
16 #include <string_view>
17
18 #include "base/check_op.h"
19 #include "base/containers/fixed_flat_set.h"
20 #include "base/strings/escape.h"
21 #include "base/strings/strcat.h"
22 #include "base/strings/string_util.h"
23 #include "base/strings/stringprintf.h"
24 #include "base/strings/utf_string_conversions.h"
25 #include "net/base/ip_address.h"
26 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
27 #include "url/gurl.h"
28 #include "url/scheme_host_port.h"
29 #include "url/url_canon.h"
30 #include "url/url_canon_internal.h"
31 #include "url/url_canon_ip.h"
32 #include "url/url_constants.h"
33 #include "url/url_util.h"
34
35 namespace net {
36
37 namespace {
38
IsHostCharAlphanumeric(char c)39 bool IsHostCharAlphanumeric(char c) {
40 // We can just check lowercase because uppercase characters have already been
41 // normalized.
42 return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
43 }
44
IsNormalizedLocalhostTLD(std::string_view host)45 bool IsNormalizedLocalhostTLD(std::string_view host) {
46 return base::EndsWith(host, ".localhost",
47 base::CompareCase::INSENSITIVE_ASCII);
48 }
49
50 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
51 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
52 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
53 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(std::string_view escaped_text)54 std::u16string UnescapeIdentityString(std::string_view escaped_text) {
55 std::string unescaped_text;
56 if (base::UnescapeBinaryURLComponentSafe(
57 escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
58 std::u16string result;
59 if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
60 &result)) {
61 return result;
62 }
63 }
64 return base::UTF8ToUTF16(escaped_text);
65 }
66
67 } // namespace
68
AppendQueryParameter(const GURL & url,std::string_view name,std::string_view value)69 GURL AppendQueryParameter(const GURL& url,
70 std::string_view name,
71 std::string_view value) {
72 std::string query(url.query());
73
74 if (!query.empty())
75 query += "&";
76
77 query += (base::EscapeQueryParamValue(name, true) + "=" +
78 base::EscapeQueryParamValue(value, true));
79 GURL::Replacements replacements;
80 replacements.SetQueryStr(query);
81 return url.ReplaceComponents(replacements);
82 }
83
AppendOrReplaceQueryParameter(const GURL & url,std::string_view name,std::optional<std::string_view> value)84 GURL AppendOrReplaceQueryParameter(const GURL& url,
85 std::string_view name,
86 std::optional<std::string_view> value) {
87 bool replaced = false;
88 std::string param_name = base::EscapeQueryParamValue(name, true);
89 bool should_keep_param = value.has_value();
90
91 std::string param_value;
92 if (should_keep_param)
93 param_value = base::EscapeQueryParamValue(value.value(), true);
94
95 const std::string_view input = url.query_piece();
96 url::Component cursor(0, input.size());
97 std::string output;
98 url::Component key_range, value_range;
99 while (url::ExtractQueryKeyValue(input, &cursor, &key_range, &value_range)) {
100 const std::string_view key = input.substr(key_range.begin, key_range.len);
101 std::string key_value_pair;
102 // Check |replaced| as only the first pair should be replaced.
103 if (!replaced && key == param_name) {
104 replaced = true;
105 if (!should_keep_param)
106 continue;
107
108 key_value_pair = param_name + "=" + param_value;
109 } else {
110 key_value_pair = std::string(
111 input.substr(key_range.begin, value_range.end() - key_range.begin));
112 }
113 if (!output.empty())
114 output += "&";
115
116 output += key_value_pair;
117 }
118 if (!replaced && should_keep_param) {
119 if (!output.empty())
120 output += "&";
121
122 output += (param_name + "=" + param_value);
123 }
124 GURL::Replacements replacements;
125 replacements.SetQueryStr(output);
126 return url.ReplaceComponents(replacements);
127 }
128
AppendOrReplaceRef(const GURL & url,const std::string_view & ref)129 GURL AppendOrReplaceRef(const GURL& url, const std::string_view& ref) {
130 GURL::Replacements replacements;
131 replacements.SetRefStr(ref);
132 return url.ReplaceComponents(replacements);
133 }
134
QueryIterator(const GURL & url)135 QueryIterator::QueryIterator(const GURL& url)
136 : url_(url), at_end_(!url.is_valid()) {
137 if (!at_end_) {
138 query_ = url.parsed_for_possibly_invalid_spec().query;
139 Advance();
140 }
141 }
142
143 QueryIterator::~QueryIterator() = default;
144
GetKey() const145 std::string_view QueryIterator::GetKey() const {
146 DCHECK(!at_end_);
147 if (key_.is_nonempty())
148 return std::string_view(url_->spec()).substr(key_.begin, key_.len);
149 return std::string_view();
150 }
151
GetValue() const152 std::string_view QueryIterator::GetValue() const {
153 DCHECK(!at_end_);
154 if (value_.is_nonempty())
155 return std::string_view(url_->spec()).substr(value_.begin, value_.len);
156 return std::string_view();
157 }
158
GetUnescapedValue()159 const std::string& QueryIterator::GetUnescapedValue() {
160 DCHECK(!at_end_);
161 if (value_.is_nonempty() && unescaped_value_.empty()) {
162 unescaped_value_ = base::UnescapeURLComponent(
163 GetValue(),
164 base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
165 base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
166 base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
167 }
168 return unescaped_value_;
169 }
170
IsAtEnd() const171 bool QueryIterator::IsAtEnd() const {
172 return at_end_;
173 }
174
Advance()175 void QueryIterator::Advance() {
176 DCHECK(!at_end_);
177 key_.reset();
178 value_.reset();
179 unescaped_value_.clear();
180 at_end_ = !url::ExtractQueryKeyValue(url_->spec(), &query_, &key_, &value_);
181 }
182
GetValueForKeyInQuery(const GURL & url,std::string_view search_key,std::string * out_value)183 bool GetValueForKeyInQuery(const GURL& url,
184 std::string_view search_key,
185 std::string* out_value) {
186 for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
187 if (it.GetKey() == search_key) {
188 *out_value = it.GetUnescapedValue();
189 return true;
190 }
191 }
192 return false;
193 }
194
ParseHostAndPort(std::string_view input,std::string * host,int * port)195 bool ParseHostAndPort(std::string_view input, std::string* host, int* port) {
196 if (input.empty())
197 return false;
198
199 url::Component auth_component(0, input.size());
200 url::Component username_component;
201 url::Component password_component;
202 url::Component hostname_component;
203 url::Component port_component;
204
205 // `input` is not NUL-terminated, so `input.data()` must be accompanied by a
206 // length. In these calls, `url::Component` provides an offset and length.
207 url::ParseAuthority(input.data(), auth_component, &username_component,
208 &password_component, &hostname_component,
209 &port_component);
210
211 // There shouldn't be a username/password.
212 if (username_component.is_valid() || password_component.is_valid())
213 return false;
214
215 if (hostname_component.is_empty())
216 return false; // Failed parsing.
217
218 int parsed_port_number = -1;
219 if (port_component.is_nonempty()) {
220 parsed_port_number = url::ParsePort(input.data(), port_component);
221
222 // If parsing failed, port_number will be either PORT_INVALID or
223 // PORT_UNSPECIFIED, both of which are negative.
224 if (parsed_port_number < 0)
225 return false; // Failed parsing the port number.
226 }
227
228 if (port_component.len == 0)
229 return false; // Reject inputs like "foo:"
230
231 unsigned char tmp_ipv6_addr[16];
232
233 // If the hostname starts with a bracket, it is either an IPv6 literal or
234 // invalid. If it is an IPv6 literal then strip the brackets.
235 if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
236 if (input[hostname_component.end() - 1] == ']' &&
237 url::IPv6AddressToNumber(input.data(), hostname_component,
238 tmp_ipv6_addr)) {
239 // Strip the brackets.
240 hostname_component.begin++;
241 hostname_component.len -= 2;
242 } else {
243 return false;
244 }
245 }
246
247 // Pass results back to caller.
248 *host = std::string(
249 input.substr(hostname_component.begin, hostname_component.len));
250 *port = parsed_port_number;
251
252 return true; // Success.
253 }
254
GetHostAndPort(const GURL & url)255 std::string GetHostAndPort(const GURL& url) {
256 // For IPv6 literals, GURL::host() already includes the brackets so it is
257 // safe to just append a colon.
258 return base::StringPrintf("%s:%d", url.host().c_str(),
259 url.EffectiveIntPort());
260 }
261
GetHostAndOptionalPort(const GURL & url)262 std::string GetHostAndOptionalPort(const GURL& url) {
263 // For IPv6 literals, GURL::host() already includes the brackets
264 // so it is safe to just append a colon.
265 if (url.has_port())
266 return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
267 return url.host();
268 }
269
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)270 NET_EXPORT std::string GetHostAndOptionalPort(
271 const url::SchemeHostPort& scheme_host_port) {
272 int default_port = url::DefaultPortForScheme(scheme_host_port.scheme());
273 if (default_port != scheme_host_port.port()) {
274 return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
275 scheme_host_port.port());
276 }
277 return scheme_host_port.host();
278 }
279
TrimEndingDot(std::string_view host)280 std::string TrimEndingDot(std::string_view host) {
281 std::string_view host_trimmed = host;
282 size_t len = host_trimmed.length();
283 if (len > 1 && host_trimmed[len - 1] == '.') {
284 host_trimmed.remove_suffix(1);
285 }
286 return std::string(host_trimmed);
287 }
288
GetHostOrSpecFromURL(const GURL & url)289 std::string GetHostOrSpecFromURL(const GURL& url) {
290 return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
291 }
292
GetSuperdomain(std::string_view domain)293 std::string GetSuperdomain(std::string_view domain) {
294 size_t dot_pos = domain.find('.');
295 if (dot_pos == std::string::npos)
296 return "";
297 return std::string(domain.substr(dot_pos + 1));
298 }
299
IsSubdomainOf(std::string_view subdomain,std::string_view superdomain)300 bool IsSubdomainOf(std::string_view subdomain, std::string_view superdomain) {
301 // Subdomain must be identical or have strictly more labels than the
302 // superdomain.
303 if (subdomain.length() <= superdomain.length())
304 return subdomain == superdomain;
305
306 // Superdomain must be suffix of subdomain, and the last character not
307 // included in the matching substring must be a dot.
308 if (!subdomain.ends_with(superdomain)) {
309 return false;
310 }
311 subdomain.remove_suffix(superdomain.length());
312 return subdomain.back() == '.';
313 }
314
315 namespace {
CanonicalizeHost(std::string_view host,bool is_file_scheme,url::CanonHostInfo * host_info)316 std::string CanonicalizeHost(std::string_view host,
317 bool is_file_scheme,
318 url::CanonHostInfo* host_info) {
319 // Try to canonicalize the host.
320 const url::Component raw_host_component(0, static_cast<int>(host.length()));
321 std::string canon_host;
322 url::StdStringCanonOutput canon_host_output(&canon_host);
323 // A url::StdStringCanonOutput starts off with a zero length buffer. The
324 // first time through Grow() immediately resizes it to 32 bytes, incurring
325 // a malloc. With libcxx a 22 byte or smaller request can be accommodated
326 // within the std::string itself (i.e. no malloc occurs). Start the buffer
327 // off at the max size to avoid a malloc on short strings.
328 // NOTE: To ensure the final size is correctly reflected, it's necessary
329 // to call Complete() which will adjust the size to the actual bytes written.
330 // This is handled below for success cases, while failure cases discard all
331 // the output.
332 const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
333 canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
334 if (is_file_scheme) {
335 url::CanonicalizeFileHostVerbose(host.data(), raw_host_component,
336 canon_host_output, *host_info);
337 } else {
338 url::CanonicalizeSpecialHostVerbose(host.data(), raw_host_component,
339 canon_host_output, *host_info);
340 }
341
342 if (host_info->out_host.is_nonempty() &&
343 host_info->family != url::CanonHostInfo::BROKEN) {
344 // Success! Assert that there's no extra garbage.
345 canon_host_output.Complete();
346 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
347 } else {
348 // Empty host, or canonicalization failed. We'll return empty.
349 canon_host.clear();
350 }
351
352 return canon_host;
353 }
354 } // namespace
355
CanonicalizeHost(std::string_view host,url::CanonHostInfo * host_info)356 std::string CanonicalizeHost(std::string_view host,
357 url::CanonHostInfo* host_info) {
358 return CanonicalizeHost(host, /*is_file_scheme=*/false, host_info);
359 }
360
CanonicalizeFileHost(std::string_view host,url::CanonHostInfo * host_info)361 std::string CanonicalizeFileHost(std::string_view host,
362 url::CanonHostInfo* host_info) {
363 return CanonicalizeHost(host, /*is_file_scheme=*/true, host_info);
364 }
365
IsCanonicalizedHostCompliant(std::string_view host)366 bool IsCanonicalizedHostCompliant(std::string_view host) {
367 if (host.empty() || host.size() > 254 ||
368 (host.back() != '.' && host.size() == 254)) {
369 return false;
370 }
371
372 bool in_component = false;
373 bool most_recent_component_started_alphanumeric = false;
374 size_t label_size = 0;
375
376 for (char c : host) {
377 ++label_size;
378 if (!in_component) {
379 most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
380 if (!most_recent_component_started_alphanumeric && (c != '-') &&
381 (c != '_')) {
382 return false;
383 }
384 in_component = true;
385 } else if (c == '.') {
386 in_component = false;
387 if (label_size > 64 || label_size == 1) {
388 // Label should not be empty or longer than 63 characters (+1 for '.'
389 // character included in `label_size`).
390 return false;
391 } else {
392 label_size = 0;
393 }
394 } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
395 return false;
396 }
397 }
398
399 // Check for too-long label when not ended with final '.'.
400 if (label_size > 63)
401 return false;
402
403 return most_recent_component_started_alphanumeric;
404 }
405
IsHostnameNonUnique(std::string_view hostname)406 bool IsHostnameNonUnique(std::string_view hostname) {
407 // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
408 const std::string host_or_ip = hostname.find(':') != std::string::npos
409 ? base::StrCat({"[", hostname, "]"})
410 : std::string(hostname);
411 url::CanonHostInfo host_info;
412 std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
413
414 // If canonicalization fails, then the input is truly malformed. However,
415 // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
416 if (canonical_name.empty())
417 return false;
418
419 // If |hostname| is an IP address, check to see if it's in an IANA-reserved
420 // range reserved for non-publicly routable networks.
421 if (host_info.IsIPAddress()) {
422 IPAddress host_addr;
423 if (!host_addr.AssignFromIPLiteral(hostname.substr(
424 host_info.out_host.begin, host_info.out_host.len))) {
425 return false;
426 }
427 switch (host_info.family) {
428 case url::CanonHostInfo::IPV4:
429 case url::CanonHostInfo::IPV6:
430 return !host_addr.IsPubliclyRoutable();
431 case url::CanonHostInfo::NEUTRAL:
432 case url::CanonHostInfo::BROKEN:
433 return false;
434 }
435 }
436
437 // Check for a registry controlled portion of |hostname|, ignoring private
438 // registries, as they already chain to ICANN-administered registries,
439 // and explicitly ignoring unknown registries. Registry identifiers themselves
440 // are also treated as unique, since a TLD is a valid hostname and can host a
441 // web server.
442 //
443 // Note: This means that as new gTLDs are introduced on the Internet, they
444 // will be treated as non-unique until the registry controlled domain list
445 // is updated. However, because gTLDs are expected to provide significant
446 // advance notice to deprecate older versions of this code, this an
447 // acceptable tradeoff.
448 return !registry_controlled_domains::HostHasRegistryControlledDomain(
449 canonical_name,
450 registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
451 registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES) &&
452 !registry_controlled_domains::HostIsRegistryIdentifier(
453 canonical_name,
454 registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
455 }
456
IsLocalhost(const GURL & url)457 bool IsLocalhost(const GURL& url) {
458 return HostStringIsLocalhost(url.HostNoBracketsPiece());
459 }
460
HostStringIsLocalhost(std::string_view host)461 bool HostStringIsLocalhost(std::string_view host) {
462 IPAddress ip_address;
463 if (ip_address.AssignFromIPLiteral(host))
464 return ip_address.IsLoopback();
465 return IsLocalHostname(host);
466 }
467
SimplifyUrlForRequest(const GURL & url)468 GURL SimplifyUrlForRequest(const GURL& url) {
469 DCHECK(url.is_valid());
470 // Fast path to avoid re-canonicalization via ReplaceComponents.
471 if (!url.has_username() && !url.has_password() && !url.has_ref())
472 return url;
473 GURL::Replacements replacements;
474 replacements.ClearUsername();
475 replacements.ClearPassword();
476 replacements.ClearRef();
477 return url.ReplaceComponents(replacements);
478 }
479
ChangeWebSocketSchemeToHttpScheme(const GURL & url)480 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
481 DCHECK(url.SchemeIsWSOrWSS());
482 GURL::Replacements replace_scheme;
483 replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
484 : url::kHttpScheme);
485 return url.ReplaceComponents(replace_scheme);
486 }
487
IsStandardSchemeWithNetworkHost(std::string_view scheme)488 bool IsStandardSchemeWithNetworkHost(std::string_view scheme) {
489 // file scheme is special. Windows file share origins can have network hosts.
490 if (scheme == url::kFileScheme)
491 return true;
492
493 url::SchemeType scheme_type;
494 if (!url::GetStandardSchemeType(
495 scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
496 return false;
497 }
498 return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
499 scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
500 }
501
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)502 void GetIdentityFromURL(const GURL& url,
503 std::u16string* username,
504 std::u16string* password) {
505 *username = UnescapeIdentityString(url.username());
506 *password = UnescapeIdentityString(url.password());
507 }
508
HasGoogleHost(const GURL & url)509 bool HasGoogleHost(const GURL& url) {
510 return IsGoogleHost(url.host_piece());
511 }
512
IsGoogleHost(std::string_view host)513 bool IsGoogleHost(std::string_view host) {
514 static const char* kGoogleHostSuffixes[] = {
515 ".google.com",
516 ".youtube.com",
517 ".gmail.com",
518 ".doubleclick.net",
519 ".gstatic.com",
520 ".googlevideo.com",
521 ".googleusercontent.com",
522 ".googlesyndication.com",
523 ".google-analytics.com",
524 ".googleadservices.com",
525 ".googleapis.com",
526 ".ytimg.com",
527 };
528 for (const char* suffix : kGoogleHostSuffixes) {
529 // Here it's possible to get away with faster case-sensitive comparisons
530 // because the list above is all lowercase, and a GURL's host name will
531 // always be canonicalized to lowercase as well.
532 if (host.ends_with(suffix)) {
533 return true;
534 }
535 }
536 return false;
537 }
538
IsGoogleHostWithAlpnH3(std::string_view host)539 bool IsGoogleHostWithAlpnH3(std::string_view host) {
540 return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
541 base::EqualsCaseInsensitiveASCII(host, "www.google.com");
542 }
543
IsLocalHostname(std::string_view host)544 bool IsLocalHostname(std::string_view host) {
545 // Remove any trailing '.'.
546 if (!host.empty() && *host.rbegin() == '.')
547 host.remove_suffix(1);
548
549 return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
550 IsNormalizedLocalhostTLD(host);
551 }
552
UnescapePercentEncodedUrl(std::string_view input)553 std::string UnescapePercentEncodedUrl(std::string_view input) {
554 std::string result(input);
555 // Replace any 0x2B (+) with 0x20 (SP).
556 for (char& c : result) {
557 if (c == '+') {
558 c = ' ';
559 }
560 }
561 // Run UTF-8 decoding without BOM on the percent-decoding.
562 url::RawCanonOutputT<char16_t> canon_output;
563 url::DecodeURLEscapeSequences(result, url::DecodeURLMode::kUTF8,
564 &canon_output);
565 return base::UTF16ToUTF8(canon_output.view());
566 }
567
568 } // namespace net
569