1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/base/url_util.h"
6
7 #include "build/build_config.h"
8
9 #if BUILDFLAG(IS_POSIX)
10 #include <netinet/in.h>
11 #elif BUILDFLAG(IS_WIN)
12 #include <ws2tcpip.h>
13 #endif
14
15 #include "base/check_op.h"
16 #include "base/containers/fixed_flat_set.h"
17 #include "base/strings/escape.h"
18 #include "base/strings/strcat.h"
19 #include "base/strings/string_piece.h"
20 #include "base/strings/string_util.h"
21 #include "base/strings/stringprintf.h"
22 #include "base/strings/utf_string_conversions.h"
23 #include "net/base/ip_address.h"
24 #include "net/base/registry_controlled_domains/registry_controlled_domain.h"
25 #include "third_party/abseil-cpp/absl/types/optional.h"
26 #include "url/gurl.h"
27 #include "url/scheme_host_port.h"
28 #include "url/url_canon.h"
29 #include "url/url_canon_internal.h"
30 #include "url/url_canon_ip.h"
31 #include "url/url_constants.h"
32 #include "url/url_util.h"
33
34 namespace net {
35
36 namespace {
37
IsHostCharAlphanumeric(char c)38 bool IsHostCharAlphanumeric(char c) {
39 // We can just check lowercase because uppercase characters have already been
40 // normalized.
41 return ((c >= 'a') && (c <= 'z')) || ((c >= '0') && (c <= '9'));
42 }
43
IsNormalizedLocalhostTLD(base::StringPiece host)44 bool IsNormalizedLocalhostTLD(base::StringPiece host) {
45 return base::EndsWith(host, ".localhost",
46 base::CompareCase::INSENSITIVE_ASCII);
47 }
48
49 // Helper function used by GetIdentityFromURL. If |escaped_text| can be "safely
50 // unescaped" to a valid UTF-8 string, return that string, as UTF-16. Otherwise,
51 // convert it as-is to UTF-16. "Safely unescaped" is defined as having no
52 // escaped character between '0x00' and '0x1F', inclusive.
UnescapeIdentityString(base::StringPiece escaped_text)53 std::u16string UnescapeIdentityString(base::StringPiece escaped_text) {
54 std::string unescaped_text;
55 if (base::UnescapeBinaryURLComponentSafe(
56 escaped_text, false /* fail_on_path_separators */, &unescaped_text)) {
57 std::u16string result;
58 if (base::UTF8ToUTF16(unescaped_text.data(), unescaped_text.length(),
59 &result)) {
60 return result;
61 }
62 }
63 return base::UTF8ToUTF16(escaped_text);
64 }
65
66 } // namespace
67
AppendQueryParameter(const GURL & url,base::StringPiece name,base::StringPiece value)68 GURL AppendQueryParameter(const GURL& url,
69 base::StringPiece name,
70 base::StringPiece value) {
71 std::string query(url.query());
72
73 if (!query.empty())
74 query += "&";
75
76 query += (base::EscapeQueryParamValue(name, true) + "=" +
77 base::EscapeQueryParamValue(value, true));
78 GURL::Replacements replacements;
79 replacements.SetQueryStr(query);
80 return url.ReplaceComponents(replacements);
81 }
82
AppendOrReplaceQueryParameter(const GURL & url,base::StringPiece name,absl::optional<base::StringPiece> value)83 GURL AppendOrReplaceQueryParameter(const GURL& url,
84 base::StringPiece name,
85 absl::optional<base::StringPiece> value) {
86 bool replaced = false;
87 std::string param_name = base::EscapeQueryParamValue(name, true);
88 bool should_keep_param = value.has_value();
89
90 std::string param_value;
91 if (should_keep_param)
92 param_value = base::EscapeQueryParamValue(value.value(), true);
93
94 const base::StringPiece input = url.query_piece();
95 url::Component cursor(0, input.size());
96 std::string output;
97 url::Component key_range, value_range;
98 while (url::ExtractQueryKeyValue(input.data(), &cursor, &key_range,
99 &value_range)) {
100 const base::StringPiece key = input.substr(key_range.begin, key_range.len);
101 std::string key_value_pair;
102 // Check |replaced| as only the first pair should be replaced.
103 if (!replaced && key == param_name) {
104 replaced = true;
105 if (!should_keep_param)
106 continue;
107
108 key_value_pair = param_name + "=" + param_value;
109 } else {
110 key_value_pair = std::string(
111 input.substr(key_range.begin, value_range.end() - key_range.begin));
112 }
113 if (!output.empty())
114 output += "&";
115
116 output += key_value_pair;
117 }
118 if (!replaced && should_keep_param) {
119 if (!output.empty())
120 output += "&";
121
122 output += (param_name + "=" + param_value);
123 }
124 GURL::Replacements replacements;
125 replacements.SetQueryStr(output);
126 return url.ReplaceComponents(replacements);
127 }
128
AppendOrReplaceRef(const GURL & url,const base::StringPiece & ref)129 GURL AppendOrReplaceRef(const GURL& url, const base::StringPiece& ref) {
130 GURL::Replacements replacements;
131 replacements.SetRefStr(ref);
132 return url.ReplaceComponents(replacements);
133 }
134
QueryIterator(const GURL & url)135 QueryIterator::QueryIterator(const GURL& url)
136 : url_(url), at_end_(!url.is_valid()) {
137 if (!at_end_) {
138 query_ = url.parsed_for_possibly_invalid_spec().query;
139 Advance();
140 }
141 }
142
143 QueryIterator::~QueryIterator() = default;
144
GetKey() const145 base::StringPiece QueryIterator::GetKey() const {
146 DCHECK(!at_end_);
147 if (key_.is_nonempty())
148 return base::StringPiece(url_->spec()).substr(key_.begin, key_.len);
149 return base::StringPiece();
150 }
151
GetValue() const152 base::StringPiece QueryIterator::GetValue() const {
153 DCHECK(!at_end_);
154 if (value_.is_nonempty())
155 return base::StringPiece(url_->spec()).substr(value_.begin, value_.len);
156 return base::StringPiece();
157 }
158
GetUnescapedValue()159 const std::string& QueryIterator::GetUnescapedValue() {
160 DCHECK(!at_end_);
161 if (value_.is_nonempty() && unescaped_value_.empty()) {
162 unescaped_value_ = base::UnescapeURLComponent(
163 GetValue(),
164 base::UnescapeRule::SPACES | base::UnescapeRule::PATH_SEPARATORS |
165 base::UnescapeRule::URL_SPECIAL_CHARS_EXCEPT_PATH_SEPARATORS |
166 base::UnescapeRule::REPLACE_PLUS_WITH_SPACE);
167 }
168 return unescaped_value_;
169 }
170
IsAtEnd() const171 bool QueryIterator::IsAtEnd() const {
172 return at_end_;
173 }
174
Advance()175 void QueryIterator::Advance() {
176 DCHECK(!at_end_);
177 key_.reset();
178 value_.reset();
179 unescaped_value_.clear();
180 at_end_ =
181 !url::ExtractQueryKeyValue(url_->spec().c_str(), &query_, &key_, &value_);
182 }
183
GetValueForKeyInQuery(const GURL & url,base::StringPiece search_key,std::string * out_value)184 bool GetValueForKeyInQuery(const GURL& url,
185 base::StringPiece search_key,
186 std::string* out_value) {
187 for (QueryIterator it(url); !it.IsAtEnd(); it.Advance()) {
188 if (it.GetKey() == search_key) {
189 *out_value = it.GetUnescapedValue();
190 return true;
191 }
192 }
193 return false;
194 }
195
ParseHostAndPort(base::StringPiece input,std::string * host,int * port)196 bool ParseHostAndPort(base::StringPiece input, std::string* host, int* port) {
197 if (input.empty())
198 return false;
199
200 url::Component auth_component(0, input.size());
201 url::Component username_component;
202 url::Component password_component;
203 url::Component hostname_component;
204 url::Component port_component;
205
206 url::ParseAuthority(input.data(), auth_component, &username_component,
207 &password_component, &hostname_component,
208 &port_component);
209
210 // There shouldn't be a username/password.
211 if (username_component.is_valid() || password_component.is_valid())
212 return false;
213
214 if (hostname_component.is_empty())
215 return false; // Failed parsing.
216
217 int parsed_port_number = -1;
218 if (port_component.is_nonempty()) {
219 parsed_port_number = url::ParsePort(input.data(), port_component);
220
221 // If parsing failed, port_number will be either PORT_INVALID or
222 // PORT_UNSPECIFIED, both of which are negative.
223 if (parsed_port_number < 0)
224 return false; // Failed parsing the port number.
225 }
226
227 if (port_component.len == 0)
228 return false; // Reject inputs like "foo:"
229
230 unsigned char tmp_ipv6_addr[16];
231
232 // If the hostname starts with a bracket, it is either an IPv6 literal or
233 // invalid. If it is an IPv6 literal then strip the brackets.
234 if (hostname_component.len > 0 && input[hostname_component.begin] == '[') {
235 if (input[hostname_component.end() - 1] == ']' &&
236 url::IPv6AddressToNumber(input.data(), hostname_component,
237 tmp_ipv6_addr)) {
238 // Strip the brackets.
239 hostname_component.begin++;
240 hostname_component.len -= 2;
241 } else {
242 return false;
243 }
244 }
245
246 // Pass results back to caller.
247 host->assign(input.data() + hostname_component.begin, hostname_component.len);
248 *port = parsed_port_number;
249
250 return true; // Success.
251 }
252
GetHostAndPort(const GURL & url)253 std::string GetHostAndPort(const GURL& url) {
254 // For IPv6 literals, GURL::host() already includes the brackets so it is
255 // safe to just append a colon.
256 return base::StringPrintf("%s:%d", url.host().c_str(),
257 url.EffectiveIntPort());
258 }
259
GetHostAndOptionalPort(const GURL & url)260 std::string GetHostAndOptionalPort(const GURL& url) {
261 // For IPv6 literals, GURL::host() already includes the brackets
262 // so it is safe to just append a colon.
263 if (url.has_port())
264 return base::StringPrintf("%s:%s", url.host().c_str(), url.port().c_str());
265 return url.host();
266 }
267
GetHostAndOptionalPort(const url::SchemeHostPort & scheme_host_port)268 NET_EXPORT std::string GetHostAndOptionalPort(
269 const url::SchemeHostPort& scheme_host_port) {
270 int default_port = url::DefaultPortForScheme(
271 scheme_host_port.scheme().data(),
272 static_cast<int>(scheme_host_port.scheme().length()));
273 if (default_port != scheme_host_port.port()) {
274 return base::StringPrintf("%s:%i", scheme_host_port.host().c_str(),
275 scheme_host_port.port());
276 }
277 return scheme_host_port.host();
278 }
279
TrimEndingDot(base::StringPiece host)280 std::string TrimEndingDot(base::StringPiece host) {
281 base::StringPiece host_trimmed = host;
282 size_t len = host_trimmed.length();
283 if (len > 1 && host_trimmed[len - 1] == '.') {
284 host_trimmed.remove_suffix(1);
285 }
286 return std::string(host_trimmed);
287 }
288
GetHostOrSpecFromURL(const GURL & url)289 std::string GetHostOrSpecFromURL(const GURL& url) {
290 return url.has_host() ? TrimEndingDot(url.host_piece()) : url.spec();
291 }
292
GetSuperdomain(base::StringPiece domain)293 std::string GetSuperdomain(base::StringPiece domain) {
294 size_t dot_pos = domain.find('.');
295 if (dot_pos == std::string::npos)
296 return "";
297 return std::string(domain.substr(dot_pos + 1));
298 }
299
IsSubdomainOf(base::StringPiece subdomain,base::StringPiece superdomain)300 bool IsSubdomainOf(base::StringPiece subdomain, base::StringPiece superdomain) {
301 // Subdomain must be identical or have strictly more labels than the
302 // superdomain.
303 if (subdomain.length() <= superdomain.length())
304 return subdomain == superdomain;
305
306 // Superdomain must be suffix of subdomain, and the last character not
307 // included in the matching substring must be a dot.
308 if (!base::EndsWith(subdomain, superdomain))
309 return false;
310 subdomain.remove_suffix(superdomain.length());
311 return subdomain.back() == '.';
312 }
313
CanonicalizeHost(base::StringPiece host,url::CanonHostInfo * host_info)314 std::string CanonicalizeHost(base::StringPiece host,
315 url::CanonHostInfo* host_info) {
316 // Try to canonicalize the host.
317 const url::Component raw_host_component(0, static_cast<int>(host.length()));
318 std::string canon_host;
319 url::StdStringCanonOutput canon_host_output(&canon_host);
320 // A url::StdStringCanonOutput starts off with a zero length buffer. The
321 // first time through Grow() immediately resizes it to 32 bytes, incurring
322 // a malloc. With libcxx a 22 byte or smaller request can be accommodated
323 // within the std::string itself (i.e. no malloc occurs). Start the buffer
324 // off at the max size to avoid a malloc on short strings.
325 // NOTE: To ensure the final size is correctly reflected, it's necessary
326 // to call Complete() which will adjust the size to the actual bytes written.
327 // This is handled below for success cases, while failure cases discard all
328 // the output.
329 const int kCxxMaxStringBufferSizeWithoutMalloc = 22;
330 canon_host_output.Resize(kCxxMaxStringBufferSizeWithoutMalloc);
331 url::CanonicalizeHostVerbose(host.data(), raw_host_component,
332 &canon_host_output, host_info);
333
334 if (host_info->out_host.is_nonempty() &&
335 host_info->family != url::CanonHostInfo::BROKEN) {
336 // Success! Assert that there's no extra garbage.
337 canon_host_output.Complete();
338 DCHECK_EQ(host_info->out_host.len, static_cast<int>(canon_host.length()));
339 } else {
340 // Empty host, or canonicalization failed. We'll return empty.
341 canon_host.clear();
342 }
343
344 return canon_host;
345 }
346
IsCanonicalizedHostCompliant(base::StringPiece host)347 bool IsCanonicalizedHostCompliant(base::StringPiece host) {
348 if (host.empty() || host.size() > 254 ||
349 (host.back() != '.' && host.size() == 254)) {
350 return false;
351 }
352
353 bool in_component = false;
354 bool most_recent_component_started_alphanumeric = false;
355 size_t label_size = 0;
356
357 for (char c : host) {
358 ++label_size;
359 if (!in_component) {
360 most_recent_component_started_alphanumeric = IsHostCharAlphanumeric(c);
361 if (!most_recent_component_started_alphanumeric && (c != '-') &&
362 (c != '_')) {
363 return false;
364 }
365 in_component = true;
366 } else if (c == '.') {
367 in_component = false;
368 if (label_size > 64 || label_size == 1) {
369 // Label should not be empty or longer than 63 characters (+1 for '.'
370 // character included in `label_size`).
371 return false;
372 } else {
373 label_size = 0;
374 }
375 } else if (!IsHostCharAlphanumeric(c) && (c != '-') && (c != '_')) {
376 return false;
377 }
378 }
379
380 // Check for too-long label when not ended with final '.'.
381 if (label_size > 63)
382 return false;
383
384 return most_recent_component_started_alphanumeric;
385 }
386
IsHostnameNonUnique(base::StringPiece hostname)387 bool IsHostnameNonUnique(base::StringPiece hostname) {
388 // CanonicalizeHost requires surrounding brackets to parse an IPv6 address.
389 const std::string host_or_ip = hostname.find(':') != std::string::npos
390 ? base::StrCat({"[", hostname, "]"})
391 : std::string(hostname);
392 url::CanonHostInfo host_info;
393 std::string canonical_name = CanonicalizeHost(host_or_ip, &host_info);
394
395 // If canonicalization fails, then the input is truly malformed. However,
396 // to avoid mis-reporting bad inputs as "non-unique", treat them as unique.
397 if (canonical_name.empty())
398 return false;
399
400 // If |hostname| is an IP address, check to see if it's in an IANA-reserved
401 // range reserved for non-publicly routable networks.
402 if (host_info.IsIPAddress()) {
403 IPAddress host_addr;
404 if (!host_addr.AssignFromIPLiteral(hostname.substr(
405 host_info.out_host.begin, host_info.out_host.len))) {
406 return false;
407 }
408 switch (host_info.family) {
409 case url::CanonHostInfo::IPV4:
410 case url::CanonHostInfo::IPV6:
411 return !host_addr.IsPubliclyRoutable();
412 case url::CanonHostInfo::NEUTRAL:
413 case url::CanonHostInfo::BROKEN:
414 return false;
415 }
416 }
417
418 // Check for a registry controlled portion of |hostname|, ignoring private
419 // registries, as they already chain to ICANN-administered registries,
420 // and explicitly ignoring unknown registries.
421 //
422 // Note: This means that as new gTLDs are introduced on the Internet, they
423 // will be treated as non-unique until the registry controlled domain list
424 // is updated. However, because gTLDs are expected to provide significant
425 // advance notice to deprecate older versions of this code, this an
426 // acceptable tradeoff.
427 return !registry_controlled_domains::HostHasRegistryControlledDomain(
428 canonical_name, registry_controlled_domains::EXCLUDE_UNKNOWN_REGISTRIES,
429 registry_controlled_domains::EXCLUDE_PRIVATE_REGISTRIES);
430 }
431
IsLocalhost(const GURL & url)432 bool IsLocalhost(const GURL& url) {
433 return HostStringIsLocalhost(url.HostNoBracketsPiece());
434 }
435
HostStringIsLocalhost(base::StringPiece host)436 bool HostStringIsLocalhost(base::StringPiece host) {
437 IPAddress ip_address;
438 if (ip_address.AssignFromIPLiteral(host))
439 return ip_address.IsLoopback();
440 return IsLocalHostname(host);
441 }
442
SimplifyUrlForRequest(const GURL & url)443 GURL SimplifyUrlForRequest(const GURL& url) {
444 DCHECK(url.is_valid());
445 // Fast path to avoid re-canonicalization via ReplaceComponents.
446 if (!url.has_username() && !url.has_password() && !url.has_ref())
447 return url;
448 GURL::Replacements replacements;
449 replacements.ClearUsername();
450 replacements.ClearPassword();
451 replacements.ClearRef();
452 return url.ReplaceComponents(replacements);
453 }
454
ChangeWebSocketSchemeToHttpScheme(const GURL & url)455 GURL ChangeWebSocketSchemeToHttpScheme(const GURL& url) {
456 DCHECK(url.SchemeIsWSOrWSS());
457 GURL::Replacements replace_scheme;
458 replace_scheme.SetSchemeStr(url.SchemeIs(url::kWssScheme) ? url::kHttpsScheme
459 : url::kHttpScheme);
460 return url.ReplaceComponents(replace_scheme);
461 }
462
IsStandardSchemeWithNetworkHost(base::StringPiece scheme)463 bool IsStandardSchemeWithNetworkHost(base::StringPiece scheme) {
464 // file scheme is special. Windows file share origins can have network hosts.
465 if (scheme == url::kFileScheme)
466 return true;
467
468 url::SchemeType scheme_type;
469 if (!url::GetStandardSchemeType(
470 scheme.data(), url::Component(0, scheme.length()), &scheme_type)) {
471 return false;
472 }
473 return scheme_type == url::SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
474 scheme_type == url::SCHEME_WITH_HOST_AND_PORT;
475 }
476
GetIdentityFromURL(const GURL & url,std::u16string * username,std::u16string * password)477 void GetIdentityFromURL(const GURL& url,
478 std::u16string* username,
479 std::u16string* password) {
480 *username = UnescapeIdentityString(url.username());
481 *password = UnescapeIdentityString(url.password());
482 }
483
HasGoogleHost(const GURL & url)484 bool HasGoogleHost(const GURL& url) {
485 return IsGoogleHost(url.host_piece());
486 }
487
IsGoogleHost(base::StringPiece host)488 bool IsGoogleHost(base::StringPiece host) {
489 static const char* kGoogleHostSuffixes[] = {
490 ".google.com",
491 ".youtube.com",
492 ".gmail.com",
493 ".doubleclick.net",
494 ".gstatic.com",
495 ".googlevideo.com",
496 ".googleusercontent.com",
497 ".googlesyndication.com",
498 ".google-analytics.com",
499 ".googleadservices.com",
500 ".googleapis.com",
501 ".ytimg.com",
502 };
503 for (const char* suffix : kGoogleHostSuffixes) {
504 // Here it's possible to get away with faster case-sensitive comparisons
505 // because the list above is all lowercase, and a GURL's host name will
506 // always be canonicalized to lowercase as well.
507 if (base::EndsWith(host, suffix))
508 return true;
509 }
510 return false;
511 }
512
IsGoogleHostWithAlpnH3(base::StringPiece host)513 bool IsGoogleHostWithAlpnH3(base::StringPiece host) {
514 return base::EqualsCaseInsensitiveASCII(host, "google.com") ||
515 base::EqualsCaseInsensitiveASCII(host, "www.google.com");
516 }
517
IsLocalHostname(base::StringPiece host)518 bool IsLocalHostname(base::StringPiece host) {
519 // Remove any trailing '.'.
520 if (!host.empty() && *host.rbegin() == '.')
521 host.remove_suffix(1);
522
523 return base::EqualsCaseInsensitiveASCII(host, "localhost") ||
524 IsNormalizedLocalhostTLD(host);
525 }
526
UnescapePercentEncodedUrl(base::StringPiece input)527 std::string UnescapePercentEncodedUrl(base::StringPiece input) {
528 std::string result(input);
529 // Replace any 0x2B (+) with 0x20 (SP).
530 for (char& c : result) {
531 if (c == '+') {
532 c = ' ';
533 }
534 }
535 // Run UTF-8 decoding without BOM on the percent-decoding.
536 url::RawCanonOutputT<char16_t> canon_output;
537 url::DecodeURLEscapeSequences(result.data(), result.size(),
538 url::DecodeURLMode::kUTF8, &canon_output);
539 return base::UTF16ToUTF8(
540 base::StringPiece16(canon_output.data(), canon_output.length()));
541 }
542
543 } // namespace net
544