1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "url/url_util.h"
11
12 #include <stddef.h>
13 #include <string.h>
14
15 #include <atomic>
16 #include <ostream>
17
18 #include "base/check_op.h"
19 #include "base/compiler_specific.h"
20 #include "base/containers/contains.h"
21 #include "base/no_destructor.h"
22 #include "base/strings/string_util.h"
23 #include "url/url_canon_internal.h"
24 #include "url/url_constants.h"
25 #include "url/url_features.h"
26 #include "url/url_file.h"
27 #include "url/url_parse_internal.h"
28 #include "url/url_util_internal.h"
29
30 namespace url {
31
32 namespace {
33
34 // A pair for representing a standard scheme name and the SchemeType for it.
35 struct SchemeWithType {
36 std::string scheme;
37 SchemeType type;
38 };
39
40 // A pair for representing a scheme and a custom protocol handler for it.
41 //
42 // This pair of strings must be normalized protocol handler parameters as
43 // described in the Custom Handler specification.
44 // https://html.spec.whatwg.org/multipage/system-state.html#normalize-protocol-handler-parameters
45 struct SchemeWithHandler {
46 std::string scheme;
47 std::string handler;
48 };
49
50 // List of currently registered schemes and associated properties.
51 struct SchemeRegistry {
52 // Standard format schemes (see header for details).
53 std::vector<SchemeWithType> standard_schemes = {
54 {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
55 {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
56 // Yes, file URLs can have a hostname, so file URLs should be handled as
57 // "standard". File URLs never have a port as specified by the SchemeType
58 // field. Unlike other SCHEME_WITH_HOST schemes, the 'host' in a file
59 // URL may be empty, a behavior which is special-cased during
60 // canonicalization.
61 {kFileScheme, SCHEME_WITH_HOST},
62 {kFtpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
63 {kWssScheme,
64 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket secure.
65 {kWsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION}, // WebSocket.
66 {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY},
67 };
68
69 // Schemes that are allowed for referrers.
70 //
71 // WARNING: Adding (1) a non-"standard" scheme or (2) a scheme whose URLs have
72 // opaque origins could lead to surprising behavior in some of the referrer
73 // generation logic. In order to avoid surprises, be sure to have adequate
74 // test coverage in each of the multiple code locations that compute
75 // referrers.
76 std::vector<SchemeWithType> referrer_schemes = {
77 {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
78 {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
79 };
80
81 // Schemes that do not trigger mixed content warning.
82 std::vector<std::string> secure_schemes = {
83 kHttpsScheme,
84 kWssScheme,
85 kDataScheme,
86 kAboutScheme,
87 };
88
89 // Schemes that normal pages cannot link to or access (i.e., with the same
90 // security rules as those applied to "file" URLs).
91 std::vector<std::string> local_schemes = {
92 kFileScheme,
93 };
94
95 // Schemes that cause pages loaded with them to not have access to pages
96 // loaded with any other URL scheme.
97 std::vector<std::string> no_access_schemes = {
98 kAboutScheme,
99 kJavaScriptScheme,
100 kDataScheme,
101 };
102
103 // Schemes that can be sent CORS requests.
104 std::vector<std::string> cors_enabled_schemes = {
105 kHttpsScheme,
106 kHttpScheme,
107 kDataScheme,
108 };
109
110 // Schemes that can be used by web to store data (local storage, etc).
111 std::vector<std::string> web_storage_schemes = {
112 kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, kWssScheme, kWsScheme,
113 };
114
115 // Schemes that can bypass the Content-Security-Policy (CSP) checks.
116 std::vector<std::string> csp_bypassing_schemes = {};
117
118 // Schemes that are strictly empty documents, allowing them to commit
119 // synchronously.
120 std::vector<std::string> empty_document_schemes = {
121 kAboutScheme,
122 };
123
124 // Non-special schemes that should be treated as opaque path URLs for
125 // compatibility reasons.
126 std::vector<std::string> opaque_non_special_schemes = {
127 // See https://crrev.com/c/5465607 for the reason.
128 kAndroidScheme,
129 // Temporarily opted-out. See https://crrev.com/c/5569365.
130 kDrivefsScheme,
131 // Temporarily opted-out. See https://crrev.com/c/5568919.
132 kChromeosSteamScheme,
133 kSteamScheme,
134 // Temporarily opted-out. See https://crrev.com/c/5578066.
135 kMaterializedViewScheme,
136 };
137
138 // Schemes with a predefined default custom handler.
139 std::vector<SchemeWithHandler> predefined_handler_schemes;
140
141 bool allow_non_standard_schemes = false;
142 };
143
144 // See the LockSchemeRegistries declaration in the header.
145 bool scheme_registries_locked = false;
146
147 // Ensure that the schemes aren't modified after first use.
148 static std::atomic<bool> g_scheme_registries_used{false};
149
150 // Gets the scheme registry without locking the schemes. This should *only* be
151 // used for adding schemes to the registry.
GetSchemeRegistryWithoutLocking()152 SchemeRegistry* GetSchemeRegistryWithoutLocking() {
153 static base::NoDestructor<SchemeRegistry> registry;
154 return registry.get();
155 }
156
GetSchemeRegistry()157 const SchemeRegistry& GetSchemeRegistry() {
158 #if DCHECK_IS_ON()
159 g_scheme_registries_used.store(true);
160 #endif
161 return *GetSchemeRegistryWithoutLocking();
162 }
163
164 // Pass this enum through for methods which would like to know if whitespace
165 // removal is necessary.
166 enum WhitespaceRemovalPolicy {
167 REMOVE_WHITESPACE,
168 DO_NOT_REMOVE_WHITESPACE,
169 };
170
171 // Given a string and a range inside the string, compares it to the given
172 // lower-case |compare_to| buffer.
173 template<typename CHAR>
DoCompareSchemeComponent(const CHAR * spec,const Component & component,const char * compare_to)174 inline bool DoCompareSchemeComponent(const CHAR* spec,
175 const Component& component,
176 const char* compare_to) {
177 if (component.is_empty())
178 return compare_to[0] == 0; // When component is empty, match empty scheme.
179 return base::EqualsCaseInsensitiveASCII(
180 std::basic_string_view(&spec[component.begin], component.len),
181 compare_to);
182 }
183
184 // Returns true and sets |type| to the SchemeType of the given scheme
185 // identified by |scheme| within |spec| if in |schemes|.
186 template<typename CHAR>
DoIsInSchemes(const CHAR * spec,const Component & scheme,SchemeType * type,const std::vector<SchemeWithType> & schemes)187 bool DoIsInSchemes(const CHAR* spec,
188 const Component& scheme,
189 SchemeType* type,
190 const std::vector<SchemeWithType>& schemes) {
191 if (scheme.is_empty())
192 return false; // Empty or invalid schemes are non-standard.
193
194 for (const SchemeWithType& scheme_with_type : schemes) {
195 if (base::EqualsCaseInsensitiveASCII(
196 std::basic_string_view(&spec[scheme.begin], scheme.len),
197 scheme_with_type.scheme)) {
198 *type = scheme_with_type.type;
199 return true;
200 }
201 }
202 return false;
203 }
204
205 template<typename CHAR>
DoIsStandard(const CHAR * spec,const Component & scheme,SchemeType * type)206 bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) {
207 return DoIsInSchemes(spec, scheme, type,
208 GetSchemeRegistry().standard_schemes);
209 }
210
211 template <typename CHAR>
DoIsOpaqueNonSpecial(const CHAR * spec,const Component & scheme)212 bool DoIsOpaqueNonSpecial(const CHAR* spec, const Component& scheme) {
213 if (scheme.is_empty()) {
214 return false;
215 }
216 for (const std::string& s : GetSchemeRegistry().opaque_non_special_schemes) {
217 if (base::EqualsCaseInsensitiveASCII(
218 std::basic_string_view(&spec[scheme.begin], scheme.len), s)) {
219 return true;
220 }
221 }
222 return false;
223 }
224
225 template<typename CHAR>
DoFindAndCompareScheme(const CHAR * str,int str_len,const char * compare,Component * found_scheme)226 bool DoFindAndCompareScheme(const CHAR* str,
227 int str_len,
228 const char* compare,
229 Component* found_scheme) {
230 // Before extracting scheme, canonicalize the URL to remove any whitespace.
231 // This matches the canonicalization done in DoCanonicalize function.
232 STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
233 int spec_len;
234 const CHAR* spec =
235 RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr);
236
237 Component our_scheme;
238 if (!ExtractScheme(spec, spec_len, &our_scheme)) {
239 // No scheme.
240 if (found_scheme)
241 *found_scheme = Component();
242 return false;
243 }
244 if (found_scheme)
245 *found_scheme = our_scheme;
246 return DoCompareSchemeComponent(spec, our_scheme, compare);
247 }
248
249 template <typename CHAR>
DoCanonicalize(const CHAR * spec,int spec_len,bool trim_path_end,WhitespaceRemovalPolicy whitespace_policy,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)250 bool DoCanonicalize(const CHAR* spec,
251 int spec_len,
252 bool trim_path_end,
253 WhitespaceRemovalPolicy whitespace_policy,
254 CharsetConverter* charset_converter,
255 CanonOutput* output,
256 Parsed* output_parsed) {
257 // Trim leading C0 control characters and spaces.
258 int begin = 0;
259 TrimURL(spec, &begin, &spec_len, trim_path_end);
260 DCHECK(0 <= begin && begin <= spec_len);
261 spec += begin;
262 spec_len -= begin;
263
264 output->ReserveSizeIfNeeded(spec_len);
265
266 // Remove any whitespace from the middle of the relative URL if necessary.
267 // Possibly this will result in copying to the new buffer.
268 STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
269 if (whitespace_policy == REMOVE_WHITESPACE) {
270 spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len,
271 &output_parsed->potentially_dangling_markup);
272 }
273
274 #ifdef WIN32
275 // For Windows, we allow things that look like absolute Windows paths to be
276 // fixed up magically to file URLs. This is done for IE compatibility. For
277 // example, this will change "c:/foo" into a file URL rather than treating
278 // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
279 // There is similar logic in url_canon_relative.cc for
280 //
281 // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
282 // has no meaning as an absolute path name. This is because browsers on Mac
283 // & Unix don't generally do this, so there is no compatibility reason for
284 // doing so.
285 if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
286 DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
287 return CanonicalizeFileURL(
288 spec, spec_len, ParseFileURL(std::basic_string_view(spec, spec_len)),
289 charset_converter, output, output_parsed);
290 }
291 #endif
292
293 Component scheme;
294 if (!ExtractScheme(spec, spec_len, &scheme))
295 return false;
296
297 // This is the parsed version of the input URL, we have to canonicalize it
298 // before storing it in our object.
299 bool success;
300 SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
301 if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
302 // File URLs are special.
303 success = CanonicalizeFileURL(
304 spec, spec_len, ParseFileURL(std::basic_string_view(spec, spec_len)),
305 charset_converter, output, output_parsed);
306 } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
307 // Filesystem URLs are special.
308 success = CanonicalizeFileSystemURL(
309 spec, ParseFileSystemURL(std::basic_string_view(spec, spec_len)),
310 charset_converter, output, output_parsed);
311
312 } else if (DoIsStandard(spec, scheme, &scheme_type)) {
313 // All "normal" URLs.
314 success = CanonicalizeStandardURL(
315 spec, ParseStandardURL(std::basic_string_view(spec, spec_len)),
316 scheme_type, charset_converter, output, output_parsed);
317
318 } else if (!url::IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
319 DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
320 // Mailto URLs are treated like standard URLs, with only a scheme, path,
321 // and query.
322 //
323 // TODO(crbug.com/40063064): Remove the special handling of 'mailto:" scheme
324 // URLs. "mailto:" is simply one of non-special URLs.
325 success = CanonicalizeMailtoURL(
326 spec, spec_len, ParseMailtoURL(std::basic_string_view(spec, spec_len)),
327 output, output_parsed);
328
329 } else {
330 // Non-special scheme URLs like data: and javascript:.
331 if (url::IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
332 !DoIsOpaqueNonSpecial(spec, scheme)) {
333 success = CanonicalizeNonSpecialURL(
334 spec, spec_len,
335 ParseNonSpecialURLInternal(std::basic_string_view(spec, spec_len),
336 trim_path_end),
337 charset_converter, *output, *output_parsed);
338 } else {
339 success = CanonicalizePathURL(
340 spec, spec_len,
341 ParsePathURL(std::basic_string_view(spec, spec_len), trim_path_end),
342 output, output_parsed);
343 }
344 }
345 return success;
346 }
347
348 template<typename CHAR>
DoResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const CHAR * in_relative,int in_relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)349 bool DoResolveRelative(const char* base_spec,
350 int base_spec_len,
351 const Parsed& base_parsed,
352 const CHAR* in_relative,
353 int in_relative_length,
354 CharsetConverter* charset_converter,
355 CanonOutput* output,
356 Parsed* output_parsed) {
357 // Remove any whitespace from the middle of the relative URL, possibly
358 // copying to the new buffer.
359 STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
360 int relative_length;
361 const CHAR* relative = RemoveURLWhitespace(
362 in_relative, in_relative_length, &whitespace_buffer, &relative_length,
363 &output_parsed->potentially_dangling_markup);
364
365 bool base_is_authority_based = false;
366 bool base_is_hierarchical = false;
367 if (base_spec &&
368 base_parsed.scheme.is_nonempty()) {
369 int after_scheme = base_parsed.scheme.end() + 1; // Skip past the colon.
370 int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
371 base_spec_len);
372 base_is_authority_based = num_slashes > 1;
373 base_is_hierarchical = num_slashes > 0;
374 }
375
376 bool is_hierarchical_base;
377
378 if (url::IsUsingStandardCompliantNonSpecialSchemeURLParsing()) {
379 is_hierarchical_base =
380 base_parsed.scheme.is_nonempty() && !base_parsed.has_opaque_path;
381 } else {
382 SchemeType unused_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
383 is_hierarchical_base =
384 base_parsed.scheme.is_nonempty() &&
385 DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type);
386 }
387
388 bool is_relative;
389 Component relative_component;
390 if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
391 (base_is_hierarchical || is_hierarchical_base),
392 &is_relative, &relative_component)) {
393 // Error resolving.
394 return false;
395 }
396
397 // Don't reserve buffer space here. Instead, reserve in DoCanonicalize and
398 // ReserveRelativeURL, to enable more accurate buffer sizes.
399
400 // Pretend for a moment that |base_spec| is a standard URL. Normally
401 // non-standard URLs are treated as PathURLs, but if the base has an
402 // authority we would like to preserve it.
403 if (is_relative && base_is_authority_based && !is_hierarchical_base) {
404 Parsed base_parsed_authority =
405 ParseStandardURL(std::string_view(base_spec, base_spec_len));
406 if (base_parsed_authority.host.is_nonempty()) {
407 STACK_UNINITIALIZED RawCanonOutputT<char> temporary_output;
408 bool did_resolve_succeed =
409 ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
410 relative_component, charset_converter,
411 &temporary_output, output_parsed);
412 // The output_parsed is incorrect at this point (because it was built
413 // based on base_parsed_authority instead of base_parsed) and needs to be
414 // re-created.
415 DoCanonicalize(temporary_output.data(), temporary_output.length(), true,
416 REMOVE_WHITESPACE, charset_converter, output,
417 output_parsed);
418 return did_resolve_succeed;
419 }
420 } else if (is_relative) {
421 // Relative, resolve and canonicalize.
422 bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
423 DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
424 return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme,
425 relative, relative_component, charset_converter,
426 output, output_parsed);
427 }
428
429 // Not relative, canonicalize the input.
430 return DoCanonicalize(relative, relative_length, true,
431 DO_NOT_REMOVE_WHITESPACE, charset_converter, output,
432 output_parsed);
433 }
434
435 template<typename CHAR>
DoReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<CHAR> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)436 bool DoReplaceComponents(const char* spec,
437 int spec_len,
438 const Parsed& parsed,
439 const Replacements<CHAR>& replacements,
440 CharsetConverter* charset_converter,
441 CanonOutput* output,
442 Parsed* out_parsed) {
443 // If the scheme is overridden, just do a simple string substitution and
444 // re-parse the whole thing. There are lots of edge cases that we really don't
445 // want to deal with. Like what happens if I replace "http://e:8080/foo"
446 // with a file. Does it become "file:///E:/8080/foo" where the port number
447 // becomes part of the path? Parsing that string as a file URL says "yes"
448 // but almost no sane rule for dealing with the components individually would
449 // come up with that.
450 //
451 // Why allow these crazy cases at all? Programatically, there is almost no
452 // case for replacing the scheme. The most common case for hitting this is
453 // in JS when building up a URL using the location object. In this case, the
454 // JS code expects the string substitution behavior:
455 // http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
456 if (replacements.IsSchemeOverridden()) {
457 // Canonicalize the new scheme so it is 8-bit and can be concatenated with
458 // the existing spec.
459 STACK_UNINITIALIZED RawCanonOutput<128> scheme_replaced;
460 Component scheme_replaced_parsed;
461 CanonicalizeScheme(replacements.sources().scheme,
462 replacements.components().scheme,
463 &scheme_replaced, &scheme_replaced_parsed);
464
465 // We can assume that the input is canonicalized, which means it always has
466 // a colon after the scheme (or where the scheme would be).
467 int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
468 : 1;
469 if (spec_len - spec_after_colon > 0) {
470 scheme_replaced.Append(&spec[spec_after_colon],
471 spec_len - spec_after_colon);
472 }
473
474 // We now need to completely re-parse the resulting string since its meaning
475 // may have changed with the different scheme.
476 STACK_UNINITIALIZED RawCanonOutput<128> recanonicalized;
477 Parsed recanonicalized_parsed;
478 DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
479 REMOVE_WHITESPACE, charset_converter, &recanonicalized,
480 &recanonicalized_parsed);
481
482 // Recurse using the version with the scheme already replaced. This will now
483 // use the replacement rules for the new scheme.
484 //
485 // Warning: this code assumes that ReplaceComponents will re-check all
486 // components for validity. This is because we can't fail if DoCanonicalize
487 // failed above since theoretically the thing making it fail could be
488 // getting replaced here. If ReplaceComponents didn't re-check everything,
489 // we wouldn't know if something *not* getting replaced is a problem.
490 // If the scheme-specific replacers are made more intelligent so they don't
491 // re-check everything, we should instead re-canonicalize the whole thing
492 // after this call to check validity (this assumes replacing the scheme is
493 // much much less common than other types of replacements, like clearing the
494 // ref).
495 Replacements<CHAR> replacements_no_scheme = replacements;
496 replacements_no_scheme.SetScheme(NULL, Component());
497 // If the input URL has potentially dangling markup, set the flag on the
498 // output too. Note that in some cases the replacement gets rid of the
499 // potentially dangling markup, but this ok since the check will fail
500 // closed.
501 if (parsed.potentially_dangling_markup) {
502 out_parsed->potentially_dangling_markup = true;
503 }
504 return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
505 recanonicalized_parsed, replacements_no_scheme,
506 charset_converter, output, out_parsed);
507 }
508
509 // TODO(csharrison): We could be smarter about size to reserve if this is done
510 // in callers below, and the code checks to see which components are being
511 // replaced, and with what length. If this ends up being a hot spot it should
512 // be changed.
513 output->ReserveSizeIfNeeded(spec_len);
514
515 // If we get here, then we know the scheme doesn't need to be replaced, so can
516 // just key off the scheme in the spec to know how to do the replacements.
517 if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
518 return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
519 out_parsed);
520 }
521 if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
522 return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
523 output, out_parsed);
524 }
525 SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
526 if (DoIsStandard(spec, parsed.scheme, &scheme_type)) {
527 return ReplaceStandardURL(spec, parsed, replacements, scheme_type,
528 charset_converter, output, out_parsed);
529 }
530 if (!IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
531 DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
532 return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
533 }
534
535 if (IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
536 !DoIsOpaqueNonSpecial(spec, parsed.scheme)) {
537 return ReplaceNonSpecialURL(spec, parsed, replacements, charset_converter,
538 *output, *out_parsed);
539 }
540 return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
541 }
542
DoSchemeModificationPreamble()543 void DoSchemeModificationPreamble() {
544 // If this assert triggers, it means you've called Add*Scheme after
545 // the SchemeRegistry has been used.
546 //
547 // This normally means you're trying to set up a new scheme too late or using
548 // the SchemeRegistry too early in your application's init process.
549 DCHECK(!g_scheme_registries_used.load())
550 << "Trying to add a scheme after the lists have been used. "
551 "Make sure that you haven't added any static GURL initializers in tests.";
552
553 // If this assert triggers, it means you've called Add*Scheme after
554 // LockSchemeRegistries has been called (see the header file for
555 // LockSchemeRegistries for more).
556 //
557 // This normally means you're trying to set up a new scheme too late in your
558 // application's init process. Locate where your app does this initialization
559 // and calls LockSchemeRegistries, and add your new scheme there.
560 DCHECK(!scheme_registries_locked)
561 << "Trying to add a scheme after the lists have been locked.";
562 }
563
DoAddSchemeWithHandler(const char * new_scheme,const char * handler,std::vector<SchemeWithHandler> * schemes)564 void DoAddSchemeWithHandler(const char* new_scheme,
565 const char* handler,
566 std::vector<SchemeWithHandler>* schemes) {
567 DoSchemeModificationPreamble();
568 DCHECK(schemes);
569 DCHECK(strlen(new_scheme) > 0);
570 DCHECK(strlen(handler) > 0);
571 DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
572 DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithHandler::scheme));
573 schemes->push_back({new_scheme, handler});
574 }
575
DoAddScheme(const char * new_scheme,std::vector<std::string> * schemes)576 void DoAddScheme(const char* new_scheme, std::vector<std::string>* schemes) {
577 DoSchemeModificationPreamble();
578 DCHECK(schemes);
579 DCHECK(strlen(new_scheme) > 0);
580 DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
581 DCHECK(!base::Contains(*schemes, new_scheme));
582 schemes->push_back(new_scheme);
583 }
584
DoAddSchemeWithType(const char * new_scheme,SchemeType type,std::vector<SchemeWithType> * schemes)585 void DoAddSchemeWithType(const char* new_scheme,
586 SchemeType type,
587 std::vector<SchemeWithType>* schemes) {
588 DoSchemeModificationPreamble();
589 DCHECK(schemes);
590 DCHECK(strlen(new_scheme) > 0);
591 DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
592 DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithType::scheme));
593 schemes->push_back({new_scheme, type});
594 }
595
596 } // namespace
597
ClearSchemesForTests()598 void ClearSchemesForTests() {
599 DCHECK(!g_scheme_registries_used.load())
600 << "Schemes already used "
601 << "(use ScopedSchemeRegistryForTests to relax for tests).";
602 DCHECK(!scheme_registries_locked)
603 << "Schemes already locked "
604 << "(use ScopedSchemeRegistryForTests to relax for tests).";
605 *GetSchemeRegistryWithoutLocking() = SchemeRegistry();
606 }
607
608 class ScopedSchemeRegistryInternal {
609 public:
ScopedSchemeRegistryInternal()610 ScopedSchemeRegistryInternal()
611 : registry_(std::make_unique<SchemeRegistry>(
612 *GetSchemeRegistryWithoutLocking())) {
613 g_scheme_registries_used.store(false);
614 scheme_registries_locked = false;
615 }
~ScopedSchemeRegistryInternal()616 ~ScopedSchemeRegistryInternal() {
617 *GetSchemeRegistryWithoutLocking() = *registry_;
618 g_scheme_registries_used.store(true);
619 scheme_registries_locked = true;
620 }
621
622 private:
623 std::unique_ptr<SchemeRegistry> registry_;
624 };
625
ScopedSchemeRegistryForTests()626 ScopedSchemeRegistryForTests::ScopedSchemeRegistryForTests()
627 : internal_(std::make_unique<ScopedSchemeRegistryInternal>()) {}
628
629 ScopedSchemeRegistryForTests::~ScopedSchemeRegistryForTests() = default;
630
EnableNonStandardSchemesForAndroidWebView()631 void EnableNonStandardSchemesForAndroidWebView() {
632 DoSchemeModificationPreamble();
633 GetSchemeRegistryWithoutLocking()->allow_non_standard_schemes = true;
634 }
635
AllowNonStandardSchemesForAndroidWebView()636 bool AllowNonStandardSchemesForAndroidWebView() {
637 return GetSchemeRegistry().allow_non_standard_schemes;
638 }
639
AddStandardScheme(const char * new_scheme,SchemeType type)640 void AddStandardScheme(const char* new_scheme, SchemeType type) {
641 DoAddSchemeWithType(new_scheme, type,
642 &GetSchemeRegistryWithoutLocking()->standard_schemes);
643 }
644
GetStandardSchemes()645 std::vector<std::string> GetStandardSchemes() {
646 std::vector<std::string> result;
647 result.reserve(GetSchemeRegistry().standard_schemes.size());
648 for (const auto& entry : GetSchemeRegistry().standard_schemes) {
649 result.push_back(entry.scheme);
650 }
651 return result;
652 }
653
AddReferrerScheme(const char * new_scheme,SchemeType type)654 void AddReferrerScheme(const char* new_scheme, SchemeType type) {
655 DoAddSchemeWithType(new_scheme, type,
656 &GetSchemeRegistryWithoutLocking()->referrer_schemes);
657 }
658
AddSecureScheme(const char * new_scheme)659 void AddSecureScheme(const char* new_scheme) {
660 DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->secure_schemes);
661 }
662
GetSecureSchemes()663 const std::vector<std::string>& GetSecureSchemes() {
664 return GetSchemeRegistry().secure_schemes;
665 }
666
AddLocalScheme(const char * new_scheme)667 void AddLocalScheme(const char* new_scheme) {
668 DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->local_schemes);
669 }
670
GetLocalSchemes()671 const std::vector<std::string>& GetLocalSchemes() {
672 return GetSchemeRegistry().local_schemes;
673 }
674
AddNoAccessScheme(const char * new_scheme)675 void AddNoAccessScheme(const char* new_scheme) {
676 DoAddScheme(new_scheme,
677 &GetSchemeRegistryWithoutLocking()->no_access_schemes);
678 }
679
GetNoAccessSchemes()680 const std::vector<std::string>& GetNoAccessSchemes() {
681 return GetSchemeRegistry().no_access_schemes;
682 }
683
AddCorsEnabledScheme(const char * new_scheme)684 void AddCorsEnabledScheme(const char* new_scheme) {
685 DoAddScheme(new_scheme,
686 &GetSchemeRegistryWithoutLocking()->cors_enabled_schemes);
687 }
688
GetCorsEnabledSchemes()689 const std::vector<std::string>& GetCorsEnabledSchemes() {
690 return GetSchemeRegistry().cors_enabled_schemes;
691 }
692
AddWebStorageScheme(const char * new_scheme)693 void AddWebStorageScheme(const char* new_scheme) {
694 DoAddScheme(new_scheme,
695 &GetSchemeRegistryWithoutLocking()->web_storage_schemes);
696 }
697
GetWebStorageSchemes()698 const std::vector<std::string>& GetWebStorageSchemes() {
699 return GetSchemeRegistry().web_storage_schemes;
700 }
701
AddCSPBypassingScheme(const char * new_scheme)702 void AddCSPBypassingScheme(const char* new_scheme) {
703 DoAddScheme(new_scheme,
704 &GetSchemeRegistryWithoutLocking()->csp_bypassing_schemes);
705 }
706
GetCSPBypassingSchemes()707 const std::vector<std::string>& GetCSPBypassingSchemes() {
708 return GetSchemeRegistry().csp_bypassing_schemes;
709 }
710
AddEmptyDocumentScheme(const char * new_scheme)711 void AddEmptyDocumentScheme(const char* new_scheme) {
712 DoAddScheme(new_scheme,
713 &GetSchemeRegistryWithoutLocking()->empty_document_schemes);
714 }
715
GetEmptyDocumentSchemes()716 const std::vector<std::string>& GetEmptyDocumentSchemes() {
717 return GetSchemeRegistry().empty_document_schemes;
718 }
719
AddPredefinedHandlerScheme(const char * new_scheme,const char * handler)720 void AddPredefinedHandlerScheme(const char* new_scheme, const char* handler) {
721 DoAddSchemeWithHandler(
722 new_scheme, handler,
723 &GetSchemeRegistryWithoutLocking()->predefined_handler_schemes);
724 }
725
GetPredefinedHandlerSchemes()726 std::vector<std::pair<std::string, std::string>> GetPredefinedHandlerSchemes() {
727 std::vector<std::pair<std::string, std::string>> result;
728 result.reserve(GetSchemeRegistry().predefined_handler_schemes.size());
729 for (const SchemeWithHandler& entry :
730 GetSchemeRegistry().predefined_handler_schemes) {
731 result.emplace_back(entry.scheme, entry.handler);
732 }
733 return result;
734 }
735
LockSchemeRegistries()736 void LockSchemeRegistries() {
737 scheme_registries_locked = true;
738 }
739
IsStandard(const char * spec,const Component & scheme)740 bool IsStandard(const char* spec, const Component& scheme) {
741 SchemeType unused_scheme_type;
742 return DoIsStandard(spec, scheme, &unused_scheme_type);
743 }
744
IsStandardScheme(std::string_view scheme)745 bool IsStandardScheme(std::string_view scheme) {
746 return IsStandard(scheme.data(),
747 Component(0, base::checked_cast<int>(scheme.size())));
748 }
749
GetStandardSchemeType(const char * spec,const Component & scheme,SchemeType * type)750 bool GetStandardSchemeType(const char* spec,
751 const Component& scheme,
752 SchemeType* type) {
753 return DoIsStandard(spec, scheme, type);
754 }
755
GetStandardSchemeType(const char16_t * spec,const Component & scheme,SchemeType * type)756 bool GetStandardSchemeType(const char16_t* spec,
757 const Component& scheme,
758 SchemeType* type) {
759 return DoIsStandard(spec, scheme, type);
760 }
761
IsStandard(const char16_t * spec,const Component & scheme)762 bool IsStandard(const char16_t* spec, const Component& scheme) {
763 SchemeType unused_scheme_type;
764 return DoIsStandard(spec, scheme, &unused_scheme_type);
765 }
766
IsReferrerScheme(const char * spec,const Component & scheme)767 bool IsReferrerScheme(const char* spec, const Component& scheme) {
768 SchemeType unused_scheme_type;
769 return DoIsInSchemes(spec, scheme, &unused_scheme_type,
770 GetSchemeRegistry().referrer_schemes);
771 }
772
FindAndCompareScheme(const char * str,int str_len,const char * compare,Component * found_scheme)773 bool FindAndCompareScheme(const char* str,
774 int str_len,
775 const char* compare,
776 Component* found_scheme) {
777 return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
778 }
779
FindAndCompareScheme(const char16_t * str,int str_len,const char * compare,Component * found_scheme)780 bool FindAndCompareScheme(const char16_t* str,
781 int str_len,
782 const char* compare,
783 Component* found_scheme) {
784 return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
785 }
786
DomainIs(std::string_view canonical_host,std::string_view canonical_domain)787 bool DomainIs(std::string_view canonical_host,
788 std::string_view canonical_domain) {
789 if (canonical_host.empty() || canonical_domain.empty())
790 return false;
791
792 // If the host name ends with a dot but the input domain doesn't, then we
793 // ignore the dot in the host name.
794 size_t host_len = canonical_host.length();
795 if (canonical_host.back() == '.' && canonical_domain.back() != '.')
796 --host_len;
797
798 if (host_len < canonical_domain.length())
799 return false;
800
801 // |host_first_pos| is the start of the compared part of the host name, not
802 // start of the whole host name.
803 const char* host_first_pos =
804 canonical_host.data() + host_len - canonical_domain.length();
805
806 if (std::string_view(host_first_pos, canonical_domain.length()) !=
807 canonical_domain) {
808 return false;
809 }
810
811 // Make sure there aren't extra characters in host before the compared part;
812 // if the host name is longer than the input domain name, then the character
813 // immediately before the compared part should be a dot. For example,
814 // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
815 if (canonical_domain[0] != '.' && host_len > canonical_domain.length() &&
816 *(host_first_pos - 1) != '.') {
817 return false;
818 }
819
820 return true;
821 }
822
HostIsIPAddress(std::string_view host)823 bool HostIsIPAddress(std::string_view host) {
824 STACK_UNINITIALIZED url::RawCanonOutputT<char, 128> ignored_output;
825 url::CanonHostInfo host_info;
826 url::CanonicalizeIPAddress(host.data(), Component(0, host.length()),
827 &ignored_output, &host_info);
828 return host_info.IsIPAddress();
829 }
830
Canonicalize(const char * spec,int spec_len,bool trim_path_end,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)831 bool Canonicalize(const char* spec,
832 int spec_len,
833 bool trim_path_end,
834 CharsetConverter* charset_converter,
835 CanonOutput* output,
836 Parsed* output_parsed) {
837 return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
838 charset_converter, output, output_parsed);
839 }
840
Canonicalize(const char16_t * spec,int spec_len,bool trim_path_end,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)841 bool Canonicalize(const char16_t* spec,
842 int spec_len,
843 bool trim_path_end,
844 CharsetConverter* charset_converter,
845 CanonOutput* output,
846 Parsed* output_parsed) {
847 return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
848 charset_converter, output, output_parsed);
849 }
850
ResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const char * relative,int relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)851 bool ResolveRelative(const char* base_spec,
852 int base_spec_len,
853 const Parsed& base_parsed,
854 const char* relative,
855 int relative_length,
856 CharsetConverter* charset_converter,
857 CanonOutput* output,
858 Parsed* output_parsed) {
859 return DoResolveRelative(base_spec, base_spec_len, base_parsed,
860 relative, relative_length,
861 charset_converter, output, output_parsed);
862 }
863
ResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const char16_t * relative,int relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)864 bool ResolveRelative(const char* base_spec,
865 int base_spec_len,
866 const Parsed& base_parsed,
867 const char16_t* relative,
868 int relative_length,
869 CharsetConverter* charset_converter,
870 CanonOutput* output,
871 Parsed* output_parsed) {
872 return DoResolveRelative(base_spec, base_spec_len, base_parsed,
873 relative, relative_length,
874 charset_converter, output, output_parsed);
875 }
876
ReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<char> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)877 bool ReplaceComponents(const char* spec,
878 int spec_len,
879 const Parsed& parsed,
880 const Replacements<char>& replacements,
881 CharsetConverter* charset_converter,
882 CanonOutput* output,
883 Parsed* out_parsed) {
884 return DoReplaceComponents(spec, spec_len, parsed, replacements,
885 charset_converter, output, out_parsed);
886 }
887
ReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<char16_t> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)888 bool ReplaceComponents(const char* spec,
889 int spec_len,
890 const Parsed& parsed,
891 const Replacements<char16_t>& replacements,
892 CharsetConverter* charset_converter,
893 CanonOutput* output,
894 Parsed* out_parsed) {
895 return DoReplaceComponents(spec, spec_len, parsed, replacements,
896 charset_converter, output, out_parsed);
897 }
898
DecodeURLEscapeSequences(std::string_view input,DecodeURLMode mode,CanonOutputW * output)899 void DecodeURLEscapeSequences(std::string_view input,
900 DecodeURLMode mode,
901 CanonOutputW* output) {
902 if (input.empty()) {
903 return;
904 }
905
906 STACK_UNINITIALIZED RawCanonOutputT<char> unescaped_chars;
907 for (size_t i = 0; i < input.length(); i++) {
908 if (input[i] == '%') {
909 unsigned char ch;
910 if (DecodeEscaped(input.data(), &i, input.length(), &ch)) {
911 unescaped_chars.push_back(ch);
912 } else {
913 // Invalid escape sequence, copy the percent literal.
914 unescaped_chars.push_back('%');
915 }
916 } else {
917 // Regular non-escaped 8-bit character.
918 unescaped_chars.push_back(input[i]);
919 }
920 }
921
922 int output_initial_length = output->length();
923 // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
924 // JavaScript URLs, but Firefox and Safari do.
925 size_t unescaped_length = unescaped_chars.length();
926 for (size_t i = 0; i < unescaped_length; i++) {
927 unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
928 if (uch < 0x80) {
929 // Non-UTF-8, just append directly
930 output->push_back(uch);
931 } else {
932 // next_ch will point to the last character of the decoded
933 // character.
934 size_t next_character = i;
935 base_icu::UChar32 code_point;
936 if (ReadUTFCharLossy(unescaped_chars.data(), &next_character,
937 unescaped_length, &code_point)) {
938 // Valid UTF-8 character, convert to UTF-16.
939 AppendUTF16Value(code_point, output);
940 i = next_character;
941 } else if (mode == DecodeURLMode::kUTF8) {
942 DCHECK_EQ(code_point, 0xFFFD);
943 AppendUTF16Value(code_point, output);
944 i = next_character;
945 } else {
946 // If there are any sequences that are not valid UTF-8, we
947 // revert |output| changes, and promote any bytes to UTF-16. We
948 // copy all characters from the beginning to the end of the
949 // identified sequence.
950 output->set_length(output_initial_length);
951 for (size_t j = 0; j < unescaped_chars.length(); ++j)
952 output->push_back(static_cast<unsigned char>(unescaped_chars.at(j)));
953 break;
954 }
955 }
956 }
957 }
958
EncodeURIComponent(std::string_view input,CanonOutput * output)959 void EncodeURIComponent(std::string_view input, CanonOutput* output) {
960 for (unsigned char c : input) {
961 if (IsComponentChar(c)) {
962 output->push_back(c);
963 } else {
964 AppendEscapedChar(c, output);
965 }
966 }
967 }
968
IsURIComponentChar(char c)969 bool IsURIComponentChar(char c) {
970 return IsComponentChar(c);
971 }
972
CompareSchemeComponent(const char * spec,const Component & component,const char * compare_to)973 bool CompareSchemeComponent(const char* spec,
974 const Component& component,
975 const char* compare_to) {
976 return DoCompareSchemeComponent(spec, component, compare_to);
977 }
978
CompareSchemeComponent(const char16_t * spec,const Component & component,const char * compare_to)979 bool CompareSchemeComponent(const char16_t* spec,
980 const Component& component,
981 const char* compare_to) {
982 return DoCompareSchemeComponent(spec, component, compare_to);
983 }
984
HasInvalidURLEscapeSequences(std::string_view input)985 bool HasInvalidURLEscapeSequences(std::string_view input) {
986 for (size_t i = 0; i < input.size(); i++) {
987 if (input[i] == '%') {
988 unsigned char ch;
989 if (!DecodeEscaped(input.data(), &i, input.size(), &ch)) {
990 return true;
991 }
992 }
993 }
994 return false;
995 }
996
IsAndroidWebViewHackEnabledScheme(std::string_view scheme)997 bool IsAndroidWebViewHackEnabledScheme(std::string_view scheme) {
998 return AllowNonStandardSchemesForAndroidWebView() &&
999 !IsStandardScheme(scheme);
1000 }
1001
1002 } // namespace url
1003