• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "url/url_util.h"
11 
12 #include <stddef.h>
13 #include <string.h>
14 
15 #include <atomic>
16 #include <ostream>
17 
18 #include "base/check_op.h"
19 #include "base/compiler_specific.h"
20 #include "base/containers/contains.h"
21 #include "base/no_destructor.h"
22 #include "base/strings/string_util.h"
23 #include "url/url_canon_internal.h"
24 #include "url/url_constants.h"
25 #include "url/url_features.h"
26 #include "url/url_file.h"
27 #include "url/url_parse_internal.h"
28 #include "url/url_util_internal.h"
29 
30 namespace url {
31 
32 namespace {
33 
34 // A pair for representing a standard scheme name and the SchemeType for it.
35 struct SchemeWithType {
36   std::string scheme;
37   SchemeType type;
38 };
39 
40 // A pair for representing a scheme and a custom protocol handler for it.
41 //
42 // This pair of strings must be normalized protocol handler parameters as
43 // described in the Custom Handler specification.
44 // https://html.spec.whatwg.org/multipage/system-state.html#normalize-protocol-handler-parameters
45 struct SchemeWithHandler {
46   std::string scheme;
47   std::string handler;
48 };
49 
50 // List of currently registered schemes and associated properties.
51 struct SchemeRegistry {
52   // Standard format schemes (see header for details).
53   std::vector<SchemeWithType> standard_schemes = {
54       {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
55       {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
56       // Yes, file URLs can have a hostname, so file URLs should be handled as
57       // "standard". File URLs never have a port as specified by the SchemeType
58       // field.  Unlike other SCHEME_WITH_HOST schemes, the 'host' in a file
59       // URL may be empty, a behavior which is special-cased during
60       // canonicalization.
61       {kFileScheme, SCHEME_WITH_HOST},
62       {kFtpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
63       {kWssScheme,
64        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},  // WebSocket secure.
65       {kWsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},  // WebSocket.
66       {kFileSystemScheme, SCHEME_WITHOUT_AUTHORITY},
67   };
68 
69   // Schemes that are allowed for referrers.
70   //
71   // WARNING: Adding (1) a non-"standard" scheme or (2) a scheme whose URLs have
72   // opaque origins could lead to surprising behavior in some of the referrer
73   // generation logic. In order to avoid surprises, be sure to have adequate
74   // test coverage in each of the multiple code locations that compute
75   // referrers.
76   std::vector<SchemeWithType> referrer_schemes = {
77       {kHttpsScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
78       {kHttpScheme, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION},
79   };
80 
81   // Schemes that do not trigger mixed content warning.
82   std::vector<std::string> secure_schemes = {
83       kHttpsScheme,
84       kWssScheme,
85       kDataScheme,
86       kAboutScheme,
87   };
88 
89   // Schemes that normal pages cannot link to or access (i.e., with the same
90   // security rules as those applied to "file" URLs).
91   std::vector<std::string> local_schemes = {
92       kFileScheme,
93   };
94 
95   // Schemes that cause pages loaded with them to not have access to pages
96   // loaded with any other URL scheme.
97   std::vector<std::string> no_access_schemes = {
98       kAboutScheme,
99       kJavaScriptScheme,
100       kDataScheme,
101   };
102 
103   // Schemes that can be sent CORS requests.
104   std::vector<std::string> cors_enabled_schemes = {
105       kHttpsScheme,
106       kHttpScheme,
107       kDataScheme,
108   };
109 
110   // Schemes that can be used by web to store data (local storage, etc).
111   std::vector<std::string> web_storage_schemes = {
112       kHttpsScheme, kHttpScheme, kFileScheme, kFtpScheme, kWssScheme, kWsScheme,
113   };
114 
115   // Schemes that can bypass the Content-Security-Policy (CSP) checks.
116   std::vector<std::string> csp_bypassing_schemes = {};
117 
118   // Schemes that are strictly empty documents, allowing them to commit
119   // synchronously.
120   std::vector<std::string> empty_document_schemes = {
121       kAboutScheme,
122   };
123 
124   // Non-special schemes that should be treated as opaque path URLs for
125   // compatibility reasons.
126   std::vector<std::string> opaque_non_special_schemes = {
127       // See https://crrev.com/c/5465607 for the reason.
128       kAndroidScheme,
129       // Temporarily opted-out. See https://crrev.com/c/5569365.
130       kDrivefsScheme,
131       // Temporarily opted-out. See https://crrev.com/c/5568919.
132       kChromeosSteamScheme,
133       kSteamScheme,
134       // Temporarily opted-out. See https://crrev.com/c/5578066.
135       kMaterializedViewScheme,
136   };
137 
138   // Schemes with a predefined default custom handler.
139   std::vector<SchemeWithHandler> predefined_handler_schemes;
140 
141   bool allow_non_standard_schemes = false;
142 };
143 
144 // See the LockSchemeRegistries declaration in the header.
145 bool scheme_registries_locked = false;
146 
147 // Ensure that the schemes aren't modified after first use.
148 static std::atomic<bool> g_scheme_registries_used{false};
149 
150 // Gets the scheme registry without locking the schemes. This should *only* be
151 // used for adding schemes to the registry.
GetSchemeRegistryWithoutLocking()152 SchemeRegistry* GetSchemeRegistryWithoutLocking() {
153   static base::NoDestructor<SchemeRegistry> registry;
154   return registry.get();
155 }
156 
GetSchemeRegistry()157 const SchemeRegistry& GetSchemeRegistry() {
158 #if DCHECK_IS_ON()
159   g_scheme_registries_used.store(true);
160 #endif
161   return *GetSchemeRegistryWithoutLocking();
162 }
163 
164 // Pass this enum through for methods which would like to know if whitespace
165 // removal is necessary.
166 enum WhitespaceRemovalPolicy {
167   REMOVE_WHITESPACE,
168   DO_NOT_REMOVE_WHITESPACE,
169 };
170 
171 // Given a string and a range inside the string, compares it to the given
172 // lower-case |compare_to| buffer.
173 template<typename CHAR>
DoCompareSchemeComponent(const CHAR * spec,const Component & component,const char * compare_to)174 inline bool DoCompareSchemeComponent(const CHAR* spec,
175                                      const Component& component,
176                                      const char* compare_to) {
177   if (component.is_empty())
178     return compare_to[0] == 0;  // When component is empty, match empty scheme.
179   return base::EqualsCaseInsensitiveASCII(
180       std::basic_string_view(&spec[component.begin], component.len),
181       compare_to);
182 }
183 
184 // Returns true and sets |type| to the SchemeType of the given scheme
185 // identified by |scheme| within |spec| if in |schemes|.
186 template<typename CHAR>
DoIsInSchemes(const CHAR * spec,const Component & scheme,SchemeType * type,const std::vector<SchemeWithType> & schemes)187 bool DoIsInSchemes(const CHAR* spec,
188                    const Component& scheme,
189                    SchemeType* type,
190                    const std::vector<SchemeWithType>& schemes) {
191   if (scheme.is_empty())
192     return false;  // Empty or invalid schemes are non-standard.
193 
194   for (const SchemeWithType& scheme_with_type : schemes) {
195     if (base::EqualsCaseInsensitiveASCII(
196             std::basic_string_view(&spec[scheme.begin], scheme.len),
197             scheme_with_type.scheme)) {
198       *type = scheme_with_type.type;
199       return true;
200     }
201   }
202   return false;
203 }
204 
205 template<typename CHAR>
DoIsStandard(const CHAR * spec,const Component & scheme,SchemeType * type)206 bool DoIsStandard(const CHAR* spec, const Component& scheme, SchemeType* type) {
207   return DoIsInSchemes(spec, scheme, type,
208                        GetSchemeRegistry().standard_schemes);
209 }
210 
211 template <typename CHAR>
DoIsOpaqueNonSpecial(const CHAR * spec,const Component & scheme)212 bool DoIsOpaqueNonSpecial(const CHAR* spec, const Component& scheme) {
213   if (scheme.is_empty()) {
214     return false;
215   }
216   for (const std::string& s : GetSchemeRegistry().opaque_non_special_schemes) {
217     if (base::EqualsCaseInsensitiveASCII(
218             std::basic_string_view(&spec[scheme.begin], scheme.len), s)) {
219       return true;
220     }
221   }
222   return false;
223 }
224 
225 template<typename CHAR>
DoFindAndCompareScheme(const CHAR * str,int str_len,const char * compare,Component * found_scheme)226 bool DoFindAndCompareScheme(const CHAR* str,
227                             int str_len,
228                             const char* compare,
229                             Component* found_scheme) {
230   // Before extracting scheme, canonicalize the URL to remove any whitespace.
231   // This matches the canonicalization done in DoCanonicalize function.
232   STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
233   int spec_len;
234   const CHAR* spec =
235       RemoveURLWhitespace(str, str_len, &whitespace_buffer, &spec_len, nullptr);
236 
237   Component our_scheme;
238   if (!ExtractScheme(spec, spec_len, &our_scheme)) {
239     // No scheme.
240     if (found_scheme)
241       *found_scheme = Component();
242     return false;
243   }
244   if (found_scheme)
245     *found_scheme = our_scheme;
246   return DoCompareSchemeComponent(spec, our_scheme, compare);
247 }
248 
249 template <typename CHAR>
DoCanonicalize(const CHAR * spec,int spec_len,bool trim_path_end,WhitespaceRemovalPolicy whitespace_policy,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)250 bool DoCanonicalize(const CHAR* spec,
251                     int spec_len,
252                     bool trim_path_end,
253                     WhitespaceRemovalPolicy whitespace_policy,
254                     CharsetConverter* charset_converter,
255                     CanonOutput* output,
256                     Parsed* output_parsed) {
257   // Trim leading C0 control characters and spaces.
258   int begin = 0;
259   TrimURL(spec, &begin, &spec_len, trim_path_end);
260   DCHECK(0 <= begin && begin <= spec_len);
261   spec += begin;
262   spec_len -= begin;
263 
264   output->ReserveSizeIfNeeded(spec_len);
265 
266   // Remove any whitespace from the middle of the relative URL if necessary.
267   // Possibly this will result in copying to the new buffer.
268   STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
269   if (whitespace_policy == REMOVE_WHITESPACE) {
270     spec = RemoveURLWhitespace(spec, spec_len, &whitespace_buffer, &spec_len,
271                                &output_parsed->potentially_dangling_markup);
272   }
273 
274 #ifdef WIN32
275   // For Windows, we allow things that look like absolute Windows paths to be
276   // fixed up magically to file URLs. This is done for IE compatibility. For
277   // example, this will change "c:/foo" into a file URL rather than treating
278   // it as a URL with the protocol "c". It also works for UNC ("\\foo\bar.txt").
279   // There is similar logic in url_canon_relative.cc for
280   //
281   // For Max & Unix, we don't do this (the equivalent would be "/foo/bar" which
282   // has no meaning as an absolute path name. This is because browsers on Mac
283   // & Unix don't generally do this, so there is no compatibility reason for
284   // doing so.
285   if (DoesBeginUNCPath(spec, 0, spec_len, false) ||
286       DoesBeginWindowsDriveSpec(spec, 0, spec_len)) {
287     return CanonicalizeFileURL(
288         spec, spec_len, ParseFileURL(std::basic_string_view(spec, spec_len)),
289         charset_converter, output, output_parsed);
290   }
291 #endif
292 
293   Component scheme;
294   if (!ExtractScheme(spec, spec_len, &scheme))
295     return false;
296 
297   // This is the parsed version of the input URL, we have to canonicalize it
298   // before storing it in our object.
299   bool success;
300   SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
301   if (DoCompareSchemeComponent(spec, scheme, url::kFileScheme)) {
302     // File URLs are special.
303     success = CanonicalizeFileURL(
304         spec, spec_len, ParseFileURL(std::basic_string_view(spec, spec_len)),
305         charset_converter, output, output_parsed);
306   } else if (DoCompareSchemeComponent(spec, scheme, url::kFileSystemScheme)) {
307     // Filesystem URLs are special.
308     success = CanonicalizeFileSystemURL(
309         spec, ParseFileSystemURL(std::basic_string_view(spec, spec_len)),
310         charset_converter, output, output_parsed);
311 
312   } else if (DoIsStandard(spec, scheme, &scheme_type)) {
313     // All "normal" URLs.
314     success = CanonicalizeStandardURL(
315         spec, ParseStandardURL(std::basic_string_view(spec, spec_len)),
316         scheme_type, charset_converter, output, output_parsed);
317 
318   } else if (!url::IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
319              DoCompareSchemeComponent(spec, scheme, url::kMailToScheme)) {
320     // Mailto URLs are treated like standard URLs, with only a scheme, path,
321     // and query.
322     //
323     // TODO(crbug.com/40063064): Remove the special handling of 'mailto:" scheme
324     // URLs. "mailto:" is simply one of non-special URLs.
325     success = CanonicalizeMailtoURL(
326         spec, spec_len, ParseMailtoURL(std::basic_string_view(spec, spec_len)),
327         output, output_parsed);
328 
329   } else {
330     // Non-special scheme URLs like data: and javascript:.
331     if (url::IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
332         !DoIsOpaqueNonSpecial(spec, scheme)) {
333       success = CanonicalizeNonSpecialURL(
334           spec, spec_len,
335           ParseNonSpecialURLInternal(std::basic_string_view(spec, spec_len),
336                                      trim_path_end),
337           charset_converter, *output, *output_parsed);
338     } else {
339       success = CanonicalizePathURL(
340           spec, spec_len,
341           ParsePathURL(std::basic_string_view(spec, spec_len), trim_path_end),
342           output, output_parsed);
343     }
344   }
345   return success;
346 }
347 
348 template<typename CHAR>
DoResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const CHAR * in_relative,int in_relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)349 bool DoResolveRelative(const char* base_spec,
350                        int base_spec_len,
351                        const Parsed& base_parsed,
352                        const CHAR* in_relative,
353                        int in_relative_length,
354                        CharsetConverter* charset_converter,
355                        CanonOutput* output,
356                        Parsed* output_parsed) {
357   // Remove any whitespace from the middle of the relative URL, possibly
358   // copying to the new buffer.
359   STACK_UNINITIALIZED RawCanonOutputT<CHAR> whitespace_buffer;
360   int relative_length;
361   const CHAR* relative = RemoveURLWhitespace(
362       in_relative, in_relative_length, &whitespace_buffer, &relative_length,
363       &output_parsed->potentially_dangling_markup);
364 
365   bool base_is_authority_based = false;
366   bool base_is_hierarchical = false;
367   if (base_spec &&
368       base_parsed.scheme.is_nonempty()) {
369     int after_scheme = base_parsed.scheme.end() + 1;  // Skip past the colon.
370     int num_slashes = CountConsecutiveSlashes(base_spec, after_scheme,
371                                               base_spec_len);
372     base_is_authority_based = num_slashes > 1;
373     base_is_hierarchical = num_slashes > 0;
374   }
375 
376   bool is_hierarchical_base;
377 
378   if (url::IsUsingStandardCompliantNonSpecialSchemeURLParsing()) {
379     is_hierarchical_base =
380         base_parsed.scheme.is_nonempty() && !base_parsed.has_opaque_path;
381   } else {
382     SchemeType unused_scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
383     is_hierarchical_base =
384         base_parsed.scheme.is_nonempty() &&
385         DoIsStandard(base_spec, base_parsed.scheme, &unused_scheme_type);
386   }
387 
388   bool is_relative;
389   Component relative_component;
390   if (!IsRelativeURL(base_spec, base_parsed, relative, relative_length,
391                      (base_is_hierarchical || is_hierarchical_base),
392                      &is_relative, &relative_component)) {
393     // Error resolving.
394     return false;
395   }
396 
397   // Don't reserve buffer space here. Instead, reserve in DoCanonicalize and
398   // ReserveRelativeURL, to enable more accurate buffer sizes.
399 
400   // Pretend for a moment that |base_spec| is a standard URL. Normally
401   // non-standard URLs are treated as PathURLs, but if the base has an
402   // authority we would like to preserve it.
403   if (is_relative && base_is_authority_based && !is_hierarchical_base) {
404     Parsed base_parsed_authority =
405         ParseStandardURL(std::string_view(base_spec, base_spec_len));
406     if (base_parsed_authority.host.is_nonempty()) {
407       STACK_UNINITIALIZED RawCanonOutputT<char> temporary_output;
408       bool did_resolve_succeed =
409           ResolveRelativeURL(base_spec, base_parsed_authority, false, relative,
410                              relative_component, charset_converter,
411                              &temporary_output, output_parsed);
412       // The output_parsed is incorrect at this point (because it was built
413       // based on base_parsed_authority instead of base_parsed) and needs to be
414       // re-created.
415       DoCanonicalize(temporary_output.data(), temporary_output.length(), true,
416                      REMOVE_WHITESPACE, charset_converter, output,
417                      output_parsed);
418       return did_resolve_succeed;
419     }
420   } else if (is_relative) {
421     // Relative, resolve and canonicalize.
422     bool file_base_scheme = base_parsed.scheme.is_nonempty() &&
423         DoCompareSchemeComponent(base_spec, base_parsed.scheme, kFileScheme);
424     return ResolveRelativeURL(base_spec, base_parsed, file_base_scheme,
425                               relative, relative_component, charset_converter,
426                               output, output_parsed);
427   }
428 
429   // Not relative, canonicalize the input.
430   return DoCanonicalize(relative, relative_length, true,
431                         DO_NOT_REMOVE_WHITESPACE, charset_converter, output,
432                         output_parsed);
433 }
434 
435 template<typename CHAR>
DoReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<CHAR> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)436 bool DoReplaceComponents(const char* spec,
437                          int spec_len,
438                          const Parsed& parsed,
439                          const Replacements<CHAR>& replacements,
440                          CharsetConverter* charset_converter,
441                          CanonOutput* output,
442                          Parsed* out_parsed) {
443   // If the scheme is overridden, just do a simple string substitution and
444   // re-parse the whole thing. There are lots of edge cases that we really don't
445   // want to deal with. Like what happens if I replace "http://e:8080/foo"
446   // with a file. Does it become "file:///E:/8080/foo" where the port number
447   // becomes part of the path? Parsing that string as a file URL says "yes"
448   // but almost no sane rule for dealing with the components individually would
449   // come up with that.
450   //
451   // Why allow these crazy cases at all? Programatically, there is almost no
452   // case for replacing the scheme. The most common case for hitting this is
453   // in JS when building up a URL using the location object. In this case, the
454   // JS code expects the string substitution behavior:
455   //   http://www.w3.org/TR/2008/WD-html5-20080610/structured.html#common3
456   if (replacements.IsSchemeOverridden()) {
457     // Canonicalize the new scheme so it is 8-bit and can be concatenated with
458     // the existing spec.
459     STACK_UNINITIALIZED RawCanonOutput<128> scheme_replaced;
460     Component scheme_replaced_parsed;
461     CanonicalizeScheme(replacements.sources().scheme,
462                        replacements.components().scheme,
463                        &scheme_replaced, &scheme_replaced_parsed);
464 
465     // We can assume that the input is canonicalized, which means it always has
466     // a colon after the scheme (or where the scheme would be).
467     int spec_after_colon = parsed.scheme.is_valid() ? parsed.scheme.end() + 1
468                                                     : 1;
469     if (spec_len - spec_after_colon > 0) {
470       scheme_replaced.Append(&spec[spec_after_colon],
471                              spec_len - spec_after_colon);
472     }
473 
474     // We now need to completely re-parse the resulting string since its meaning
475     // may have changed with the different scheme.
476     STACK_UNINITIALIZED RawCanonOutput<128> recanonicalized;
477     Parsed recanonicalized_parsed;
478     DoCanonicalize(scheme_replaced.data(), scheme_replaced.length(), true,
479                    REMOVE_WHITESPACE, charset_converter, &recanonicalized,
480                    &recanonicalized_parsed);
481 
482     // Recurse using the version with the scheme already replaced. This will now
483     // use the replacement rules for the new scheme.
484     //
485     // Warning: this code assumes that ReplaceComponents will re-check all
486     // components for validity. This is because we can't fail if DoCanonicalize
487     // failed above since theoretically the thing making it fail could be
488     // getting replaced here. If ReplaceComponents didn't re-check everything,
489     // we wouldn't know if something *not* getting replaced is a problem.
490     // If the scheme-specific replacers are made more intelligent so they don't
491     // re-check everything, we should instead re-canonicalize the whole thing
492     // after this call to check validity (this assumes replacing the scheme is
493     // much much less common than other types of replacements, like clearing the
494     // ref).
495     Replacements<CHAR> replacements_no_scheme = replacements;
496     replacements_no_scheme.SetScheme(NULL, Component());
497     // If the input URL has potentially dangling markup, set the flag on the
498     // output too. Note that in some cases the replacement gets rid of the
499     // potentially dangling markup, but this ok since the check will fail
500     // closed.
501     if (parsed.potentially_dangling_markup) {
502       out_parsed->potentially_dangling_markup = true;
503     }
504     return DoReplaceComponents(recanonicalized.data(), recanonicalized.length(),
505                                recanonicalized_parsed, replacements_no_scheme,
506                                charset_converter, output, out_parsed);
507   }
508 
509   // TODO(csharrison): We could be smarter about size to reserve if this is done
510   // in callers below, and the code checks to see which components are being
511   // replaced, and with what length. If this ends up being a hot spot it should
512   // be changed.
513   output->ReserveSizeIfNeeded(spec_len);
514 
515   // If we get here, then we know the scheme doesn't need to be replaced, so can
516   // just key off the scheme in the spec to know how to do the replacements.
517   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileScheme)) {
518     return ReplaceFileURL(spec, parsed, replacements, charset_converter, output,
519                           out_parsed);
520   }
521   if (DoCompareSchemeComponent(spec, parsed.scheme, url::kFileSystemScheme)) {
522     return ReplaceFileSystemURL(spec, parsed, replacements, charset_converter,
523                                 output, out_parsed);
524   }
525   SchemeType scheme_type = SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION;
526   if (DoIsStandard(spec, parsed.scheme, &scheme_type)) {
527     return ReplaceStandardURL(spec, parsed, replacements, scheme_type,
528                               charset_converter, output, out_parsed);
529   }
530   if (!IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
531       DoCompareSchemeComponent(spec, parsed.scheme, url::kMailToScheme)) {
532     return ReplaceMailtoURL(spec, parsed, replacements, output, out_parsed);
533   }
534 
535   if (IsUsingStandardCompliantNonSpecialSchemeURLParsing() &&
536       !DoIsOpaqueNonSpecial(spec, parsed.scheme)) {
537     return ReplaceNonSpecialURL(spec, parsed, replacements, charset_converter,
538                                 *output, *out_parsed);
539   }
540   return ReplacePathURL(spec, parsed, replacements, output, out_parsed);
541 }
542 
DoSchemeModificationPreamble()543 void DoSchemeModificationPreamble() {
544   // If this assert triggers, it means you've called Add*Scheme after
545   // the SchemeRegistry has been used.
546   //
547   // This normally means you're trying to set up a new scheme too late or using
548   // the SchemeRegistry too early in your application's init process.
549   DCHECK(!g_scheme_registries_used.load())
550       << "Trying to add a scheme after the lists have been used. "
551          "Make sure that you haven't added any static GURL initializers in tests.";
552 
553   // If this assert triggers, it means you've called Add*Scheme after
554   // LockSchemeRegistries has been called (see the header file for
555   // LockSchemeRegistries for more).
556   //
557   // This normally means you're trying to set up a new scheme too late in your
558   // application's init process. Locate where your app does this initialization
559   // and calls LockSchemeRegistries, and add your new scheme there.
560   DCHECK(!scheme_registries_locked)
561       << "Trying to add a scheme after the lists have been locked.";
562 }
563 
DoAddSchemeWithHandler(const char * new_scheme,const char * handler,std::vector<SchemeWithHandler> * schemes)564 void DoAddSchemeWithHandler(const char* new_scheme,
565                             const char* handler,
566                             std::vector<SchemeWithHandler>* schemes) {
567   DoSchemeModificationPreamble();
568   DCHECK(schemes);
569   DCHECK(strlen(new_scheme) > 0);
570   DCHECK(strlen(handler) > 0);
571   DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
572   DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithHandler::scheme));
573   schemes->push_back({new_scheme, handler});
574 }
575 
DoAddScheme(const char * new_scheme,std::vector<std::string> * schemes)576 void DoAddScheme(const char* new_scheme, std::vector<std::string>* schemes) {
577   DoSchemeModificationPreamble();
578   DCHECK(schemes);
579   DCHECK(strlen(new_scheme) > 0);
580   DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
581   DCHECK(!base::Contains(*schemes, new_scheme));
582   schemes->push_back(new_scheme);
583 }
584 
DoAddSchemeWithType(const char * new_scheme,SchemeType type,std::vector<SchemeWithType> * schemes)585 void DoAddSchemeWithType(const char* new_scheme,
586                          SchemeType type,
587                          std::vector<SchemeWithType>* schemes) {
588   DoSchemeModificationPreamble();
589   DCHECK(schemes);
590   DCHECK(strlen(new_scheme) > 0);
591   DCHECK_EQ(base::ToLowerASCII(new_scheme), new_scheme);
592   DCHECK(!base::Contains(*schemes, new_scheme, &SchemeWithType::scheme));
593   schemes->push_back({new_scheme, type});
594 }
595 
596 }  // namespace
597 
ClearSchemesForTests()598 void ClearSchemesForTests() {
599   DCHECK(!g_scheme_registries_used.load())
600       << "Schemes already used "
601       << "(use ScopedSchemeRegistryForTests to relax for tests).";
602   DCHECK(!scheme_registries_locked)
603       << "Schemes already locked "
604       << "(use ScopedSchemeRegistryForTests to relax for tests).";
605   *GetSchemeRegistryWithoutLocking() = SchemeRegistry();
606 }
607 
608 class ScopedSchemeRegistryInternal {
609  public:
ScopedSchemeRegistryInternal()610   ScopedSchemeRegistryInternal()
611       : registry_(std::make_unique<SchemeRegistry>(
612             *GetSchemeRegistryWithoutLocking())) {
613     g_scheme_registries_used.store(false);
614     scheme_registries_locked = false;
615   }
~ScopedSchemeRegistryInternal()616   ~ScopedSchemeRegistryInternal() {
617     *GetSchemeRegistryWithoutLocking() = *registry_;
618     g_scheme_registries_used.store(true);
619     scheme_registries_locked = true;
620   }
621 
622  private:
623   std::unique_ptr<SchemeRegistry> registry_;
624 };
625 
ScopedSchemeRegistryForTests()626 ScopedSchemeRegistryForTests::ScopedSchemeRegistryForTests()
627     : internal_(std::make_unique<ScopedSchemeRegistryInternal>()) {}
628 
629 ScopedSchemeRegistryForTests::~ScopedSchemeRegistryForTests() = default;
630 
EnableNonStandardSchemesForAndroidWebView()631 void EnableNonStandardSchemesForAndroidWebView() {
632   DoSchemeModificationPreamble();
633   GetSchemeRegistryWithoutLocking()->allow_non_standard_schemes = true;
634 }
635 
AllowNonStandardSchemesForAndroidWebView()636 bool AllowNonStandardSchemesForAndroidWebView() {
637   return GetSchemeRegistry().allow_non_standard_schemes;
638 }
639 
AddStandardScheme(const char * new_scheme,SchemeType type)640 void AddStandardScheme(const char* new_scheme, SchemeType type) {
641   DoAddSchemeWithType(new_scheme, type,
642                       &GetSchemeRegistryWithoutLocking()->standard_schemes);
643 }
644 
GetStandardSchemes()645 std::vector<std::string> GetStandardSchemes() {
646   std::vector<std::string> result;
647   result.reserve(GetSchemeRegistry().standard_schemes.size());
648   for (const auto& entry : GetSchemeRegistry().standard_schemes) {
649     result.push_back(entry.scheme);
650   }
651   return result;
652 }
653 
AddReferrerScheme(const char * new_scheme,SchemeType type)654 void AddReferrerScheme(const char* new_scheme, SchemeType type) {
655   DoAddSchemeWithType(new_scheme, type,
656                       &GetSchemeRegistryWithoutLocking()->referrer_schemes);
657 }
658 
AddSecureScheme(const char * new_scheme)659 void AddSecureScheme(const char* new_scheme) {
660   DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->secure_schemes);
661 }
662 
GetSecureSchemes()663 const std::vector<std::string>& GetSecureSchemes() {
664   return GetSchemeRegistry().secure_schemes;
665 }
666 
AddLocalScheme(const char * new_scheme)667 void AddLocalScheme(const char* new_scheme) {
668   DoAddScheme(new_scheme, &GetSchemeRegistryWithoutLocking()->local_schemes);
669 }
670 
GetLocalSchemes()671 const std::vector<std::string>& GetLocalSchemes() {
672   return GetSchemeRegistry().local_schemes;
673 }
674 
AddNoAccessScheme(const char * new_scheme)675 void AddNoAccessScheme(const char* new_scheme) {
676   DoAddScheme(new_scheme,
677               &GetSchemeRegistryWithoutLocking()->no_access_schemes);
678 }
679 
GetNoAccessSchemes()680 const std::vector<std::string>& GetNoAccessSchemes() {
681   return GetSchemeRegistry().no_access_schemes;
682 }
683 
AddCorsEnabledScheme(const char * new_scheme)684 void AddCorsEnabledScheme(const char* new_scheme) {
685   DoAddScheme(new_scheme,
686               &GetSchemeRegistryWithoutLocking()->cors_enabled_schemes);
687 }
688 
GetCorsEnabledSchemes()689 const std::vector<std::string>& GetCorsEnabledSchemes() {
690   return GetSchemeRegistry().cors_enabled_schemes;
691 }
692 
AddWebStorageScheme(const char * new_scheme)693 void AddWebStorageScheme(const char* new_scheme) {
694   DoAddScheme(new_scheme,
695               &GetSchemeRegistryWithoutLocking()->web_storage_schemes);
696 }
697 
GetWebStorageSchemes()698 const std::vector<std::string>& GetWebStorageSchemes() {
699   return GetSchemeRegistry().web_storage_schemes;
700 }
701 
AddCSPBypassingScheme(const char * new_scheme)702 void AddCSPBypassingScheme(const char* new_scheme) {
703   DoAddScheme(new_scheme,
704               &GetSchemeRegistryWithoutLocking()->csp_bypassing_schemes);
705 }
706 
GetCSPBypassingSchemes()707 const std::vector<std::string>& GetCSPBypassingSchemes() {
708   return GetSchemeRegistry().csp_bypassing_schemes;
709 }
710 
AddEmptyDocumentScheme(const char * new_scheme)711 void AddEmptyDocumentScheme(const char* new_scheme) {
712   DoAddScheme(new_scheme,
713               &GetSchemeRegistryWithoutLocking()->empty_document_schemes);
714 }
715 
GetEmptyDocumentSchemes()716 const std::vector<std::string>& GetEmptyDocumentSchemes() {
717   return GetSchemeRegistry().empty_document_schemes;
718 }
719 
AddPredefinedHandlerScheme(const char * new_scheme,const char * handler)720 void AddPredefinedHandlerScheme(const char* new_scheme, const char* handler) {
721   DoAddSchemeWithHandler(
722       new_scheme, handler,
723       &GetSchemeRegistryWithoutLocking()->predefined_handler_schemes);
724 }
725 
GetPredefinedHandlerSchemes()726 std::vector<std::pair<std::string, std::string>> GetPredefinedHandlerSchemes() {
727   std::vector<std::pair<std::string, std::string>> result;
728   result.reserve(GetSchemeRegistry().predefined_handler_schemes.size());
729   for (const SchemeWithHandler& entry :
730        GetSchemeRegistry().predefined_handler_schemes) {
731     result.emplace_back(entry.scheme, entry.handler);
732   }
733   return result;
734 }
735 
LockSchemeRegistries()736 void LockSchemeRegistries() {
737   scheme_registries_locked = true;
738 }
739 
IsStandard(const char * spec,const Component & scheme)740 bool IsStandard(const char* spec, const Component& scheme) {
741   SchemeType unused_scheme_type;
742   return DoIsStandard(spec, scheme, &unused_scheme_type);
743 }
744 
IsStandardScheme(std::string_view scheme)745 bool IsStandardScheme(std::string_view scheme) {
746   return IsStandard(scheme.data(),
747                     Component(0, base::checked_cast<int>(scheme.size())));
748 }
749 
GetStandardSchemeType(const char * spec,const Component & scheme,SchemeType * type)750 bool GetStandardSchemeType(const char* spec,
751                            const Component& scheme,
752                            SchemeType* type) {
753   return DoIsStandard(spec, scheme, type);
754 }
755 
GetStandardSchemeType(const char16_t * spec,const Component & scheme,SchemeType * type)756 bool GetStandardSchemeType(const char16_t* spec,
757                            const Component& scheme,
758                            SchemeType* type) {
759   return DoIsStandard(spec, scheme, type);
760 }
761 
IsStandard(const char16_t * spec,const Component & scheme)762 bool IsStandard(const char16_t* spec, const Component& scheme) {
763   SchemeType unused_scheme_type;
764   return DoIsStandard(spec, scheme, &unused_scheme_type);
765 }
766 
IsReferrerScheme(const char * spec,const Component & scheme)767 bool IsReferrerScheme(const char* spec, const Component& scheme) {
768   SchemeType unused_scheme_type;
769   return DoIsInSchemes(spec, scheme, &unused_scheme_type,
770                        GetSchemeRegistry().referrer_schemes);
771 }
772 
FindAndCompareScheme(const char * str,int str_len,const char * compare,Component * found_scheme)773 bool FindAndCompareScheme(const char* str,
774                           int str_len,
775                           const char* compare,
776                           Component* found_scheme) {
777   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
778 }
779 
FindAndCompareScheme(const char16_t * str,int str_len,const char * compare,Component * found_scheme)780 bool FindAndCompareScheme(const char16_t* str,
781                           int str_len,
782                           const char* compare,
783                           Component* found_scheme) {
784   return DoFindAndCompareScheme(str, str_len, compare, found_scheme);
785 }
786 
DomainIs(std::string_view canonical_host,std::string_view canonical_domain)787 bool DomainIs(std::string_view canonical_host,
788               std::string_view canonical_domain) {
789   if (canonical_host.empty() || canonical_domain.empty())
790     return false;
791 
792   // If the host name ends with a dot but the input domain doesn't, then we
793   // ignore the dot in the host name.
794   size_t host_len = canonical_host.length();
795   if (canonical_host.back() == '.' && canonical_domain.back() != '.')
796     --host_len;
797 
798   if (host_len < canonical_domain.length())
799     return false;
800 
801   // |host_first_pos| is the start of the compared part of the host name, not
802   // start of the whole host name.
803   const char* host_first_pos =
804       canonical_host.data() + host_len - canonical_domain.length();
805 
806   if (std::string_view(host_first_pos, canonical_domain.length()) !=
807       canonical_domain) {
808     return false;
809   }
810 
811   // Make sure there aren't extra characters in host before the compared part;
812   // if the host name is longer than the input domain name, then the character
813   // immediately before the compared part should be a dot. For example,
814   // www.google.com has domain "google.com", but www.iamnotgoogle.com does not.
815   if (canonical_domain[0] != '.' && host_len > canonical_domain.length() &&
816       *(host_first_pos - 1) != '.') {
817     return false;
818   }
819 
820   return true;
821 }
822 
HostIsIPAddress(std::string_view host)823 bool HostIsIPAddress(std::string_view host) {
824   STACK_UNINITIALIZED url::RawCanonOutputT<char, 128> ignored_output;
825   url::CanonHostInfo host_info;
826   url::CanonicalizeIPAddress(host.data(), Component(0, host.length()),
827                              &ignored_output, &host_info);
828   return host_info.IsIPAddress();
829 }
830 
Canonicalize(const char * spec,int spec_len,bool trim_path_end,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)831 bool Canonicalize(const char* spec,
832                   int spec_len,
833                   bool trim_path_end,
834                   CharsetConverter* charset_converter,
835                   CanonOutput* output,
836                   Parsed* output_parsed) {
837   return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
838                         charset_converter, output, output_parsed);
839 }
840 
Canonicalize(const char16_t * spec,int spec_len,bool trim_path_end,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)841 bool Canonicalize(const char16_t* spec,
842                   int spec_len,
843                   bool trim_path_end,
844                   CharsetConverter* charset_converter,
845                   CanonOutput* output,
846                   Parsed* output_parsed) {
847   return DoCanonicalize(spec, spec_len, trim_path_end, REMOVE_WHITESPACE,
848                         charset_converter, output, output_parsed);
849 }
850 
ResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const char * relative,int relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)851 bool ResolveRelative(const char* base_spec,
852                      int base_spec_len,
853                      const Parsed& base_parsed,
854                      const char* relative,
855                      int relative_length,
856                      CharsetConverter* charset_converter,
857                      CanonOutput* output,
858                      Parsed* output_parsed) {
859   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
860                            relative, relative_length,
861                            charset_converter, output, output_parsed);
862 }
863 
ResolveRelative(const char * base_spec,int base_spec_len,const Parsed & base_parsed,const char16_t * relative,int relative_length,CharsetConverter * charset_converter,CanonOutput * output,Parsed * output_parsed)864 bool ResolveRelative(const char* base_spec,
865                      int base_spec_len,
866                      const Parsed& base_parsed,
867                      const char16_t* relative,
868                      int relative_length,
869                      CharsetConverter* charset_converter,
870                      CanonOutput* output,
871                      Parsed* output_parsed) {
872   return DoResolveRelative(base_spec, base_spec_len, base_parsed,
873                            relative, relative_length,
874                            charset_converter, output, output_parsed);
875 }
876 
ReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<char> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)877 bool ReplaceComponents(const char* spec,
878                        int spec_len,
879                        const Parsed& parsed,
880                        const Replacements<char>& replacements,
881                        CharsetConverter* charset_converter,
882                        CanonOutput* output,
883                        Parsed* out_parsed) {
884   return DoReplaceComponents(spec, spec_len, parsed, replacements,
885                              charset_converter, output, out_parsed);
886 }
887 
ReplaceComponents(const char * spec,int spec_len,const Parsed & parsed,const Replacements<char16_t> & replacements,CharsetConverter * charset_converter,CanonOutput * output,Parsed * out_parsed)888 bool ReplaceComponents(const char* spec,
889                        int spec_len,
890                        const Parsed& parsed,
891                        const Replacements<char16_t>& replacements,
892                        CharsetConverter* charset_converter,
893                        CanonOutput* output,
894                        Parsed* out_parsed) {
895   return DoReplaceComponents(spec, spec_len, parsed, replacements,
896                              charset_converter, output, out_parsed);
897 }
898 
DecodeURLEscapeSequences(std::string_view input,DecodeURLMode mode,CanonOutputW * output)899 void DecodeURLEscapeSequences(std::string_view input,
900                               DecodeURLMode mode,
901                               CanonOutputW* output) {
902   if (input.empty()) {
903     return;
904   }
905 
906   STACK_UNINITIALIZED RawCanonOutputT<char> unescaped_chars;
907   for (size_t i = 0; i < input.length(); i++) {
908     if (input[i] == '%') {
909       unsigned char ch;
910       if (DecodeEscaped(input.data(), &i, input.length(), &ch)) {
911         unescaped_chars.push_back(ch);
912       } else {
913         // Invalid escape sequence, copy the percent literal.
914         unescaped_chars.push_back('%');
915       }
916     } else {
917       // Regular non-escaped 8-bit character.
918       unescaped_chars.push_back(input[i]);
919     }
920   }
921 
922   int output_initial_length = output->length();
923   // Convert that 8-bit to UTF-16. It's not clear IE does this at all to
924   // JavaScript URLs, but Firefox and Safari do.
925   size_t unescaped_length = unescaped_chars.length();
926   for (size_t i = 0; i < unescaped_length; i++) {
927     unsigned char uch = static_cast<unsigned char>(unescaped_chars.at(i));
928     if (uch < 0x80) {
929       // Non-UTF-8, just append directly
930       output->push_back(uch);
931     } else {
932       // next_ch will point to the last character of the decoded
933       // character.
934       size_t next_character = i;
935       base_icu::UChar32 code_point;
936       if (ReadUTFCharLossy(unescaped_chars.data(), &next_character,
937                            unescaped_length, &code_point)) {
938         // Valid UTF-8 character, convert to UTF-16.
939         AppendUTF16Value(code_point, output);
940         i = next_character;
941       } else if (mode == DecodeURLMode::kUTF8) {
942         DCHECK_EQ(code_point, 0xFFFD);
943         AppendUTF16Value(code_point, output);
944         i = next_character;
945       } else {
946         // If there are any sequences that are not valid UTF-8, we
947         // revert |output| changes, and promote any bytes to UTF-16. We
948         // copy all characters from the beginning to the end of the
949         // identified sequence.
950         output->set_length(output_initial_length);
951         for (size_t j = 0; j < unescaped_chars.length(); ++j)
952           output->push_back(static_cast<unsigned char>(unescaped_chars.at(j)));
953         break;
954       }
955     }
956   }
957 }
958 
EncodeURIComponent(std::string_view input,CanonOutput * output)959 void EncodeURIComponent(std::string_view input, CanonOutput* output) {
960   for (unsigned char c : input) {
961     if (IsComponentChar(c)) {
962       output->push_back(c);
963     } else {
964       AppendEscapedChar(c, output);
965     }
966   }
967 }
968 
IsURIComponentChar(char c)969 bool IsURIComponentChar(char c) {
970   return IsComponentChar(c);
971 }
972 
CompareSchemeComponent(const char * spec,const Component & component,const char * compare_to)973 bool CompareSchemeComponent(const char* spec,
974                             const Component& component,
975                             const char* compare_to) {
976   return DoCompareSchemeComponent(spec, component, compare_to);
977 }
978 
CompareSchemeComponent(const char16_t * spec,const Component & component,const char * compare_to)979 bool CompareSchemeComponent(const char16_t* spec,
980                             const Component& component,
981                             const char* compare_to) {
982   return DoCompareSchemeComponent(spec, component, compare_to);
983 }
984 
HasInvalidURLEscapeSequences(std::string_view input)985 bool HasInvalidURLEscapeSequences(std::string_view input) {
986   for (size_t i = 0; i < input.size(); i++) {
987     if (input[i] == '%') {
988       unsigned char ch;
989       if (!DecodeEscaped(input.data(), &i, input.size(), &ch)) {
990         return true;
991       }
992     }
993   }
994   return false;
995 }
996 
IsAndroidWebViewHackEnabledScheme(std::string_view scheme)997 bool IsAndroidWebViewHackEnabledScheme(std::string_view scheme) {
998   return AllowNonStandardSchemesForAndroidWebView() &&
999          !IsStandardScheme(scheme);
1000 }
1001 
1002 }  // namespace url
1003