• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 // Functions to canonicalize "standard" URLs, which are ones that have an
11 // authority section including a host name.
12 
13 #include "url/url_canon.h"
14 #include "url/url_canon_internal.h"
15 #include "url/url_constants.h"
16 
17 namespace url {
18 
19 namespace {
20 
21 template <typename CHAR>
DoCanonicalizeStandardURL(const URLComponentSource<CHAR> & source,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)22 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
23                                const Parsed& parsed,
24                                SchemeType scheme_type,
25                                CharsetConverter* query_converter,
26                                CanonOutput* output,
27                                Parsed* new_parsed) {
28   DCHECK(!parsed.has_opaque_path);
29 
30   // Scheme: this will append the colon.
31   bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
32                                     output, &new_parsed->scheme);
33 
34   bool scheme_supports_user_info =
35       (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION);
36   bool scheme_supports_ports =
37       (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
38        scheme_type == SCHEME_WITH_HOST_AND_PORT);
39 
40   // Authority (username, password, host, port)
41   bool have_authority;
42   if ((scheme_supports_user_info &&
43        (parsed.username.is_valid() || parsed.password.is_valid())) ||
44       parsed.host.is_nonempty() ||
45       (scheme_supports_ports && parsed.port.is_valid())) {
46     have_authority = true;
47 
48     // Only write the authority separators when we have a scheme.
49     if (parsed.scheme.is_valid()) {
50       output->push_back('/');
51       output->push_back('/');
52     }
53 
54     // User info: the canonicalizer will handle the : and @.
55     if (scheme_supports_user_info) {
56       success &= CanonicalizeUserInfo(
57           source.username, parsed.username, source.password, parsed.password,
58           output, &new_parsed->username, &new_parsed->password);
59     } else {
60       new_parsed->username.reset();
61       new_parsed->password.reset();
62     }
63 
64     success &= CanonicalizeHost(source.host, parsed.host,
65                                 output, &new_parsed->host);
66 
67     // Host must not be empty for standard URLs.
68     if (parsed.host.is_empty())
69       success = false;
70 
71     // Port: the port canonicalizer will handle the colon.
72     if (scheme_supports_ports) {
73       int default_port = DefaultPortForScheme(std::string_view(
74           &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len));
75       success &= CanonicalizePort(source.port, parsed.port, default_port,
76                                   output, &new_parsed->port);
77     } else {
78       new_parsed->port.reset();
79     }
80   } else {
81     // No authority, clear the components.
82     have_authority = false;
83     new_parsed->host.reset();
84     new_parsed->username.reset();
85     new_parsed->password.reset();
86     new_parsed->port.reset();
87     success = false;  // Standard URLs must have an authority.
88   }
89 
90   // Path
91   if (parsed.path.is_valid()) {
92     success &= CanonicalizePath(source.path, parsed.path,
93                                 output, &new_parsed->path);
94   } else if (have_authority ||
95              parsed.query.is_valid() || parsed.ref.is_valid()) {
96     // When we have an empty path, make up a path when we have an authority
97     // or something following the path. The only time we allow an empty
98     // output path is when there is nothing else.
99     new_parsed->path = Component(output->length(), 1);
100     output->push_back('/');
101   } else {
102     // No path at all
103     new_parsed->path.reset();
104   }
105 
106   // Query
107   CanonicalizeQuery(source.query, parsed.query, query_converter,
108                     output, &new_parsed->query);
109 
110   // Ref: ignore failure for this, since the page can probably still be loaded.
111   CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
112 
113   // Carry over the flag for potentially dangling markup:
114   if (parsed.potentially_dangling_markup)
115     new_parsed->potentially_dangling_markup = true;
116 
117   return success;
118 }
119 
120 }  // namespace
121 
122 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
123 // if the scheme is unknown.
124 //
125 // Please keep blink::DefaultPortForProtocol and url::DefaultPortForProtocol in
126 // sync.
DefaultPortForScheme(std::string_view scheme)127 int DefaultPortForScheme(std::string_view scheme) {
128   switch (scheme.length()) {
129     case 4:
130       if (scheme == kHttpScheme) {
131         return 80;
132       }
133       break;
134     case 5:
135       if (scheme == kHttpsScheme) {
136         return 443;
137       }
138       break;
139     case 3:
140       if (scheme == kFtpScheme) {
141         return 21;
142       } else if (scheme == kWssScheme) {
143         return 443;
144       }
145       break;
146     case 2:
147       if (scheme == kWsScheme) {
148         return 80;
149       }
150       break;
151   }
152   return PORT_UNSPECIFIED;
153 }
154 
CanonicalizeStandardURL(const char * spec,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)155 bool CanonicalizeStandardURL(const char* spec,
156                              const Parsed& parsed,
157                              SchemeType scheme_type,
158                              CharsetConverter* query_converter,
159                              CanonOutput* output,
160                              Parsed* new_parsed) {
161   return DoCanonicalizeStandardURL(URLComponentSource(spec), parsed,
162                                    scheme_type, query_converter, output,
163                                    new_parsed);
164 }
165 
CanonicalizeStandardURL(const char16_t * spec,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)166 bool CanonicalizeStandardURL(const char16_t* spec,
167                              const Parsed& parsed,
168                              SchemeType scheme_type,
169                              CharsetConverter* query_converter,
170                              CanonOutput* output,
171                              Parsed* new_parsed) {
172   return DoCanonicalizeStandardURL(URLComponentSource(spec), parsed,
173                                    scheme_type, query_converter, output,
174                                    new_parsed);
175 }
176 
177 // It might be nice in the future to optimize this so unchanged components don't
178 // need to be recanonicalized. This is especially true since the common case for
179 // ReplaceComponents is removing things we don't want, like reference fragments
180 // and usernames. These cases can become more efficient if we can assume the
181 // rest of the URL is OK with these removed (or only the modified parts
182 // recanonicalized). This would be much more complex to implement, however.
183 //
184 // You would also need to update DoReplaceComponents in url_util.cc which
185 // relies on this re-checking everything (see the comment there for why).
ReplaceStandardURL(const char * base,const Parsed & base_parsed,const Replacements<char> & replacements,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)186 bool ReplaceStandardURL(const char* base,
187                         const Parsed& base_parsed,
188                         const Replacements<char>& replacements,
189                         SchemeType scheme_type,
190                         CharsetConverter* query_converter,
191                         CanonOutput* output,
192                         Parsed* new_parsed) {
193   URLComponentSource<char> source(base);
194   Parsed parsed(base_parsed);
195   SetupOverrideComponents(base, replacements, &source, &parsed);
196   return DoCanonicalizeStandardURL(source, parsed, scheme_type, query_converter,
197                                    output, new_parsed);
198 }
199 
200 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
201 // regular code path can be used.
ReplaceStandardURL(const char * base,const Parsed & base_parsed,const Replacements<char16_t> & replacements,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)202 bool ReplaceStandardURL(const char* base,
203                         const Parsed& base_parsed,
204                         const Replacements<char16_t>& replacements,
205                         SchemeType scheme_type,
206                         CharsetConverter* query_converter,
207                         CanonOutput* output,
208                         Parsed* new_parsed) {
209   RawCanonOutput<1024> utf8;
210   URLComponentSource<char> source(base);
211   Parsed parsed(base_parsed);
212   SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
213   return DoCanonicalizeStandardURL(source, parsed, scheme_type, query_converter,
214                                    output, new_parsed);
215 }
216 
217 }  // namespace url
218