1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 // Functions to canonicalize "standard" URLs, which are ones that have an
11 // authority section including a host name.
12
13 #include "url/url_canon.h"
14 #include "url/url_canon_internal.h"
15 #include "url/url_constants.h"
16
17 namespace url {
18
19 namespace {
20
21 template <typename CHAR>
DoCanonicalizeStandardURL(const URLComponentSource<CHAR> & source,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)22 bool DoCanonicalizeStandardURL(const URLComponentSource<CHAR>& source,
23 const Parsed& parsed,
24 SchemeType scheme_type,
25 CharsetConverter* query_converter,
26 CanonOutput* output,
27 Parsed* new_parsed) {
28 DCHECK(!parsed.has_opaque_path);
29
30 // Scheme: this will append the colon.
31 bool success = CanonicalizeScheme(source.scheme, parsed.scheme,
32 output, &new_parsed->scheme);
33
34 bool scheme_supports_user_info =
35 (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION);
36 bool scheme_supports_ports =
37 (scheme_type == SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION ||
38 scheme_type == SCHEME_WITH_HOST_AND_PORT);
39
40 // Authority (username, password, host, port)
41 bool have_authority;
42 if ((scheme_supports_user_info &&
43 (parsed.username.is_valid() || parsed.password.is_valid())) ||
44 parsed.host.is_nonempty() ||
45 (scheme_supports_ports && parsed.port.is_valid())) {
46 have_authority = true;
47
48 // Only write the authority separators when we have a scheme.
49 if (parsed.scheme.is_valid()) {
50 output->push_back('/');
51 output->push_back('/');
52 }
53
54 // User info: the canonicalizer will handle the : and @.
55 if (scheme_supports_user_info) {
56 success &= CanonicalizeUserInfo(
57 source.username, parsed.username, source.password, parsed.password,
58 output, &new_parsed->username, &new_parsed->password);
59 } else {
60 new_parsed->username.reset();
61 new_parsed->password.reset();
62 }
63
64 success &= CanonicalizeHost(source.host, parsed.host,
65 output, &new_parsed->host);
66
67 // Host must not be empty for standard URLs.
68 if (parsed.host.is_empty())
69 success = false;
70
71 // Port: the port canonicalizer will handle the colon.
72 if (scheme_supports_ports) {
73 int default_port = DefaultPortForScheme(std::string_view(
74 &output->data()[new_parsed->scheme.begin], new_parsed->scheme.len));
75 success &= CanonicalizePort(source.port, parsed.port, default_port,
76 output, &new_parsed->port);
77 } else {
78 new_parsed->port.reset();
79 }
80 } else {
81 // No authority, clear the components.
82 have_authority = false;
83 new_parsed->host.reset();
84 new_parsed->username.reset();
85 new_parsed->password.reset();
86 new_parsed->port.reset();
87 success = false; // Standard URLs must have an authority.
88 }
89
90 // Path
91 if (parsed.path.is_valid()) {
92 success &= CanonicalizePath(source.path, parsed.path,
93 output, &new_parsed->path);
94 } else if (have_authority ||
95 parsed.query.is_valid() || parsed.ref.is_valid()) {
96 // When we have an empty path, make up a path when we have an authority
97 // or something following the path. The only time we allow an empty
98 // output path is when there is nothing else.
99 new_parsed->path = Component(output->length(), 1);
100 output->push_back('/');
101 } else {
102 // No path at all
103 new_parsed->path.reset();
104 }
105
106 // Query
107 CanonicalizeQuery(source.query, parsed.query, query_converter,
108 output, &new_parsed->query);
109
110 // Ref: ignore failure for this, since the page can probably still be loaded.
111 CanonicalizeRef(source.ref, parsed.ref, output, &new_parsed->ref);
112
113 // Carry over the flag for potentially dangling markup:
114 if (parsed.potentially_dangling_markup)
115 new_parsed->potentially_dangling_markup = true;
116
117 return success;
118 }
119
120 } // namespace
121
122 // Returns the default port for the given canonical scheme, or PORT_UNSPECIFIED
123 // if the scheme is unknown.
124 //
125 // Please keep blink::DefaultPortForProtocol and url::DefaultPortForProtocol in
126 // sync.
DefaultPortForScheme(std::string_view scheme)127 int DefaultPortForScheme(std::string_view scheme) {
128 switch (scheme.length()) {
129 case 4:
130 if (scheme == kHttpScheme) {
131 return 80;
132 }
133 break;
134 case 5:
135 if (scheme == kHttpsScheme) {
136 return 443;
137 }
138 break;
139 case 3:
140 if (scheme == kFtpScheme) {
141 return 21;
142 } else if (scheme == kWssScheme) {
143 return 443;
144 }
145 break;
146 case 2:
147 if (scheme == kWsScheme) {
148 return 80;
149 }
150 break;
151 }
152 return PORT_UNSPECIFIED;
153 }
154
CanonicalizeStandardURL(const char * spec,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)155 bool CanonicalizeStandardURL(const char* spec,
156 const Parsed& parsed,
157 SchemeType scheme_type,
158 CharsetConverter* query_converter,
159 CanonOutput* output,
160 Parsed* new_parsed) {
161 return DoCanonicalizeStandardURL(URLComponentSource(spec), parsed,
162 scheme_type, query_converter, output,
163 new_parsed);
164 }
165
CanonicalizeStandardURL(const char16_t * spec,const Parsed & parsed,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)166 bool CanonicalizeStandardURL(const char16_t* spec,
167 const Parsed& parsed,
168 SchemeType scheme_type,
169 CharsetConverter* query_converter,
170 CanonOutput* output,
171 Parsed* new_parsed) {
172 return DoCanonicalizeStandardURL(URLComponentSource(spec), parsed,
173 scheme_type, query_converter, output,
174 new_parsed);
175 }
176
177 // It might be nice in the future to optimize this so unchanged components don't
178 // need to be recanonicalized. This is especially true since the common case for
179 // ReplaceComponents is removing things we don't want, like reference fragments
180 // and usernames. These cases can become more efficient if we can assume the
181 // rest of the URL is OK with these removed (or only the modified parts
182 // recanonicalized). This would be much more complex to implement, however.
183 //
184 // You would also need to update DoReplaceComponents in url_util.cc which
185 // relies on this re-checking everything (see the comment there for why).
ReplaceStandardURL(const char * base,const Parsed & base_parsed,const Replacements<char> & replacements,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)186 bool ReplaceStandardURL(const char* base,
187 const Parsed& base_parsed,
188 const Replacements<char>& replacements,
189 SchemeType scheme_type,
190 CharsetConverter* query_converter,
191 CanonOutput* output,
192 Parsed* new_parsed) {
193 URLComponentSource<char> source(base);
194 Parsed parsed(base_parsed);
195 SetupOverrideComponents(base, replacements, &source, &parsed);
196 return DoCanonicalizeStandardURL(source, parsed, scheme_type, query_converter,
197 output, new_parsed);
198 }
199
200 // For 16-bit replacements, we turn all the replacements into UTF-8 so the
201 // regular code path can be used.
ReplaceStandardURL(const char * base,const Parsed & base_parsed,const Replacements<char16_t> & replacements,SchemeType scheme_type,CharsetConverter * query_converter,CanonOutput * output,Parsed * new_parsed)202 bool ReplaceStandardURL(const char* base,
203 const Parsed& base_parsed,
204 const Replacements<char16_t>& replacements,
205 SchemeType scheme_type,
206 CharsetConverter* query_converter,
207 CanonOutput* output,
208 Parsed* new_parsed) {
209 RawCanonOutput<1024> utf8;
210 URLComponentSource<char> source(base);
211 Parsed parsed(base_parsed);
212 SetupUTF16OverrideComponents(base, replacements, &utf8, &source, &parsed);
213 return DoCanonicalizeStandardURL(source, parsed, scheme_type, query_converter,
214 output, new_parsed);
215 }
216
217 } // namespace url
218