1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 // Canonicalizers for random bits that aren't big enough for their own files.
11
12 #include <string.h>
13
14 #include "url/url_canon.h"
15 #include "url/url_canon_internal.h"
16
17 namespace url {
18
19 namespace {
20
21 // Returns true if the given character should be removed from the middle of a
22 // URL.
IsRemovableURLWhitespace(int ch)23 inline bool IsRemovableURLWhitespace(int ch) {
24 return ch == '\r' || ch == '\n' || ch == '\t';
25 }
26
27 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
28 // It sucks that we have to do this, since this takes about 13% of the total URL
29 // canonicalization time.
30 template <typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len,bool * potentially_dangling_markup)31 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
32 int input_len,
33 CanonOutputT<CHAR>* buffer,
34 int* output_len,
35 bool* potentially_dangling_markup) {
36 // Fast verification that there's nothing that needs removal. This is the 99%
37 // case, so we want it to be fast and don't care about impacting the speed
38 // when we do find whitespace.
39 bool found_whitespace = false;
40 if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
41 // For large strings, memchr is much faster than any scalar code we can
42 // write, even if we need to run it three times. (If this turns out to still
43 // be a bottleneck, we could write our own vector code, but given that
44 // memchr is so fast, it's unlikely to be relevant.)
45 found_whitespace = memchr(input, '\n', input_len) != nullptr ||
46 memchr(input, '\r', input_len) != nullptr ||
47 memchr(input, '\t', input_len) != nullptr;
48 } else {
49 for (int i = 0; i < input_len; i++) {
50 if (!IsRemovableURLWhitespace(input[i]))
51 continue;
52 found_whitespace = true;
53 break;
54 }
55 }
56
57 if (!found_whitespace) {
58 // Didn't find any whitespace, we don't need to do anything. We can just
59 // return the input as the output.
60 *output_len = input_len;
61 return input;
62 }
63
64 // Skip whitespace removal for `data:` URLs.
65 //
66 // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
67 // that turns out to be difficult to do correctly given this function's
68 // character type templating.
69 if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
70 input[3] == 'a' && input[4] == ':') {
71 *output_len = input_len;
72 return input;
73 }
74
75 // Remove the whitespace into the new buffer and return it.
76 for (int i = 0; i < input_len; i++) {
77 if (!IsRemovableURLWhitespace(input[i])) {
78 if (potentially_dangling_markup && input[i] == 0x3C)
79 *potentially_dangling_markup = true;
80 buffer->push_back(input[i]);
81 }
82 }
83 *output_len = buffer->length();
84 return buffer->data();
85 }
86
87 // Contains the canonical version of each possible input letter in the scheme
88 // (basically, lower-cased). The corresponding entry will be 0 if the letter
89 // is not allowed in a scheme.
90 // clang-format off
91 const char kSchemeCanonical[0x80] = {
92 // 00-1f: all are invalid
93 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
94 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
95 // ' ' ! " # $ % & ' ( ) * + , - . /
96 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
97 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
98 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
99 // @ A B C D E F G H I J K L M N O
100 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
101 // P Q R S T U V W X Y Z [ \ ] ^ _
102 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
103 // ` a b c d e f g h i j k l m n o
104 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
105 // p q r s t u v w x y z { | } ~
106 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
107 // clang-format on
108
109 // This could be a table lookup as well by setting the high bit for each
110 // valid character, but it's only called once per URL, and it makes the lookup
111 // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)112 inline bool IsSchemeFirstChar(unsigned char c) {
113 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
114 }
115
116 template <typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)117 bool DoScheme(const CHAR* spec,
118 const Component& scheme,
119 CanonOutput* output,
120 Component* out_scheme) {
121 if (scheme.is_empty()) {
122 // Scheme is unspecified or empty, convert to empty by appending a colon.
123 *out_scheme = Component(output->length(), 0);
124 output->push_back(':');
125 return false;
126 }
127
128 // The output scheme starts from the current position.
129 out_scheme->begin = output->length();
130
131 // Danger: it's important that this code does not strip any characters;
132 // it only emits the canonical version (be it valid or escaped) for each
133 // of the input characters. Stripping would put it out of sync with
134 // FindAndCompareScheme, which could cause some security checks on
135 // schemes to be incorrect.
136 bool success = true;
137 size_t begin = static_cast<size_t>(scheme.begin);
138 size_t end = static_cast<size_t>(scheme.end());
139 for (size_t i = begin; i < end; i++) {
140 UCHAR ch = static_cast<UCHAR>(spec[i]);
141 char replacement = 0;
142 if (ch < 0x80) {
143 if (i == begin) {
144 // Need to do a special check for the first letter of the scheme.
145 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
146 replacement = kSchemeCanonical[ch];
147 } else {
148 replacement = kSchemeCanonical[ch];
149 }
150 }
151
152 if (replacement) {
153 output->push_back(replacement);
154 } else if (ch == '%') {
155 // Canonicalizing the scheme multiple times should lead to the same
156 // result. Since invalid characters will be escaped, we need to preserve
157 // the percent to avoid multiple escaping. The scheme will be invalid.
158 success = false;
159 output->push_back('%');
160 } else {
161 // Invalid character, store it but mark this scheme as invalid.
162 success = false;
163
164 // This will escape the output and also handle encoding issues.
165 // Ignore the return value since we already failed.
166 AppendUTF8EscapedChar(spec, &i, end, output);
167 }
168 }
169
170 // The output scheme ends with the the current position, before appending
171 // the colon.
172 out_scheme->len = output->length() - out_scheme->begin;
173 output->push_back(':');
174 return success;
175 }
176
177 // The username and password components reference ranges in the corresponding
178 // *_spec strings. Typically, these specs will be the same (we're
179 // canonicalizing a single source string), but may be different when
180 // replacing components.
181 template <typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const Component & username,const CHAR * password_spec,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)182 bool DoUserInfo(const CHAR* username_spec,
183 const Component& username,
184 const CHAR* password_spec,
185 const Component& password,
186 CanonOutput* output,
187 Component* out_username,
188 Component* out_password) {
189 if (username.is_empty() && password.is_empty()) {
190 // Common case: no user info. We strip empty username/passwords.
191 *out_username = Component();
192 *out_password = Component();
193 return true;
194 }
195
196 // Write the username.
197 out_username->begin = output->length();
198 if (username.is_nonempty()) {
199 // This will escape characters not valid for the username.
200 AppendStringOfType(&username_spec[username.begin],
201 static_cast<size_t>(username.len), CHAR_USERINFO,
202 output);
203 }
204 out_username->len = output->length() - out_username->begin;
205
206 // When there is a password, we need the separator. Note that we strip
207 // empty but specified passwords.
208 if (password.is_nonempty()) {
209 output->push_back(':');
210 out_password->begin = output->length();
211 AppendStringOfType(&password_spec[password.begin],
212 static_cast<size_t>(password.len), CHAR_USERINFO,
213 output);
214 out_password->len = output->length() - out_password->begin;
215 } else {
216 *out_password = Component();
217 }
218
219 output->push_back('@');
220 return true;
221 }
222
223 // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)224 inline void WritePortInt(char* output, int output_len, int port) {
225 _itoa_s(port, output, output_len, 10);
226 }
227
228 // This function will prepend the colon if there will be a port.
229 template <typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)230 bool DoPort(const CHAR* spec,
231 const Component& port,
232 int default_port_for_scheme,
233 CanonOutput* output,
234 Component* out_port) {
235 int port_num = ParsePort(spec, port);
236 if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
237 *out_port = Component();
238 return true; // Leave port empty.
239 }
240
241 if (port_num == PORT_INVALID) {
242 // Invalid port: We'll copy the text from the input so the user can see
243 // what the error was, and mark the URL as invalid by returning false.
244 output->push_back(':');
245 out_port->begin = output->length();
246 AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
247 static_cast<size_t>(port.end()), output);
248 out_port->len = output->length() - out_port->begin;
249 return false;
250 }
251
252 // Convert port number back to an integer. Max port value is 5 digits, and
253 // the Parsed::ExtractPort will have made sure the integer is in range.
254 const int buf_size = 6;
255 char buf[buf_size];
256 WritePortInt(buf, buf_size, port_num);
257
258 // Append the port number to the output, preceded by a colon.
259 output->push_back(':');
260 out_port->begin = output->length();
261 for (int i = 0; i < buf_size && buf[i]; i++)
262 output->push_back(buf[i]);
263
264 out_port->len = output->length() - out_port->begin;
265 return true;
266 }
267
268 // clang-format off
269 // Percent-escape all characters from the fragment percent-encode set
270 // https://url.spec.whatwg.org/#fragment-percent-encode-set
271 const bool kShouldEscapeCharInFragment[0x80] = {
272 // Control characters (0x00-0x1F)
273 true, true, true, true, true, true, true, true,
274 true, true, true, true, true, true, true, true,
275 true, true, true, true, true, true, true, true,
276 true, true, true, true, true, true, true, true,
277 // ' ' ! " # $ % & '
278 true, false, true, false, false, false, false, false,
279 // ( ) * + , - . /
280 false, false, false, false, false, false, false, false,
281 // 0 1 2 3 4 5 6 7
282 false, false, false, false, false, false, false, false,
283 // 8 9 : ; < = > ?
284 false, false, false, false, true, false, true, false,
285 // @ A B C D E F G
286 false, false, false, false, false, false, false, false,
287 // H I J K L M N O
288 false, false, false, false, false, false, false, false,
289 // P Q R S T U V W
290 false, false, false, false, false, false, false, false,
291 // X Y Z [ \ ] ^ _
292 false, false, false, false, false, false, false, false,
293 // ` a b c d e f g
294 true, false, false, false, false, false, false, false,
295 // h i j k l m n o
296 false, false, false, false, false, false, false, false,
297 // p q r s t u v w
298 false, false, false, false, false, false, false, false,
299 // x y z { | } ~ DELETE
300 false, false, false, false, false, false, false, true
301 };
302 // clang-format on
303
304 template <typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const Component & ref,CanonOutput * output,Component * out_ref)305 void DoCanonicalizeRef(const CHAR* spec,
306 const Component& ref,
307 CanonOutput* output,
308 Component* out_ref) {
309 if (!ref.is_valid()) {
310 // Common case of no ref.
311 *out_ref = Component();
312 return;
313 }
314
315 // Append the ref separator. Note that we need to do this even when the ref
316 // is empty but present.
317 output->push_back('#');
318 out_ref->begin = output->length();
319
320 // Now iterate through all the characters, converting to UTF-8 and validating.
321 size_t end = static_cast<size_t>(ref.end());
322 for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
323 UCHAR current_char = static_cast<UCHAR>(spec[i]);
324 if (current_char < 0x80) {
325 if (kShouldEscapeCharInFragment[current_char])
326 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
327 else
328 output->push_back(static_cast<char>(spec[i]));
329 } else {
330 AppendUTF8EscapedChar(spec, &i, end, output);
331 }
332 }
333
334 out_ref->len = output->length() - out_ref->begin;
335 }
336
337 } // namespace
338
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len,bool * potentially_dangling_markup)339 const char* RemoveURLWhitespace(const char* input,
340 int input_len,
341 CanonOutputT<char>* buffer,
342 int* output_len,
343 bool* potentially_dangling_markup) {
344 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
345 potentially_dangling_markup);
346 }
347
RemoveURLWhitespace(const char16_t * input,int input_len,CanonOutputT<char16_t> * buffer,int * output_len,bool * potentially_dangling_markup)348 const char16_t* RemoveURLWhitespace(const char16_t* input,
349 int input_len,
350 CanonOutputT<char16_t>* buffer,
351 int* output_len,
352 bool* potentially_dangling_markup) {
353 return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
354 potentially_dangling_markup);
355 }
356
CanonicalSchemeChar(char16_t ch)357 char CanonicalSchemeChar(char16_t ch) {
358 if (ch >= 0x80)
359 return 0; // Non-ASCII is not supported by schemes.
360 return kSchemeCanonical[ch];
361 }
362
CanonicalizeScheme(const char * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)363 bool CanonicalizeScheme(const char* spec,
364 const Component& scheme,
365 CanonOutput* output,
366 Component* out_scheme) {
367 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
368 }
369
CanonicalizeScheme(const char16_t * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)370 bool CanonicalizeScheme(const char16_t* spec,
371 const Component& scheme,
372 CanonOutput* output,
373 Component* out_scheme) {
374 return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
375 }
376
CanonicalizeUserInfo(const char * username_source,const Component & username,const char * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)377 bool CanonicalizeUserInfo(const char* username_source,
378 const Component& username,
379 const char* password_source,
380 const Component& password,
381 CanonOutput* output,
382 Component* out_username,
383 Component* out_password) {
384 return DoUserInfo<char, unsigned char>(username_source, username,
385 password_source, password, output,
386 out_username, out_password);
387 }
388
CanonicalizeUserInfo(const char16_t * username_source,const Component & username,const char16_t * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)389 bool CanonicalizeUserInfo(const char16_t* username_source,
390 const Component& username,
391 const char16_t* password_source,
392 const Component& password,
393 CanonOutput* output,
394 Component* out_username,
395 Component* out_password) {
396 return DoUserInfo<char16_t, char16_t>(username_source, username,
397 password_source, password, output,
398 out_username, out_password);
399 }
400
CanonicalizePort(const char * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)401 bool CanonicalizePort(const char* spec,
402 const Component& port,
403 int default_port_for_scheme,
404 CanonOutput* output,
405 Component* out_port) {
406 return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
407 output, out_port);
408 }
409
CanonicalizePort(const char16_t * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)410 bool CanonicalizePort(const char16_t* spec,
411 const Component& port,
412 int default_port_for_scheme,
413 CanonOutput* output,
414 Component* out_port) {
415 return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
416 out_port);
417 }
418
CanonicalizeRef(const char * spec,const Component & ref,CanonOutput * output,Component * out_ref)419 void CanonicalizeRef(const char* spec,
420 const Component& ref,
421 CanonOutput* output,
422 Component* out_ref) {
423 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
424 }
425
CanonicalizeRef(const char16_t * spec,const Component & ref,CanonOutput * output,Component * out_ref)426 void CanonicalizeRef(const char16_t* spec,
427 const Component& ref,
428 CanonOutput* output,
429 Component* out_ref) {
430 DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
431 }
432
433 } // namespace url
434