• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 // Canonicalizers for random bits that aren't big enough for their own files.
11 
12 #include <string.h>
13 
14 #include "url/url_canon.h"
15 #include "url/url_canon_internal.h"
16 
17 namespace url {
18 
19 namespace {
20 
21 // Returns true if the given character should be removed from the middle of a
22 // URL.
IsRemovableURLWhitespace(int ch)23 inline bool IsRemovableURLWhitespace(int ch) {
24   return ch == '\r' || ch == '\n' || ch == '\t';
25 }
26 
27 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
28 // It sucks that we have to do this, since this takes about 13% of the total URL
29 // canonicalization time.
30 template <typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len,bool * potentially_dangling_markup)31 const CHAR* DoRemoveURLWhitespace(const CHAR* input,
32                                   int input_len,
33                                   CanonOutputT<CHAR>* buffer,
34                                   int* output_len,
35                                   bool* potentially_dangling_markup) {
36   // Fast verification that there's nothing that needs removal. This is the 99%
37   // case, so we want it to be fast and don't care about impacting the speed
38   // when we do find whitespace.
39   bool found_whitespace = false;
40   if (sizeof(*input) == 1 && input_len >= kMinimumLengthForSIMD) {
41     // For large strings, memchr is much faster than any scalar code we can
42     // write, even if we need to run it three times. (If this turns out to still
43     // be a bottleneck, we could write our own vector code, but given that
44     // memchr is so fast, it's unlikely to be relevant.)
45     found_whitespace = memchr(input, '\n', input_len) != nullptr ||
46                        memchr(input, '\r', input_len) != nullptr ||
47                        memchr(input, '\t', input_len) != nullptr;
48   } else {
49     for (int i = 0; i < input_len; i++) {
50       if (!IsRemovableURLWhitespace(input[i]))
51         continue;
52       found_whitespace = true;
53       break;
54     }
55   }
56 
57   if (!found_whitespace) {
58     // Didn't find any whitespace, we don't need to do anything. We can just
59     // return the input as the output.
60     *output_len = input_len;
61     return input;
62   }
63 
64   // Skip whitespace removal for `data:` URLs.
65   //
66   // TODO(mkwst): Ideally, this would use something like `base::StartsWith`, but
67   // that turns out to be difficult to do correctly given this function's
68   // character type templating.
69   if (input_len > 5 && input[0] == 'd' && input[1] == 'a' && input[2] == 't' &&
70       input[3] == 'a' && input[4] == ':') {
71     *output_len = input_len;
72     return input;
73   }
74 
75   // Remove the whitespace into the new buffer and return it.
76   for (int i = 0; i < input_len; i++) {
77     if (!IsRemovableURLWhitespace(input[i])) {
78       if (potentially_dangling_markup && input[i] == 0x3C)
79         *potentially_dangling_markup = true;
80       buffer->push_back(input[i]);
81     }
82   }
83   *output_len = buffer->length();
84   return buffer->data();
85 }
86 
87 // Contains the canonical version of each possible input letter in the scheme
88 // (basically, lower-cased). The corresponding entry will be 0 if the letter
89 // is not allowed in a scheme.
90 // clang-format off
91 const char kSchemeCanonical[0x80] = {
92 // 00-1f: all are invalid
93      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
94      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
95 //  ' '   !    "    #    $    %    &    '    (    )    *    +    ,    -    .    /
96      0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,  '+',  0,  '-', '.',  0,
97 //   0    1    2    3    4    5    6    7    8    9    :    ;    <    =    >    ?
98     '0', '1', '2', '3', '4', '5', '6', '7', '8', '9',  0 ,  0 ,  0 ,  0 ,  0 ,  0 ,
99 //   @    A    B    C    D    E    F    G    H    I    J    K    L    M    N    O
100      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
101 //   P    Q    R    S    T    U    V    W    X    Y    Z    [    \    ]    ^    _
102     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0,   0 ,  0,   0 ,  0,
103 //   `    a    b    c    d    e    f    g    h    i    j    k    l    m    n    o
104      0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
105 //   p    q    r    s    t    u    v    w    x    y    z    {    |    }    ~
106     'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',  0 ,  0 ,  0 ,  0 ,  0 };
107 // clang-format on
108 
109 // This could be a table lookup as well by setting the high bit for each
110 // valid character, but it's only called once per URL, and it makes the lookup
111 // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)112 inline bool IsSchemeFirstChar(unsigned char c) {
113   return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
114 }
115 
116 template <typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)117 bool DoScheme(const CHAR* spec,
118               const Component& scheme,
119               CanonOutput* output,
120               Component* out_scheme) {
121   if (scheme.is_empty()) {
122     // Scheme is unspecified or empty, convert to empty by appending a colon.
123     *out_scheme = Component(output->length(), 0);
124     output->push_back(':');
125     return false;
126   }
127 
128   // The output scheme starts from the current position.
129   out_scheme->begin = output->length();
130 
131   // Danger: it's important that this code does not strip any characters;
132   // it only emits the canonical version (be it valid or escaped) for each
133   // of the input characters. Stripping would put it out of sync with
134   // FindAndCompareScheme, which could cause some security checks on
135   // schemes to be incorrect.
136   bool success = true;
137   size_t begin = static_cast<size_t>(scheme.begin);
138   size_t end = static_cast<size_t>(scheme.end());
139   for (size_t i = begin; i < end; i++) {
140     UCHAR ch = static_cast<UCHAR>(spec[i]);
141     char replacement = 0;
142     if (ch < 0x80) {
143       if (i == begin) {
144         // Need to do a special check for the first letter of the scheme.
145         if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
146           replacement = kSchemeCanonical[ch];
147       } else {
148         replacement = kSchemeCanonical[ch];
149       }
150     }
151 
152     if (replacement) {
153       output->push_back(replacement);
154     } else if (ch == '%') {
155       // Canonicalizing the scheme multiple times should lead to the same
156       // result. Since invalid characters will be escaped, we need to preserve
157       // the percent to avoid multiple escaping. The scheme will be invalid.
158       success = false;
159       output->push_back('%');
160     } else {
161       // Invalid character, store it but mark this scheme as invalid.
162       success = false;
163 
164       // This will escape the output and also handle encoding issues.
165       // Ignore the return value since we already failed.
166       AppendUTF8EscapedChar(spec, &i, end, output);
167     }
168   }
169 
170   // The output scheme ends with the the current position, before appending
171   // the colon.
172   out_scheme->len = output->length() - out_scheme->begin;
173   output->push_back(':');
174   return success;
175 }
176 
177 // The username and password components reference ranges in the corresponding
178 // *_spec strings. Typically, these specs will be the same (we're
179 // canonicalizing a single source string), but may be different when
180 // replacing components.
181 template <typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const Component & username,const CHAR * password_spec,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)182 bool DoUserInfo(const CHAR* username_spec,
183                 const Component& username,
184                 const CHAR* password_spec,
185                 const Component& password,
186                 CanonOutput* output,
187                 Component* out_username,
188                 Component* out_password) {
189   if (username.is_empty() && password.is_empty()) {
190     // Common case: no user info. We strip empty username/passwords.
191     *out_username = Component();
192     *out_password = Component();
193     return true;
194   }
195 
196   // Write the username.
197   out_username->begin = output->length();
198   if (username.is_nonempty()) {
199     // This will escape characters not valid for the username.
200     AppendStringOfType(&username_spec[username.begin],
201                        static_cast<size_t>(username.len), CHAR_USERINFO,
202                        output);
203   }
204   out_username->len = output->length() - out_username->begin;
205 
206   // When there is a password, we need the separator. Note that we strip
207   // empty but specified passwords.
208   if (password.is_nonempty()) {
209     output->push_back(':');
210     out_password->begin = output->length();
211     AppendStringOfType(&password_spec[password.begin],
212                        static_cast<size_t>(password.len), CHAR_USERINFO,
213                        output);
214     out_password->len = output->length() - out_password->begin;
215   } else {
216     *out_password = Component();
217   }
218 
219   output->push_back('@');
220   return true;
221 }
222 
223 // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)224 inline void WritePortInt(char* output, int output_len, int port) {
225   _itoa_s(port, output, output_len, 10);
226 }
227 
228 // This function will prepend the colon if there will be a port.
229 template <typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)230 bool DoPort(const CHAR* spec,
231             const Component& port,
232             int default_port_for_scheme,
233             CanonOutput* output,
234             Component* out_port) {
235   int port_num = ParsePort(spec, port);
236   if (port_num == PORT_UNSPECIFIED || port_num == default_port_for_scheme) {
237     *out_port = Component();
238     return true;  // Leave port empty.
239   }
240 
241   if (port_num == PORT_INVALID) {
242     // Invalid port: We'll copy the text from the input so the user can see
243     // what the error was, and mark the URL as invalid by returning false.
244     output->push_back(':');
245     out_port->begin = output->length();
246     AppendInvalidNarrowString(spec, static_cast<size_t>(port.begin),
247                               static_cast<size_t>(port.end()), output);
248     out_port->len = output->length() - out_port->begin;
249     return false;
250   }
251 
252   // Convert port number back to an integer. Max port value is 5 digits, and
253   // the Parsed::ExtractPort will have made sure the integer is in range.
254   const int buf_size = 6;
255   char buf[buf_size];
256   WritePortInt(buf, buf_size, port_num);
257 
258   // Append the port number to the output, preceded by a colon.
259   output->push_back(':');
260   out_port->begin = output->length();
261   for (int i = 0; i < buf_size && buf[i]; i++)
262     output->push_back(buf[i]);
263 
264   out_port->len = output->length() - out_port->begin;
265   return true;
266 }
267 
268 // clang-format off
269 //   Percent-escape all characters from the fragment percent-encode set
270 //   https://url.spec.whatwg.org/#fragment-percent-encode-set
271 const bool kShouldEscapeCharInFragment[0x80] = {
272 //  Control characters (0x00-0x1F)
273     true,  true,  true,  true,  true,  true,  true,  true,
274     true,  true,  true,  true,  true,  true,  true,  true,
275     true,  true,  true,  true,  true,  true,  true,  true,
276     true,  true,  true,  true,  true,  true,  true,  true,
277 //  ' '    !      "      #      $      %      &      '
278     true,  false, true,  false, false, false, false, false,
279 //  (      )      *      +      ,      -      .      /
280     false, false, false, false, false, false, false, false,
281 //  0      1      2      3      4      5      6      7
282     false, false, false, false, false, false, false, false,
283 //  8      9      :      ;      <      =      >      ?
284     false, false, false, false, true,  false, true,  false,
285 //  @      A      B      C      D      E      F      G
286     false, false, false, false, false, false, false, false,
287 //  H      I      J      K      L      M      N      O
288     false, false, false, false, false, false, false, false,
289 //  P      Q      R      S      T      U      V      W
290     false, false, false, false, false, false, false, false,
291 //  X      Y      Z      [      \      ]      ^      _
292     false, false, false, false, false, false, false, false,
293 //  `      a      b      c      d      e      f      g
294     true,  false, false, false, false, false, false, false,
295 //  h      i      j      k      l      m      n      o
296     false, false, false, false, false, false, false, false,
297 //  p      q      r      s      t      u      v      w
298     false, false, false, false, false, false, false, false,
299 //  x      y      z      {      |      }      ~      DELETE
300     false, false, false, false, false, false, false, true
301 };
302 // clang-format on
303 
304 template <typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const Component & ref,CanonOutput * output,Component * out_ref)305 void DoCanonicalizeRef(const CHAR* spec,
306                        const Component& ref,
307                        CanonOutput* output,
308                        Component* out_ref) {
309   if (!ref.is_valid()) {
310     // Common case of no ref.
311     *out_ref = Component();
312     return;
313   }
314 
315   // Append the ref separator. Note that we need to do this even when the ref
316   // is empty but present.
317   output->push_back('#');
318   out_ref->begin = output->length();
319 
320   // Now iterate through all the characters, converting to UTF-8 and validating.
321   size_t end = static_cast<size_t>(ref.end());
322   for (size_t i = static_cast<size_t>(ref.begin); i < end; i++) {
323     UCHAR current_char = static_cast<UCHAR>(spec[i]);
324     if (current_char < 0x80) {
325       if (kShouldEscapeCharInFragment[current_char])
326         AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
327       else
328         output->push_back(static_cast<char>(spec[i]));
329     } else {
330       AppendUTF8EscapedChar(spec, &i, end, output);
331     }
332   }
333 
334   out_ref->len = output->length() - out_ref->begin;
335 }
336 
337 }  // namespace
338 
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len,bool * potentially_dangling_markup)339 const char* RemoveURLWhitespace(const char* input,
340                                 int input_len,
341                                 CanonOutputT<char>* buffer,
342                                 int* output_len,
343                                 bool* potentially_dangling_markup) {
344   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
345                                potentially_dangling_markup);
346 }
347 
RemoveURLWhitespace(const char16_t * input,int input_len,CanonOutputT<char16_t> * buffer,int * output_len,bool * potentially_dangling_markup)348 const char16_t* RemoveURLWhitespace(const char16_t* input,
349                                     int input_len,
350                                     CanonOutputT<char16_t>* buffer,
351                                     int* output_len,
352                                     bool* potentially_dangling_markup) {
353   return DoRemoveURLWhitespace(input, input_len, buffer, output_len,
354                                potentially_dangling_markup);
355 }
356 
CanonicalSchemeChar(char16_t ch)357 char CanonicalSchemeChar(char16_t ch) {
358   if (ch >= 0x80)
359     return 0;  // Non-ASCII is not supported by schemes.
360   return kSchemeCanonical[ch];
361 }
362 
CanonicalizeScheme(const char * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)363 bool CanonicalizeScheme(const char* spec,
364                         const Component& scheme,
365                         CanonOutput* output,
366                         Component* out_scheme) {
367   return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
368 }
369 
CanonicalizeScheme(const char16_t * spec,const Component & scheme,CanonOutput * output,Component * out_scheme)370 bool CanonicalizeScheme(const char16_t* spec,
371                         const Component& scheme,
372                         CanonOutput* output,
373                         Component* out_scheme) {
374   return DoScheme<char16_t, char16_t>(spec, scheme, output, out_scheme);
375 }
376 
CanonicalizeUserInfo(const char * username_source,const Component & username,const char * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)377 bool CanonicalizeUserInfo(const char* username_source,
378                           const Component& username,
379                           const char* password_source,
380                           const Component& password,
381                           CanonOutput* output,
382                           Component* out_username,
383                           Component* out_password) {
384   return DoUserInfo<char, unsigned char>(username_source, username,
385                                          password_source, password, output,
386                                          out_username, out_password);
387 }
388 
CanonicalizeUserInfo(const char16_t * username_source,const Component & username,const char16_t * password_source,const Component & password,CanonOutput * output,Component * out_username,Component * out_password)389 bool CanonicalizeUserInfo(const char16_t* username_source,
390                           const Component& username,
391                           const char16_t* password_source,
392                           const Component& password,
393                           CanonOutput* output,
394                           Component* out_username,
395                           Component* out_password) {
396   return DoUserInfo<char16_t, char16_t>(username_source, username,
397                                         password_source, password, output,
398                                         out_username, out_password);
399 }
400 
CanonicalizePort(const char * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)401 bool CanonicalizePort(const char* spec,
402                       const Component& port,
403                       int default_port_for_scheme,
404                       CanonOutput* output,
405                       Component* out_port) {
406   return DoPort<char, unsigned char>(spec, port, default_port_for_scheme,
407                                      output, out_port);
408 }
409 
CanonicalizePort(const char16_t * spec,const Component & port,int default_port_for_scheme,CanonOutput * output,Component * out_port)410 bool CanonicalizePort(const char16_t* spec,
411                       const Component& port,
412                       int default_port_for_scheme,
413                       CanonOutput* output,
414                       Component* out_port) {
415   return DoPort<char16_t, char16_t>(spec, port, default_port_for_scheme, output,
416                                     out_port);
417 }
418 
CanonicalizeRef(const char * spec,const Component & ref,CanonOutput * output,Component * out_ref)419 void CanonicalizeRef(const char* spec,
420                      const Component& ref,
421                      CanonOutput* output,
422                      Component* out_ref) {
423   DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
424 }
425 
CanonicalizeRef(const char16_t * spec,const Component & ref,CanonOutput * output,Component * out_ref)426 void CanonicalizeRef(const char16_t* spec,
427                      const Component& ref,
428                      CanonOutput* output,
429                      Component* out_ref) {
430   DoCanonicalizeRef<char16_t, char16_t>(spec, ref, output, out_ref);
431 }
432 
433 }  // namespace url
434