1 // Copyright 2007, Google Inc.
2 // All rights reserved.
3 //
4 // Redistribution and use in source and binary forms, with or without
5 // modification, are permitted provided that the following conditions are
6 // met:
7 //
8 // * Redistributions of source code must retain the above copyright
9 // notice, this list of conditions and the following disclaimer.
10 // * Redistributions in binary form must reproduce the above
11 // copyright notice, this list of conditions and the following disclaimer
12 // in the documentation and/or other materials provided with the
13 // distribution.
14 // * Neither the name of Google Inc. nor the names of its
15 // contributors may be used to endorse or promote products derived from
16 // this software without specific prior written permission.
17 //
18 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
19 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
20 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
21 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
22 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
23 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
24 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
25 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
26 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
27 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29
30 // Canonicalizers for random bits that aren't big enough for their own files.
31
32 #include <string.h>
33
34 #include "googleurl/src/url_canon.h"
35 #include "googleurl/src/url_canon_internal.h"
36
37 namespace url_canon {
38
39 namespace {
40
41 // Returns true if the given character should be removed from the middle of a
42 // URL.
IsRemovableURLWhitespace(int ch)43 inline bool IsRemovableURLWhitespace(int ch) {
44 return ch == '\r' || ch == '\n' || ch == '\t';
45 }
46
47 // Backend for RemoveURLWhitespace (see declaration in url_canon.h).
48 // It sucks that we have to do this, since this takes about 13% of the total URL
49 // canonicalization time.
50 template<typename CHAR>
DoRemoveURLWhitespace(const CHAR * input,int input_len,CanonOutputT<CHAR> * buffer,int * output_len)51 const CHAR* DoRemoveURLWhitespace(const CHAR* input, int input_len,
52 CanonOutputT<CHAR>* buffer,
53 int* output_len) {
54 // Fast verification that there's nothing that needs removal. This is the 99%
55 // case, so we want it to be fast and don't care about impacting the speed
56 // when we do find whitespace.
57 int found_whitespace = false;
58 for (int i = 0; i < input_len; i++) {
59 if (!IsRemovableURLWhitespace(input[i]))
60 continue;
61 found_whitespace = true;
62 break;
63 }
64
65 if (!found_whitespace) {
66 // Didn't find any whitespace, we don't need to do anything. We can just
67 // return the input as the output.
68 *output_len = input_len;
69 return input;
70 }
71
72 // Remove the whitespace into the new buffer and return it.
73 for (int i = 0; i < input_len; i++) {
74 if (!IsRemovableURLWhitespace(input[i]))
75 buffer->push_back(input[i]);
76 }
77 *output_len = buffer->length();
78 return buffer->data();
79 }
80
81 // Contains the canonical version of each possible input letter in the scheme
82 // (basically, lower-cased). The corresponding entry will be 0 if the letter
83 // is not allowed in a scheme.
84 const char kSchemeCanonical[0x80] = {
85 // 00-1f: all are invalid
86 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
87 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
88 // ' ' ! " # $ % & ' ( ) * + , - . /
89 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '+', 0, '-', '.', 0,
90 // 0 1 2 3 4 5 6 7 8 9 : ; < = > ?
91 '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 0 , 0 , 0 , 0 , 0 , 0 ,
92 // @ A B C D E F G H I J K L M N O
93 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
94 // P Q R S T U V W X Y Z [ \ ] ^ _
95 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0, 0 , 0, 0 , 0,
96 // ` a b c d e f g h i j k l m n o
97 0 , 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o',
98 // p q r s t u v w x y z { | } ~
99 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 0 , 0 , 0 , 0 , 0 };
100
101 // This could be a table lookup as well by setting the high bit for each
102 // valid character, but it's only called once per URL, and it makes the lookup
103 // table easier to read not having extra stuff in it.
IsSchemeFirstChar(unsigned char c)104 inline bool IsSchemeFirstChar(unsigned char c) {
105 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z');
106 }
107
108 template<typename CHAR, typename UCHAR>
DoScheme(const CHAR * spec,const url_parse::Component & scheme,CanonOutput * output,url_parse::Component * out_scheme)109 bool DoScheme(const CHAR* spec,
110 const url_parse::Component& scheme,
111 CanonOutput* output,
112 url_parse::Component* out_scheme) {
113 if (scheme.len <= 0) {
114 // Scheme is unspecified or empty, convert to empty by appending a colon.
115 *out_scheme = url_parse::Component(output->length(), 0);
116 output->push_back(':');
117 return true;
118 }
119
120 // The output scheme starts from the current position.
121 out_scheme->begin = output->length();
122
123 // Danger: it's important that this code does not strip any characters: it
124 // only emits the canonical version (be it valid or escaped) of each of
125 // the input characters. Stripping would put it out of sync with
126 // url_util::FindAndCompareScheme, which could cause some security checks on
127 // schemes to be incorrect.
128 bool success = true;
129 int end = scheme.end();
130 for (int i = scheme.begin; i < end; i++) {
131 UCHAR ch = static_cast<UCHAR>(spec[i]);
132 char replacement = 0;
133 if (ch < 0x80) {
134 if (i == scheme.begin) {
135 // Need to do a special check for the first letter of the scheme.
136 if (IsSchemeFirstChar(static_cast<unsigned char>(ch)))
137 replacement = kSchemeCanonical[ch];
138 } else {
139 replacement = kSchemeCanonical[ch];
140 }
141 }
142
143 if (replacement) {
144 output->push_back(replacement);
145 } else if (ch == '%') {
146 // Canonicalizing the scheme multiple times should lead to the same
147 // result. Since invalid characters will be escaped, we need to preserve
148 // the percent to avoid multiple escaping. The scheme will be invalid.
149 success = false;
150 output->push_back('%');
151 } else {
152 // Invalid character, store it but mark this scheme as invalid.
153 success = false;
154
155 // This will escape the output and also handle encoding issues.
156 // Ignore the return value since we already failed.
157 AppendUTF8EscapedChar(spec, &i, end, output);
158 }
159 }
160
161 // The output scheme ends with the the current position, before appending
162 // the colon.
163 out_scheme->len = output->length() - out_scheme->begin;
164 output->push_back(':');
165 return success;
166 }
167
168 // The username and password components reference ranges in the corresponding
169 // *_spec strings. Typically, these specs will be the same (we're
170 // canonicalizing a single source string), but may be different when
171 // replacing components.
172 template<typename CHAR, typename UCHAR>
DoUserInfo(const CHAR * username_spec,const url_parse::Component & username,const CHAR * password_spec,const url_parse::Component & password,CanonOutput * output,url_parse::Component * out_username,url_parse::Component * out_password)173 bool DoUserInfo(const CHAR* username_spec,
174 const url_parse::Component& username,
175 const CHAR* password_spec,
176 const url_parse::Component& password,
177 CanonOutput* output,
178 url_parse::Component* out_username,
179 url_parse::Component* out_password) {
180 if (username.len <= 0 && password.len <= 0) {
181 // Common case: no user info. We strip empty username/passwords.
182 *out_username = url_parse::Component();
183 *out_password = url_parse::Component();
184 return true;
185 }
186
187 // Write the username.
188 out_username->begin = output->length();
189 if (username.len > 0) {
190 // This will escape characters not valid for the username.
191 AppendStringOfType(&username_spec[username.begin], username.len,
192 CHAR_USERINFO, output);
193 }
194 out_username->len = output->length() - out_username->begin;
195
196 // When there is a password, we need the separator. Note that we strip
197 // empty but specified passwords.
198 if (password.len > 0) {
199 output->push_back(':');
200 out_password->begin = output->length();
201 AppendStringOfType(&password_spec[password.begin], password.len,
202 CHAR_USERINFO, output);
203 out_password->len = output->length() - out_password->begin;
204 } else {
205 *out_password = url_parse::Component();
206 }
207
208 output->push_back('@');
209 return true;
210 }
211
212 // Helper functions for converting port integers to strings.
WritePortInt(char * output,int output_len,int port)213 inline void WritePortInt(char* output, int output_len, int port) {
214 _itoa_s(port, output, output_len, 10);
215 }
216
217 // This function will prepend the colon if there will be a port.
218 template<typename CHAR, typename UCHAR>
DoPort(const CHAR * spec,const url_parse::Component & port,int default_port_for_scheme,CanonOutput * output,url_parse::Component * out_port)219 bool DoPort(const CHAR* spec,
220 const url_parse::Component& port,
221 int default_port_for_scheme,
222 CanonOutput* output,
223 url_parse::Component* out_port) {
224 int port_num = url_parse::ParsePort(spec, port);
225 if (port_num == url_parse::PORT_UNSPECIFIED ||
226 port_num == default_port_for_scheme) {
227 *out_port = url_parse::Component();
228 return true; // Leave port empty.
229 }
230
231 if (port_num == url_parse::PORT_INVALID) {
232 // Invalid port: We'll copy the text from the input so the user can see
233 // what the error was, and mark the URL as invalid by returning false.
234 output->push_back(':');
235 out_port->begin = output->length();
236 AppendInvalidNarrowString(spec, port.begin, port.end(), output);
237 out_port->len = output->length() - out_port->begin;
238 return false;
239 }
240
241 // Convert port number back to an integer. Max port value is 5 digits, and
242 // the Parsed::ExtractPort will have made sure the integer is in range.
243 const int buf_size = 6;
244 char buf[buf_size];
245 WritePortInt(buf, buf_size, port_num);
246
247 // Append the port number to the output, preceeded by a colon.
248 output->push_back(':');
249 out_port->begin = output->length();
250 for (int i = 0; i < buf_size && buf[i]; i++)
251 output->push_back(buf[i]);
252
253 out_port->len = output->length() - out_port->begin;
254 return true;
255 }
256
257 template<typename CHAR, typename UCHAR>
DoCanonicalizeRef(const CHAR * spec,const url_parse::Component & ref,CanonOutput * output,url_parse::Component * out_ref)258 void DoCanonicalizeRef(const CHAR* spec,
259 const url_parse::Component& ref,
260 CanonOutput* output,
261 url_parse::Component* out_ref) {
262 if (ref.len < 0) {
263 // Common case of no ref.
264 *out_ref = url_parse::Component();
265 return;
266 }
267
268 // Append the ref separator. Note that we need to do this even when the ref
269 // is empty but present.
270 output->push_back('#');
271 out_ref->begin = output->length();
272
273 // Now iterate through all the characters, converting to UTF-8 and validating.
274 int end = ref.end();
275 for (int i = ref.begin; i < end; i++) {
276 if (spec[i] == 0) {
277 // IE just strips NULLs, so we do too.
278 continue;
279 } else if (static_cast<UCHAR>(spec[i]) < 0x20) {
280 // Unline IE seems to, we escape control characters. This will probably
281 // make the reference fragment unusable on a web page, but people
282 // shouldn't be using control characters in their anchor names.
283 AppendEscapedChar(static_cast<unsigned char>(spec[i]), output);
284 } else if (static_cast<UCHAR>(spec[i]) < 0x80) {
285 // Normal ASCII characters are just appended.
286 output->push_back(static_cast<char>(spec[i]));
287 } else {
288 // Non-ASCII characters are appended unescaped, but only when they are
289 // valid. Invalid Unicode characters are replaced with the "invalid
290 // character" as IE seems to (ReadUTFChar puts the unicode replacement
291 // character in the output on failure for us).
292 unsigned code_point;
293 ReadUTFChar(spec, &i, end, &code_point);
294 AppendUTF8Value(code_point, output);
295 }
296 }
297
298 out_ref->len = output->length() - out_ref->begin;
299 }
300
301 } // namespace
302
RemoveURLWhitespace(const char * input,int input_len,CanonOutputT<char> * buffer,int * output_len)303 const char* RemoveURLWhitespace(const char* input, int input_len,
304 CanonOutputT<char>* buffer,
305 int* output_len) {
306 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
307 }
308
RemoveURLWhitespace(const char16 * input,int input_len,CanonOutputT<char16> * buffer,int * output_len)309 const char16* RemoveURLWhitespace(const char16* input, int input_len,
310 CanonOutputT<char16>* buffer,
311 int* output_len) {
312 return DoRemoveURLWhitespace(input, input_len, buffer, output_len);
313 }
314
CanonicalSchemeChar(char16 ch)315 char CanonicalSchemeChar(char16 ch) {
316 if (ch >= 0x80)
317 return 0; // Non-ASCII is not supported by schemes.
318 return kSchemeCanonical[ch];
319 }
320
CanonicalizeScheme(const char * spec,const url_parse::Component & scheme,CanonOutput * output,url_parse::Component * out_scheme)321 bool CanonicalizeScheme(const char* spec,
322 const url_parse::Component& scheme,
323 CanonOutput* output,
324 url_parse::Component* out_scheme) {
325 return DoScheme<char, unsigned char>(spec, scheme, output, out_scheme);
326 }
327
CanonicalizeScheme(const char16 * spec,const url_parse::Component & scheme,CanonOutput * output,url_parse::Component * out_scheme)328 bool CanonicalizeScheme(const char16* spec,
329 const url_parse::Component& scheme,
330 CanonOutput* output,
331 url_parse::Component* out_scheme) {
332 return DoScheme<char16, char16>(spec, scheme, output, out_scheme);
333 }
334
CanonicalizeUserInfo(const char * username_source,const url_parse::Component & username,const char * password_source,const url_parse::Component & password,CanonOutput * output,url_parse::Component * out_username,url_parse::Component * out_password)335 bool CanonicalizeUserInfo(const char* username_source,
336 const url_parse::Component& username,
337 const char* password_source,
338 const url_parse::Component& password,
339 CanonOutput* output,
340 url_parse::Component* out_username,
341 url_parse::Component* out_password) {
342 return DoUserInfo<char, unsigned char>(
343 username_source, username, password_source, password,
344 output, out_username, out_password);
345 }
346
CanonicalizeUserInfo(const char16 * username_source,const url_parse::Component & username,const char16 * password_source,const url_parse::Component & password,CanonOutput * output,url_parse::Component * out_username,url_parse::Component * out_password)347 bool CanonicalizeUserInfo(const char16* username_source,
348 const url_parse::Component& username,
349 const char16* password_source,
350 const url_parse::Component& password,
351 CanonOutput* output,
352 url_parse::Component* out_username,
353 url_parse::Component* out_password) {
354 return DoUserInfo<char16, char16>(
355 username_source, username, password_source, password,
356 output, out_username, out_password);
357 }
358
CanonicalizePort(const char * spec,const url_parse::Component & port,int default_port_for_scheme,CanonOutput * output,url_parse::Component * out_port)359 bool CanonicalizePort(const char* spec,
360 const url_parse::Component& port,
361 int default_port_for_scheme,
362 CanonOutput* output,
363 url_parse::Component* out_port) {
364 return DoPort<char, unsigned char>(spec, port,
365 default_port_for_scheme,
366 output, out_port);
367 }
368
CanonicalizePort(const char16 * spec,const url_parse::Component & port,int default_port_for_scheme,CanonOutput * output,url_parse::Component * out_port)369 bool CanonicalizePort(const char16* spec,
370 const url_parse::Component& port,
371 int default_port_for_scheme,
372 CanonOutput* output,
373 url_parse::Component* out_port) {
374 return DoPort<char16, char16>(spec, port, default_port_for_scheme,
375 output, out_port);
376 }
377
CanonicalizeRef(const char * spec,const url_parse::Component & ref,CanonOutput * output,url_parse::Component * out_ref)378 void CanonicalizeRef(const char* spec,
379 const url_parse::Component& ref,
380 CanonOutput* output,
381 url_parse::Component* out_ref) {
382 DoCanonicalizeRef<char, unsigned char>(spec, ref, output, out_ref);
383 }
384
CanonicalizeRef(const char16 * spec,const url_parse::Component & ref,CanonOutput * output,url_parse::Component * out_ref)385 void CanonicalizeRef(const char16* spec,
386 const url_parse::Component& ref,
387 CanonOutput* output,
388 url_parse::Component* out_ref) {
389 DoCanonicalizeRef<char16, char16>(spec, ref, output, out_ref);
390 }
391
392 } // namespace url_canon
393