1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "url/url_canon.h"
6
7 #include <errno.h>
8 #include <stddef.h>
9
10 #include "base/strings/string_piece.h"
11 #include "base/strings/utf_string_conversions.h"
12 #include "base/test/gtest_util.h"
13 #include "base/test/scoped_feature_list.h"
14 #include "testing/gtest/include/gtest/gtest.h"
15 #include "url/third_party/mozilla/url_parse.h"
16 #include "url/url_canon_internal.h"
17 #include "url/url_canon_stdstring.h"
18 #include "url/url_features.h"
19 #include "url/url_test_utils.h"
20
21 namespace url {
22
23 namespace {
24
25 struct ComponentCase {
26 const char* input;
27 const char* expected;
28 Component expected_component;
29 bool expected_success;
30 };
31
32 // ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
33 // treat each input as optional, and will only try processing if non-NULL.
34 // The output is always 8-bit.
35 struct DualComponentCase {
36 const char* input8;
37 const wchar_t* input16;
38 const char* expected;
39 Component expected_component;
40 bool expected_success;
41 };
42
43 // Test cases for CanonicalizeIPAddress(). The inputs are identical to
44 // DualComponentCase, but the output has extra CanonHostInfo fields.
45 struct IPAddressCase {
46 const char* input8;
47 const wchar_t* input16;
48 const char* expected;
49 Component expected_component;
50
51 // CanonHostInfo fields, for verbose output.
52 CanonHostInfo::Family expected_family;
53 int expected_num_ipv4_components;
54 const char* expected_address_hex; // Two hex chars per IP address byte.
55 };
56
BytesToHexString(unsigned char bytes[16],int length)57 std::string BytesToHexString(unsigned char bytes[16], int length) {
58 EXPECT_TRUE(length == 0 || length == 4 || length == 16)
59 << "Bad IP address length: " << length;
60 std::string result;
61 for (int i = 0; i < length; ++i) {
62 result.push_back(kHexCharLookup[(bytes[i] >> 4) & 0xf]);
63 result.push_back(kHexCharLookup[bytes[i] & 0xf]);
64 }
65 return result;
66 }
67
68 struct ReplaceCase {
69 const char* base;
70 const char* scheme;
71 const char* username;
72 const char* password;
73 const char* host;
74 const char* port;
75 const char* path;
76 const char* query;
77 const char* ref;
78 const char* expected;
79 };
80
81 // Magic string used in the replacements code that tells SetupReplComp to
82 // call the clear function.
83 const char kDeleteComp[] = "|";
84
85 // Sets up a replacement for a single component. This is given pointers to
86 // the set and clear function for the component being replaced, and will
87 // either set the component (if it exists) or clear it (if the replacement
88 // string matches kDeleteComp).
89 //
90 // This template is currently used only for the 8-bit case, and the strlen
91 // causes it to fail in other cases. It is left a template in case we have
92 // tests for wide replacements.
93 template<typename CHAR>
SetupReplComp(void (Replacements<CHAR>::* set)(const CHAR *,const Component &),void (Replacements<CHAR>::* clear)(),Replacements<CHAR> * rep,const CHAR * str)94 void SetupReplComp(
95 void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
96 void (Replacements<CHAR>::*clear)(),
97 Replacements<CHAR>* rep,
98 const CHAR* str) {
99 if (str && str[0] == kDeleteComp[0]) {
100 (rep->*clear)();
101 } else if (str) {
102 (rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
103 }
104 }
105
106 } // namespace
107
TEST(URLCanonTest,DoAppendUTF8)108 TEST(URLCanonTest, DoAppendUTF8) {
109 struct UTF8Case {
110 unsigned input;
111 const char* output;
112 } utf_cases[] = {
113 // Valid code points.
114 {0x24, "\x24"},
115 {0xA2, "\xC2\xA2"},
116 {0x20AC, "\xE2\x82\xAC"},
117 {0x24B62, "\xF0\xA4\xAD\xA2"},
118 {0x10FFFF, "\xF4\x8F\xBF\xBF"},
119 };
120 std::string out_str;
121 for (size_t i = 0; i < std::size(utf_cases); i++) {
122 out_str.clear();
123 StdStringCanonOutput output(&out_str);
124 AppendUTF8Value(utf_cases[i].input, &output);
125 output.Complete();
126 EXPECT_EQ(utf_cases[i].output, out_str);
127 }
128 }
129
TEST(URLCanonTest,DoAppendUTF8Invalid)130 TEST(URLCanonTest, DoAppendUTF8Invalid) {
131 std::string out_str;
132 StdStringCanonOutput output(&out_str);
133 // Invalid code point (too large).
134 EXPECT_DCHECK_DEATH({
135 AppendUTF8Value(0x110000, &output);
136 output.Complete();
137 });
138 }
139
TEST(URLCanonTest,UTF)140 TEST(URLCanonTest, UTF) {
141 // Low-level test that we handle reading, canonicalization, and writing
142 // UTF-8/UTF-16 strings properly.
143 struct UTFCase {
144 const char* input8;
145 const wchar_t* input16;
146 bool expected_success;
147 const char* output;
148 } utf_cases[] = {
149 // Valid canonical input should get passed through & escaped.
150 {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
151 // Test a character that takes > 16 bits (U+10300 = old italic letter A)
152 {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
153 // Non-shortest-form UTF-8 characters are invalid. The bad bytes should
154 // each be replaced with the invalid character (EF BF DB in UTF-8).
155 {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", nullptr, false,
156 "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
157 // Invalid UTF-8 sequences should be marked as invalid (the first
158 // sequence is truncated).
159 {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
160 // Character going off the end.
161 {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
162 // ...same with low surrogates with no high surrogate.
163 {nullptr, L"\xdc00", false, "%EF%BF%BD"},
164 // Test a UTF-8 encoded surrogate value is marked as invalid.
165 // ED A0 80 = U+D800
166 {"\xed\xa0\x80", nullptr, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
167 // ...even when paired.
168 {"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
169 "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
170 };
171
172 std::string out_str;
173 for (size_t i = 0; i < std::size(utf_cases); i++) {
174 if (utf_cases[i].input8) {
175 out_str.clear();
176 StdStringCanonOutput output(&out_str);
177
178 size_t input_len = strlen(utf_cases[i].input8);
179 bool success = true;
180 for (size_t ch = 0; ch < input_len; ch++) {
181 success &= AppendUTF8EscapedChar(utf_cases[i].input8, &ch, input_len,
182 &output);
183 }
184 output.Complete();
185 EXPECT_EQ(utf_cases[i].expected_success, success);
186 EXPECT_EQ(std::string(utf_cases[i].output), out_str);
187 }
188 if (utf_cases[i].input16) {
189 out_str.clear();
190 StdStringCanonOutput output(&out_str);
191
192 std::u16string input_str(
193 test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
194 size_t input_len = input_str.length();
195 bool success = true;
196 for (size_t ch = 0; ch < input_len; ch++) {
197 success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
198 &output);
199 }
200 output.Complete();
201 EXPECT_EQ(utf_cases[i].expected_success, success);
202 EXPECT_EQ(std::string(utf_cases[i].output), out_str);
203 }
204
205 if (utf_cases[i].input8 && utf_cases[i].input16 &&
206 utf_cases[i].expected_success) {
207 // Check that the UTF-8 and UTF-16 inputs are equivalent.
208
209 // UTF-16 -> UTF-8
210 std::string input8_str(utf_cases[i].input8);
211 std::u16string input16_str(
212 test_utils::TruncateWStringToUTF16(utf_cases[i].input16));
213 EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str));
214
215 // UTF-8 -> UTF-16
216 EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str));
217 }
218 }
219 }
220
TEST(URLCanonTest,Scheme)221 TEST(URLCanonTest, Scheme) {
222 // Here, we're mostly testing that unusual characters are handled properly.
223 // The canonicalizer doesn't do any parsing or whitespace detection. It will
224 // also do its best on error, and will escape funny sequences (these won't be
225 // valid schemes and it will return error).
226 //
227 // Note that the canonicalizer will append a colon to the output to separate
228 // out the rest of the URL, which is not present in the input. We check,
229 // however, that the output range includes everything but the colon.
230 ComponentCase scheme_cases[] = {
231 {"http", "http:", Component(0, 4), true},
232 {"HTTP", "http:", Component(0, 4), true},
233 {" HTTP ", "%20http%20:", Component(0, 10), false},
234 {"htt: ", "htt%3A%20:", Component(0, 9), false},
235 {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
236 // Don't re-escape something already escaped. Note that it will
237 // "canonicalize" the 'A' to 'a', but that's OK.
238 {"ht%3Atp", "ht%3atp:", Component(0, 7), false},
239 {"", ":", Component(0, 0), false},
240 };
241
242 std::string out_str;
243
244 for (size_t i = 0; i < std::size(scheme_cases); i++) {
245 int url_len = static_cast<int>(strlen(scheme_cases[i].input));
246 Component in_comp(0, url_len);
247 Component out_comp;
248
249 out_str.clear();
250 StdStringCanonOutput output1(&out_str);
251 bool success = CanonicalizeScheme(scheme_cases[i].input, in_comp, &output1,
252 &out_comp);
253 output1.Complete();
254
255 EXPECT_EQ(scheme_cases[i].expected_success, success);
256 EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
257 EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
258 EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
259
260 // Now try the wide version.
261 out_str.clear();
262 StdStringCanonOutput output2(&out_str);
263
264 std::u16string wide_input(base::UTF8ToUTF16(scheme_cases[i].input));
265 in_comp.len = static_cast<int>(wide_input.length());
266 success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
267 &out_comp);
268 output2.Complete();
269
270 EXPECT_EQ(scheme_cases[i].expected_success, success);
271 EXPECT_EQ(std::string(scheme_cases[i].expected), out_str);
272 EXPECT_EQ(scheme_cases[i].expected_component.begin, out_comp.begin);
273 EXPECT_EQ(scheme_cases[i].expected_component.len, out_comp.len);
274 }
275
276 // Test the case where the scheme is declared nonexistent, it should be
277 // converted into an empty scheme.
278 Component out_comp;
279 out_str.clear();
280 StdStringCanonOutput output(&out_str);
281
282 EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
283 output.Complete();
284
285 EXPECT_EQ(std::string(":"), out_str);
286 EXPECT_EQ(0, out_comp.begin);
287 EXPECT_EQ(0, out_comp.len);
288 }
289
290 // IDNA mode to use in CanonHost tests.
291 enum class IDNAMode { kTransitional, kNonTransitional };
292
293 class URLCanonHostTest
294 : public ::testing::Test,
295 public ::testing::WithParamInterface<IDNAMode> {
296 public:
URLCanonHostTest()297 URLCanonHostTest() {
298 if (GetParam() == IDNAMode::kNonTransitional) {
299 scoped_feature_list_.InitAndEnableFeature(kUseIDNA2008NonTransitional);
300 } else {
301 scoped_feature_list_.InitAndDisableFeature(kUseIDNA2008NonTransitional);
302 }
303 }
304
305 private:
306 base::test::ScopedFeatureList scoped_feature_list_;
307 };
308
309 INSTANTIATE_TEST_SUITE_P(All,
310 URLCanonHostTest,
311 ::testing::Values(IDNAMode::kTransitional,
312 IDNAMode::kNonTransitional));
313
TEST_P(URLCanonHostTest,Host)314 TEST_P(URLCanonHostTest, Host) {
315 bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional();
316
317 IPAddressCase host_cases[] = {
318 // Basic canonicalization, uppercase should be converted to lowercase.
319 {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10),
320 CanonHostInfo::NEUTRAL, -1, ""},
321 // Spaces and some other characters should be escaped.
322 {"Goo%20 goo%7C|.com", L"Goo%20 goo%7C|.com", "goo%20%20goo%7C%7C.com",
323 Component(0, 22), CanonHostInfo::NEUTRAL, -1, ""},
324 // Exciting different types of spaces!
325 {NULL, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16),
326 CanonHostInfo::NEUTRAL, -1, ""},
327 // Other types of space (no-break, zero-width, zero-width-no-break) are
328 // name-prepped away to nothing.
329 {NULL, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10),
330 CanonHostInfo::NEUTRAL, -1, ""},
331 // Ideographic full stop (full-width period for Chinese, etc.) should be
332 // treated as a dot.
333 {NULL,
334 L"www.foo\x3002"
335 L"bar.com",
336 "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
337 // Invalid unicode characters should fail...
338 // ...In wide input, ICU will barf and we'll end up with the input as
339 // escaped UTF-8 (the invalid character should be replaced with the
340 // replacement character).
341 {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%BF%BDzyx.com",
342 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
343 // ...This is the same as previous but with with escaped.
344 {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%BF%BDzyx.com",
345 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
346 // Test name prepping, fullwidth input should be converted to ASCII and
347 // NOT
348 // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
349 {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com",
350 Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
351 // Test that fullwidth escaped values are properly name-prepped,
352 // then converted or rejected.
353 // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
354 {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com",
355 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
356 {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com",
357 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
358 // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
359 {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com",
360 "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
361 {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com",
362 "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
363 // ICU will convert weird percents into ASCII percents, but not unescape
364 // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
365 // "small percent". At this point we should be within our rights to mark
366 // anything as invalid since the URL is corrupt or malicious. The code
367 // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
368 // and kept as valid, so we validate that behavior here, but this level
369 // of fixing the input shouldn't be seen as required. "%81" is invalid.
370 {"\xef\xb9\xaa"
371 "41.com",
372 L"\xfe6a"
373 L"41.com",
374 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
375 {"%ef%b9%aa"
376 "41.com",
377 L"\xfe6a"
378 L"41.com",
379 "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
380 {"\xef\xb9\xaa"
381 "81.com",
382 L"\xfe6a"
383 L"81.com",
384 "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
385 {"%ef%b9%aa"
386 "81.com",
387 L"\xfe6a"
388 L"81.com",
389 "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
390 // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
391 {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
392 L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
393 CanonHostInfo::NEUTRAL, -1, ""},
394 // See http://unicode.org/cldr/utility/idna.jsp for other
395 // examples/experiments and http://goo.gl/7yG11o
396 // for the full list of characters handled differently by
397 // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
398
399 // 4 Deviation characters are mapped/ignored in UTS 46 transitional
400 // mechansm. UTS 46, table 4 row (g).
401 // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46
402 // after transitional period.
403 // Previously, it'd be "fussball.de".
404 {"fu\xc3\x9f"
405 "ball.de",
406 L"fu\x00df"
407 L"ball.de",
408 use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de",
409 use_idna_non_transitional ? Component(0, 17) : Component(0, 11),
410 CanonHostInfo::NEUTRAL, -1, ""},
411
412 // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2).
413 // Previously, it'd be "xn--wxaikc9b".
414 {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
415 use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b",
416 Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
417
418 // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
419 // handling as well as in IDNA 2003, but not thereafter.
420 {"a\xe2\x80\x8c"
421 "b\xe2\x80\x8d"
422 "c",
423 L"a\x200c"
424 L"b\x200d"
425 L"c",
426 use_idna_non_transitional ? "xn--abc-9m0ag" : "abc",
427 use_idna_non_transitional ? Component(0, 13) : Component(0, 3),
428 CanonHostInfo::NEUTRAL, -1, ""},
429
430 // ZWJ between Devanagari characters was still mapped away in UTS 46
431 // transitional handling. IDNA 2008 gives xn--11bo0mv54g.
432 // Previously "xn--11bo0m".
433 {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
434 L"\x915\x94d\x200d\x91c",
435 use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m",
436 use_idna_non_transitional ? Component(0, 14) : Component(0, 10),
437 CanonHostInfo::NEUTRAL, -1, ""},
438
439 // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
440 // However, we do allow this at the moment because we don't use
441 // STD3 rules and canonicalize full-width ASCII to ASCII.
442 {"wow\xef\xbc\x81", L"wow\xff01", "wow%21", Component(0, 6),
443 CanonHostInfo::NEUTRAL, -1, ""},
444 // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
445 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
446 {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11),
447 CanonHostInfo::BROKEN, -1, ""},
448 // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
449 // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
450 {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
451 "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1,
452 ""},
453 // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
454 {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14),
455 CanonHostInfo::NEUTRAL, -1, ""},
456 // An already-IDNA host is not modified.
457 {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14),
458 CanonHostInfo::NEUTRAL, -1, ""},
459 // Symbol/punctuations are allowed in IDNA 2003/UTS46.
460 // Not allowed in IDNA 2008. UTS 46 table 4 row (f).
461 {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13),
462 CanonHostInfo::NEUTRAL, -1, ""},
463 // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
464 // We used to allow it because we passed through unassigned code points.
465 {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
466 Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
467 // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
468 // Used to be allowed in INDA 2003.
469 {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9),
470 CanonHostInfo::BROKEN, -1, ""},
471 // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
472 // on Unicode 3.2). We did allow it in the past because we let unassigned
473 // code point pass. We continue to allow it even though it's a
474 // "punctuation and symbol" blocked in IDNA 2008.
475 // UTS 46 table 4, row (j)
476 {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11),
477 CanonHostInfo::NEUTRAL, -1, ""},
478 // Maps uppercase letters to lower case letters.
479 // In IDNA 2003, it's allowed without case-folding
480 // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
481 // (added in Unicode 4.1). UTS 46 table 4 row (k)
482 {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15),
483 CanonHostInfo::NEUTRAL, -1, ""},
484 // Maps U+FF43 (Full Width Small Letter C) to 'c'.
485 {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7),
486 CanonHostInfo::NEUTRAL, -1, ""},
487 // Maps U+1D68C (Math Monospace Small C) to 'c'.
488 // U+1D68C = \xD835\xDE8C in UTF-16
489 {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
490 Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
491 // BiDi check test
492 // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
493 // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
494 {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
495 L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13),
496 CanonHostInfo::NEUTRAL, -1, ""},
497 // Disallowed in both IDNA 2003 and 2008 with BiDi check.
498 // Labels starting with a RTL character cannot end with a LTR character.
499 {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
500 "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1,
501 ""},
502 // Labels starting with a RTL character can end with BC=EN (European
503 // number). Disallowed in IDNA 2003 but now allowed.
504 {"\xd8\xac\xd8\xa7\xd8\xb1"
505 "2",
506 L"\x62c\x627\x631"
507 L"2",
508 "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
509 // Labels starting with a RTL character cannot have "L" characters
510 // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
511 {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
512 "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1,
513 ""},
514 // Labels starting with a RTL character can end with BC=AN (Arabic number)
515 // Disallowed in IDNA 2003, but now allowed.
516 {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
517 "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
518 // Labels starting with a RTL character cannot have "L" characters
519 // even if it ends with an BC=AN (Arabic number).
520 // Disallowed in both IDNA 2003/2008.
521 {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
522 "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN,
523 -1, ""},
524 // Labels starting with a RTL character cannot mix BC=EN and BC=AN
525 {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
526 "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN,
527 -1, ""},
528 // As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
529 {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13),
530 CanonHostInfo::BROKEN, -1, ""},
531 // U+0080 is not allowed.
532 {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10),
533 CanonHostInfo::BROKEN, -1, ""},
534 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
535 // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
536 // UTF-8 (wide case). The output should be equivalent to the true wide
537 // character input above).
538 {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
539 L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
540 CanonHostInfo::NEUTRAL, -1, ""},
541 // Invalid escaped characters should fail and the percents should be
542 // escaped.
543 {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
544 CanonHostInfo::BROKEN, -1, ""},
545 // If we get an invalid character that has been escaped.
546 {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
547 {"hello%00", L"hello%00", "hello%00", Component(0, 8),
548 CanonHostInfo::BROKEN, -1, ""},
549 // Escaped numbers should be treated like IP addresses if they are.
550 {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
551 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
552 {"%30%78%63%30%2e%30%32%35%30.01%2e",
553 L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11),
554 CanonHostInfo::IPV4, 3, "C0A80001"},
555 // Invalid escaping should trigger the regular host error handling.
556 {"%3g%78%63%30%2e%30%32%35%30%2E.01",
557 L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01",
558 Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
559 // Something that isn't exactly an IP should get treated as a host and
560 // spaces escaped.
561 {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello",
562 Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
563 // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
564 // These are "0Xc0.0250.01" in fullwidth.
565 {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%"
566 "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%"
567 "8E\xef\xbc\x90\xef\xbc\x91",
568 L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10"
569 L"\xff11",
570 "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
571 // Broken IP addresses get marked as such.
572 {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
573 CanonHostInfo::BROKEN, -1, ""},
574 {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12),
575 CanonHostInfo::BROKEN, -1, ""},
576 // Cyrillic letter followed by '(' should return punycode for '(' escaped
577 // before punycode string was created. I.e.
578 // if '(' is escaped after punycode is created we would get xn--%28-8tb
579 // (incorrect).
580 {"\xd1\x82(", L"\x0442(", "xn--%28-7ed", Component(0, 11),
581 CanonHostInfo::NEUTRAL, -1, ""},
582 // Address with all hexadecimal characters with leading number of 1<<32
583 // or greater and should return NEUTRAL rather than BROKEN if not all
584 // components are numbers.
585 {"12345678912345.de", L"12345678912345.de", "12345678912345.de",
586 Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
587 {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de",
588 Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
589 {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de",
590 "12345678912345.12345678912345.de", Component(0, 32),
591 CanonHostInfo::NEUTRAL, -1, ""},
592 {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de",
593 Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
594 {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde",
595 Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
596 // A label that starts with "xn--" but contains non-ASCII characters
597 // should
598 // be an error. Escape the invalid characters.
599 {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen",
600 Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
601 };
602
603 // CanonicalizeHost() non-verbose.
604 std::string out_str;
605 for (size_t i = 0; i < std::size(host_cases); i++) {
606 // Narrow version.
607 if (host_cases[i].input8) {
608 int host_len = static_cast<int>(strlen(host_cases[i].input8));
609 Component in_comp(0, host_len);
610 Component out_comp;
611
612 out_str.clear();
613 StdStringCanonOutput output(&out_str);
614
615 bool success = CanonicalizeHost(host_cases[i].input8, in_comp, &output,
616 &out_comp);
617 output.Complete();
618
619 EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
620 success) << "for input: " << host_cases[i].input8;
621 EXPECT_EQ(std::string(host_cases[i].expected), out_str) <<
622 "for input: " << host_cases[i].input8;
623 EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin) <<
624 "for input: " << host_cases[i].input8;
625 EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len) <<
626 "for input: " << host_cases[i].input8;
627 }
628
629 // Wide version.
630 if (host_cases[i].input16) {
631 std::u16string input16(
632 test_utils::TruncateWStringToUTF16(host_cases[i].input16));
633 int host_len = static_cast<int>(input16.length());
634 Component in_comp(0, host_len);
635 Component out_comp;
636
637 out_str.clear();
638 StdStringCanonOutput output(&out_str);
639
640 bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
641 &out_comp);
642 output.Complete();
643
644 EXPECT_EQ(host_cases[i].expected_family != CanonHostInfo::BROKEN,
645 success);
646 EXPECT_EQ(std::string(host_cases[i].expected), out_str);
647 EXPECT_EQ(host_cases[i].expected_component.begin, out_comp.begin);
648 EXPECT_EQ(host_cases[i].expected_component.len, out_comp.len);
649 }
650 }
651
652 // CanonicalizeHostVerbose()
653 for (size_t i = 0; i < std::size(host_cases); i++) {
654 // Narrow version.
655 if (host_cases[i].input8) {
656 int host_len = static_cast<int>(strlen(host_cases[i].input8));
657 Component in_comp(0, host_len);
658
659 out_str.clear();
660 StdStringCanonOutput output(&out_str);
661 CanonHostInfo host_info;
662
663 CanonicalizeHostVerbose(host_cases[i].input8, in_comp, &output,
664 &host_info);
665 output.Complete();
666
667 EXPECT_EQ(host_cases[i].expected_family, host_info.family);
668 EXPECT_EQ(std::string(host_cases[i].expected), out_str);
669 EXPECT_EQ(host_cases[i].expected_component.begin,
670 host_info.out_host.begin);
671 EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
672 EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
673 BytesToHexString(host_info.address, host_info.AddressLength()));
674 if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
675 EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
676 host_info.num_ipv4_components);
677 }
678 }
679
680 // Wide version.
681 if (host_cases[i].input16) {
682 std::u16string input16(
683 test_utils::TruncateWStringToUTF16(host_cases[i].input16));
684 int host_len = static_cast<int>(input16.length());
685 Component in_comp(0, host_len);
686
687 out_str.clear();
688 StdStringCanonOutput output(&out_str);
689 CanonHostInfo host_info;
690
691 CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
692 output.Complete();
693
694 EXPECT_EQ(host_cases[i].expected_family, host_info.family);
695 EXPECT_EQ(std::string(host_cases[i].expected), out_str);
696 EXPECT_EQ(host_cases[i].expected_component.begin,
697 host_info.out_host.begin);
698 EXPECT_EQ(host_cases[i].expected_component.len, host_info.out_host.len);
699 EXPECT_EQ(std::string(host_cases[i].expected_address_hex),
700 BytesToHexString(host_info.address, host_info.AddressLength()));
701 if (host_cases[i].expected_family == CanonHostInfo::IPV4) {
702 EXPECT_EQ(host_cases[i].expected_num_ipv4_components,
703 host_info.num_ipv4_components);
704 }
705 }
706 }
707 }
708
TEST(URLCanonTest,IPv4)709 TEST(URLCanonTest, IPv4) {
710 // clang-format off
711 IPAddressCase cases[] = {
712 // Empty is not an IP address.
713 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
714 {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
715 // Regular IP addresses in different bases.
716 {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
717 {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
718 {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
719 // Non-IP addresses due to invalid characters.
720 {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
721 // Hostnames with a numeric final component but other components that don't
722 // parse as numbers should be considered broken.
723 {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
724 {"19a.168.0.1.", L"19a.168.0.1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
725 {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::BROKEN, -1, ""},
726 {"0308.0250.00.01.", L"0308.0250.00.01.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
727 {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
728 {"0xCG.0xA8.0x0.0x1.", L"0xCG.0xA8.0x0.0x1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
729 // Non-numeric terminal compeonent should be considered not IPv4 hostnames, but valid.
730 {"19.168.0.1a", L"19.168.0.1a", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
731 {"0xC.0xA8.0x0.0x1G", L"0xC.0xA8.0x0.0x1G", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
732 // Hostnames that would be considered broken IPv4 hostnames should be considered valid non-IPv4 hostnames if they end with two dots instead of 0 or 1.
733 {"19a.168.0.1..", L"19a.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
734 {"0308.0250.00.01..", L"0308.0250.00.01..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
735 {"0xCG.0xA8.0x0.0x1..", L"0xCG.0xA8.0x0.0x1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
736 // Hosts with components that aren't considered valid IPv4 numbers but are entirely numeric should be considered invalid.
737 {"1.2.3.08", L"1.2.3.08", "", Component(), CanonHostInfo::BROKEN, -1, ""},
738 {"1.2.3.08.", L"1.2.3.08.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
739 // If there are not enough components, the last one should fill them out.
740 {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
741 {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
742 {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
743 {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
744 {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
745 {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
746 {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
747 {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
748 // Hostnames with too many components, but a numeric final numeric component are invalid.
749 {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
750 // We allow a single trailing dot.
751 {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
752 {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
753 {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
754 // Hosts with two dots in a row with a final numeric component are considered invalid.
755 {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
756 {"192.168..1.", L"192.168..1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
757 // Any numerical overflow should be marked as BROKEN.
758 {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
759 {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
760 {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
761 {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
762 {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
763 {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
764 {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
765 {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
766 {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
767 // Repeat the previous tests, minus 1, to verify boundaries.
768 {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
769 {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
770 {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
771 {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
772 {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
773 {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
774 {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
775 {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
776 {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
777 // Old trunctations tests. They're all "BROKEN" now.
778 {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
779 {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
780 {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
781 {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
782 {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
783 {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
784 // Too many components should be rejected, in valid ranges or not.
785 {"255.255.255.255.255", L"255.255.255.255.255", "", Component(), CanonHostInfo::BROKEN, -1, ""},
786 {"256.256.256.256.256", L"256.256.256.256.256", "", Component(), CanonHostInfo::BROKEN, -1, ""},
787 // Spaces should be rejected.
788 {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
789 // Very large numbers.
790 {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
791 {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
792 // A number has no length limit, but long numbers can still overflow.
793 {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
794 {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
795 // If a long component is non-numeric, it's a hostname, *not* a broken IP.
796 {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
797 {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
798 // Truncation of all zeros should still result in 0.
799 {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
800 // Non-ASCII characters in final component should return NEUTRAL.
801 {"1.2.3.\xF0\x9F\x92\xA9", L"1.2.3.\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
802 {"1.2.3.4\xF0\x9F\x92\xA9", L"1.2.3.4\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
803 {"1.2.3.0x\xF0\x9F\x92\xA9", L"1.2.3.0x\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
804 {"1.2.3.0\xF0\x9F\x92\xA9", L"1.2.3.0\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
805 // Non-ASCII characters in other components should result in broken IPs when final component is numeric.
806 {"1.2.\xF0\x9F\x92\xA9.4", L"1.2.\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
807 {"1.2.3\xF0\x9F\x92\xA9.4", L"1.2.3\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
808 {"1.2.0x\xF0\x9F\x92\xA9.4", L"1.2.0x\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
809 {"1.2.0\xF0\x9F\x92\xA9.4", L"1.2.0\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
810 {"\xF0\x9F\x92\xA9.2.3.4", L"\xD83D\xDCA9.2.3.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
811 };
812 // clang-format on
813
814 for (const auto& test_case : cases) {
815 SCOPED_TRACE(test_case.input8);
816
817 // 8-bit version.
818 Component component(0, static_cast<int>(strlen(test_case.input8)));
819
820 std::string out_str1;
821 StdStringCanonOutput output1(&out_str1);
822 CanonHostInfo host_info;
823 CanonicalizeIPAddress(test_case.input8, component, &output1, &host_info);
824 output1.Complete();
825
826 EXPECT_EQ(test_case.expected_family, host_info.family);
827 EXPECT_EQ(std::string(test_case.expected_address_hex),
828 BytesToHexString(host_info.address, host_info.AddressLength()));
829 if (host_info.family == CanonHostInfo::IPV4) {
830 EXPECT_STREQ(test_case.expected, out_str1.c_str());
831 EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
832 EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
833 EXPECT_EQ(test_case.expected_num_ipv4_components,
834 host_info.num_ipv4_components);
835 }
836
837 // 16-bit version.
838 std::u16string input16(
839 test_utils::TruncateWStringToUTF16(test_case.input16));
840 component = Component(0, static_cast<int>(input16.length()));
841
842 std::string out_str2;
843 StdStringCanonOutput output2(&out_str2);
844 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
845 output2.Complete();
846
847 EXPECT_EQ(test_case.expected_family, host_info.family);
848 EXPECT_EQ(std::string(test_case.expected_address_hex),
849 BytesToHexString(host_info.address, host_info.AddressLength()));
850 if (host_info.family == CanonHostInfo::IPV4) {
851 EXPECT_STREQ(test_case.expected, out_str2.c_str());
852 EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
853 EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
854 EXPECT_EQ(test_case.expected_num_ipv4_components,
855 host_info.num_ipv4_components);
856 }
857 }
858 }
859
860 class URLCanonIPv6Test
861 : public ::testing::Test,
862 public ::testing::WithParamInterface<bool> {
863 public:
URLCanonIPv6Test()864 URLCanonIPv6Test() {
865 if (GetParam()) {
866 scoped_feature_list_.InitAndEnableFeature(kStrictIPv4EmbeddedIPv6AddressParsing);
867 } else {
868 scoped_feature_list_.InitAndDisableFeature(kStrictIPv4EmbeddedIPv6AddressParsing);
869 }
870 }
871
872 private:
873 base::test::ScopedFeatureList scoped_feature_list_;
874 };
875
876 INSTANTIATE_TEST_SUITE_P(All,
877 URLCanonIPv6Test,
878 ::testing::Bool());
879
TEST_P(URLCanonIPv6Test,IPv6)880 TEST_P(URLCanonIPv6Test, IPv6) {
881 bool strict_ipv4_embedded_ipv6_parsing =
882 base::FeatureList::IsEnabled(url::kStrictIPv4EmbeddedIPv6AddressParsing);
883
884 IPAddressCase cases[] = {
885 // Empty is not an IP address.
886 {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
887 // Non-IPs with [:] characters are marked BROKEN.
888 {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
889 {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
890 {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
891 {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
892 {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
893 {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
894 {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
895 // Regular IP address is invalid without bounding '[' and ']'.
896 {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
897 {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
898 {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
899 // Regular IP addresses.
900 {"[::]", L"[::]", "[::]", Component(0,4), CanonHostInfo::IPV6, -1, "00000000000000000000000000000000"},
901 {"[::1]", L"[::1]", "[::1]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000001"},
902 {"[1::]", L"[1::]", "[1::]", Component(0,5), CanonHostInfo::IPV6, -1, "00010000000000000000000000000000"},
903
904 // Leading zeros should be stripped.
905 {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]", "[0:1:2:3:4:5:6:7]", Component(0,17), CanonHostInfo::IPV6, -1, "00000001000200030004000500060007"},
906
907 // Upper case letters should be lowercased.
908 {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]", Component(0,20), CanonHostInfo::IPV6, -1, "000A000B000C00DE00FF0000000100AC"},
909
910 // The same address can be written with different contractions, but should
911 // get canonicalized to the same thing.
912 {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
913 {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0,14), CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
914
915 // Addresses with embedded IPv4.
916 {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0,10), CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
917 {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
918 {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1, "00000000000000000000EEEEC0A80001"},
919 {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]", Component(0, 14), CanonHostInfo::IPV6, -1, "200100000000000000000000C0A80001"},
920 {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
921
922 // IPv4 embedded IPv6 addresses
923 {"[::ffff:192.1.2]",
924 L"[::ffff:192.1.2]",
925 "[::ffff:c001:2]",
926 strict_ipv4_embedded_ipv6_parsing ? Component() : Component(0,15),
927 strict_ipv4_embedded_ipv6_parsing ? CanonHostInfo::BROKEN : CanonHostInfo::IPV6,
928 -1,
929 (strict_ipv4_embedded_ipv6_parsing ? "" : "00000000000000000000FFFFC0010002")},
930 {"[::ffff:192.1]",
931 L"[::ffff:192.1]",
932 "[::ffff:c000:1]",
933 strict_ipv4_embedded_ipv6_parsing ? Component() : Component(0,15),
934 strict_ipv4_embedded_ipv6_parsing ? CanonHostInfo::BROKEN : CanonHostInfo::IPV6,
935 -1,
936 (strict_ipv4_embedded_ipv6_parsing ? "" : "00000000000000000000FFFFC0000001")},
937 {"[::ffff:192.1.2.3.4]",
938 L"[::ffff:192.1.2.3.4]",
939 "", Component(), CanonHostInfo::BROKEN, -1, ""},
940
941 // IPv4 using hex.
942 // TODO(eroman): Should this format be disallowed?
943 {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]", "[::ffff:c0a8:1]", Component(0,15), CanonHostInfo::IPV6, -1, "00000000000000000000FFFFC0A80001"},
944
945 // There may be zeros surrounding the "::" contraction.
946 {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0,5), CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
947
948 {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0,13), CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
949
950 // Can only have one "::" contraction in an IPv6 string literal.
951 {"[2001::db8::1]", L"[2001::db8::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
952 // No more than 2 consecutive ':'s.
953 {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
954 {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
955 // Non-IP addresses due to invalid characters.
956 {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
957 // If there are not enough components, the last one should fill them out.
958 // ... omitted at this time ...
959 // Too many components means not an IP address. Similarly, with too few
960 // if using IPv4 compat or mapped addresses.
961 {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
962 {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
963 {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
964 // Too many bits (even though 8 comonents, the last one holds 32 bits).
965 {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
966
967 // Too many bits specified -- the contraction would have to be zero-length
968 // to not exceed 128 bits.
969 {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
970
971 // The contraction is for 16 bits of zero.
972 {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]", Component(0,17), CanonHostInfo::IPV6, -1, "00010002000300040005000600000008"},
973
974 // Cannot have a trailing colon.
975 {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
976 {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
977
978 // Cannot have negative numbers.
979 {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
980
981 // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
982 // The scope_id should be included in the canonicalized URL, and is an
983 // unsigned decimal number.
984
985 // Invalid because no ID was given after the percent.
986
987 // Don't allow scope-id
988 {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
989 {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
990 {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
991 {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
992 {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
993
994 // Don't allow leading or trailing colons.
995 {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
996 {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
997 {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
998
999 // We allow a single trailing dot.
1000 // ... omitted at this time ...
1001 // Two dots in a row means not an IP address.
1002 {"[::192.168..1]", L"[::192.168..1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1003 // Any non-first components get truncated to one byte.
1004 // ... omitted at this time ...
1005 // Spaces should be rejected.
1006 {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1007 };
1008
1009 for (size_t i = 0; i < std::size(cases); i++) {
1010 // 8-bit version.
1011 Component component(0, static_cast<int>(strlen(cases[i].input8)));
1012
1013 std::string out_str1;
1014 StdStringCanonOutput output1(&out_str1);
1015 CanonHostInfo host_info;
1016 CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
1017 output1.Complete();
1018
1019 EXPECT_EQ(cases[i].expected_family, host_info.family);
1020 EXPECT_EQ(std::string(cases[i].expected_address_hex),
1021 BytesToHexString(host_info.address, host_info.AddressLength())) << "iter " << i << " host " << cases[i].input8;
1022 if (host_info.family == CanonHostInfo::IPV6) {
1023 EXPECT_STREQ(cases[i].expected, out_str1.c_str());
1024 EXPECT_EQ(cases[i].expected_component.begin,
1025 host_info.out_host.begin);
1026 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1027 }
1028
1029 // 16-bit version.
1030 std::u16string input16(
1031 test_utils::TruncateWStringToUTF16(cases[i].input16));
1032 component = Component(0, static_cast<int>(input16.length()));
1033
1034 std::string out_str2;
1035 StdStringCanonOutput output2(&out_str2);
1036 CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
1037 output2.Complete();
1038
1039 EXPECT_EQ(cases[i].expected_family, host_info.family);
1040 EXPECT_EQ(std::string(cases[i].expected_address_hex),
1041 BytesToHexString(host_info.address, host_info.AddressLength()));
1042 if (host_info.family == CanonHostInfo::IPV6) {
1043 EXPECT_STREQ(cases[i].expected, out_str2.c_str());
1044 EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
1045 EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1046 }
1047 }
1048 }
1049
TEST(URLCanonTest,IPEmpty)1050 TEST(URLCanonTest, IPEmpty) {
1051 std::string out_str1;
1052 StdStringCanonOutput output1(&out_str1);
1053 CanonHostInfo host_info;
1054
1055 // This tests tests.
1056 const char spec[] = "192.168.0.1";
1057 CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
1058 EXPECT_FALSE(host_info.IsIPAddress());
1059
1060 CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
1061 EXPECT_FALSE(host_info.IsIPAddress());
1062 }
1063
1064 // Verifies that CanonicalizeHostSubstring produces the expected output and
1065 // does not "fix" IP addresses. Because this code is a subset of
1066 // CanonicalizeHost, the shared functionality is not tested.
TEST(URLCanonTest,CanonicalizeHostSubstring)1067 TEST(URLCanonTest, CanonicalizeHostSubstring) {
1068 // Basic sanity check.
1069 {
1070 std::string out_str;
1071 StdStringCanonOutput output(&out_str);
1072 EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
1073 Component(0, 12), &output));
1074 output.Complete();
1075 EXPECT_EQ("xn--mnchen-3ya.com", out_str);
1076 }
1077
1078 // Failure case.
1079 {
1080 std::string out_str;
1081 StdStringCanonOutput output(&out_str);
1082 EXPECT_FALSE(CanonicalizeHostSubstring(
1083 test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
1084 Component(0, 8), &output));
1085 output.Complete();
1086 EXPECT_EQ("%EF%BF%BDzyx.com", out_str);
1087 }
1088
1089 // Should return true for empty input strings.
1090 {
1091 std::string out_str;
1092 StdStringCanonOutput output(&out_str);
1093 EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
1094 output.Complete();
1095 EXPECT_EQ(std::string(), out_str);
1096 }
1097
1098 // Numbers that look like IP addresses should not be changed.
1099 {
1100 std::string out_str;
1101 StdStringCanonOutput output(&out_str);
1102 EXPECT_TRUE(
1103 CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
1104 output.Complete();
1105 EXPECT_EQ("01.02.03.04", out_str);
1106 }
1107 }
1108
TEST(URLCanonTest,UserInfo)1109 TEST(URLCanonTest, UserInfo) {
1110 // Note that the canonicalizer should escape and treat empty components as
1111 // not being there.
1112
1113 // We actually parse a full input URL so we can get the initial components.
1114 struct UserComponentCase {
1115 const char* input;
1116 const char* expected;
1117 Component expected_username;
1118 Component expected_password;
1119 bool expected_success;
1120 } user_info_cases[] = {
1121 {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true},
1122 {"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
1123 {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
1124 {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
1125 {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true},
1126 {"http://^ :$\t@host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
1127 {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
1128 {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
1129
1130 // IE7 compatibility: old versions allowed backslashes in usernames, but
1131 // IE7 does not. We disallow it as well.
1132 {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true},
1133 };
1134
1135 for (size_t i = 0; i < std::size(user_info_cases); i++) {
1136 int url_len = static_cast<int>(strlen(user_info_cases[i].input));
1137 Parsed parsed;
1138 ParseStandardURL(user_info_cases[i].input, url_len, &parsed);
1139 Component out_user, out_pass;
1140 std::string out_str;
1141 StdStringCanonOutput output1(&out_str);
1142
1143 bool success = CanonicalizeUserInfo(user_info_cases[i].input,
1144 parsed.username,
1145 user_info_cases[i].input,
1146 parsed.password,
1147 &output1,
1148 &out_user,
1149 &out_pass);
1150 output1.Complete();
1151
1152 EXPECT_EQ(user_info_cases[i].expected_success, success);
1153 EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
1154 EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
1155 EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
1156 EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
1157 EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
1158
1159 // Now try the wide version
1160 out_str.clear();
1161 StdStringCanonOutput output2(&out_str);
1162 std::u16string wide_input(base::UTF8ToUTF16(user_info_cases[i].input));
1163 success = CanonicalizeUserInfo(wide_input.c_str(),
1164 parsed.username,
1165 wide_input.c_str(),
1166 parsed.password,
1167 &output2,
1168 &out_user,
1169 &out_pass);
1170 output2.Complete();
1171
1172 EXPECT_EQ(user_info_cases[i].expected_success, success);
1173 EXPECT_EQ(std::string(user_info_cases[i].expected), out_str);
1174 EXPECT_EQ(user_info_cases[i].expected_username.begin, out_user.begin);
1175 EXPECT_EQ(user_info_cases[i].expected_username.len, out_user.len);
1176 EXPECT_EQ(user_info_cases[i].expected_password.begin, out_pass.begin);
1177 EXPECT_EQ(user_info_cases[i].expected_password.len, out_pass.len);
1178 }
1179 }
1180
TEST(URLCanonTest,Port)1181 TEST(URLCanonTest, Port) {
1182 // We only need to test that the number gets properly put into the output
1183 // buffer. The parser unit tests will test scanning the number correctly.
1184 //
1185 // Note that the CanonicalizePort will always prepend a colon to the output
1186 // to separate it from the colon that it assumes precedes it.
1187 struct PortCase {
1188 const char* input;
1189 int default_port;
1190 const char* expected;
1191 Component expected_component;
1192 bool expected_success;
1193 } port_cases[] = {
1194 // Invalid input should be copied w/ failure.
1195 {"as df", 80, ":as%20df", Component(1, 7), false},
1196 {"-2", 80, ":-2", Component(1, 2), false},
1197 // Default port should be omitted.
1198 {"80", 80, "", Component(0, -1), true},
1199 {"8080", 80, ":8080", Component(1, 4), true},
1200 // PORT_UNSPECIFIED should mean always keep the port.
1201 {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true},
1202 };
1203
1204 for (size_t i = 0; i < std::size(port_cases); i++) {
1205 int url_len = static_cast<int>(strlen(port_cases[i].input));
1206 Component in_comp(0, url_len);
1207 Component out_comp;
1208 std::string out_str;
1209 StdStringCanonOutput output1(&out_str);
1210 bool success = CanonicalizePort(port_cases[i].input,
1211 in_comp,
1212 port_cases[i].default_port,
1213 &output1,
1214 &out_comp);
1215 output1.Complete();
1216
1217 EXPECT_EQ(port_cases[i].expected_success, success);
1218 EXPECT_EQ(std::string(port_cases[i].expected), out_str);
1219 EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
1220 EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
1221
1222 // Now try the wide version
1223 out_str.clear();
1224 StdStringCanonOutput output2(&out_str);
1225 std::u16string wide_input(base::UTF8ToUTF16(port_cases[i].input));
1226 success = CanonicalizePort(wide_input.c_str(),
1227 in_comp,
1228 port_cases[i].default_port,
1229 &output2,
1230 &out_comp);
1231 output2.Complete();
1232
1233 EXPECT_EQ(port_cases[i].expected_success, success);
1234 EXPECT_EQ(std::string(port_cases[i].expected), out_str);
1235 EXPECT_EQ(port_cases[i].expected_component.begin, out_comp.begin);
1236 EXPECT_EQ(port_cases[i].expected_component.len, out_comp.len);
1237 }
1238 }
1239
1240 DualComponentCase kCommonPathCases[] = {
1241 // ----- path collapsing tests -----
1242 {"/././foo", L"/././foo", "/foo", Component(0, 4), true},
1243 {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true},
1244 {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true},
1245 {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true},
1246 // double dots followed by a slash or the end of the string count
1247 {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true},
1248 {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true},
1249 // don't count double dots when they aren't followed by a slash
1250 {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true},
1251 // some in the middle
1252 {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true},
1253 {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a",
1254 Component(0, 2), true},
1255 // we should not be able to go above the root
1256 {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true},
1257 {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true},
1258 // escaped dots should be unescaped and treated the same as dots
1259 {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true},
1260 {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true},
1261 {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar",
1262 "/..bar", Component(0, 6), true},
1263 // Multiple slashes in a row should be preserved and treated like empty
1264 // directory names.
1265 {"////../..", L"////../..", "//", Component(0, 2), true},
1266
1267 // ----- escaping tests -----
1268 {"/foo", L"/foo", "/foo", Component(0, 4), true},
1269 // Valid escape sequence
1270 {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true},
1271 // Invalid escape sequence we should pass through unchanged.
1272 {"/foo%", L"/foo%", "/foo%", Component(0, 5), true},
1273 {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true},
1274 // Invalid escape sequence: bad characters should be treated the same as
1275 // the surrounding text, not as escaped (in this case, UTF-8).
1276 {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true},
1277 {"/foo%2\xc2\xa9zbar", nullptr, "/foo%2%C2%A9zbar", Component(0, 16), true},
1278 {nullptr, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22),
1279 true},
1280 // Regular characters that are escaped should be unescaped
1281 {"/foo%41%7a", L"/foo%41%7a", "/fooAz", Component(0, 6), true},
1282 // Funny characters that are unescaped should be escaped
1283 {"/foo\x09\x91%91", nullptr, "/foo%09%91%91", Component(0, 13), true},
1284 {nullptr, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true},
1285 // Invalid characters that are escaped should cause a failure.
1286 {"/foo%00%51", L"/foo%00%51", "/foo%00Q", Component(0, 8), false},
1287 // Some characters should be passed through unchanged regardless of esc.
1288 {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13),
1289 true},
1290 // Characters that are properly escaped should not have the case changed
1291 // of hex letters.
1292 {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13),
1293 true},
1294 // Funny characters that are unescaped should be escaped
1295 {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true},
1296 // Backslashes should get converted to forward slashes
1297 {"\\foo\\bar", L"\\foo\\bar", "/foo/bar", Component(0, 8), true},
1298 // Hashes found in paths (possibly only when the caller explicitly sets
1299 // the path on an already-parsed URL) should be escaped.
1300 {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true},
1301 // %7f should be allowed and %3D should not be unescaped (these were wrong
1302 // in a previous version).
1303 {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd",
1304 "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
1305 // @ should be passed through unchanged (escaped or unescaped).
1306 {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
1307 // Nested escape sequences should result in escaping the leading '%' if
1308 // unescaping would result in a new escape sequence.
1309 {"/%A%42", L"/%A%42", "/%25AB", Component(0, 6), true},
1310 {"/%%41B", L"/%%41B", "/%25AB", Component(0, 6), true},
1311 {"/%%41%42", L"/%%41%42", "/%25AB", Component(0, 6), true},
1312 // Make sure truncated "nested" escapes don't result in reading off the
1313 // string end.
1314 {"/%%41", L"/%%41", "/%A", Component(0, 3), true},
1315 // Don't unescape the leading '%' if unescaping doesn't result in a valid
1316 // new escape sequence.
1317 {"/%%470", L"/%%470", "/%G0", Component(0, 4), true},
1318 {"/%%2D%41", L"/%%2D%41", "/%-A", Component(0, 4), true},
1319 // Don't erroneously downcast a UTF-16 character in a way that makes it
1320 // look like part of an escape sequence.
1321 {nullptr, L"/%%41\x0130", "/%A%C4%B0", Component(0, 9), true},
1322
1323 // ----- encoding tests -----
1324 // Basic conversions
1325 {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
1326 L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD",
1327 Component(0, 37), true},
1328 // Invalid unicode characters should fail. We only do validation on
1329 // UTF-16 input, so this doesn't happen on 8-bit.
1330 {"/\xef\xb7\x90zyx", nullptr, "/%EF%B7%90zyx", Component(0, 13), true},
1331 {nullptr, L"/\xfdd0zyx", "/%EF%BF%BDzyx", Component(0, 13), false},
1332 };
1333
1334 typedef bool (*CanonFunc8Bit)(const char*,
1335 const Component&,
1336 CanonOutput*,
1337 Component*);
1338 typedef bool (*CanonFunc16Bit)(const char16_t*,
1339 const Component&,
1340 CanonOutput*,
1341 Component*);
1342
DoPathTest(const DualComponentCase * path_cases,size_t num_cases,CanonFunc8Bit canon_func_8,CanonFunc16Bit canon_func_16)1343 void DoPathTest(const DualComponentCase* path_cases,
1344 size_t num_cases,
1345 CanonFunc8Bit canon_func_8,
1346 CanonFunc16Bit canon_func_16) {
1347 for (size_t i = 0; i < num_cases; i++) {
1348 testing::Message scope_message;
1349 scope_message << path_cases[i].input8 << "," << path_cases[i].input16;
1350 SCOPED_TRACE(scope_message);
1351 if (path_cases[i].input8) {
1352 int len = static_cast<int>(strlen(path_cases[i].input8));
1353 Component in_comp(0, len);
1354 Component out_comp;
1355 std::string out_str;
1356 StdStringCanonOutput output(&out_str);
1357 bool success =
1358 canon_func_8(path_cases[i].input8, in_comp, &output, &out_comp);
1359 output.Complete();
1360
1361 EXPECT_EQ(path_cases[i].expected_success, success);
1362 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1363 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1364 EXPECT_EQ(path_cases[i].expected, out_str);
1365 }
1366
1367 if (path_cases[i].input16) {
1368 std::u16string input16(
1369 test_utils::TruncateWStringToUTF16(path_cases[i].input16));
1370 int len = static_cast<int>(input16.length());
1371 Component in_comp(0, len);
1372 Component out_comp;
1373 std::string out_str;
1374 StdStringCanonOutput output(&out_str);
1375
1376 bool success =
1377 canon_func_16(input16.c_str(), in_comp, &output, &out_comp);
1378 output.Complete();
1379
1380 EXPECT_EQ(path_cases[i].expected_success, success);
1381 EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1382 EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1383 EXPECT_EQ(path_cases[i].expected, out_str);
1384 }
1385 }
1386 }
1387
TEST(URLCanonTest,Path)1388 TEST(URLCanonTest, Path) {
1389 DoPathTest(kCommonPathCases, std::size(kCommonPathCases), CanonicalizePath,
1390 CanonicalizePath);
1391
1392 // Manual test: embedded NULLs should be escaped and the URL should be marked
1393 // as invalid.
1394 const char path_with_null[] = "/ab\0c";
1395 Component in_comp(0, 5);
1396 Component out_comp;
1397
1398 std::string out_str;
1399 StdStringCanonOutput output(&out_str);
1400 bool success = CanonicalizePath(path_with_null, in_comp, &output, &out_comp);
1401 output.Complete();
1402 EXPECT_FALSE(success);
1403 EXPECT_EQ("/ab%00c", out_str);
1404 }
1405
TEST(URLCanonTest,PartialPath)1406 TEST(URLCanonTest, PartialPath) {
1407 DualComponentCase partial_path_cases[] = {
1408 {".html", L".html", ".html", Component(0, 5), true},
1409 {"", L"", "", Component(0, 0), true},
1410 };
1411
1412 DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1413 CanonicalizePartialPath, CanonicalizePartialPath);
1414 DoPathTest(partial_path_cases, std::size(partial_path_cases),
1415 CanonicalizePartialPath, CanonicalizePartialPath);
1416 }
1417
TEST(URLCanonTest,Query)1418 TEST(URLCanonTest, Query) {
1419 struct QueryCase {
1420 const char* input8;
1421 const wchar_t* input16;
1422 const char* expected;
1423 } query_cases[] = {
1424 // Regular ASCII case.
1425 {"foo=bar", L"foo=bar", "?foo=bar"},
1426 // Allow question marks in the query without escaping
1427 {"as?df", L"as?df", "?as?df"},
1428 // Always escape '#' since it would mark the ref.
1429 {"as#df", L"as#df", "?as%23df"},
1430 // Escape some questionable 8-bit characters, but never unescape.
1431 {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"},
1432 {"%40%41123", L"%40%41123", "?%40%41123"},
1433 // Chinese input/output
1434 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"},
1435 // Invalid UTF-8/16 input should be replaced with invalid characters.
1436 {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"},
1437 // Don't allow < or > because sometimes they are used for XSS if the
1438 // URL is echoed in content. Firefox does this, IE doesn't.
1439 {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"},
1440 // Escape double quotemarks in the query.
1441 {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"},
1442 };
1443
1444 for (size_t i = 0; i < std::size(query_cases); i++) {
1445 Component out_comp;
1446
1447 if (query_cases[i].input8) {
1448 int len = static_cast<int>(strlen(query_cases[i].input8));
1449 Component in_comp(0, len);
1450 std::string out_str;
1451
1452 StdStringCanonOutput output(&out_str);
1453 CanonicalizeQuery(query_cases[i].input8, in_comp, NULL, &output,
1454 &out_comp);
1455 output.Complete();
1456
1457 EXPECT_EQ(query_cases[i].expected, out_str);
1458 }
1459
1460 if (query_cases[i].input16) {
1461 std::u16string input16(
1462 test_utils::TruncateWStringToUTF16(query_cases[i].input16));
1463 int len = static_cast<int>(input16.length());
1464 Component in_comp(0, len);
1465 std::string out_str;
1466
1467 StdStringCanonOutput output(&out_str);
1468 CanonicalizeQuery(input16.c_str(), in_comp, NULL, &output, &out_comp);
1469 output.Complete();
1470
1471 EXPECT_EQ(query_cases[i].expected, out_str);
1472 }
1473 }
1474
1475 // Extra test for input with embedded NULL;
1476 std::string out_str;
1477 StdStringCanonOutput output(&out_str);
1478 Component out_comp;
1479 CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
1480 output.Complete();
1481 EXPECT_EQ("?a%20%00z%01", out_str);
1482 }
1483
TEST(URLCanonTest,Ref)1484 TEST(URLCanonTest, Ref) {
1485 // Refs are trivial, it just checks the encoding.
1486 DualComponentCase ref_cases[] = {
1487 {"hello!", L"hello!", "#hello!", Component(1, 6), true},
1488 // We should escape spaces, double-quotes, angled braces, and backtics.
1489 {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
1490 true},
1491 {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
1492 true},
1493 {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
1494 true},
1495 {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
1496 true},
1497 {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
1498 true},
1499 // UTF-8/wide input should be preserved
1500 {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
1501 // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
1502 {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss",
1503 Component(1, 14), true},
1504 // Escaping should be preserved unchanged, even invalid ones
1505 {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true},
1506 // Invalid UTF-8/16 input should be flagged and the input made valid
1507 {"\xc2", nullptr, "#%EF%BF%BD", Component(1, 9), true},
1508 {nullptr, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true},
1509 // Test a Unicode invalid character.
1510 {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%BF%BD", Component(1, 10), true},
1511 // Refs can have # signs and we should preserve them.
1512 {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true},
1513 {"#asdf", L"#asdf", "##asdf", Component(1, 5), true},
1514 };
1515
1516 for (size_t i = 0; i < std::size(ref_cases); i++) {
1517 // 8-bit input
1518 if (ref_cases[i].input8) {
1519 int len = static_cast<int>(strlen(ref_cases[i].input8));
1520 Component in_comp(0, len);
1521 Component out_comp;
1522
1523 std::string out_str;
1524 StdStringCanonOutput output(&out_str);
1525 CanonicalizeRef(ref_cases[i].input8, in_comp, &output, &out_comp);
1526 output.Complete();
1527
1528 EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
1529 EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
1530 EXPECT_EQ(ref_cases[i].expected, out_str);
1531 }
1532
1533 // 16-bit input
1534 if (ref_cases[i].input16) {
1535 std::u16string input16(
1536 test_utils::TruncateWStringToUTF16(ref_cases[i].input16));
1537 int len = static_cast<int>(input16.length());
1538 Component in_comp(0, len);
1539 Component out_comp;
1540
1541 std::string out_str;
1542 StdStringCanonOutput output(&out_str);
1543 CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
1544 output.Complete();
1545
1546 EXPECT_EQ(ref_cases[i].expected_component.begin, out_comp.begin);
1547 EXPECT_EQ(ref_cases[i].expected_component.len, out_comp.len);
1548 EXPECT_EQ(ref_cases[i].expected, out_str);
1549 }
1550 }
1551
1552 // Try one with an embedded NULL. It should be stripped.
1553 const char null_input[5] = "ab\x00z";
1554 Component null_input_component(0, 4);
1555 Component out_comp;
1556
1557 std::string out_str;
1558 StdStringCanonOutput output(&out_str);
1559 CanonicalizeRef(null_input, null_input_component, &output, &out_comp);
1560 output.Complete();
1561
1562 EXPECT_EQ(1, out_comp.begin);
1563 EXPECT_EQ(6, out_comp.len);
1564 EXPECT_EQ("#ab%00z", out_str);
1565 }
1566
TEST(URLCanonTest,CanonicalizeStandardURL)1567 TEST(URLCanonTest, CanonicalizeStandardURL) {
1568 // The individual component canonicalize tests should have caught the cases
1569 // for each of those components. Here, we just need to test that the various
1570 // parts are included or excluded properly, and have the correct separators.
1571 struct URLCase {
1572 const char* input;
1573 const char* expected;
1574 bool expected_success;
1575 } cases[] = {
1576 {"http://www.google.com/foo?bar=baz#",
1577 "http://www.google.com/foo?bar=baz#", true},
1578 {"http://[www.google.com]/", "http://[www.google.com]/", false},
1579 {"ht\ttp:@www.google.com:80/;p?#", "ht%09tp://www.google.com:80/;p?#",
1580 false},
1581 {"http:////////user:@google.com:99?foo", "http://user@google.com:99/?foo",
1582 true},
1583 {"www.google.com", ":www.google.com/", false},
1584 {"http://192.0x00A80001", "http://192.168.0.1/", true},
1585 {"http://www/foo%2Ehtml", "http://www/foo.html", true},
1586 {"http://user:pass@/", "http://user:pass@/", false},
1587 {"http://%25DOMAIN:foobar@foodomain.com/",
1588 "http://%25DOMAIN:foobar@foodomain.com/", true},
1589
1590 // Backslashes should get converted to forward slashes.
1591 {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
1592
1593 // Busted refs shouldn't make the whole thing fail.
1594 {"http://www.google.com/asdf#\xc2",
1595 "http://www.google.com/asdf#%EF%BF%BD", true},
1596
1597 // Basic port tests.
1598 {"http://foo:80/", "http://foo/", true},
1599 {"http://foo:81/", "http://foo:81/", true},
1600 {"httpa://foo:80/", "httpa://foo:80/", true},
1601 {"http://foo:-80/", "http://foo:-80/", false},
1602
1603 {"https://foo:443/", "https://foo/", true},
1604 {"https://foo:80/", "https://foo:80/", true},
1605 {"ftp://foo:21/", "ftp://foo/", true},
1606 {"ftp://foo:80/", "ftp://foo:80/", true},
1607 {"gopher://foo:70/", "gopher://foo:70/", true},
1608 {"gopher://foo:443/", "gopher://foo:443/", true},
1609 {"ws://foo:80/", "ws://foo/", true},
1610 {"ws://foo:81/", "ws://foo:81/", true},
1611 {"ws://foo:443/", "ws://foo:443/", true},
1612 {"ws://foo:815/", "ws://foo:815/", true},
1613 {"wss://foo:80/", "wss://foo:80/", true},
1614 {"wss://foo:81/", "wss://foo:81/", true},
1615 {"wss://foo:443/", "wss://foo/", true},
1616 {"wss://foo:815/", "wss://foo:815/", true},
1617
1618 // This particular code path ends up "backing up" to replace an invalid
1619 // host ICU generated with an escaped version. Test that in the context
1620 // of a full URL to make sure the backing up doesn't mess up the non-host
1621 // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
1622 // ICU will convert to an ASCII one, generating "%81".
1623 {"ws:)W\x1eW\xef\xb9\xaa"
1624 "81:80/",
1625 "ws://%29w%1ew%81/", false},
1626 // Regression test for the last_invalid_percent_index bug described in
1627 // https://crbug.com/1080890#c10.
1628 {R"(HTTP:S/5%\../>%41)", "http://s/%3EA", true},
1629 };
1630
1631 for (size_t i = 0; i < std::size(cases); i++) {
1632 int url_len = static_cast<int>(strlen(cases[i].input));
1633 Parsed parsed;
1634 ParseStandardURL(cases[i].input, url_len, &parsed);
1635
1636 Parsed out_parsed;
1637 std::string out_str;
1638 StdStringCanonOutput output(&out_str);
1639 bool success = CanonicalizeStandardURL(
1640 cases[i].input, url_len, parsed,
1641 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL, &output, &out_parsed);
1642 output.Complete();
1643
1644 EXPECT_EQ(cases[i].expected_success, success);
1645 EXPECT_EQ(cases[i].expected, out_str);
1646 }
1647 }
1648
1649 // The codepath here is the same as for regular canonicalization, so we just
1650 // need to test that things are replaced or not correctly.
TEST(URLCanonTest,ReplaceStandardURL)1651 TEST(URLCanonTest, ReplaceStandardURL) {
1652 ReplaceCase replace_cases[] = {
1653 // Common case of truncating the path.
1654 {"http://www.google.com/foo?bar=baz#ref", nullptr, nullptr, nullptr,
1655 nullptr, nullptr, "/", kDeleteComp, kDeleteComp,
1656 "http://www.google.com/"},
1657 // Replace everything
1658 {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw",
1659 "host.com", "99", "/path", "query", "ref",
1660 "https://me:pw@host.com:99/path?query#ref"},
1661 // Replace nothing
1662 {"http://a:b@google.com:22/foo?baz@cat", nullptr, nullptr, nullptr,
1663 nullptr, nullptr, nullptr, nullptr, nullptr,
1664 "http://a:b@google.com:22/foo?baz@cat"},
1665 // Replace scheme with filesystem. The result is garbage, but you asked
1666 // for it.
1667 {"http://a:b@google.com:22/foo?baz@cat", "filesystem", nullptr, nullptr,
1668 nullptr, nullptr, nullptr, nullptr, nullptr,
1669 "filesystem://a:b@google.com:22/foo?baz@cat"},
1670 };
1671
1672 for (size_t i = 0; i < std::size(replace_cases); i++) {
1673 const ReplaceCase& cur = replace_cases[i];
1674 int base_len = static_cast<int>(strlen(cur.base));
1675 Parsed parsed;
1676 ParseStandardURL(cur.base, base_len, &parsed);
1677
1678 Replacements<char> r;
1679 typedef Replacements<char> R; // Clean up syntax.
1680
1681 // Note that for the scheme we pass in a different clear function since
1682 // there is no function to clear the scheme.
1683 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1684 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1685 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1686 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1687 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1688 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1689 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1690 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1691
1692 std::string out_str;
1693 StdStringCanonOutput output(&out_str);
1694 Parsed out_parsed;
1695 ReplaceStandardURL(replace_cases[i].base, parsed, r,
1696 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
1697 &output, &out_parsed);
1698 output.Complete();
1699
1700 EXPECT_EQ(replace_cases[i].expected, out_str);
1701 }
1702
1703 // The path pointer should be ignored if the address is invalid.
1704 {
1705 const char src[] = "http://www.google.com/here_is_the_path";
1706 int src_len = static_cast<int>(strlen(src));
1707
1708 Parsed parsed;
1709 ParseStandardURL(src, src_len, &parsed);
1710
1711 // Replace the path to 0 length string. By using 1 as the string address,
1712 // the test should get an access violation if it tries to dereference it.
1713 Replacements<char> r;
1714 r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0));
1715 std::string out_str1;
1716 StdStringCanonOutput output1(&out_str1);
1717 Parsed new_parsed;
1718 ReplaceStandardURL(src, parsed, r,
1719 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
1720 &output1, &new_parsed);
1721 output1.Complete();
1722 EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
1723
1724 // Same with an "invalid" path.
1725 r.SetPath(reinterpret_cast<char*>(0x00000001), Component());
1726 std::string out_str2;
1727 StdStringCanonOutput output2(&out_str2);
1728 ReplaceStandardURL(src, parsed, r,
1729 SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, NULL,
1730 &output2, &new_parsed);
1731 output2.Complete();
1732 EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
1733 }
1734 }
1735
TEST(URLCanonTest,ReplaceFileURL)1736 TEST(URLCanonTest, ReplaceFileURL) {
1737 ReplaceCase replace_cases[] = {
1738 // Replace everything
1739 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, "filer", nullptr,
1740 "/foo", "b", "c", "file://filer/foo?b#c"},
1741 // Replace nothing
1742 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
1743 nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
1744 {"file:///Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1745 nullptr, nullptr, "file:///Y:"},
1746 {"file:///Y:/", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1747 nullptr, nullptr, "file:///Y:/"},
1748 {"file:///./Y", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1749 nullptr, nullptr, "file:///Y"},
1750 {"file:///./Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1751 nullptr, nullptr, "file:///Y:"},
1752 // Clear non-path components (common)
1753 {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
1754 nullptr, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
1755 // Replace path with something that doesn't begin with a slash and make
1756 // sure it gets added properly.
1757 {"file:///C:/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
1758 "interesting/", nullptr, nullptr, "file:///interesting/"},
1759 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, "filer",
1760 nullptr, "/foo", "b", "c", "file://filer/foo?b#c"},
1761 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
1762 nullptr, nullptr, nullptr, nullptr, "file:///home/gaba?query#ref"},
1763 {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
1764 nullptr, nullptr, kDeleteComp, kDeleteComp, "file:///home/gaba"},
1765 {"file:///home/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
1766 "interesting/", nullptr, nullptr, "file:///interesting/"},
1767 // Replace scheme -- shouldn't do anything.
1768 {"file:///C:/gaba?query#ref", "http", nullptr, nullptr, nullptr, nullptr,
1769 nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
1770 };
1771
1772 for (size_t i = 0; i < std::size(replace_cases); i++) {
1773 const ReplaceCase& cur = replace_cases[i];
1774 SCOPED_TRACE(cur.base);
1775 int base_len = static_cast<int>(strlen(cur.base));
1776 Parsed parsed;
1777 ParseFileURL(cur.base, base_len, &parsed);
1778
1779 Replacements<char> r;
1780 typedef Replacements<char> R; // Clean up syntax.
1781 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1782 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1783 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1784 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1785 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1786 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1787 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1788 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1789
1790 std::string out_str;
1791 StdStringCanonOutput output(&out_str);
1792 Parsed out_parsed;
1793 ReplaceFileURL(cur.base, parsed, r, NULL, &output, &out_parsed);
1794 output.Complete();
1795
1796 EXPECT_EQ(replace_cases[i].expected, out_str);
1797 }
1798 }
1799
TEST(URLCanonTest,ReplaceFileSystemURL)1800 TEST(URLCanonTest, ReplaceFileSystemURL) {
1801 ReplaceCase replace_cases[] = {
1802 // Replace everything in the outer URL.
1803 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
1804 nullptr, nullptr, "/foo", "b", "c",
1805 "filesystem:file:///temporary/foo?b#c"},
1806 // Replace nothing
1807 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
1808 nullptr, nullptr, nullptr, nullptr, nullptr,
1809 "filesystem:file:///temporary/gaba?query#ref"},
1810 // Clear non-path components (common)
1811 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
1812 nullptr, nullptr, nullptr, kDeleteComp, kDeleteComp,
1813 "filesystem:file:///temporary/gaba"},
1814 // Replace path with something that doesn't begin with a slash and make
1815 // sure it gets added properly.
1816 {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
1817 nullptr, nullptr, "interesting/", nullptr, nullptr,
1818 "filesystem:file:///temporary/interesting/?query#ref"},
1819 // Replace scheme -- shouldn't do anything except canonicalize.
1820 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", nullptr,
1821 nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1822 "filesystem:http://bar.com/t/gaba?query#ref"},
1823 // Replace username -- shouldn't do anything except canonicalize.
1824 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, "u2", nullptr,
1825 nullptr, nullptr, nullptr, nullptr, nullptr,
1826 "filesystem:http://bar.com/t/gaba?query#ref"},
1827 // Replace password -- shouldn't do anything except canonicalize.
1828 {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, nullptr,
1829 "pw2", nullptr, nullptr, nullptr, nullptr, nullptr,
1830 "filesystem:http://bar.com/t/gaba?query#ref"},
1831 // Replace host -- shouldn't do anything except canonicalize.
1832 {"filesystem:http://u:p@bar.com:80/t/gaba?query#ref", nullptr, nullptr,
1833 nullptr, "foo.com", nullptr, nullptr, nullptr, nullptr,
1834 "filesystem:http://bar.com/t/gaba?query#ref"},
1835 // Replace port -- shouldn't do anything except canonicalize.
1836 {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", nullptr, nullptr,
1837 nullptr, nullptr, "41", nullptr, nullptr, nullptr,
1838 "filesystem:http://bar.com:40/t/gaba?query#ref"},
1839 };
1840
1841 for (size_t i = 0; i < std::size(replace_cases); i++) {
1842 const ReplaceCase& cur = replace_cases[i];
1843 int base_len = static_cast<int>(strlen(cur.base));
1844 Parsed parsed;
1845 ParseFileSystemURL(cur.base, base_len, &parsed);
1846
1847 Replacements<char> r;
1848 typedef Replacements<char> R; // Clean up syntax.
1849 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1850 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1851 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1852 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1853 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1854 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1855 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1856 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1857
1858 std::string out_str;
1859 StdStringCanonOutput output(&out_str);
1860 Parsed out_parsed;
1861 ReplaceFileSystemURL(cur.base, parsed, r, NULL, &output, &out_parsed);
1862 output.Complete();
1863
1864 EXPECT_EQ(replace_cases[i].expected, out_str);
1865 }
1866 }
1867
TEST(URLCanonTest,ReplacePathURL)1868 TEST(URLCanonTest, ReplacePathURL) {
1869 ReplaceCase replace_cases[] = {
1870 // Replace everything
1871 {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr,
1872 "alert('foo?');", nullptr, nullptr, "javascript:alert('foo?');"},
1873 // Replace nothing
1874 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
1875 nullptr, nullptr, "data:foo"},
1876 // Replace one or the other
1877 {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, nullptr,
1878 nullptr, nullptr, "javascript:foo"},
1879 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, "bar", nullptr,
1880 nullptr, "data:bar"},
1881 {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, kDeleteComp,
1882 nullptr, nullptr, "data:"},
1883 };
1884
1885 for (size_t i = 0; i < std::size(replace_cases); i++) {
1886 const ReplaceCase& cur = replace_cases[i];
1887 int base_len = static_cast<int>(strlen(cur.base));
1888 Parsed parsed;
1889 ParsePathURL(cur.base, base_len, false, &parsed);
1890
1891 Replacements<char> r;
1892 typedef Replacements<char> R; // Clean up syntax.
1893 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1894 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1895 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1896 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1897 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1898 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1899 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1900 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1901
1902 std::string out_str;
1903 StdStringCanonOutput output(&out_str);
1904 Parsed out_parsed;
1905 ReplacePathURL(cur.base, parsed, r, &output, &out_parsed);
1906 output.Complete();
1907
1908 EXPECT_EQ(replace_cases[i].expected, out_str);
1909 }
1910 }
1911
TEST(URLCanonTest,ReplaceMailtoURL)1912 TEST(URLCanonTest, ReplaceMailtoURL) {
1913 ReplaceCase replace_cases[] = {
1914 // Replace everything
1915 {"mailto:jon@foo.com?body=sup", "mailto", NULL, NULL, NULL, NULL, "addr1", "to=tony", NULL, "mailto:addr1?to=tony"},
1916 // Replace nothing
1917 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, "mailto:jon@foo.com?body=sup"},
1918 // Replace the path
1919 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", NULL, NULL, "mailto:jason?body=sup"},
1920 // Replace the query
1921 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "custom=1", NULL, "mailto:jon@foo.com?custom=1"},
1922 // Replace the path and query
1923 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "jason", "custom=1", NULL, "mailto:jason?custom=1"},
1924 // Set the query to empty (should leave trailing question mark)
1925 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "", NULL, "mailto:jon@foo.com?"},
1926 // Clear the query
1927 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, NULL, "|", NULL, "mailto:jon@foo.com"},
1928 // Clear the path
1929 {"mailto:jon@foo.com?body=sup", NULL, NULL, NULL, NULL, NULL, "|", NULL, NULL, "mailto:?body=sup"},
1930 // Clear the path + query
1931 {"mailto:", NULL, NULL, NULL, NULL, NULL, "|", "|", NULL, "mailto:"},
1932 // Setting the ref should have no effect
1933 {"mailto:addr1", NULL, NULL, NULL, NULL, NULL, NULL, NULL, "BLAH", "mailto:addr1"},
1934 };
1935
1936 for (size_t i = 0; i < std::size(replace_cases); i++) {
1937 const ReplaceCase& cur = replace_cases[i];
1938 int base_len = static_cast<int>(strlen(cur.base));
1939 Parsed parsed;
1940 ParseMailtoURL(cur.base, base_len, &parsed);
1941
1942 Replacements<char> r;
1943 typedef Replacements<char> R;
1944 SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1945 SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1946 SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1947 SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1948 SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1949 SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1950 SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1951 SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1952
1953 std::string out_str;
1954 StdStringCanonOutput output(&out_str);
1955 Parsed out_parsed;
1956 ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed);
1957 output.Complete();
1958
1959 EXPECT_EQ(replace_cases[i].expected, out_str);
1960 }
1961 }
1962
TEST(URLCanonTest,CanonicalizeFileURL)1963 TEST(URLCanonTest, CanonicalizeFileURL) {
1964 struct URLCase {
1965 const char* input;
1966 const char* expected;
1967 bool expected_success;
1968 Component expected_host;
1969 Component expected_path;
1970 } cases[] = {
1971 #ifdef _WIN32
1972 // Windows-style paths
1973 {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(),
1974 Component(7, 16)},
1975 {" File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true,
1976 Component(), Component(7, 19)},
1977 {"file:", "file:///", true, Component(), Component(7, 1)},
1978 {"file:UNChost/path", "file://unchost/path", true, Component(7, 7),
1979 Component(14, 5)},
1980 // CanonicalizeFileURL supports absolute Windows style paths for IE
1981 // compatibility. Note that the caller must decide that this is a file
1982 // URL itself so it can call the file canonicalizer. This is usually
1983 // done automatically as part of relative URL resolving.
1984 {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(),
1985 Component(7, 11)},
1986 {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
1987 {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(),
1988 Component(7, 11)},
1989 {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(),
1990 Component(7, 11)},
1991 {"//server/file", "file://server/file", true, Component(7, 6),
1992 Component(13, 5)},
1993 {"\\\\server\\file", "file://server/file", true, Component(7, 6),
1994 Component(13, 5)},
1995 {"/\\server/file", "file://server/file", true, Component(7, 6),
1996 Component(13, 5)},
1997 // We should preserve the number of slashes after the colon for IE
1998 // compatibility, except when there is none, in which case we should
1999 // add one.
2000 {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(),
2001 Component(7, 16)},
2002 {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true,
2003 Component(), Component(7, 19)},
2004 // Three slashes should be non-UNC, even if there is no drive spec (IE
2005 // does this, which makes the resulting request invalid).
2006 {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(),
2007 Component(7, 12)},
2008 // TODO(brettw) we should probably fail for invalid host names, which
2009 // would change the expected result on this test. We also currently allow
2010 // colon even though it's probably invalid, because its currently the
2011 // "natural" result of the way the canonicalizer is written. There doesn't
2012 // seem to be a strong argument for why allowing it here would be bad, so
2013 // we just tolerate it and the load will fail later.
2014 {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false,
2015 Component(7, 2), Component(9, 16)},
2016 {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5),
2017 Component(12, 8)},
2018 // Make sure relative paths can't go above the "C:"
2019 {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true,
2020 Component(), Component(7, 12)},
2021 // Busted refs shouldn't make the whole thing fail.
2022 {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(),
2023 Component(7, 8)},
2024 {"file:///./s:", "file:///S:", true, Component(), Component(7, 3)},
2025 #else
2026 // Unix-style paths
2027 {"file:///home/me", "file:///home/me", true, Component(),
2028 Component(7, 8)},
2029 // Windowsy ones should get still treated as Unix-style.
2030 {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(),
2031 Component(7, 16)},
2032 {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true,
2033 Component(), Component(7, 19)},
2034 {"file:///./s:", "file:///s:", true, Component(), Component(7, 3)},
2035 // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
2036 {"//", "file:///", true, Component(), Component(7, 1)},
2037 {"///", "file:///", true, Component(), Component(7, 1)},
2038 {"///test", "file:///test", true, Component(), Component(7, 5)},
2039 {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)},
2040 {"file://localhost", "file://localhost/", true, Component(7, 9),
2041 Component(16, 1)},
2042 {"file://localhost/", "file://localhost/", true, Component(7, 9),
2043 Component(16, 1)},
2044 {"file://localhost/test", "file://localhost/test", true, Component(7, 9),
2045 Component(16, 5)},
2046 #endif // _WIN32
2047 };
2048
2049 for (size_t i = 0; i < std::size(cases); i++) {
2050 int url_len = static_cast<int>(strlen(cases[i].input));
2051 Parsed parsed;
2052 ParseFileURL(cases[i].input, url_len, &parsed);
2053
2054 Parsed out_parsed;
2055 std::string out_str;
2056 StdStringCanonOutput output(&out_str);
2057 bool success = CanonicalizeFileURL(cases[i].input, url_len, parsed, NULL,
2058 &output, &out_parsed);
2059 output.Complete();
2060
2061 EXPECT_EQ(cases[i].expected_success, success);
2062 EXPECT_EQ(cases[i].expected, out_str);
2063
2064 // Make sure the spec was properly identified, the file canonicalizer has
2065 // different code for writing the spec.
2066 EXPECT_EQ(0, out_parsed.scheme.begin);
2067 EXPECT_EQ(4, out_parsed.scheme.len);
2068
2069 EXPECT_EQ(cases[i].expected_host.begin, out_parsed.host.begin);
2070 EXPECT_EQ(cases[i].expected_host.len, out_parsed.host.len);
2071
2072 EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
2073 EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
2074 }
2075 }
2076
TEST(URLCanonTest,CanonicalizeFileSystemURL)2077 TEST(URLCanonTest, CanonicalizeFileSystemURL) {
2078 struct URLCase {
2079 const char* input;
2080 const char* expected;
2081 bool expected_success;
2082 } cases[] = {
2083 {"Filesystem:htTp://www.Foo.com:80/tempoRary",
2084 "filesystem:http://www.foo.com/tempoRary/", true},
2085 {"filesystem:httpS://www.foo.com/temporary/",
2086 "filesystem:https://www.foo.com/temporary/", true},
2087 {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//",
2088 false},
2089 {"filesystem:http://www.foo.com/persistent/bob?query#ref",
2090 "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
2091 {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
2092 {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
2093 {"filesystem:File:///temporary/Bob?qUery#reF",
2094 "filesystem:file:///temporary/Bob?qUery#reF", true},
2095 {"FilEsysteM:htTp:E=/.", "filesystem:http://e%3D//", false},
2096 };
2097
2098 for (size_t i = 0; i < std::size(cases); i++) {
2099 int url_len = static_cast<int>(strlen(cases[i].input));
2100 Parsed parsed;
2101 ParseFileSystemURL(cases[i].input, url_len, &parsed);
2102
2103 Parsed out_parsed;
2104 std::string out_str;
2105 StdStringCanonOutput output(&out_str);
2106 bool success = CanonicalizeFileSystemURL(cases[i].input, url_len, parsed,
2107 NULL, &output, &out_parsed);
2108 output.Complete();
2109
2110 EXPECT_EQ(cases[i].expected_success, success);
2111 EXPECT_EQ(cases[i].expected, out_str);
2112
2113 // Make sure the spec was properly identified, the filesystem canonicalizer
2114 // has different code for writing the spec.
2115 EXPECT_EQ(0, out_parsed.scheme.begin);
2116 EXPECT_EQ(10, out_parsed.scheme.len);
2117 if (success)
2118 EXPECT_GT(out_parsed.path.len, 0);
2119 }
2120 }
2121
TEST(URLCanonTest,CanonicalizePathURL)2122 TEST(URLCanonTest, CanonicalizePathURL) {
2123 // Path URLs should get canonicalized schemes but nothing else.
2124 struct PathCase {
2125 const char* input;
2126 const char* expected;
2127 } path_cases[] = {
2128 {"javascript:", "javascript:"},
2129 {"JavaScript:Foo", "javascript:Foo"},
2130 {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"},
2131
2132 // Validation errors should not cause failure. See
2133 // https://crbug.com/925614.
2134 {"javascript:\uFFFF", "javascript:%EF%BF%BD"},
2135 };
2136
2137 for (size_t i = 0; i < std::size(path_cases); i++) {
2138 int url_len = static_cast<int>(strlen(path_cases[i].input));
2139 Parsed parsed;
2140 ParsePathURL(path_cases[i].input, url_len, true, &parsed);
2141
2142 Parsed out_parsed;
2143 std::string out_str;
2144 StdStringCanonOutput output(&out_str);
2145 bool success = CanonicalizePathURL(path_cases[i].input, url_len, parsed,
2146 &output, &out_parsed);
2147 output.Complete();
2148
2149 EXPECT_TRUE(success);
2150 EXPECT_EQ(path_cases[i].expected, out_str);
2151
2152 EXPECT_EQ(0, out_parsed.host.begin);
2153 EXPECT_EQ(-1, out_parsed.host.len);
2154
2155 // When we end with a colon at the end, there should be no path.
2156 if (path_cases[i].input[url_len - 1] == ':') {
2157 EXPECT_EQ(0, out_parsed.GetContent().begin);
2158 EXPECT_EQ(-1, out_parsed.GetContent().len);
2159 }
2160 }
2161 }
2162
TEST(URLCanonTest,CanonicalizePathURLPath)2163 TEST(URLCanonTest, CanonicalizePathURLPath) {
2164 struct PathCase {
2165 std::string input;
2166 std::wstring input16;
2167 std::string expected;
2168 } path_cases[] = {
2169 {"Foo", L"Foo", "Foo"},
2170 {"\":This /is interesting;?#", L"\":This /is interesting;?#",
2171 "\":This /is interesting;?#"},
2172 {"\uFFFF", L"\uFFFF", "%EF%BF%BD"},
2173 };
2174
2175 for (size_t i = 0; i < std::size(path_cases); i++) {
2176 // 8-bit string input
2177 std::string out_str;
2178 StdStringCanonOutput output(&out_str);
2179 url::Component out_component;
2180 CanonicalizePathURLPath(path_cases[i].input.data(),
2181 Component(0, path_cases[i].input.size()), &output,
2182 &out_component);
2183 output.Complete();
2184
2185 EXPECT_EQ(path_cases[i].expected, out_str);
2186
2187 EXPECT_EQ(0, out_component.begin);
2188 EXPECT_EQ(path_cases[i].expected.size(),
2189 static_cast<size_t>(out_component.len));
2190
2191 // 16-bit string input
2192 std::string out_str16;
2193 StdStringCanonOutput output16(&out_str16);
2194 url::Component out_component16;
2195 std::u16string input16(
2196 test_utils::TruncateWStringToUTF16(path_cases[i].input16.data()));
2197 CanonicalizePathURLPath(input16.c_str(),
2198 Component(0, path_cases[i].input16.size()),
2199 &output16, &out_component16);
2200 output16.Complete();
2201
2202 EXPECT_EQ(path_cases[i].expected, out_str16);
2203
2204 EXPECT_EQ(0, out_component16.begin);
2205 EXPECT_EQ(path_cases[i].expected.size(),
2206 static_cast<size_t>(out_component16.len));
2207 }
2208 }
2209
TEST(URLCanonTest,CanonicalizeMailtoURL)2210 TEST(URLCanonTest, CanonicalizeMailtoURL) {
2211 struct URLCase {
2212 const char* input;
2213 const char* expected;
2214 bool expected_success;
2215 Component expected_path;
2216 Component expected_query;
2217 } cases[] = {
2218 // Null character should be escaped to %00.
2219 // Keep this test first in the list as it is handled specially below.
2220 {"mailto:addr1\0addr2?foo",
2221 "mailto:addr1%00addr2?foo",
2222 true, Component(7, 13), Component(21, 3)},
2223 {"mailto:addr1",
2224 "mailto:addr1",
2225 true, Component(7, 5), Component()},
2226 {"mailto:addr1@foo.com",
2227 "mailto:addr1@foo.com",
2228 true, Component(7, 13), Component()},
2229 // Trailing whitespace is stripped.
2230 {"MaIlTo:addr1 \t ",
2231 "mailto:addr1",
2232 true, Component(7, 5), Component()},
2233 {"MaIlTo:addr1?to=jon",
2234 "mailto:addr1?to=jon",
2235 true, Component(7, 5), Component(13,6)},
2236 {"mailto:addr1,addr2",
2237 "mailto:addr1,addr2",
2238 true, Component(7, 11), Component()},
2239 // Embedded spaces must be encoded.
2240 {"mailto:addr1, addr2",
2241 "mailto:addr1,%20addr2",
2242 true, Component(7, 14), Component()},
2243 {"mailto:addr1, addr2?subject=one two ",
2244 "mailto:addr1,%20addr2?subject=one%20two",
2245 true, Component(7, 14), Component(22, 17)},
2246 {"mailto:addr1%2caddr2",
2247 "mailto:addr1%2caddr2",
2248 true, Component(7, 13), Component()},
2249 {"mailto:\xF0\x90\x8C\x80",
2250 "mailto:%F0%90%8C%80",
2251 true, Component(7, 12), Component()},
2252 // Invalid -- UTF-8 encoded surrogate value.
2253 {"mailto:\xed\xa0\x80",
2254 "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD",
2255 false, Component(7, 27), Component()},
2256 {"mailto:addr1?",
2257 "mailto:addr1?",
2258 true, Component(7, 5), Component(13, 0)},
2259 // Certain characters have special meanings and must be encoded.
2260 {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~",
2261 "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~",
2262 true, Component(7, 53), Component(61, 47)},
2263 };
2264
2265 // Define outside of loop to catch bugs where components aren't reset
2266 Parsed parsed;
2267 Parsed out_parsed;
2268
2269 for (size_t i = 0; i < std::size(cases); i++) {
2270 int url_len = static_cast<int>(strlen(cases[i].input));
2271 if (i == 0) {
2272 // The first test case purposely has a '\0' in it -- don't count it
2273 // as the string terminator.
2274 url_len = 22;
2275 }
2276 ParseMailtoURL(cases[i].input, url_len, &parsed);
2277
2278 std::string out_str;
2279 StdStringCanonOutput output(&out_str);
2280 bool success = CanonicalizeMailtoURL(cases[i].input, url_len, parsed,
2281 &output, &out_parsed);
2282 output.Complete();
2283
2284 EXPECT_EQ(cases[i].expected_success, success);
2285 EXPECT_EQ(cases[i].expected, out_str);
2286
2287 // Make sure the spec was properly identified
2288 EXPECT_EQ(0, out_parsed.scheme.begin);
2289 EXPECT_EQ(6, out_parsed.scheme.len);
2290
2291 EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
2292 EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
2293
2294 EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
2295 EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
2296 }
2297 }
2298
2299 #ifndef WIN32
2300
TEST(URLCanonTest,_itoa_s)2301 TEST(URLCanonTest, _itoa_s) {
2302 // We fill the buffer with 0xff to ensure that it's getting properly
2303 // null-terminated. We also allocate one byte more than what we tell
2304 // _itoa_s about, and ensure that the extra byte is untouched.
2305 char buf[6];
2306 memset(buf, 0xff, sizeof(buf));
2307 EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10));
2308 EXPECT_STREQ("12", buf);
2309 EXPECT_EQ('\xFF', buf[3]);
2310
2311 // Test the edge cases - exactly the buffer size and one over
2312 memset(buf, 0xff, sizeof(buf));
2313 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10));
2314 EXPECT_STREQ("1234", buf);
2315 EXPECT_EQ('\xFF', buf[5]);
2316
2317 memset(buf, 0xff, sizeof(buf));
2318 EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10));
2319 EXPECT_EQ('\xFF', buf[5]); // should never write to this location
2320
2321 // Test the template overload (note that this will see the full buffer)
2322 memset(buf, 0xff, sizeof(buf));
2323 EXPECT_EQ(0, _itoa_s(12, buf, 10));
2324 EXPECT_STREQ("12", buf);
2325 EXPECT_EQ('\xFF', buf[3]);
2326
2327 memset(buf, 0xff, sizeof(buf));
2328 EXPECT_EQ(0, _itoa_s(12345, buf, 10));
2329 EXPECT_STREQ("12345", buf);
2330
2331 EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10));
2332
2333 // Test that radix 16 is supported.
2334 memset(buf, 0xff, sizeof(buf));
2335 EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16));
2336 EXPECT_STREQ("4d2", buf);
2337 EXPECT_EQ('\xFF', buf[5]);
2338 }
2339
TEST(URLCanonTest,_itow_s)2340 TEST(URLCanonTest, _itow_s) {
2341 // We fill the buffer with 0xff to ensure that it's getting properly
2342 // null-terminated. We also allocate one byte more than what we tell
2343 // _itoa_s about, and ensure that the extra byte is untouched.
2344 char16_t buf[6];
2345 const char fill_mem = 0xff;
2346 const char16_t fill_char = 0xffff;
2347 memset(buf, fill_mem, sizeof(buf));
2348 EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
2349 EXPECT_EQ(u"12", std::u16string(buf));
2350 EXPECT_EQ(fill_char, buf[3]);
2351
2352 // Test the edge cases - exactly the buffer size and one over
2353 EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
2354 EXPECT_EQ(u"1234", std::u16string(buf));
2355 EXPECT_EQ(fill_char, buf[5]);
2356
2357 memset(buf, fill_mem, sizeof(buf));
2358 EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
2359 EXPECT_EQ(fill_char, buf[5]); // should never write to this location
2360
2361 // Test the template overload (note that this will see the full buffer)
2362 memset(buf, fill_mem, sizeof(buf));
2363 EXPECT_EQ(0, _itow_s(12, buf, 10));
2364 EXPECT_EQ(u"12", std::u16string(buf));
2365 EXPECT_EQ(fill_char, buf[3]);
2366
2367 memset(buf, fill_mem, sizeof(buf));
2368 EXPECT_EQ(0, _itow_s(12345, buf, 10));
2369 EXPECT_EQ(u"12345", std::u16string(buf));
2370
2371 EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10));
2372 }
2373
2374 #endif // !WIN32
2375
2376 // Returns true if the given two structures are the same.
ParsedIsEqual(const Parsed & a,const Parsed & b)2377 static bool ParsedIsEqual(const Parsed& a, const Parsed& b) {
2378 return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
2379 a.username.begin == b.username.begin && a.username.len == b.username.len &&
2380 a.password.begin == b.password.begin && a.password.len == b.password.len &&
2381 a.host.begin == b.host.begin && a.host.len == b.host.len &&
2382 a.port.begin == b.port.begin && a.port.len == b.port.len &&
2383 a.path.begin == b.path.begin && a.path.len == b.path.len &&
2384 a.query.begin == b.query.begin && a.query.len == b.query.len &&
2385 a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
2386 }
2387
TEST(URLCanonTest,ResolveRelativeURL)2388 TEST(URLCanonTest, ResolveRelativeURL) {
2389 struct RelativeCase {
2390 const char* base; // Input base URL: MUST BE CANONICAL
2391 bool is_base_hier; // Is the base URL hierarchical
2392 bool is_base_file; // Tells us if the base is a file URL.
2393 const char* test; // Input URL to test against.
2394 bool succeed_relative; // Whether we expect IsRelativeURL to succeed
2395 bool is_rel; // Whether we expect |test| to be relative or not.
2396 bool succeed_resolve; // Whether we expect ResolveRelativeURL to succeed.
2397 const char* resolved; // What we expect in the result when resolving.
2398 } rel_cases[] = {
2399 // Basic absolute input.
2400 {"http://host/a", true, false, "http://another/", true, false, false, NULL},
2401 {"http://host/a", true, false, "http:////another/", true, false, false, NULL},
2402 // Empty relative URLs should only remove the ref part of the URL,
2403 // leaving the rest unchanged.
2404 {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
2405 {"http://foo/bar#ref", true, false, "", true, true, true, "http://foo/bar"},
2406 {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
2407 // Spaces at the ends of the relative path should be ignored.
2408 {"http://foo/bar", true, false, " another ", true, true, true, "http://foo/another"},
2409 {"http://foo/bar", true, false, " . ", true, true, true, "http://foo/"},
2410 {"http://foo/bar", true, false, " \t ", true, true, true, "http://foo/bar"},
2411 // Matching schemes without two slashes are treated as relative.
2412 {"http://host/a", true, false, "http:path", true, true, true, "http://host/path"},
2413 {"http://host/a/", true, false, "http:path", true, true, true, "http://host/a/path"},
2414 {"http://host/a", true, false, "http:/path", true, true, true, "http://host/path"},
2415 {"http://host/a", true, false, "HTTP:/path", true, true, true, "http://host/path"},
2416 // Nonmatching schemes are absolute.
2417 {"http://host/a", true, false, "https:host2", true, false, false, NULL},
2418 {"http://host/a", true, false, "htto:/host2", true, false, false, NULL},
2419 // Absolute path input
2420 {"http://host/a", true, false, "/b/c/d", true, true, true, "http://host/b/c/d"},
2421 {"http://host/a", true, false, "\\b\\c\\d", true, true, true, "http://host/b/c/d"},
2422 {"http://host/a", true, false, "/b/../c", true, true, true, "http://host/c"},
2423 {"http://host/a?b#c", true, false, "/b/../c", true, true, true, "http://host/c"},
2424 {"http://host/a", true, false, "\\b/../c?x#y", true, true, true, "http://host/c?x#y"},
2425 {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true, "http://host/c?x#y"},
2426 // Relative path input
2427 {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
2428 {"http://host/a", true, false, "bc/de", true, true, true, "http://host/bc/de"},
2429 {"http://host/a/", true, false, "bc/de?query#ref", true, true, true, "http://host/a/bc/de?query#ref"},
2430 {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
2431 {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
2432 {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
2433 {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
2434 {"http://host/a/", true, false, "././.", true, true, true, "http://host/a/"},
2435 {"http://host/a?query#ref", true, false, "../../../foo", true, true, true, "http://host/foo"},
2436 // Query input
2437 {"http://host/a", true, false, "?foo=bar", true, true, true, "http://host/a?foo=bar"},
2438 {"http://host/a?x=y#z", true, false, "?", true, true, true, "http://host/a?"},
2439 {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true, "http://host/a?foo=bar#com"},
2440 // Ref input
2441 {"http://host/a", true, false, "#ref", true, true, true, "http://host/a#ref"},
2442 {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
2443 {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true, "http://host/a?foo=bar#bye"},
2444 // Non-hierarchical base: no relative handling. Relative input should
2445 // error, and if a scheme is present, it should be treated as absolute.
2446 {"data:foobar", false, false, "baz.html", false, false, false, NULL},
2447 {"data:foobar", false, false, "data:baz", true, false, false, NULL},
2448 {"data:foobar", false, false, "data:/base", true, false, false, NULL},
2449 // Non-hierarchical base: absolute input should succeed.
2450 {"data:foobar", false, false, "http://host/", true, false, false, NULL},
2451 {"data:foobar", false, false, "http:host", true, false, false, NULL},
2452 // Non-hierarchical base: empty URL should give error.
2453 {"data:foobar", false, false, "", false, false, false, NULL},
2454 // Invalid schemes should be treated as relative.
2455 {"http://foo/bar", true, false, "./asd:fgh", true, true, true, "http://foo/asd:fgh"},
2456 {"http://foo/bar", true, false, ":foo", true, true, true, "http://foo/:foo"},
2457 {"http://foo/bar", true, false, " hello world", true, true, true, "http://foo/hello%20world"},
2458 {"data:asdf", false, false, ":foo", false, false, false, NULL},
2459 {"data:asdf", false, false, "bad(':foo')", false, false, false, NULL},
2460 // We should treat semicolons like any other character in URL resolving
2461 {"http://host/a", true, false, ";foo", true, true, true, "http://host/;foo"},
2462 {"http://host/a;", true, false, ";foo", true, true, true, "http://host/;foo"},
2463 {"http://host/a", true, false, ";/../bar", true, true, true, "http://host/bar"},
2464 // Relative URLs can also be written as "//foo/bar" which is relative to
2465 // the scheme. In this case, it would take the old scheme, so for http
2466 // the example would resolve to "http://foo/bar".
2467 {"http://host/a", true, false, "//another", true, true, true, "http://another/"},
2468 {"http://host/a", true, false, "//another/path?query#ref", true, true, true, "http://another/path?query#ref"},
2469 {"http://host/a", true, false, "///another/path", true, true, true, "http://another/path"},
2470 {"http://host/a", true, false, "//Another\\path", true, true, true, "http://another/path"},
2471 {"http://host/a", true, false, "//", true, true, false, "http:"},
2472 // IE will also allow one or the other to be a backslash to get the same
2473 // behavior.
2474 {"http://host/a", true, false, "\\/another/path", true, true, true, "http://another/path"},
2475 {"http://host/a", true, false, "/\\Another\\path", true, true, true, "http://another/path"},
2476 #ifdef WIN32
2477 // Resolving against Windows file base URLs.
2478 {"file:///C:/foo", true, true, "http://host/", true, false, false, NULL},
2479 {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
2480 {"file:///C:/foo", true, true, "../../../bar.html", true, true, true, "file:///C:/bar.html"},
2481 {"file:///C:/foo", true, true, "/../bar.html", true, true, true, "file:///C:/bar.html"},
2482 // But two backslashes on Windows should be UNC so should be treated
2483 // as absolute.
2484 {"http://host/a", true, false, "\\\\another\\path", true, false, false, NULL},
2485 // IE doesn't support drive specs starting with two slashes. It fails
2486 // immediately and doesn't even try to load. We fix it up to either
2487 // an absolute path or UNC depending on what it looks like.
2488 {"file:///C:/something", true, true, "//c:/foo", true, true, true, "file:///C:/foo"},
2489 {"file:///C:/something", true, true, "//localhost/c:/foo", true, true, true, "file:///C:/foo"},
2490 // Windows drive specs should be allowed and treated as absolute.
2491 {"file:///C:/foo", true, true, "c:", true, false, false, NULL},
2492 {"file:///C:/foo", true, true, "c:/foo", true, false, false, NULL},
2493 {"http://host/a", true, false, "c:\\foo", true, false, false, NULL},
2494 // Relative paths with drive letters should be allowed when the base is
2495 // also a file.
2496 {"file:///C:/foo", true, true, "/z:/bar", true, true, true, "file:///Z:/bar"},
2497 // Treat absolute paths as being off of the drive.
2498 {"file:///C:/foo", true, true, "/bar", true, true, true, "file:///C:/bar"},
2499 {"file://localhost/C:/foo", true, true, "/bar", true, true, true, "file://localhost/C:/bar"},
2500 {"file:///C:/foo/com/", true, true, "/bar", true, true, true, "file:///C:/bar"},
2501 // On Windows, two slashes without a drive letter when the base is a file
2502 // means that the path is UNC.
2503 {"file:///C:/something", true, true, "//somehost/path", true, true, true, "file://somehost/path"},
2504 {"file:///C:/something", true, true, "/\\//somehost/path", true, true, true, "file://somehost/path"},
2505 #else
2506 // On Unix we fall back to relative behavior since there's nothing else
2507 // reasonable to do.
2508 {"http://host/a", true, false, "\\\\Another\\path", true, true, true, "http://another/path"},
2509 #endif
2510 // Even on Windows, we don't allow relative drive specs when the base
2511 // is not file.
2512 {"http://host/a", true, false, "/c:\\foo", true, true, true, "http://host/c:/foo"},
2513 {"http://host/a", true, false, "//c:\\foo", true, true, true, "http://c/foo"},
2514 // Cross-platform relative file: resolution behavior.
2515 {"file://host/a", true, true, "/", true, true, true, "file://host/"},
2516 {"file://host/a", true, true, "//", true, true, true, "file:///"},
2517 {"file://host/a", true, true, "/b", true, true, true, "file://host/b"},
2518 {"file://host/a", true, true, "//b", true, true, true, "file://b/"},
2519 // Ensure that ports aren't allowed for hosts relative to a file url.
2520 // Although the result string shows a host:port portion, the call to
2521 // resolve the relative URL returns false, indicating parse failure,
2522 // which is what is required.
2523 {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false, "file://host:80/bar.txt"},
2524 // Filesystem URL tests; filesystem URLs are only valid and relative if
2525 // they have no scheme, e.g. "./index.html". There's no valid equivalent
2526 // to http:index.html.
2527 {"filesystem:http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
2528 {"filesystem:http://host/t/path", true, false, "filesystem:https://host/t/path2", true, false, false, NULL},
2529 {"filesystem:http://host/t/path", true, false, "http://host/t/path2", true, false, false, NULL},
2530 {"http://host/t/path", true, false, "filesystem:http://host/t/path2", true, false, false, NULL},
2531 {"filesystem:http://host/t/path", true, false, "./path2", true, true, true, "filesystem:http://host/t/path2"},
2532 {"filesystem:http://host/t/path/", true, false, "path2", true, true, true, "filesystem:http://host/t/path/path2"},
2533 {"filesystem:http://host/t/path", true, false, "filesystem:http:path2", true, false, false, NULL},
2534 // Absolute URLs are still not relative to a non-standard base URL.
2535 {"about:blank", false, false, "http://X/A", true, false, true, ""},
2536 {"about:blank", false, false, "content://content.Provider/", true, false, true, ""},
2537 };
2538
2539 for (size_t i = 0; i < std::size(rel_cases); i++) {
2540 const RelativeCase& cur_case = rel_cases[i];
2541
2542 Parsed parsed;
2543 int base_len = static_cast<int>(strlen(cur_case.base));
2544 if (cur_case.is_base_file)
2545 ParseFileURL(cur_case.base, base_len, &parsed);
2546 else if (cur_case.is_base_hier)
2547 ParseStandardURL(cur_case.base, base_len, &parsed);
2548 else
2549 ParsePathURL(cur_case.base, base_len, false, &parsed);
2550
2551 // First see if it is relative.
2552 int test_len = static_cast<int>(strlen(cur_case.test));
2553 bool is_relative;
2554 Component relative_component;
2555 bool succeed_is_rel = IsRelativeURL(
2556 cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
2557 &is_relative, &relative_component);
2558
2559 EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
2560 "succeed is rel failure on " << cur_case.test;
2561 EXPECT_EQ(cur_case.is_rel, is_relative) <<
2562 "is rel failure on " << cur_case.test;
2563 // Now resolve it.
2564 if (succeed_is_rel && is_relative && cur_case.is_rel) {
2565 std::string resolved;
2566 StdStringCanonOutput output(&resolved);
2567 Parsed resolved_parsed;
2568
2569 bool succeed_resolve = ResolveRelativeURL(
2570 cur_case.base, parsed, cur_case.is_base_file, cur_case.test,
2571 relative_component, NULL, &output, &resolved_parsed);
2572 output.Complete();
2573
2574 EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
2575 EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
2576
2577 // Verify that the output parsed structure is the same as parsing a
2578 // the URL freshly.
2579 Parsed ref_parsed;
2580 int resolved_len = static_cast<int>(resolved.size());
2581 if (cur_case.is_base_file) {
2582 ParseFileURL(resolved.c_str(), resolved_len, &ref_parsed);
2583 } else if (cur_case.is_base_hier) {
2584 ParseStandardURL(resolved.c_str(), resolved_len, &ref_parsed);
2585 } else {
2586 ParsePathURL(resolved.c_str(), resolved_len, false, &ref_parsed);
2587 }
2588 EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
2589 }
2590 }
2591 }
2592
2593 // It used to be the case that when we did a replacement with a long buffer of
2594 // UTF-16 characters, we would get invalid data in the URL. This is because the
2595 // buffer that it used to hold the UTF-8 data was resized, while some pointers
2596 // were still kept to the old buffer that was removed.
TEST(URLCanonTest,ReplacementOverflow)2597 TEST(URLCanonTest, ReplacementOverflow) {
2598 const char src[] = "file:///C:/foo/bar";
2599 int src_len = static_cast<int>(strlen(src));
2600 Parsed parsed;
2601 ParseFileURL(src, src_len, &parsed);
2602
2603 // Override two components, the path with something short, and the query with
2604 // something long enough to trigger the bug.
2605 Replacements<char16_t> repl;
2606 std::u16string new_query;
2607 for (int i = 0; i < 4800; i++)
2608 new_query.push_back('a');
2609
2610 std::u16string new_path(test_utils::TruncateWStringToUTF16(L"/foo"));
2611 repl.SetPath(new_path.c_str(), Component(0, 4));
2612 repl.SetQuery(new_query.c_str(),
2613 Component(0, static_cast<int>(new_query.length())));
2614
2615 // Call ReplaceComponents on the string. It doesn't matter if we call it for
2616 // standard URLs, file URLs, etc, since they will go to the same replacement
2617 // function that was buggy.
2618 Parsed repl_parsed;
2619 std::string repl_str;
2620 StdStringCanonOutput repl_output(&repl_str);
2621 ReplaceFileURL(src, parsed, repl, NULL, &repl_output, &repl_parsed);
2622 repl_output.Complete();
2623
2624 // Generate the expected string and check.
2625 std::string expected("file:///foo?");
2626 for (size_t i = 0; i < new_query.length(); i++)
2627 expected.push_back('a');
2628 EXPECT_TRUE(expected == repl_str);
2629 }
2630
TEST(URLCanonTest,DefaultPortForScheme)2631 TEST(URLCanonTest, DefaultPortForScheme) {
2632 struct TestCases {
2633 const char* scheme;
2634 const int expected_port;
2635 } cases[]{
2636 {"http", 80},
2637 {"https", 443},
2638 {"ftp", 21},
2639 {"ws", 80},
2640 {"wss", 443},
2641 {"fake-scheme", PORT_UNSPECIFIED},
2642 {"HTTP", PORT_UNSPECIFIED},
2643 {"HTTPS", PORT_UNSPECIFIED},
2644 {"FTP", PORT_UNSPECIFIED},
2645 {"WS", PORT_UNSPECIFIED},
2646 {"WSS", PORT_UNSPECIFIED},
2647 };
2648
2649 for (auto& test_case : cases) {
2650 SCOPED_TRACE(test_case.scheme);
2651 EXPECT_EQ(test_case.expected_port,
2652 DefaultPortForScheme(test_case.scheme, strlen(test_case.scheme)));
2653 }
2654 }
2655
TEST(URLCanonTest,FindWindowsDriveLetter)2656 TEST(URLCanonTest, FindWindowsDriveLetter) {
2657 struct TestCase {
2658 base::StringPiece spec;
2659 int begin;
2660 int end; // -1 for end of spec
2661 int expected_drive_letter_pos;
2662 } cases[] = {
2663 {"/", 0, -1, -1},
2664
2665 {"c:/foo", 0, -1, 0},
2666 {"/c:/foo", 0, -1, 1},
2667 {"//c:/foo", 0, -1, -1}, // "//" does not canonicalize to "/"
2668 {"\\C|\\foo", 0, -1, 1},
2669 {"/cd:/foo", 0, -1, -1}, // "/c" does not canonicalize to "/"
2670 {"/./c:/foo", 0, -1, 3},
2671 {"/.//c:/foo", 0, -1, -1}, // "/.//" does not canonicalize to "/"
2672 {"/././c:/foo", 0, -1, 5},
2673 {"/abc/c:/foo", 0, -1, -1}, // "/abc/" does not canonicalize to "/"
2674 {"/abc/./../c:/foo", 0, -1, 10},
2675
2676 {"/c:/c:/foo", 3, -1, 4}, // actual input is "/c:/foo"
2677 {"/c:/foo", 3, -1, -1}, // actual input is "/foo"
2678 {"/c:/foo", 0, 1, -1}, // actual input is "/"
2679 };
2680
2681 for (const auto& c : cases) {
2682 int end = c.end;
2683 if (end == -1)
2684 end = c.spec.size();
2685
2686 EXPECT_EQ(c.expected_drive_letter_pos,
2687 FindWindowsDriveLetter(c.spec.data(), c.begin, end))
2688 << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-8)";
2689
2690 std::u16string spec16 = base::ASCIIToUTF16(c.spec);
2691 EXPECT_EQ(c.expected_drive_letter_pos,
2692 FindWindowsDriveLetter(spec16.data(), c.begin, end))
2693 << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-16)";
2694 }
2695 }
2696
TEST(URLCanonTest,IDNToASCII)2697 TEST(URLCanonTest, IDNToASCII) {
2698 RawCanonOutputW<1024> output;
2699
2700 // Basic ASCII test.
2701 std::u16string str = u"hello";
2702 EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
2703 EXPECT_EQ(u"hello", std::u16string(output.data()));
2704 output.set_length(0);
2705
2706 // Mixed ASCII/non-ASCII.
2707 str = u"hellö";
2708 EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
2709 EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
2710 output.set_length(0);
2711
2712 // All non-ASCII.
2713 str = u"你好";
2714 EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
2715 EXPECT_EQ(u"xn--6qq79v", std::u16string(output.data()));
2716 output.set_length(0);
2717
2718 // Characters that need mapping (the resulting Punycode is the encoding for
2719 // "1⁄4").
2720 str = u"¼";
2721 EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
2722 EXPECT_EQ(u"xn--14-c6t", std::u16string(output.data()));
2723 output.set_length(0);
2724
2725 // String to encode already starts with "xn--", and all ASCII. Should not
2726 // modify the string.
2727 str = u"xn--hell-8qa";
2728 EXPECT_TRUE(IDNToASCII(str.data(), str.length(), &output));
2729 EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
2730 output.set_length(0);
2731
2732 // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
2733 // Should fail, due to a special case: if the label starts with "xn--", it
2734 // should be parsed as Punycode, which must be all ASCII.
2735 str = u"xn--hellö";
2736 EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output));
2737 output.set_length(0);
2738
2739 // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
2740 // This tests that there is still an error for the character '⁄' (U+2044),
2741 // which would be a valid ASCII character, U+0044, if the high byte were
2742 // ignored.
2743 str = u"xn--1⁄4";
2744 EXPECT_FALSE(IDNToASCII(str.data(), str.length(), &output));
2745 output.set_length(0);
2746 }
2747
2748 } // namespace url
2749