• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/350788890): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "url/url_canon.h"
11 
12 #include <errno.h>
13 #include <stddef.h>
14 #include <string_view>
15 
16 #include "base/strings/string_number_conversions.h"
17 #include "base/strings/utf_string_conversions.h"
18 #include "base/test/gtest_util.h"
19 #include "base/test/scoped_feature_list.h"
20 #include "testing/gtest/include/gtest/gtest.h"
21 #include "url/third_party/mozilla/url_parse.h"
22 #include "url/url_canon_internal.h"
23 #include "url/url_canon_stdstring.h"
24 #include "url/url_features.h"
25 #include "url/url_test_utils.h"
26 
27 namespace url {
28 
29 namespace {
30 
31 struct ComponentCase {
32   const char* input;
33   const char* expected;
34   Component expected_component;
35   bool expected_success;
36 };
37 
38 // ComponentCase but with dual 8-bit/16-bit input. Generally, the unit tests
39 // treat each input as optional, and will only try processing if non-NULL.
40 // The output is always 8-bit.
41 struct DualComponentCase {
42   const char* input8;
43   const wchar_t* input16;
44   const char* expected;
45   Component expected_component;
46   bool expected_success;
47 };
48 
49 // Test cases for CanonicalizeIPAddress(). The inputs are identical to
50 // DualComponentCase, but the output has extra CanonHostInfo fields.
51 struct IPAddressCase {
52   const char* input8;
53   const wchar_t* input16;
54   const char* expected;
55   Component expected_component;
56 
57   // CanonHostInfo fields, for verbose output.
58   CanonHostInfo::Family expected_family;
59   int expected_num_ipv4_components;
60   const char* expected_address_hex;  // Two hex chars per IP address byte.
61 };
62 
63 struct ReplaceCase {
64   const char* base;
65   const char* scheme;
66   const char* username;
67   const char* password;
68   const char* host;
69   const char* port;
70   const char* path;
71   const char* query;
72   const char* ref;
73   const char* expected;
74 };
75 
76 // Magic string used in the replacements code that tells SetupReplComp to
77 // call the clear function.
78 const char kDeleteComp[] = "|";
79 
80 // Sets up a replacement for a single component. This is given pointers to
81 // the set and clear function for the component being replaced, and will
82 // either set the component (if it exists) or clear it (if the replacement
83 // string matches kDeleteComp).
84 //
85 // This template is currently used only for the 8-bit case, and the strlen
86 // causes it to fail in other cases. It is left a template in case we have
87 // tests for wide replacements.
88 template<typename CHAR>
SetupReplComp(void (Replacements<CHAR>::* set)(const CHAR *,const Component &),void (Replacements<CHAR>::* clear)(),Replacements<CHAR> * rep,const CHAR * str)89 void SetupReplComp(
90     void (Replacements<CHAR>::*set)(const CHAR*, const Component&),
91     void (Replacements<CHAR>::*clear)(),
92     Replacements<CHAR>* rep,
93     const CHAR* str) {
94   if (str && str[0] == kDeleteComp[0]) {
95     (rep->*clear)();
96   } else if (str) {
97     (rep->*set)(str, Component(0, static_cast<int>(strlen(str))));
98   }
99 }
100 
CanonicalizeSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)101 bool CanonicalizeSpecialPath(const char* spec,
102                              const Component& path,
103                              CanonOutput* output,
104                              Component* out_path) {
105   return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
106 }
107 
CanonicalizeSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)108 bool CanonicalizeSpecialPath(const char16_t* spec,
109                              const Component& path,
110                              CanonOutput* output,
111                              Component* out_path) {
112   return CanonicalizePath(spec, path, CanonMode::kSpecialURL, output, out_path);
113 }
114 
CanonicalizeNonSpecialPath(const char * spec,const Component & path,CanonOutput * output,Component * out_path)115 bool CanonicalizeNonSpecialPath(const char* spec,
116                                 const Component& path,
117                                 CanonOutput* output,
118                                 Component* out_path) {
119   return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
120                           out_path);
121 }
122 
CanonicalizeNonSpecialPath(const char16_t * spec,const Component & path,CanonOutput * output,Component * out_path)123 bool CanonicalizeNonSpecialPath(const char16_t* spec,
124                                 const Component& path,
125                                 CanonOutput* output,
126                                 Component* out_path) {
127   return CanonicalizePath(spec, path, CanonMode::kNonSpecialURL, output,
128                           out_path);
129 }
130 
131 }  // namespace
132 
133 class URLCanonTest : public ::testing::Test {
134  public:
URLCanonTest()135   URLCanonTest() {
136     scoped_feature_list_.InitAndEnableFeature(
137         url::kDisallowSpaceCharacterInURLHostParsing);
138   }
139 
140  private:
141   base::test::ScopedFeatureList scoped_feature_list_;
142 };
143 
TEST_F(URLCanonTest,DoAppendUTF8)144 TEST_F(URLCanonTest, DoAppendUTF8) {
145   struct UTF8Case {
146     unsigned input;
147     const char* output;
148   } utf_cases[] = {
149     // Valid code points.
150     {0x24, "\x24"},
151     {0xA2, "\xC2\xA2"},
152     {0x20AC, "\xE2\x82\xAC"},
153     {0x24B62, "\xF0\xA4\xAD\xA2"},
154     {0x10FFFF, "\xF4\x8F\xBF\xBF"},
155   };
156   std::string out_str;
157   for (const auto& utf_case : utf_cases) {
158     out_str.clear();
159     StdStringCanonOutput output(&out_str);
160     AppendUTF8Value(utf_case.input, &output);
161     output.Complete();
162     EXPECT_EQ(utf_case.output, out_str);
163   }
164 }
165 
TEST_F(URLCanonTest,DoAppendUTF8Invalid)166 TEST_F(URLCanonTest, DoAppendUTF8Invalid) {
167   std::string out_str;
168   StdStringCanonOutput output(&out_str);
169   // Invalid code point (too large).
170   EXPECT_DCHECK_DEATH({
171     AppendUTF8Value(0x110000, &output);
172     output.Complete();
173   });
174 }
175 
TEST_F(URLCanonTest,UTF)176 TEST_F(URLCanonTest, UTF) {
177   // Low-level test that we handle reading, canonicalization, and writing
178   // UTF-8/UTF-16 strings properly.
179   struct UTFCase {
180     const char* input8;
181     const wchar_t* input16;
182     bool expected_success;
183     const char* output;
184   } utf_cases[] = {
185       // Valid canonical input should get passed through & escaped.
186       {"\xe4\xbd\xa0\xe5\xa5\xbd", L"\x4f60\x597d", true, "%E4%BD%A0%E5%A5%BD"},
187       // Test a character that takes > 16 bits (U+10300 = old italic letter A)
188       {"\xF0\x90\x8C\x80", L"\xd800\xdf00", true, "%F0%90%8C%80"},
189       // Non-shortest-form UTF-8 characters are invalid. The bad bytes should
190       // each be replaced with the invalid character (EF BF DB in UTF-8).
191       {"\xf0\x84\xbd\xa0\xe5\xa5\xbd", nullptr, false,
192        "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%E5%A5%BD"},
193       // Invalid UTF-8 sequences should be marked as invalid (the first
194       // sequence is truncated).
195       {"\xe4\xa0\xe5\xa5\xbd", L"\xd800\x597d", false, "%EF%BF%BD%E5%A5%BD"},
196       // Character going off the end.
197       {"\xe4\xbd\xa0\xe5\xa5", L"\x4f60\xd800", false, "%E4%BD%A0%EF%BF%BD"},
198       // ...same with low surrogates with no high surrogate.
199       {nullptr, L"\xdc00", false, "%EF%BF%BD"},
200       // Test a UTF-8 encoded surrogate value is marked as invalid.
201       // ED A0 80 = U+D800
202       {"\xed\xa0\x80", nullptr, false, "%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
203       // ...even when paired.
204       {"\xed\xa0\x80\xed\xb0\x80", nullptr, false,
205        "%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD%EF%BF%BD"},
206   };
207 
208   std::string out_str;
209   for (const auto& utf_case : utf_cases) {
210     if (utf_case.input8) {
211       out_str.clear();
212       StdStringCanonOutput output(&out_str);
213 
214       size_t input_len = strlen(utf_case.input8);
215       bool success = true;
216       for (size_t ch = 0; ch < input_len; ch++) {
217         success &=
218             AppendUTF8EscapedChar(utf_case.input8, &ch, input_len, &output);
219       }
220       output.Complete();
221       EXPECT_EQ(utf_case.expected_success, success);
222       EXPECT_EQ(utf_case.output, out_str);
223     }
224     if (utf_case.input16) {
225       out_str.clear();
226       StdStringCanonOutput output(&out_str);
227 
228       std::u16string input_str(
229           test_utils::TruncateWStringToUTF16(utf_case.input16));
230       size_t input_len = input_str.length();
231       bool success = true;
232       for (size_t ch = 0; ch < input_len; ch++) {
233         success &= AppendUTF8EscapedChar(input_str.c_str(), &ch, input_len,
234                                          &output);
235       }
236       output.Complete();
237       EXPECT_EQ(utf_case.expected_success, success);
238       EXPECT_EQ(utf_case.output, out_str);
239     }
240 
241     if (utf_case.input8 && utf_case.input16 && utf_case.expected_success) {
242       // Check that the UTF-8 and UTF-16 inputs are equivalent.
243 
244       // UTF-16 -> UTF-8
245       std::string input8_str(utf_case.input8);
246       std::u16string input16_str(
247           test_utils::TruncateWStringToUTF16(utf_case.input16));
248       EXPECT_EQ(input8_str, base::UTF16ToUTF8(input16_str));
249 
250       // UTF-8 -> UTF-16
251       EXPECT_EQ(input16_str, base::UTF8ToUTF16(input8_str));
252     }
253   }
254 }
255 
TEST_F(URLCanonTest,Scheme)256 TEST_F(URLCanonTest, Scheme) {
257   // Here, we're mostly testing that unusual characters are handled properly.
258   // The canonicalizer doesn't do any parsing or whitespace detection. It will
259   // also do its best on error, and will escape funny sequences (these won't be
260   // valid schemes and it will return error).
261   //
262   // Note that the canonicalizer will append a colon to the output to separate
263   // out the rest of the URL, which is not present in the input. We check,
264   // however, that the output range includes everything but the colon.
265   ComponentCase scheme_cases[] = {
266     {"http", "http:", Component(0, 4), true},
267     {"HTTP", "http:", Component(0, 4), true},
268     {" HTTP ", "%20http%20:", Component(0, 10), false},
269     {"htt: ", "htt%3A%20:", Component(0, 9), false},
270     {"\xe4\xbd\xa0\xe5\xa5\xbdhttp", "%E4%BD%A0%E5%A5%BDhttp:", Component(0, 22), false},
271       // Don't re-escape something already escaped. Note that it will
272       // "canonicalize" the 'A' to 'a', but that's OK.
273     {"ht%3Atp", "ht%3atp:", Component(0, 7), false},
274     {"", ":", Component(0, 0), false},
275   };
276 
277   std::string out_str;
278 
279   for (const auto& scheme_case : scheme_cases) {
280     int url_len = static_cast<int>(strlen(scheme_case.input));
281     Component in_comp(0, url_len);
282     Component out_comp;
283 
284     out_str.clear();
285     StdStringCanonOutput output1(&out_str);
286     bool success =
287         CanonicalizeScheme(scheme_case.input, in_comp, &output1, &out_comp);
288     output1.Complete();
289 
290     EXPECT_EQ(scheme_case.expected_success, success);
291     EXPECT_EQ(scheme_case.expected, out_str);
292     EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
293     EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
294 
295     // Now try the wide version.
296     out_str.clear();
297     StdStringCanonOutput output2(&out_str);
298 
299     std::u16string wide_input(base::UTF8ToUTF16(scheme_case.input));
300     in_comp.len = static_cast<int>(wide_input.length());
301     success = CanonicalizeScheme(wide_input.c_str(), in_comp, &output2,
302                                  &out_comp);
303     output2.Complete();
304 
305     EXPECT_EQ(scheme_case.expected_success, success);
306     EXPECT_EQ(scheme_case.expected, out_str);
307     EXPECT_EQ(scheme_case.expected_component.begin, out_comp.begin);
308     EXPECT_EQ(scheme_case.expected_component.len, out_comp.len);
309   }
310 
311   // Test the case where the scheme is declared nonexistent, it should be
312   // converted into an empty scheme.
313   Component out_comp;
314   out_str.clear();
315   StdStringCanonOutput output(&out_str);
316 
317   EXPECT_FALSE(CanonicalizeScheme("", Component(0, -1), &output, &out_comp));
318   output.Complete();
319 
320   EXPECT_EQ(":", out_str);
321   EXPECT_EQ(0, out_comp.begin);
322   EXPECT_EQ(0, out_comp.len);
323 }
324 
325 // IDNA mode to use in CanonHost tests.
326 enum class IDNAMode { kTransitional, kNonTransitional };
327 
328 class URLCanonHostTest
329     : public ::testing::Test,
330       public ::testing::WithParamInterface<IDNAMode> {
331  public:
URLCanonHostTest()332   URLCanonHostTest() {
333     std::vector<base::test::FeatureRef> enabled_features;
334     std::vector<base::test::FeatureRef> disabled_features;
335     if (GetParam() == IDNAMode::kNonTransitional) {
336       enabled_features.push_back(kUseIDNA2008NonTransitional);
337     } else {
338       disabled_features.push_back(kUseIDNA2008NonTransitional);
339     }
340 
341     enabled_features.push_back(url::kDisallowSpaceCharacterInURLHostParsing);
342     scoped_feature_list_.InitWithFeatures(enabled_features, disabled_features);
343   }
344 
345  private:
346   base::test::ScopedFeatureList scoped_feature_list_;
347 };
348 
349 INSTANTIATE_TEST_SUITE_P(All,
350                          URLCanonHostTest,
351                          ::testing::Values(IDNAMode::kTransitional,
352                                            IDNAMode::kNonTransitional));
353 
TEST_P(URLCanonHostTest,Host)354 TEST_P(URLCanonHostTest, Host) {
355   bool use_idna_non_transitional = IsUsingIDNA2008NonTransitional();
356 
357   // clang-format off
358   IPAddressCase host_cases[] = {
359       // Basic canonicalization, uppercase should be converted to lowercase.
360       {"GoOgLe.CoM", L"GoOgLe.CoM", "google.com", Component(0, 10),
361        CanonHostInfo::NEUTRAL, -1, ""},
362       {"Goo%20 goo.com", L"Goo%20 goo.com", "goo%20%20goo.com",
363        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
364       // TODO(crbug.com/40256677): Update the test after ASTERISK is
365       // correctly handled.
366       {"Goo%2a*goo.com", L"Goo%2a*goo.com", "goo%2A%2Agoo.com",
367        Component(0, 16), CanonHostInfo::NEUTRAL, -1, ""},
368       // Exciting different types of spaces!
369       {nullptr, L"GOO\x00a0\x3000goo.com", "goo%20%20goo.com", Component(0, 16),
370        CanonHostInfo::BROKEN, -1, ""},
371       // Other types of space (no-break, zero-width, zero-width-no-break) are
372       // name-prepped away to nothing.
373       {nullptr, L"GOO\x200b\x2060\xfeffgoo.com", "googoo.com", Component(0, 10),
374        CanonHostInfo::NEUTRAL, -1, ""},
375       // Ideographic full stop (full-width period for Chinese, etc.) should be
376       // treated as a dot.
377       {nullptr,
378        L"www.foo\x3002"
379        L"bar.com",
380        "www.foo.bar.com", Component(0, 15), CanonHostInfo::NEUTRAL, -1, ""},
381       // Invalid unicode characters should fail...
382       {"\xef\xb7\x90zyx.com", L"\xfdd0zyx.com", "%EF%B7%90zyx.com",
383        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
384       // ...This is the same as previous but with with escaped.
385       {"%ef%b7%90zyx.com", L"%ef%b7%90zyx.com", "%EF%B7%90zyx.com",
386        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
387       // Test name prepping, fullwidth input should be converted to ASCII and
388       // NOT
389       // IDN-ized. This is "Go" in fullwidth UTF-8/UTF-16.
390       {"\xef\xbc\xa7\xef\xbd\x8f.com", L"\xff27\xff4f.com", "go.com",
391        Component(0, 6), CanonHostInfo::NEUTRAL, -1, ""},
392       // Test that fullwidth escaped values are properly name-prepped,
393       // then converted or rejected.
394       // ...%41 in fullwidth = 'A' (also as escaped UTF-8 input)
395       {"\xef\xbc\x85\xef\xbc\x94\xef\xbc\x91.com", L"\xff05\xff14\xff11.com",
396        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
397       {"%ef%bc%85%ef%bc%94%ef%bc%91.com", L"%ef%bc%85%ef%bc%94%ef%bc%91.com",
398        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
399       // ...%00 in fullwidth should fail (also as escaped UTF-8 input)
400       {"\xef\xbc\x85\xef\xbc\x90\xef\xbc\x90.com", L"\xff05\xff10\xff10.com",
401        "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
402       {"%ef%bc%85%ef%bc%90%ef%bc%90.com", L"%ef%bc%85%ef%bc%90%ef%bc%90.com",
403        "%00.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
404       // ICU will convert weird percents into ASCII percents, but not unescape
405       // further. A weird percent is U+FE6A (EF B9 AA in UTF-8) which is a
406       // "small percent". At this point we should be within our rights to mark
407       // anything as invalid since the URL is corrupt or malicious. The code
408       // happens to allow ASCII characters (%41 = "A" -> 'a') to be unescaped
409       // and kept as valid, so we validate that behavior here, but this level
410       // of fixing the input shouldn't be seen as required. "%81" is invalid.
411       {"\xef\xb9\xaa"
412        "41.com",
413        L"\xfe6a"
414        L"41.com",
415        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
416       {"%ef%b9%aa"
417        "41.com",
418        L"\xfe6a"
419        L"41.com",
420        "a.com", Component(0, 5), CanonHostInfo::NEUTRAL, -1, ""},
421       {"\xef\xb9\xaa"
422        "81.com",
423        L"\xfe6a"
424        L"81.com",
425        "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
426       {"%ef%b9%aa"
427        "81.com",
428        L"\xfe6a"
429        L"81.com",
430        "%81.com", Component(0, 7), CanonHostInfo::BROKEN, -1, ""},
431       // Basic IDN support, UTF-8 and UTF-16 input should be converted to IDN
432       {"\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
433        L"\x4f60\x597d\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
434        CanonHostInfo::NEUTRAL, -1, ""},
435       // See http://unicode.org/cldr/utility/idna.jsp for other
436       // examples/experiments and http://goo.gl/7yG11o
437       // for the full list of characters handled differently by
438       // IDNA 2003, UTS 46 (http://unicode.org/reports/tr46/ ) and IDNA 2008.
439 
440       // 4 Deviation characters are mapped/ignored in UTS 46 transitional
441       // mechansm. UTS 46, table 4 row (g).
442       // Sharp-s is mapped to 'ss' in IDNA 2003, not in IDNA 2008 or UTF 46
443       // after transitional period.
444       // Previously, it'd be "fussball.de".
445       {"fu\xc3\x9f"
446        "ball.de",
447        L"fu\x00df"
448        L"ball.de",
449        use_idna_non_transitional ? "xn--fuball-cta.de" : "fussball.de",
450        use_idna_non_transitional ? Component(0, 17) : Component(0, 11),
451        CanonHostInfo::NEUTRAL, -1, ""},
452 
453       // Final-sigma (U+03C3) was mapped to regular sigma (U+03C2).
454       // Previously, it'd be "xn--wxaikc9b".
455       {"\xcf\x83\xcf\x8c\xce\xbb\xce\xbf\xcf\x82", L"\x3c3\x3cc\x3bb\x3bf\x3c2",
456        use_idna_non_transitional ? "xn--wxaijb9b" : "xn--wxaikc6b",
457        Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
458 
459       // ZWNJ (U+200C) and ZWJ (U+200D) are mapped away in UTS 46 transitional
460       // handling as well as in IDNA 2003, but not thereafter.
461       {"a\xe2\x80\x8c"
462        "b\xe2\x80\x8d"
463        "c",
464        L"a\x200c"
465        L"b\x200d"
466        L"c",
467        use_idna_non_transitional ? "xn--abc-9m0ag" : "abc",
468        use_idna_non_transitional ? Component(0, 13) : Component(0, 3),
469        CanonHostInfo::NEUTRAL, -1, ""},
470 
471       // ZWJ between Devanagari characters was still mapped away in UTS 46
472       // transitional handling. IDNA 2008 gives xn--11bo0mv54g.
473       // Previously "xn--11bo0m".
474       {"\xe0\xa4\x95\xe0\xa5\x8d\xe2\x80\x8d\xe0\xa4\x9c",
475        L"\x915\x94d\x200d\x91c",
476        use_idna_non_transitional ? "xn--11bo0mv54g" : "xn--11bo0m",
477        use_idna_non_transitional ? Component(0, 14) : Component(0, 10),
478        CanonHostInfo::NEUTRAL, -1, ""},
479 
480       // Fullwidth exclamation mark is disallowed. UTS 46, table 4, row (b)
481       // However, we do allow this at the moment because we don't use
482       // STD3 rules and canonicalize full-width ASCII to ASCII.
483       {"wow\xef\xbc\x81", L"wow\xff01", "wow!", Component(0, 4),
484        CanonHostInfo::NEUTRAL, -1, ""},
485       // U+2132 (turned capital F) is disallowed. UTS 46, table 4, row (c)
486       // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
487       {"\xe2\x84\xb2oo", L"\x2132oo", "%E2%84%B2oo", Component(0, 11),
488        CanonHostInfo::BROKEN, -1, ""},
489       // U+2F868 (CJK Comp) is disallowed. UTS 46, table 4, row (d)
490       // Allowed in IDNA 2003, but the mapping changed after Unicode 3.2
491       {"\xf0\xaf\xa1\xa8\xe5\xa7\xbb.cn", L"\xd87e\xdc68\x59fb.cn",
492        "%F0%AF%A1%A8%E5%A7%BB.cn", Component(0, 24), CanonHostInfo::BROKEN, -1,
493        ""},
494       // Maps uppercase letters to lower case letters. UTS 46 table 4 row (e)
495       {"M\xc3\x9cNCHEN", L"M\xdcNCHEN", "xn--mnchen-3ya", Component(0, 14),
496        CanonHostInfo::NEUTRAL, -1, ""},
497       // An already-IDNA host is not modified.
498       {"xn--mnchen-3ya", L"xn--mnchen-3ya", "xn--mnchen-3ya", Component(0, 14),
499        CanonHostInfo::NEUTRAL, -1, ""},
500       // Symbol/punctuations are allowed in IDNA 2003/UTS46.
501       // Not allowed in IDNA 2008. UTS 46 table 4 row (f).
502       {"\xe2\x99\xa5ny.us", L"\x2665ny.us", "xn--ny-s0x.us", Component(0, 13),
503        CanonHostInfo::NEUTRAL, -1, ""},
504       // U+11013 is new in Unicode 6.0 and is allowed. UTS 46 table 4, row (h)
505       // We used to allow it because we passed through unassigned code points.
506       {"\xf0\x91\x80\x93.com", L"\xd804\xdc13.com", "xn--n00d.com",
507        Component(0, 12), CanonHostInfo::NEUTRAL, -1, ""},
508       // U+0602 is disallowed in UTS46/IDNA 2008. UTS 46 table 4, row(i)
509       // Used to be allowed in INDA 2003.
510       {"\xd8\x82.eg", L"\x602.eg", "%D8%82.eg", Component(0, 9),
511        CanonHostInfo::BROKEN, -1, ""},
512       // U+20B7 is new in Unicode 5.2 (not a part of IDNA 2003 based
513       // on Unicode 3.2). We did allow it in the past because we let unassigned
514       // code point pass. We continue to allow it even though it's a
515       // "punctuation and symbol" blocked in IDNA 2008.
516       // UTS 46 table 4, row (j)
517       {"\xe2\x82\xb7.com", L"\x20b7.com", "xn--wzg.com", Component(0, 11),
518        CanonHostInfo::NEUTRAL, -1, ""},
519       // Maps uppercase letters to lower case letters.
520       // In IDNA 2003, it's allowed without case-folding
521       // ( xn--bc-7cb.com ) because it's not defined in Unicode 3.2
522       // (added in Unicode 4.1). UTS 46 table 4 row (k)
523       {"bc\xc8\xba.com", L"bc\x23a.com", "xn--bc-is1a.com", Component(0, 15),
524        CanonHostInfo::NEUTRAL, -1, ""},
525       // Maps U+FF43 (Full Width Small Letter C) to 'c'.
526       {"ab\xef\xbd\x83.xyz", L"ab\xff43.xyz", "abc.xyz", Component(0, 7),
527        CanonHostInfo::NEUTRAL, -1, ""},
528       // Maps U+1D68C (Math Monospace Small C) to 'c'.
529       // U+1D68C = \xD835\xDE8C in UTF-16
530       {"ab\xf0\x9d\x9a\x8c.xyz", L"ab\xd835\xde8c.xyz", "abc.xyz",
531        Component(0, 7), CanonHostInfo::NEUTRAL, -1, ""},
532       // BiDi check test
533       // "Divehi" in Divehi (Thaana script) ends with BidiClass=NSM.
534       // Disallowed in IDNA 2003 but now allowed in UTS 46/IDNA 2008.
535       {"\xde\x8b\xde\xa8\xde\x88\xde\xac\xde\x80\xde\xa8",
536        L"\x78b\x7a8\x788\x7ac\x780\x7a8", "xn--hqbpi0jcw", Component(0, 13),
537        CanonHostInfo::NEUTRAL, -1, ""},
538       // Disallowed in both IDNA 2003 and 2008 with BiDi check.
539       // Labels starting with a RTL character cannot end with a LTR character.
540       {"\xd8\xac\xd8\xa7\xd8\xb1xyz", L"\x62c\x627\x631xyz",
541        "%D8%AC%D8%A7%D8%B1xyz", Component(0, 21), CanonHostInfo::BROKEN, -1,
542        ""},
543       // Labels starting with a RTL character can end with BC=EN (European
544       // number). Disallowed in IDNA 2003 but now allowed.
545       {"\xd8\xac\xd8\xa7\xd8\xb1"
546        "2",
547        L"\x62c\x627\x631"
548        L"2",
549        "xn--2-ymcov", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
550       // Labels starting with a RTL character cannot have "L" characters
551       // even if it ends with an BC=EN. Disallowed in both IDNA 2003/2008.
552       {"\xd8\xac\xd8\xa7\xd8\xb1xy2", L"\x62c\x627\x631xy2",
553        "%D8%AC%D8%A7%D8%B1xy2", Component(0, 21), CanonHostInfo::BROKEN, -1,
554        ""},
555       // Labels starting with a RTL character can end with BC=AN (Arabic number)
556       // Disallowed in IDNA 2003, but now allowed.
557       {"\xd8\xac\xd8\xa7\xd8\xb1\xd9\xa2", L"\x62c\x627\x631\x662",
558        "xn--mgbjq0r", Component(0, 11), CanonHostInfo::NEUTRAL, -1, ""},
559       // Labels starting with a RTL character cannot have "L" characters
560       // even if it ends with an BC=AN (Arabic number).
561       // Disallowed in both IDNA 2003/2008.
562       {"\xd8\xac\xd8\xa7\xd8\xb1xy\xd9\xa2", L"\x62c\x627\x631xy\x662",
563        "%D8%AC%D8%A7%D8%B1xy%D9%A2", Component(0, 26), CanonHostInfo::BROKEN,
564        -1, ""},
565       // Labels starting with a RTL character cannot mix BC=EN and BC=AN
566       {"\xd8\xac\xd8\xa7\xd8\xb1xy2\xd9\xa2", L"\x62c\x627\x631xy2\x662",
567        "%D8%AC%D8%A7%D8%B1xy2%D9%A2", Component(0, 27), CanonHostInfo::BROKEN,
568        -1, ""},
569       // As of Unicode 6.2, U+20CF is not assigned. We do not allow it.
570       {"\xe2\x83\x8f.com", L"\x20cf.com", "%E2%83%8F.com", Component(0, 13),
571        CanonHostInfo::BROKEN, -1, ""},
572       // U+0080 is not allowed.
573       {"\xc2\x80.com", L"\x80.com", "%C2%80.com", Component(0, 10),
574        CanonHostInfo::BROKEN, -1, ""},
575       // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
576       // Mixed UTF-8 and escaped UTF-8 (narrow case) and UTF-16 and escaped
577       // UTF-8 (wide case). The output should be equivalent to the true wide
578       // character input above).
579       {"%E4%BD%A0%E5%A5%BD\xe4\xbd\xa0\xe5\xa5\xbd",
580        L"%E4%BD%A0%E5%A5%BD\x4f60\x597d", "xn--6qqa088eba", Component(0, 14),
581        CanonHostInfo::NEUTRAL, -1, ""},
582       // Invalid escaped characters should fail and the percents should be
583       // escaped.
584       {"%zz%66%a", L"%zz%66%a", "%25zzf%25a", Component(0, 10),
585        CanonHostInfo::BROKEN, -1, ""},
586       // If we get an invalid character that has been escaped.
587       {"%25", L"%25", "%25", Component(0, 3), CanonHostInfo::BROKEN, -1, ""},
588       {"hello%00", L"hello%00", "hello%00", Component(0, 8),
589        CanonHostInfo::BROKEN, -1, ""},
590       // Escaped numbers should be treated like IP addresses if they are.
591       {"%30%78%63%30%2e%30%32%35%30.01", L"%30%78%63%30%2e%30%32%35%30.01",
592        "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
593       {"%30%78%63%30%2e%30%32%35%30.01%2e",
594        L"%30%78%63%30%2e%30%32%35%30.01%2e", "192.168.0.1", Component(0, 11),
595        CanonHostInfo::IPV4, 3, "C0A80001"},
596       // Invalid escaping should trigger the regular host error handling.
597       {"%3g%78%63%30%2e%30%32%35%30%2E.01",
598        L"%3g%78%63%30%2e%30%32%35%30%2E.01", "%253gxc0.0250..01",
599        Component(0, 17), CanonHostInfo::BROKEN, -1, ""},
600       // Something that isn't exactly an IP should get treated as a host and
601       // spaces treated as invalid.
602       {"192.168.0.1 hello", L"192.168.0.1 hello", "192.168.0.1%20hello",
603        Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
604       // Fullwidth and escaped UTF-8 fullwidth should still be treated as IP.
605       // These are "0Xc0.0250.01" in fullwidth.
606       {"\xef\xbc\x90%Ef%bc\xb8%ef%Bd%83\xef\xbc\x90%EF%BC%"
607        "8E\xef\xbc\x90\xef\xbc\x92\xef\xbc\x95\xef\xbc\x90\xef\xbc%"
608        "8E\xef\xbc\x90\xef\xbc\x91",
609        L"\xff10\xff38\xff43\xff10\xff0e\xff10\xff12\xff15\xff10\xff0e\xff10"
610        L"\xff11",
611        "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
612       // Broken IP addresses get marked as such.
613       {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
614        CanonHostInfo::BROKEN, -1, ""},
615       {"[google.com]", L"[google.com]", "[google.com]", Component(0, 12),
616        CanonHostInfo::BROKEN, -1, ""},
617       // Cyrillic letter followed by '(' should return punycode for '(' escaped
618       // before punycode string was created. I.e.
619       // if '(' is escaped after punycode is created we would get xn--%28-8tb
620       // (incorrect).
621       {"\xd1\x82(", L"\x0442(", "xn--(-8tb", Component(0, 9),
622        CanonHostInfo::NEUTRAL, -1, ""},
623       // Address with all hexadecimal characters with leading number of 1<<32
624       // or greater and should return NEUTRAL rather than BROKEN if not all
625       // components are numbers.
626       {"12345678912345.de", L"12345678912345.de", "12345678912345.de",
627        Component(0, 17), CanonHostInfo::NEUTRAL, -1, ""},
628       {"1.12345678912345.de", L"1.12345678912345.de", "1.12345678912345.de",
629        Component(0, 19), CanonHostInfo::NEUTRAL, -1, ""},
630       {"12345678912345.12345678912345.de", L"12345678912345.12345678912345.de",
631        "12345678912345.12345678912345.de", Component(0, 32),
632        CanonHostInfo::NEUTRAL, -1, ""},
633       {"1.2.0xB3A73CE5B59.de", L"1.2.0xB3A73CE5B59.de", "1.2.0xb3a73ce5b59.de",
634        Component(0, 20), CanonHostInfo::NEUTRAL, -1, ""},
635       {"12345678912345.0xde", L"12345678912345.0xde", "12345678912345.0xde",
636        Component(0, 19), CanonHostInfo::BROKEN, -1, ""},
637       // A label that starts with "xn--" but contains non-ASCII characters
638       // should
639       // be an error. Escape the invalid characters.
640       {"xn--m\xc3\xbcnchen", L"xn--m\xfcnchen", "xn--m%C3%BCnchen",
641        Component(0, 16), CanonHostInfo::BROKEN, -1, ""},
642   };
643   // clang-format on
644 
645   // CanonicalizeHost() non-verbose.
646   std::string out_str;
647   for (const auto& host_case : host_cases) {
648     // Narrow version.
649     if (host_case.input8) {
650       int host_len = static_cast<int>(strlen(host_case.input8));
651       Component in_comp(0, host_len);
652       Component out_comp;
653 
654       out_str.clear();
655       StdStringCanonOutput output(&out_str);
656 
657       bool success =
658           CanonicalizeHost(host_case.input8, in_comp, &output, &out_comp);
659       output.Complete();
660 
661       EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success)
662           << "for input: " << host_case.input8;
663       EXPECT_EQ(host_case.expected, out_str)
664           << "for input: " << host_case.input8;
665       EXPECT_EQ(host_case.expected_component.begin, out_comp.begin)
666           << "for input: " << host_case.input8;
667       EXPECT_EQ(host_case.expected_component.len, out_comp.len)
668           << "for input: " << host_case.input8;
669     }
670 
671     // Wide version.
672     if (host_case.input16) {
673       std::u16string input16(
674           test_utils::TruncateWStringToUTF16(host_case.input16));
675       int host_len = static_cast<int>(input16.length());
676       Component in_comp(0, host_len);
677       Component out_comp;
678 
679       out_str.clear();
680       StdStringCanonOutput output(&out_str);
681 
682       bool success = CanonicalizeHost(input16.c_str(), in_comp, &output,
683                                       &out_comp);
684       output.Complete();
685 
686       EXPECT_EQ(host_case.expected_family != CanonHostInfo::BROKEN, success);
687       EXPECT_EQ(host_case.expected, out_str);
688       EXPECT_EQ(host_case.expected_component.begin, out_comp.begin);
689       EXPECT_EQ(host_case.expected_component.len, out_comp.len);
690     }
691   }
692 
693   // CanonicalizeHostVerbose()
694   for (const auto& host_case : host_cases) {
695     // Narrow version.
696     if (host_case.input8) {
697       int host_len = static_cast<int>(strlen(host_case.input8));
698       Component in_comp(0, host_len);
699 
700       out_str.clear();
701       StdStringCanonOutput output(&out_str);
702       CanonHostInfo host_info;
703 
704       CanonicalizeHostVerbose(host_case.input8, in_comp, &output, &host_info);
705       output.Complete();
706 
707       EXPECT_EQ(host_case.expected_family, host_info.family);
708       EXPECT_EQ(host_case.expected, out_str);
709       EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
710       EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
711       EXPECT_EQ(
712           host_case.expected_address_hex,
713           base::HexEncode(host_info.address,
714                           static_cast<size_t>(host_info.AddressLength())));
715       if (host_case.expected_family == CanonHostInfo::IPV4) {
716         EXPECT_EQ(host_case.expected_num_ipv4_components,
717                   host_info.num_ipv4_components);
718       }
719     }
720 
721     // Wide version.
722     if (host_case.input16) {
723       std::u16string input16(
724           test_utils::TruncateWStringToUTF16(host_case.input16));
725       int host_len = static_cast<int>(input16.length());
726       Component in_comp(0, host_len);
727 
728       out_str.clear();
729       StdStringCanonOutput output(&out_str);
730       CanonHostInfo host_info;
731 
732       CanonicalizeHostVerbose(input16.c_str(), in_comp, &output, &host_info);
733       output.Complete();
734 
735       EXPECT_EQ(host_case.expected_family, host_info.family);
736       EXPECT_EQ(host_case.expected, out_str);
737       EXPECT_EQ(host_case.expected_component.begin, host_info.out_host.begin);
738       EXPECT_EQ(host_case.expected_component.len, host_info.out_host.len);
739       EXPECT_EQ(
740           host_case.expected_address_hex,
741           base::HexEncode(host_info.address,
742                           static_cast<size_t>(host_info.AddressLength())));
743       if (host_case.expected_family == CanonHostInfo::IPV4) {
744         EXPECT_EQ(host_case.expected_num_ipv4_components,
745                   host_info.num_ipv4_components);
746       }
747     }
748   }
749 }
750 
TEST_F(URLCanonTest,SpecialHostPuncutationChar)751 TEST_F(URLCanonTest, SpecialHostPuncutationChar) {
752   // '%' is not tested here. '%' is used for percent-escaping.
753   const std::string_view allowed_host_chars[] = {"!", "\"", "$", "&", "'", "(",
754                                                  ")", "+",  ",", "-", ".", ";",
755                                                  "=", "_",  "`", "{", "}", "~"};
756 
757   const std::string_view forbidden_host_chars[] = {
758       " ", "#", "/", ":", "<", ">", "?", "@", "[", "\\", "]", "^", "|",
759   };
760 
761   // Standard non-compliant characters which are escaped. See
762   // https://crbug.com/1416013.
763   struct EscapedCharTestCase {
764     std::string_view input;
765     std::string_view expected;
766   } escaped_host_chars[] = {{"*", "%2A"}};
767 
768   for (const std::string_view input : allowed_host_chars) {
769     std::string out_str;
770     Component in_comp(0, input.size());
771     Component out_comp;
772     StdStringCanonOutput output(&out_str);
773     bool success =
774         CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp);
775     EXPECT_TRUE(success) << "Input: " << input;
776     output.Complete();
777     EXPECT_EQ(out_str, input) << "Input: " << input;
778   }
779 
780   for (const std::string_view input : forbidden_host_chars) {
781     std::string out_str;
782     Component in_comp(0, input.size());
783     Component out_comp;
784     StdStringCanonOutput output(&out_str);
785     EXPECT_FALSE(
786         CanonicalizeSpecialHost(input.data(), in_comp, output, out_comp))
787         << "Input: " << input;
788   }
789 
790   for (const auto& c : escaped_host_chars) {
791     std::string out_str;
792     Component in_comp(0, c.input.size());
793     Component out_comp;
794     StdStringCanonOutput output(&out_str);
795     bool success =
796         CanonicalizeSpecialHost(c.input.data(), in_comp, output, out_comp);
797     EXPECT_TRUE(success) << "Input: " << c.input;
798     output.Complete();
799     EXPECT_EQ(out_str, c.expected) << "Input: " << c.input;
800   }
801 }
802 
TEST_F(URLCanonTest,ForbiddenHostCodePoint)803 TEST_F(URLCanonTest, ForbiddenHostCodePoint) {
804   // Test only CanonicalizeNonSpecialHost.
805   // CanonicalizeSpecialHost is not standard compliant yet.
806   // See URLCanonTest::SpecialHostPuncutationChar.
807 
808   // https://url.spec.whatwg.org/#forbidden-host-code-point
809   const std::string_view forbidden_host_chars[] = {
810       "\x09", "\x0A", "\x0D", " ", "#",  "/", ":", "<",
811       ">",    "?",    "@",    "[", "\\", "]", "^", "|",
812   };
813 
814   for (const std::string_view input : forbidden_host_chars) {
815     std::string out_str;
816     Component in_comp(0, input.size());
817     Component out_comp;
818     StdStringCanonOutput output(&out_str);
819     EXPECT_FALSE(
820         CanonicalizeNonSpecialHost(input.data(), in_comp, output, out_comp))
821         << "Input: " << input;
822   }
823 
824   // Test NULL manually.
825   const char host_with_null[] = "a\0b";
826   std::string out_str;
827   Component in_comp(0, 3);
828   Component out_comp;
829   StdStringCanonOutput output(&out_str);
830   EXPECT_FALSE(
831       CanonicalizeNonSpecialHost(host_with_null, in_comp, output, out_comp));
832 }
833 
TEST_F(URLCanonTest,IPv4)834 TEST_F(URLCanonTest, IPv4) {
835   // clang-format off
836   IPAddressCase cases[] = {
837     // Empty is not an IP address.
838     {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
839     {".", L".", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
840     // Regular IP addresses in different bases.
841     {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
842     {"0300.0250.00.01", L"0300.0250.00.01", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
843     {"0xC0.0Xa8.0x0.0x1", L"0xC0.0Xa8.0x0.0x1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
844     // Non-IP addresses due to invalid characters.
845     {"192.168.9.com", L"192.168.9.com", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
846     // Hostnames with a numeric final component but other components that don't
847     // parse as numbers should be considered broken.
848     {"19a.168.0.1", L"19a.168.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
849     {"19a.168.0.1.", L"19a.168.0.1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
850     {"0308.0250.00.01", L"0308.0250.00.01", "", Component(), CanonHostInfo::BROKEN, -1, ""},
851     {"0308.0250.00.01.", L"0308.0250.00.01.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
852     {"0xCG.0xA8.0x0.0x1", L"0xCG.0xA8.0x0.0x1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
853     {"0xCG.0xA8.0x0.0x1.", L"0xCG.0xA8.0x0.0x1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
854     // Non-numeric terminal compeonent should be considered not IPv4 hostnames, but valid.
855     {"19.168.0.1a", L"19.168.0.1a", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
856     {"0xC.0xA8.0x0.0x1G", L"0xC.0xA8.0x0.0x1G", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
857     // Hostnames that would be considered broken IPv4 hostnames should be considered valid non-IPv4 hostnames if they end with two dots instead of 0 or 1.
858     {"19a.168.0.1..", L"19a.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
859     {"0308.0250.00.01..", L"0308.0250.00.01..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
860     {"0xCG.0xA8.0x0.0x1..", L"0xCG.0xA8.0x0.0x1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
861     // Hosts with components that aren't considered valid IPv4 numbers but are entirely numeric should be considered invalid.
862     {"1.2.3.08", L"1.2.3.08", "", Component(), CanonHostInfo::BROKEN, -1, ""},
863     {"1.2.3.08.", L"1.2.3.08.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
864     // If there are not enough components, the last one should fill them out.
865     {"192", L"192", "0.0.0.192", Component(0, 9), CanonHostInfo::IPV4, 1, "000000C0"},
866     {"0xC0a80001", L"0xC0a80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
867     {"030052000001", L"030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
868     {"000030052000001", L"000030052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 1, "C0A80001"},
869     {"192.168", L"192.168", "192.0.0.168", Component(0, 11), CanonHostInfo::IPV4, 2, "C00000A8"},
870     {"192.0x00A80001", L"192.0x000A80001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
871     {"0xc0.052000001", L"0xc0.052000001", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 2, "C0A80001"},
872     {"192.168.1", L"192.168.1", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0A80001"},
873     // Hostnames with too many components, but a numeric final numeric component are invalid.
874     {"192.168.0.0.1", L"192.168.0.0.1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
875     // We allow a single trailing dot.
876     {"192.168.0.1.", L"192.168.0.1.", "192.168.0.1", Component(0, 11), CanonHostInfo::IPV4, 4, "C0A80001"},
877     {"192.168.0.1. hello", L"192.168.0.1. hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
878     {"192.168.0.1..", L"192.168.0.1..", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
879     // Hosts with two dots in a row with a final numeric component are considered invalid.
880     {"192.168..1", L"192.168..1", "", Component(), CanonHostInfo::BROKEN, -1, ""},
881     {"192.168..1.", L"192.168..1.", "", Component(), CanonHostInfo::BROKEN, -1, ""},
882     // Any numerical overflow should be marked as BROKEN.
883     {"0x100.0", L"0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
884     {"0x100.0.0", L"0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
885     {"0x100.0.0.0", L"0x100.0.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
886     {"0.0x100.0.0", L"0.0x100.0.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
887     {"0.0.0x100.0", L"0.0.0x100.0", "", Component(), CanonHostInfo::BROKEN, -1, ""},
888     {"0.0.0.0x100", L"0.0.0.0x100", "", Component(), CanonHostInfo::BROKEN, -1, ""},
889     {"0.0.0x10000", L"0.0.0x10000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
890     {"0.0x1000000", L"0.0x1000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
891     {"0x100000000", L"0x100000000", "", Component(), CanonHostInfo::BROKEN, -1, ""},
892     // Repeat the previous tests, minus 1, to verify boundaries.
893     {"0xFF.0", L"0xFF.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 2, "FF000000"},
894     {"0xFF.0.0", L"0xFF.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 3, "FF000000"},
895     {"0xFF.0.0.0", L"0xFF.0.0.0", "255.0.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "FF000000"},
896     {"0.0xFF.0.0", L"0.0xFF.0.0", "0.255.0.0", Component(0, 9), CanonHostInfo::IPV4, 4, "00FF0000"},
897     {"0.0.0xFF.0", L"0.0.0xFF.0", "0.0.255.0", Component(0, 9), CanonHostInfo::IPV4, 4, "0000FF00"},
898     {"0.0.0.0xFF", L"0.0.0.0xFF", "0.0.0.255", Component(0, 9), CanonHostInfo::IPV4, 4, "000000FF"},
899     {"0.0.0xFFFF", L"0.0.0xFFFF", "0.0.255.255", Component(0, 11), CanonHostInfo::IPV4, 3, "0000FFFF"},
900     {"0.0xFFFFFF", L"0.0xFFFFFF", "0.255.255.255", Component(0, 13), CanonHostInfo::IPV4, 2, "00FFFFFF"},
901     {"0xFFFFFFFF", L"0xFFFFFFFF", "255.255.255.255", Component(0, 15), CanonHostInfo::IPV4, 1, "FFFFFFFF"},
902     // Old trunctations tests. They're all "BROKEN" now.
903     {"276.256.0xf1a2.077777", L"276.256.0xf1a2.077777", "", Component(), CanonHostInfo::BROKEN, -1, ""},
904     {"192.168.0.257", L"192.168.0.257", "", Component(), CanonHostInfo::BROKEN, -1, ""},
905     {"192.168.0xa20001", L"192.168.0xa20001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
906     {"192.015052000001", L"192.015052000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
907     {"0X12C0a80001", L"0X12C0a80001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
908     {"276.1.2", L"276.1.2", "", Component(), CanonHostInfo::BROKEN, -1, ""},
909     // Too many components should be rejected, in valid ranges or not.
910     {"255.255.255.255.255", L"255.255.255.255.255", "", Component(), CanonHostInfo::BROKEN, -1, ""},
911     {"256.256.256.256.256", L"256.256.256.256.256", "", Component(), CanonHostInfo::BROKEN, -1, ""},
912     // Spaces should be rejected.
913     {"192.168.0.1 hello", L"192.168.0.1 hello", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
914     // Very large numbers.
915     {"0000000000000300.0x00000000000000fF.00000000000000001", L"0000000000000300.0x00000000000000fF.00000000000000001", "192.255.0.1", Component(0, 11), CanonHostInfo::IPV4, 3, "C0FF0001"},
916     {"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", L"0000000000000300.0xffffffffFFFFFFFF.3022415481470977", "", Component(0, 11), CanonHostInfo::BROKEN, -1, ""},
917     // A number has no length limit, but long numbers can still overflow.
918     {"00000000000000000001", L"00000000000000000001", "0.0.0.1", Component(0, 7), CanonHostInfo::IPV4, 1, "00000001"},
919     {"0000000000000000100000000000000001", L"0000000000000000100000000000000001", "", Component(), CanonHostInfo::BROKEN, -1, ""},
920     // If a long component is non-numeric, it's a hostname, *not* a broken IP.
921     {"0.0.0.000000000000000000z", L"0.0.0.000000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
922     {"0.0.0.100000000000000000z", L"0.0.0.100000000000000000z", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
923     // Truncation of all zeros should still result in 0.
924     {"0.00.0x.0x0", L"0.00.0x.0x0", "0.0.0.0", Component(0, 7), CanonHostInfo::IPV4, 4, "00000000"},
925     // Non-ASCII characters in final component should return NEUTRAL.
926     {"1.2.3.\xF0\x9F\x92\xA9", L"1.2.3.\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
927     {"1.2.3.4\xF0\x9F\x92\xA9", L"1.2.3.4\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
928     {"1.2.3.0x\xF0\x9F\x92\xA9", L"1.2.3.0x\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
929     {"1.2.3.0\xF0\x9F\x92\xA9", L"1.2.3.0\xD83D\xDCA9", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
930     // Non-ASCII characters in other components should result in broken IPs when final component is numeric.
931     {"1.2.\xF0\x9F\x92\xA9.4", L"1.2.\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
932     {"1.2.3\xF0\x9F\x92\xA9.4", L"1.2.3\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
933     {"1.2.0x\xF0\x9F\x92\xA9.4", L"1.2.0x\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
934     {"1.2.0\xF0\x9F\x92\xA9.4", L"1.2.0\xD83D\xDCA9.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
935     {"\xF0\x9F\x92\xA9.2.3.4", L"\xD83D\xDCA9.2.3.4", "", Component(), CanonHostInfo::BROKEN, -1, ""},
936   };
937   // clang-format on
938 
939   for (const auto& test_case : cases) {
940     SCOPED_TRACE(test_case.input8);
941 
942     // 8-bit version.
943     Component component(0, static_cast<int>(strlen(test_case.input8)));
944 
945     std::string out_str1;
946     StdStringCanonOutput output1(&out_str1);
947     CanonHostInfo host_info;
948     CanonicalizeIPAddress(test_case.input8, component, &output1, &host_info);
949     output1.Complete();
950 
951     EXPECT_EQ(test_case.expected_family, host_info.family);
952     EXPECT_EQ(test_case.expected_address_hex,
953               base::HexEncode(host_info.address,
954                               static_cast<size_t>(host_info.AddressLength())));
955     if (host_info.family == CanonHostInfo::IPV4) {
956       EXPECT_STREQ(test_case.expected, out_str1.c_str());
957       EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
958       EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
959       EXPECT_EQ(test_case.expected_num_ipv4_components,
960                 host_info.num_ipv4_components);
961     }
962 
963     // 16-bit version.
964     std::u16string input16(
965         test_utils::TruncateWStringToUTF16(test_case.input16));
966     component = Component(0, static_cast<int>(input16.length()));
967 
968     std::string out_str2;
969     StdStringCanonOutput output2(&out_str2);
970     CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
971     output2.Complete();
972 
973     EXPECT_EQ(test_case.expected_family, host_info.family);
974     EXPECT_EQ(test_case.expected_address_hex,
975               base::HexEncode(host_info.address,
976                               static_cast<size_t>(host_info.AddressLength())));
977     if (host_info.family == CanonHostInfo::IPV4) {
978       EXPECT_STREQ(test_case.expected, out_str2.c_str());
979       EXPECT_EQ(test_case.expected_component.begin, host_info.out_host.begin);
980       EXPECT_EQ(test_case.expected_component.len, host_info.out_host.len);
981       EXPECT_EQ(test_case.expected_num_ipv4_components,
982                 host_info.num_ipv4_components);
983     }
984   }
985 }
986 
TEST_F(URLCanonTest,IPv6)987 TEST_F(URLCanonTest, IPv6) {
988   IPAddressCase cases[] = {
989       // Empty is not an IP address.
990       {"", L"", "", Component(), CanonHostInfo::NEUTRAL, -1, ""},
991       // Non-IPs with [:] characters are marked BROKEN.
992       {":", L":", "", Component(), CanonHostInfo::BROKEN, -1, ""},
993       {"[", L"[", "", Component(), CanonHostInfo::BROKEN, -1, ""},
994       {"[:", L"[:", "", Component(), CanonHostInfo::BROKEN, -1, ""},
995       {"]", L"]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
996       {":]", L":]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
997       {"[]", L"[]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
998       {"[:]", L"[:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
999       // Regular IP address is invalid without bounding '[' and ']'.
1000       {"2001:db8::1", L"2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
1001        -1, ""},
1002       {"[2001:db8::1", L"[2001:db8::1", "", Component(), CanonHostInfo::BROKEN,
1003        -1, ""},
1004       {"2001:db8::1]", L"2001:db8::1]", "", Component(), CanonHostInfo::BROKEN,
1005        -1, ""},
1006       // Regular IP addresses.
1007       {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
1008        "00000000000000000000000000000000"},
1009       {"[::1]", L"[::1]", "[::1]", Component(0, 5), CanonHostInfo::IPV6, -1,
1010        "00000000000000000000000000000001"},
1011       {"[1::]", L"[1::]", "[1::]", Component(0, 5), CanonHostInfo::IPV6, -1,
1012        "00010000000000000000000000000000"},
1013 
1014       // Leading zeros should be stripped.
1015       {"[000:01:02:003:004:5:6:007]", L"[000:01:02:003:004:5:6:007]",
1016        "[0:1:2:3:4:5:6:7]", Component(0, 17), CanonHostInfo::IPV6, -1,
1017        "00000001000200030004000500060007"},
1018 
1019       // Upper case letters should be lowercased.
1020       {"[A:b:c:DE:fF:0:1:aC]", L"[A:b:c:DE:fF:0:1:aC]", "[a:b:c:de:ff:0:1:ac]",
1021        Component(0, 20), CanonHostInfo::IPV6, -1,
1022        "000A000B000C00DE00FF0000000100AC"},
1023 
1024       // The same address can be written with different contractions, but should
1025       // get canonicalized to the same thing.
1026       {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1027        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1028       {"[1::2:0:0:3:0]", L"[1::2:0:0:3:0]", "[1::2:0:0:3:0]", Component(0, 14),
1029        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
1030 
1031       // Addresses with embedded IPv4.
1032       {"[::192.168.0.1]", L"[::192.168.0.1]", "[::c0a8:1]", Component(0, 10),
1033        CanonHostInfo::IPV6, -1, "000000000000000000000000C0A80001"},
1034       {"[::ffff:192.168.0.1]", L"[::ffff:192.168.0.1]", "[::ffff:c0a8:1]",
1035        Component(0, 15), CanonHostInfo::IPV6, -1,
1036        "00000000000000000000FFFFC0A80001"},
1037       {"[::eeee:192.168.0.1]", L"[::eeee:192.168.0.1]", "[::eeee:c0a8:1]",
1038        Component(0, 15), CanonHostInfo::IPV6, -1,
1039        "00000000000000000000EEEEC0A80001"},
1040       {"[2001::192.168.0.1]", L"[2001::192.168.0.1]", "[2001::c0a8:1]",
1041        Component(0, 14), CanonHostInfo::IPV6, -1,
1042        "200100000000000000000000C0A80001"},
1043       {"[1:2:192.168.0.1:5:6]", L"[1:2:192.168.0.1:5:6]", "", Component(),
1044        CanonHostInfo::BROKEN, -1, ""},
1045 
1046       // IPv4 embedded IPv6 addresses
1047       {"[::ffff:192.1.2]", L"[::ffff:192.1.2]", "[::ffff:c001:2]", Component(),
1048        CanonHostInfo::BROKEN, -1, ""},
1049       {"[::ffff:192.1]", L"[::ffff:192.1]", "[::ffff:c000:1]", Component(),
1050        CanonHostInfo::BROKEN, -1, ""},
1051       {"[::ffff:192.1.2.3.4]", L"[::ffff:192.1.2.3.4]", "", Component(),
1052        CanonHostInfo::BROKEN, -1, ""},
1053 
1054       // IPv4 using hex.
1055       // TODO(eroman): Should this format be disallowed?
1056       {"[::ffff:0xC0.0Xa8.0x0.0x1]", L"[::ffff:0xC0.0Xa8.0x0.0x1]",
1057        "[::ffff:c0a8:1]", Component(0, 15), CanonHostInfo::IPV6, -1,
1058        "00000000000000000000FFFFC0A80001"},
1059 
1060       // There may be zeros surrounding the "::" contraction.
1061       {"[0:0::0:0:8]", L"[0:0::0:0:8]", "[::8]", Component(0, 5),
1062        CanonHostInfo::IPV6, -1, "00000000000000000000000000000008"},
1063 
1064       {"[2001:db8::1]", L"[2001:db8::1]", "[2001:db8::1]", Component(0, 13),
1065        CanonHostInfo::IPV6, -1, "20010DB8000000000000000000000001"},
1066 
1067       // Can only have one "::" contraction in an IPv6 string literal.
1068       {"[2001::db8::1]", L"[2001::db8::1]", "", Component(),
1069        CanonHostInfo::BROKEN, -1, ""},
1070       // No more than 2 consecutive ':'s.
1071       {"[2001:db8:::1]", L"[2001:db8:::1]", "", Component(),
1072        CanonHostInfo::BROKEN, -1, ""},
1073       {"[:::]", L"[:::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1074       // Non-IP addresses due to invalid characters.
1075       {"[2001::.com]", L"[2001::.com]", "", Component(), CanonHostInfo::BROKEN,
1076        -1, ""},
1077       // If there are not enough components, the last one should fill them out.
1078       // ... omitted at this time ...
1079       // Too many components means not an IP address. Similarly, with too few
1080       // if using IPv4 compat or mapped addresses.
1081       {"[::192.168.0.0.1]", L"[::192.168.0.0.1]", "", Component(),
1082        CanonHostInfo::BROKEN, -1, ""},
1083       {"[::ffff:192.168.0.0.1]", L"[::ffff:192.168.0.0.1]", "", Component(),
1084        CanonHostInfo::BROKEN, -1, ""},
1085       {"[1:2:3:4:5:6:7:8:9]", L"[1:2:3:4:5:6:7:8:9]", "", Component(),
1086        CanonHostInfo::BROKEN, -1, ""},
1087       // Too many bits (even though 8 components, the last one holds 32 bits).
1088       {"[0:0:0:0:0:0:0:192.168.0.1]", L"[0:0:0:0:0:0:0:192.168.0.1]", "",
1089        Component(), CanonHostInfo::BROKEN, -1, ""},
1090 
1091       // Too many bits specified -- the contraction would have to be zero-length
1092       // to not exceed 128 bits.
1093       {"[1:2:3:4:5:6::192.168.0.1]", L"[1:2:3:4:5:6::192.168.0.1]", "",
1094        Component(), CanonHostInfo::BROKEN, -1, ""},
1095 
1096       // The contraction is for 16 bits of zero.
1097       {"[1:2:3:4:5:6::8]", L"[1:2:3:4:5:6::8]", "[1:2:3:4:5:6:0:8]",
1098        Component(0, 17), CanonHostInfo::IPV6, -1,
1099        "00010002000300040005000600000008"},
1100 
1101       // Cannot have a trailing colon.
1102       {"[1:2:3:4:5:6:7:8:]", L"[1:2:3:4:5:6:7:8:]", "", Component(),
1103        CanonHostInfo::BROKEN, -1, ""},
1104       {"[1:2:3:4:5:6:192.168.0.1:]", L"[1:2:3:4:5:6:192.168.0.1:]", "",
1105        Component(), CanonHostInfo::BROKEN, -1, ""},
1106 
1107       // Cannot have negative numbers.
1108       {"[-1:2:3:4:5:6:7:8]", L"[-1:2:3:4:5:6:7:8]", "", Component(),
1109        CanonHostInfo::BROKEN, -1, ""},
1110 
1111       // Scope ID -- the URL may contain an optional ["%" <scope_id>] section.
1112       // The scope_id should be included in the canonicalized URL, and is an
1113       // unsigned decimal number.
1114 
1115       // Invalid because no ID was given after the percent.
1116 
1117       // Don't allow scope-id
1118       {"[1::%1]", L"[1::%1]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1119       {"[1::%eth0]", L"[1::%eth0]", "", Component(), CanonHostInfo::BROKEN, -1,
1120        ""},
1121       {"[1::%]", L"[1::%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1122       {"[%]", L"[%]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1123       {"[::%:]", L"[::%:]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
1124 
1125       // Don't allow leading or trailing colons.
1126       {"[:0:0::0:0:8]", L"[:0:0::0:0:8]", "", Component(),
1127        CanonHostInfo::BROKEN, -1, ""},
1128       {"[0:0::0:0:8:]", L"[0:0::0:0:8:]", "", Component(),
1129        CanonHostInfo::BROKEN, -1, ""},
1130       {"[:0:0::0:0:8:]", L"[:0:0::0:0:8:]", "", Component(),
1131        CanonHostInfo::BROKEN, -1, ""},
1132 
1133       // We allow a single trailing dot.
1134       // ... omitted at this time ...
1135       // Two dots in a row means not an IP address.
1136       {"[::192.168..1]", L"[::192.168..1]", "", Component(),
1137        CanonHostInfo::BROKEN, -1, ""},
1138       // Any non-first components get truncated to one byte.
1139       // ... omitted at this time ...
1140       // Spaces should be rejected.
1141       {"[::1 hello]", L"[::1 hello]", "", Component(), CanonHostInfo::BROKEN,
1142        -1, ""},
1143   };
1144 
1145   for (size_t i = 0; i < std::size(cases); i++) {
1146     // 8-bit version.
1147     Component component(0, static_cast<int>(strlen(cases[i].input8)));
1148 
1149     std::string out_str1;
1150     StdStringCanonOutput output1(&out_str1);
1151     CanonHostInfo host_info;
1152     CanonicalizeIPAddress(cases[i].input8, component, &output1, &host_info);
1153     output1.Complete();
1154 
1155     EXPECT_EQ(cases[i].expected_family, host_info.family);
1156     EXPECT_EQ(cases[i].expected_address_hex,
1157               base::HexEncode(host_info.address,
1158                               static_cast<size_t>(host_info.AddressLength())))
1159         << "iter " << i << " host " << cases[i].input8;
1160     if (host_info.family == CanonHostInfo::IPV6) {
1161       EXPECT_STREQ(cases[i].expected, out_str1.c_str());
1162       EXPECT_EQ(cases[i].expected_component.begin,
1163                 host_info.out_host.begin);
1164       EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1165     }
1166 
1167     // 16-bit version.
1168     std::u16string input16(
1169         test_utils::TruncateWStringToUTF16(cases[i].input16));
1170     component = Component(0, static_cast<int>(input16.length()));
1171 
1172     std::string out_str2;
1173     StdStringCanonOutput output2(&out_str2);
1174     CanonicalizeIPAddress(input16.c_str(), component, &output2, &host_info);
1175     output2.Complete();
1176 
1177     EXPECT_EQ(cases[i].expected_family, host_info.family);
1178     EXPECT_EQ(cases[i].expected_address_hex,
1179               base::HexEncode(host_info.address,
1180                               static_cast<size_t>(host_info.AddressLength())));
1181     if (host_info.family == CanonHostInfo::IPV6) {
1182       EXPECT_STREQ(cases[i].expected, out_str2.c_str());
1183       EXPECT_EQ(cases[i].expected_component.begin, host_info.out_host.begin);
1184       EXPECT_EQ(cases[i].expected_component.len, host_info.out_host.len);
1185     }
1186   }
1187 }
1188 
TEST_F(URLCanonTest,IPEmpty)1189 TEST_F(URLCanonTest, IPEmpty) {
1190   std::string out_str1;
1191   StdStringCanonOutput output1(&out_str1);
1192   CanonHostInfo host_info;
1193 
1194   // This tests tests.
1195   const char spec[] = "192.168.0.1";
1196   CanonicalizeIPAddress(spec, Component(), &output1, &host_info);
1197   EXPECT_FALSE(host_info.IsIPAddress());
1198 
1199   CanonicalizeIPAddress(spec, Component(0, 0), &output1, &host_info);
1200   EXPECT_FALSE(host_info.IsIPAddress());
1201 }
1202 
1203 // Verifies that CanonicalizeHostSubstring produces the expected output and
1204 // does not "fix" IP addresses. Because this code is a subset of
1205 // CanonicalizeHost, the shared functionality is not tested.
TEST_F(URLCanonTest,CanonicalizeHostSubstring)1206 TEST_F(URLCanonTest, CanonicalizeHostSubstring) {
1207   // Basic sanity check.
1208   {
1209     std::string out_str;
1210     StdStringCanonOutput output(&out_str);
1211     EXPECT_TRUE(CanonicalizeHostSubstring("M\xc3\x9cNCHEN.com",
1212                                           Component(0, 12), &output));
1213     output.Complete();
1214     EXPECT_EQ("xn--mnchen-3ya.com", out_str);
1215   }
1216 
1217   // Failure case.
1218   {
1219     std::string out_str;
1220     StdStringCanonOutput output(&out_str);
1221     EXPECT_FALSE(CanonicalizeHostSubstring(
1222         test_utils::TruncateWStringToUTF16(L"\xfdd0zyx.com").c_str(),
1223         Component(0, 8), &output));
1224     output.Complete();
1225     EXPECT_EQ("%EF%B7%90zyx.com", out_str);
1226   }
1227 
1228   // Should return true for empty input strings.
1229   {
1230     std::string out_str;
1231     StdStringCanonOutput output(&out_str);
1232     EXPECT_TRUE(CanonicalizeHostSubstring("", Component(0, 0), &output));
1233     output.Complete();
1234     EXPECT_EQ(std::string(), out_str);
1235   }
1236 
1237   // Numbers that look like IP addresses should not be changed.
1238   {
1239     std::string out_str;
1240     StdStringCanonOutput output(&out_str);
1241     EXPECT_TRUE(
1242         CanonicalizeHostSubstring("01.02.03.04", Component(0, 11), &output));
1243     output.Complete();
1244     EXPECT_EQ("01.02.03.04", out_str);
1245   }
1246 }
1247 
TEST_F(URLCanonTest,UserInfo)1248 TEST_F(URLCanonTest, UserInfo) {
1249   // Note that the canonicalizer should escape and treat empty components as
1250   // not being there.
1251 
1252   // We actually parse a full input URL so we can get the initial components.
1253   struct UserComponentCase {
1254     const char* input;
1255     const char* expected;
1256     Component expected_username;
1257     Component expected_password;
1258     bool expected_success;
1259   } user_info_cases[] = {
1260     {"http://user:pass@host.com/", "user:pass@", Component(0, 4), Component(5, 4), true},
1261     {"http://@host.com/", "", Component(0, -1), Component(0, -1), true},
1262     {"http://:@host.com/", "", Component(0, -1), Component(0, -1), true},
1263     {"http://foo:@host.com/", "foo@", Component(0, 3), Component(0, -1), true},
1264     {"http://:foo@host.com/", ":foo@", Component(0, 0), Component(1, 3), true},
1265     {"http://^ :$\t@host.com/", "%5E%20:$%09@", Component(0, 6), Component(7, 4), true},
1266     {"http://user:pass@/", "user:pass@", Component(0, 4), Component(5, 4), true},
1267     {"http://%2540:bar@domain.com/", "%2540:bar@", Component(0, 5), Component(6, 3), true },
1268 
1269       // IE7 compatibility: old versions allowed backslashes in usernames, but
1270       // IE7 does not. We disallow it as well.
1271     {"ftp://me\\mydomain:pass@foo.com/", "", Component(0, -1), Component(0, -1), true},
1272   };
1273 
1274   for (const auto& user_info_case : user_info_cases) {
1275     Parsed parsed = ParseStandardURL(user_info_case.input);
1276     Component out_user, out_pass;
1277     std::string out_str;
1278     StdStringCanonOutput output1(&out_str);
1279 
1280     bool success = CanonicalizeUserInfo(user_info_case.input, parsed.username,
1281                                         user_info_case.input, parsed.password,
1282                                         &output1, &out_user, &out_pass);
1283     output1.Complete();
1284 
1285     EXPECT_EQ(user_info_case.expected_success, success);
1286     EXPECT_EQ(user_info_case.expected, out_str);
1287     EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1288     EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1289     EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1290     EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1291 
1292     // Now try the wide version
1293     out_str.clear();
1294     StdStringCanonOutput output2(&out_str);
1295     std::u16string wide_input(base::UTF8ToUTF16(user_info_case.input));
1296     success = CanonicalizeUserInfo(wide_input.c_str(),
1297                                    parsed.username,
1298                                    wide_input.c_str(),
1299                                    parsed.password,
1300                                    &output2,
1301                                    &out_user,
1302                                    &out_pass);
1303     output2.Complete();
1304 
1305     EXPECT_EQ(user_info_case.expected_success, success);
1306     EXPECT_EQ(user_info_case.expected, out_str);
1307     EXPECT_EQ(user_info_case.expected_username.begin, out_user.begin);
1308     EXPECT_EQ(user_info_case.expected_username.len, out_user.len);
1309     EXPECT_EQ(user_info_case.expected_password.begin, out_pass.begin);
1310     EXPECT_EQ(user_info_case.expected_password.len, out_pass.len);
1311   }
1312 }
1313 
TEST_F(URLCanonTest,Port)1314 TEST_F(URLCanonTest, Port) {
1315   // We only need to test that the number gets properly put into the output
1316   // buffer. The parser unit tests will test scanning the number correctly.
1317   //
1318   // Note that the CanonicalizePort will always prepend a colon to the output
1319   // to separate it from the colon that it assumes precedes it.
1320   struct PortCase {
1321     const char* input;
1322     int default_port;
1323     const char* expected;
1324     Component expected_component;
1325     bool expected_success;
1326   } port_cases[] = {
1327       // Invalid input should be copied w/ failure.
1328     {"as df", 80, ":as%20df", Component(1, 7), false},
1329     {"-2", 80, ":-2", Component(1, 2), false},
1330       // Default port should be omitted.
1331     {"80", 80, "", Component(0, -1), true},
1332     {"8080", 80, ":8080", Component(1, 4), true},
1333       // PORT_UNSPECIFIED should mean always keep the port.
1334     {"80", PORT_UNSPECIFIED, ":80", Component(1, 2), true},
1335   };
1336 
1337   for (const auto& port_case : port_cases) {
1338     int url_len = static_cast<int>(strlen(port_case.input));
1339     Component in_comp(0, url_len);
1340     Component out_comp;
1341     std::string out_str;
1342     StdStringCanonOutput output1(&out_str);
1343     bool success = CanonicalizePort(
1344         port_case.input, in_comp, port_case.default_port, &output1, &out_comp);
1345     output1.Complete();
1346 
1347     EXPECT_EQ(port_case.expected_success, success);
1348     EXPECT_EQ(port_case.expected, out_str);
1349     EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1350     EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1351 
1352     // Now try the wide version
1353     out_str.clear();
1354     StdStringCanonOutput output2(&out_str);
1355     std::u16string wide_input(base::UTF8ToUTF16(port_case.input));
1356     success = CanonicalizePort(wide_input.c_str(), in_comp,
1357                                port_case.default_port, &output2, &out_comp);
1358     output2.Complete();
1359 
1360     EXPECT_EQ(port_case.expected_success, success);
1361     EXPECT_EQ(port_case.expected, out_str);
1362     EXPECT_EQ(port_case.expected_component.begin, out_comp.begin);
1363     EXPECT_EQ(port_case.expected_component.len, out_comp.len);
1364   }
1365 }
1366 
1367 DualComponentCase kCommonPathCases[] = {
1368     // ----- path collapsing tests -----
1369     {"/././foo", L"/././foo", "/foo", Component(0, 4), true},
1370     {"/./.foo", L"/./.foo", "/.foo", Component(0, 5), true},
1371     {"/foo/.", L"/foo/.", "/foo/", Component(0, 5), true},
1372     {"/foo/./", L"/foo/./", "/foo/", Component(0, 5), true},
1373     // double dots followed by a slash or the end of the string count
1374     {"/foo/bar/..", L"/foo/bar/..", "/foo/", Component(0, 5), true},
1375     {"/foo/bar/../", L"/foo/bar/../", "/foo/", Component(0, 5), true},
1376     // don't count double dots when they aren't followed by a slash
1377     {"/foo/..bar", L"/foo/..bar", "/foo/..bar", Component(0, 10), true},
1378     // some in the middle
1379     {"/foo/bar/../ton", L"/foo/bar/../ton", "/foo/ton", Component(0, 8), true},
1380     {"/foo/bar/../ton/../../a", L"/foo/bar/../ton/../../a", "/a",
1381      Component(0, 2), true},
1382     // we should not be able to go above the root
1383     {"/foo/../../..", L"/foo/../../..", "/", Component(0, 1), true},
1384     {"/foo/../../../ton", L"/foo/../../../ton", "/ton", Component(0, 4), true},
1385     // escaped dots should be unescaped and treated the same as dots
1386     {"/foo/%2e", L"/foo/%2e", "/foo/", Component(0, 5), true},
1387     {"/foo/%2e%2", L"/foo/%2e%2", "/foo/.%2", Component(0, 8), true},
1388     {"/foo/%2e./%2e%2e/.%2e/%2e.bar", L"/foo/%2e./%2e%2e/.%2e/%2e.bar",
1389      "/..bar", Component(0, 6), true},
1390     // Multiple slashes in a row should be preserved and treated like empty
1391     // directory names.
1392     {"////../..", L"////../..", "//", Component(0, 2), true},
1393 
1394     // ----- escaping tests -----
1395     {"/foo", L"/foo", "/foo", Component(0, 4), true},
1396     // Valid escape sequence
1397     {"/%20foo", L"/%20foo", "/%20foo", Component(0, 7), true},
1398     // Invalid escape sequence we should pass through unchanged.
1399     {"/foo%", L"/foo%", "/foo%", Component(0, 5), true},
1400     {"/foo%2", L"/foo%2", "/foo%2", Component(0, 6), true},
1401     // Invalid escape sequence: bad characters should be treated the same as
1402     // the surrounding text, not as escaped (in this case, UTF-8).
1403     {"/foo%2zbar", L"/foo%2zbar", "/foo%2zbar", Component(0, 10), true},
1404     {"/foo%2\xc2\xa9zbar", nullptr, "/foo%2%C2%A9zbar", Component(0, 16), true},
1405     {nullptr, L"/foo%2\xc2\xa9zbar", "/foo%2%C3%82%C2%A9zbar", Component(0, 22),
1406      true},
1407     // Regular characters that are escaped should remain escaped
1408     {"/foo%41%7a", L"/foo%41%7a", "/foo%41%7a", Component(0, 10), true},
1409     // Funny characters that are unescaped should be escaped
1410     {"/foo\x09\x91%91", nullptr, "/foo%09%91%91", Component(0, 13), true},
1411     {nullptr, L"/foo\x09\x91%91", "/foo%09%C2%91%91", Component(0, 16), true},
1412     // %00 should not cause failures.
1413     {"/foo%00%51", L"/foo%00%51", "/foo%00%51", Component(0, 10), true},
1414     // Some characters should be passed through unchanged regardless of esc.
1415     {"/(%28:%3A%29)", L"/(%28:%3A%29)", "/(%28:%3A%29)", Component(0, 13),
1416      true},
1417     // Characters that are properly escaped should not have the case changed
1418     // of hex letters.
1419     {"/%3A%3a%3C%3c", L"/%3A%3a%3C%3c", "/%3A%3a%3C%3c", Component(0, 13),
1420      true},
1421     // Funny characters that are unescaped should be escaped
1422     {"/foo\tbar", L"/foo\tbar", "/foo%09bar", Component(0, 10), true},
1423     // Hashes found in paths (possibly only when the caller explicitly sets
1424     // the path on an already-parsed URL) should be escaped.
1425     {"/foo#bar", L"/foo#bar", "/foo%23bar", Component(0, 10), true},
1426     // %7f should be allowed and %3D should not be unescaped (these were wrong
1427     // in a previous version).
1428     {"/%7Ffp3%3Eju%3Dduvgw%3Dd", L"/%7Ffp3%3Eju%3Dduvgw%3Dd",
1429      "/%7Ffp3%3Eju%3Dduvgw%3Dd", Component(0, 24), true},
1430     // @ should be passed through unchanged (escaped or unescaped).
1431     {"/@asdf%40", L"/@asdf%40", "/@asdf%40", Component(0, 9), true},
1432     // Nested escape sequences no longer happen. See https://crbug.com/1252531.
1433     {"/%A%42", L"/%A%42", "/%A%42", Component(0, 6), true},
1434     {"/%%41B", L"/%%41B", "/%%41B", Component(0, 6), true},
1435     {"/%%41%42", L"/%%41%42", "/%%41%42", Component(0, 8), true},
1436     // Make sure truncated "nested" escapes don't result in reading off the
1437     // string end.
1438     {"/%%41", L"/%%41", "/%%41", Component(0, 5), true},
1439     // Don't unescape the leading '%' if unescaping doesn't result in a valid
1440     // new escape sequence.
1441     {"/%%470", L"/%%470", "/%%470", Component(0, 6), true},
1442     {"/%%2D%41", L"/%%2D%41", "/%%2D%41", Component(0, 8), true},
1443     // Don't erroneously downcast a UTF-16 character in a way that makes it
1444     // look like part of an escape sequence.
1445     {nullptr, L"/%%41\x0130", "/%%41%C4%B0", Component(0, 11), true},
1446 
1447     // ----- encoding tests -----
1448     // Basic conversions
1449     {"/\xe4\xbd\xa0\xe5\xa5\xbd\xe4\xbd\xa0\xe5\xa5\xbd",
1450      L"/\x4f60\x597d\x4f60\x597d", "/%E4%BD%A0%E5%A5%BD%E4%BD%A0%E5%A5%BD",
1451      Component(0, 37), true},
1452     // Unicode Noncharacter (U+FDD0) should not fail.
1453     {"/\xef\xb7\x90zyx", nullptr, "/%EF%B7%90zyx", Component(0, 13), true},
1454     {nullptr, L"/\xfdd0zyx", "/%EF%B7%90zyx", Component(0, 13), true},
1455 };
1456 
1457 typedef bool (*CanonFunc8Bit)(const char*,
1458                               const Component&,
1459                               CanonOutput*,
1460                               Component*);
1461 typedef bool (*CanonFunc16Bit)(const char16_t*,
1462                                const Component&,
1463                                CanonOutput*,
1464                                Component*);
1465 
DoPathTest(const DualComponentCase * path_cases,size_t num_cases,CanonFunc8Bit canon_func_8,CanonFunc16Bit canon_func_16)1466 void DoPathTest(const DualComponentCase* path_cases,
1467                 size_t num_cases,
1468                 CanonFunc8Bit canon_func_8,
1469                 CanonFunc16Bit canon_func_16) {
1470   for (size_t i = 0; i < num_cases; i++) {
1471     testing::Message scope_message;
1472     scope_message << path_cases[i].input8 << "," << path_cases[i].input16;
1473     SCOPED_TRACE(scope_message);
1474     if (path_cases[i].input8) {
1475       int len = static_cast<int>(strlen(path_cases[i].input8));
1476       Component in_comp(0, len);
1477       Component out_comp;
1478       std::string out_str;
1479       StdStringCanonOutput output(&out_str);
1480       bool success =
1481           canon_func_8(path_cases[i].input8, in_comp, &output, &out_comp);
1482       output.Complete();
1483 
1484       EXPECT_EQ(path_cases[i].expected_success, success);
1485       EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1486       EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1487       EXPECT_EQ(path_cases[i].expected, out_str);
1488     }
1489 
1490     if (path_cases[i].input16) {
1491       std::u16string input16(
1492           test_utils::TruncateWStringToUTF16(path_cases[i].input16));
1493       int len = static_cast<int>(input16.length());
1494       Component in_comp(0, len);
1495       Component out_comp;
1496       std::string out_str;
1497       StdStringCanonOutput output(&out_str);
1498 
1499       bool success =
1500           canon_func_16(input16.c_str(), in_comp, &output, &out_comp);
1501       output.Complete();
1502 
1503       EXPECT_EQ(path_cases[i].expected_success, success);
1504       EXPECT_EQ(path_cases[i].expected_component.begin, out_comp.begin);
1505       EXPECT_EQ(path_cases[i].expected_component.len, out_comp.len);
1506       EXPECT_EQ(path_cases[i].expected, out_str);
1507     }
1508   }
1509 }
1510 
TEST_F(URLCanonTest,SpecialPath)1511 TEST_F(URLCanonTest, SpecialPath) {
1512   // Common test cases
1513   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1514              CanonicalizeSpecialPath, CanonicalizeSpecialPath);
1515 
1516   // Manual test: embedded NULLs should be escaped and the URL should be marked
1517   // as valid.
1518   const char path_with_null[] = "/ab\0c";
1519   Component in_comp(0, 5);
1520   Component out_comp;
1521 
1522   std::string out_str;
1523   StdStringCanonOutput output(&out_str);
1524   bool success =
1525       CanonicalizeSpecialPath(path_with_null, in_comp, &output, &out_comp);
1526   output.Complete();
1527   EXPECT_TRUE(success);
1528   EXPECT_EQ("/ab%00c", out_str);
1529 
1530   // Test cases specific on special URLs.
1531   DualComponentCase special_path_cases[] = {
1532       // Canonical path for empty path is a slash.
1533       {"", L"", "/", Component(0, 1), true},
1534       // Backslashes should be used as path separators.
1535       {"\\a\\b", L"\\a\\b", "/a/b", Component(0, 4), true},
1536       {"/a\\..\\b", L"/a\\..\\b", "/b", Component(0, 2), true},
1537       {"/a\\.\\b", L"/a\\.\\b", "/a/b", Component(0, 4), true},
1538   };
1539 
1540   DoPathTest(special_path_cases, std::size(special_path_cases),
1541              CanonicalizeSpecialPath, CanonicalizePath);
1542 }
1543 
TEST_F(URLCanonTest,NonSpecialPath)1544 TEST_F(URLCanonTest, NonSpecialPath) {
1545   // Common test cases
1546   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1547              CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1548 
1549   // Test cases specific on non-special URLs.
1550   DualComponentCase non_special_path_cases[] = {
1551       // Empty.
1552       {"", L"", "", Component(0, 0), true},
1553       // Backslashes.
1554       {"/a\\..\\b", L"/a\\..\\b", "/a\\..\\b", Component(0, 7), true},
1555       {"/a\\./b", L"/a\\./b", "/a\\./b", Component(0, 6), true},
1556   };
1557 
1558   DoPathTest(non_special_path_cases, std::size(non_special_path_cases),
1559              CanonicalizeNonSpecialPath, CanonicalizeNonSpecialPath);
1560 }
1561 
TEST_F(URLCanonTest,PartialPath)1562 TEST_F(URLCanonTest, PartialPath) {
1563   DualComponentCase partial_path_cases[] = {
1564       {".html", L".html", ".html", Component(0, 5), true},
1565       {"", L"", "", Component(0, 0), true},
1566   };
1567 
1568   DoPathTest(kCommonPathCases, std::size(kCommonPathCases),
1569              CanonicalizePartialPath, CanonicalizePartialPath);
1570   DoPathTest(partial_path_cases, std::size(partial_path_cases),
1571              CanonicalizePartialPath, CanonicalizePartialPath);
1572 }
1573 
TEST_F(URLCanonTest,Query)1574 TEST_F(URLCanonTest, Query) {
1575   struct QueryCase {
1576     const char* input8;
1577     const wchar_t* input16;
1578     const char* expected;
1579   } query_cases[] = {
1580       // Regular ASCII case.
1581     {"foo=bar", L"foo=bar", "?foo=bar"},
1582       // Allow question marks in the query without escaping
1583     {"as?df", L"as?df", "?as?df"},
1584       // Always escape '#' since it would mark the ref.
1585     {"as#df", L"as#df", "?as%23df"},
1586       // Escape some questionable 8-bit characters, but never unescape.
1587     {"\x02hello\x7f bye", L"\x02hello\x7f bye", "?%02hello%7F%20bye"},
1588     {"%40%41123", L"%40%41123", "?%40%41123"},
1589       // Chinese input/output
1590     {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "?q=%E4%BD%A0%E5%A5%BD"},
1591       // Invalid UTF-8/16 input should be replaced with invalid characters.
1592     {"q=\xed\xed", L"q=\xd800\xd800", "?q=%EF%BF%BD%EF%BF%BD"},
1593       // Don't allow < or > because sometimes they are used for XSS if the
1594       // URL is echoed in content. Firefox does this, IE doesn't.
1595     {"q=<asdf>", L"q=<asdf>", "?q=%3Casdf%3E"},
1596       // Escape double quotemarks in the query.
1597     {"q=\"asdf\"", L"q=\"asdf\"", "?q=%22asdf%22"},
1598   };
1599 
1600   for (const auto& query_case : query_cases) {
1601     Component out_comp;
1602 
1603     if (query_case.input8) {
1604       int len = static_cast<int>(strlen(query_case.input8));
1605       Component in_comp(0, len);
1606       std::string out_str;
1607 
1608       StdStringCanonOutput output(&out_str);
1609       CanonicalizeQuery(query_case.input8, in_comp, nullptr, &output,
1610                         &out_comp);
1611       output.Complete();
1612 
1613       EXPECT_EQ(query_case.expected, out_str);
1614     }
1615 
1616     if (query_case.input16) {
1617       std::u16string input16(
1618           test_utils::TruncateWStringToUTF16(query_case.input16));
1619       int len = static_cast<int>(input16.length());
1620       Component in_comp(0, len);
1621       std::string out_str;
1622 
1623       StdStringCanonOutput output(&out_str);
1624       CanonicalizeQuery(input16.c_str(), in_comp, nullptr, &output, &out_comp);
1625       output.Complete();
1626 
1627       EXPECT_EQ(query_case.expected, out_str);
1628     }
1629   }
1630 
1631   // Extra test for input with embedded NULL;
1632   std::string out_str;
1633   StdStringCanonOutput output(&out_str);
1634   Component out_comp;
1635   CanonicalizeQuery("a \x00z\x01", Component(0, 5), nullptr, &output,
1636                     &out_comp);
1637   output.Complete();
1638   EXPECT_EQ("?a%20%00z%01", out_str);
1639 }
1640 
TEST_F(URLCanonTest,Ref)1641 TEST_F(URLCanonTest, Ref) {
1642   // Refs are trivial, it just checks the encoding.
1643   DualComponentCase ref_cases[] = {
1644       {"hello!", L"hello!", "#hello!", Component(1, 6), true},
1645       // We should escape spaces, double-quotes, angled braces, and backtics.
1646       {"hello, world", L"hello, world", "#hello,%20world", Component(1, 14),
1647        true},
1648       {"hello,\"world", L"hello,\"world", "#hello,%22world", Component(1, 14),
1649        true},
1650       {"hello,<world", L"hello,<world", "#hello,%3Cworld", Component(1, 14),
1651        true},
1652       {"hello,>world", L"hello,>world", "#hello,%3Eworld", Component(1, 14),
1653        true},
1654       {"hello,`world", L"hello,`world", "#hello,%60world", Component(1, 14),
1655        true},
1656       // UTF-8/wide input should be preserved
1657       {"\xc2\xa9", L"\xa9", "#%C2%A9", Component(1, 6), true},
1658       // Test a characer that takes > 16 bits (U+10300 = old italic letter A)
1659       {"\xF0\x90\x8C\x80ss", L"\xd800\xdf00ss", "#%F0%90%8C%80ss",
1660        Component(1, 14), true},
1661       // Escaping should be preserved unchanged, even invalid ones
1662       {"%41%a", L"%41%a", "#%41%a", Component(1, 5), true},
1663       // Invalid UTF-8/16 input should be flagged and the input made valid
1664       {"\xc2", nullptr, "#%EF%BF%BD", Component(1, 9), true},
1665       {nullptr, L"\xd800\x597d", "#%EF%BF%BD%E5%A5%BD", Component(1, 18), true},
1666       // Test a Unicode invalid character.
1667       {"a\xef\xb7\x90", L"a\xfdd0", "#a%EF%B7%90", Component(1, 10), true},
1668       // Refs can have # signs and we should preserve them.
1669       {"asdf#qwer", L"asdf#qwer", "#asdf#qwer", Component(1, 9), true},
1670       {"#asdf", L"#asdf", "##asdf", Component(1, 5), true},
1671   };
1672 
1673   for (const auto& ref_case : ref_cases) {
1674     // 8-bit input
1675     if (ref_case.input8) {
1676       int len = static_cast<int>(strlen(ref_case.input8));
1677       Component in_comp(0, len);
1678       Component out_comp;
1679 
1680       std::string out_str;
1681       StdStringCanonOutput output(&out_str);
1682       CanonicalizeRef(ref_case.input8, in_comp, &output, &out_comp);
1683       output.Complete();
1684 
1685       EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1686       EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1687       EXPECT_EQ(ref_case.expected, out_str);
1688     }
1689 
1690     // 16-bit input
1691     if (ref_case.input16) {
1692       std::u16string input16(
1693           test_utils::TruncateWStringToUTF16(ref_case.input16));
1694       int len = static_cast<int>(input16.length());
1695       Component in_comp(0, len);
1696       Component out_comp;
1697 
1698       std::string out_str;
1699       StdStringCanonOutput output(&out_str);
1700       CanonicalizeRef(input16.c_str(), in_comp, &output, &out_comp);
1701       output.Complete();
1702 
1703       EXPECT_EQ(ref_case.expected_component.begin, out_comp.begin);
1704       EXPECT_EQ(ref_case.expected_component.len, out_comp.len);
1705       EXPECT_EQ(ref_case.expected, out_str);
1706     }
1707   }
1708 
1709   // Try one with an embedded NULL. It should be stripped.
1710   const char null_input[5] = "ab\x00z";
1711   Component null_input_component(0, 4);
1712   Component out_comp;
1713 
1714   std::string out_str;
1715   StdStringCanonOutput output(&out_str);
1716   CanonicalizeRef(null_input, null_input_component, &output, &out_comp);
1717   output.Complete();
1718 
1719   EXPECT_EQ(1, out_comp.begin);
1720   EXPECT_EQ(6, out_comp.len);
1721   EXPECT_EQ("#ab%00z", out_str);
1722 }
1723 
TEST_F(URLCanonTest,CanonicalizeStandardURL)1724 TEST_F(URLCanonTest, CanonicalizeStandardURL) {
1725   // The individual component canonicalize tests should have caught the cases
1726   // for each of those components. Here, we just need to test that the various
1727   // parts are included or excluded properly, and have the correct separators.
1728   // clang-format off
1729   struct URLCase {
1730     const char* input;
1731     const char* expected;
1732     bool expected_success;
1733   } cases[] = {
1734     {"http://www.google.com/foo?bar=baz#", "http://www.google.com/foo?bar=baz#",
1735      true},
1736 
1737       // Backslashes should get converted to forward slashes.
1738       {"http:\\\\www.google.com\\foo", "http://www.google.com/foo", true},
1739 
1740       // Busted refs shouldn't make the whole thing fail.
1741       {"http://www.google.com/asdf#\xc2",
1742        "http://www.google.com/asdf#%EF%BF%BD", true},
1743 
1744       // Basic port tests.
1745       {"http://foo:80/", "http://foo/", true},
1746       {"http://foo:81/", "http://foo:81/", true},
1747       {"httpa://foo:80/", "httpa://foo:80/", true},
1748       {"http://foo:-80/", "http://foo:-80/", false},
1749 
1750       {"https://foo:443/", "https://foo/", true},
1751       {"https://foo:80/", "https://foo:80/", true},
1752       {"ftp://foo:21/", "ftp://foo/", true},
1753       {"ftp://foo:80/", "ftp://foo:80/", true},
1754       {"gopher://foo:70/", "gopher://foo:70/", true},
1755       {"gopher://foo:443/", "gopher://foo:443/", true},
1756       {"ws://foo:80/", "ws://foo/", true},
1757       {"ws://foo:81/", "ws://foo:81/", true},
1758       {"ws://foo:443/", "ws://foo:443/", true},
1759       {"ws://foo:815/", "ws://foo:815/", true},
1760       {"wss://foo:80/", "wss://foo:80/", true},
1761       {"wss://foo:81/", "wss://foo:81/", true},
1762       {"wss://foo:443/", "wss://foo/", true},
1763       {"wss://foo:815/", "wss://foo:815/", true},
1764 
1765       // This particular code path ends up "backing up" to replace an invalid
1766       // host ICU generated with an escaped version. Test that in the context
1767       // of a full URL to make sure the backing up doesn't mess up the non-host
1768       // parts of the URL. "EF B9 AA" is U+FE6A which is a type of percent that
1769       // ICU will convert to an ASCII one, generating "%81".
1770       {"ws:)W\x1eW\xef\xb9\xaa"
1771        "81:80/",
1772        "ws://)w%1ew%81/", false},
1773       // Regression test for the last_invalid_percent_index bug described in
1774       // https://crbug.com/1080890#c10.
1775       {R"(HTTP:S/5%\../>%41)", "http://s/%3E%41", true},
1776   };
1777   // clang-format on
1778 
1779   for (const auto& i : cases) {
1780     Parsed parsed = ParseStandardURL(i.input);
1781 
1782     Parsed out_parsed;
1783     std::string out_str;
1784     StdStringCanonOutput output(&out_str);
1785     bool success = CanonicalizeStandardURL(
1786         i.input, parsed, SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1787         &output, &out_parsed);
1788     output.Complete();
1789 
1790     EXPECT_EQ(i.expected_success, success);
1791     EXPECT_EQ(i.expected, out_str);
1792   }
1793 }
1794 
TEST_F(URLCanonTest,CanonicalizeNonSpecialURL)1795 TEST_F(URLCanonTest, CanonicalizeNonSpecialURL) {
1796   // The individual component canonicalize tests should have caught the cases
1797   // for each of those components. Here, we just need to test that the various
1798   // parts are included or excluded properly, and have the correct separators.
1799   struct URLCase {
1800     const std::string_view input;
1801     const std::string_view expected;
1802     bool expected_success;
1803   } cases[] = {
1804       // Basic cases.
1805       {"git://host:80/path?a=b#ref", "git://host:80/path?a=b#ref", true},
1806       {"git://host", "git://host", true},
1807       {"git://host/", "git://host/", true},
1808       {"git://HosT/", "git://HosT/", true},
1809       {"git://..", "git://..", true},
1810       {"git://../", "git://../", true},
1811       {"git://../..", "git://../", true},
1812 
1813       // Empty hosts.
1814       {"git://", "git://", true},
1815       {"git:///", "git:///", true},
1816       {"git:////", "git:////", true},
1817       {"git:///a", "git:///a", true},
1818       {"git:///a/../b", "git:///b", true},
1819       {"git:///..", "git:///", true},
1820 
1821       // No hosts.
1822       {"git:/", "git:/", true},
1823       {"git:/a", "git:/a", true},
1824       {"git:/a/../b", "git:/b", true},
1825       {"git:/..", "git:/", true},
1826       {"git:/../", "git:/", true},
1827       {"git:/../..", "git:/", true},
1828       {"git:/.//a", "git:/.//a", true},
1829 
1830       // Users.
1831       {"git://@host", "git://host", true},
1832       {"git:// @host", "git://%20@host", true},
1833       {"git://\\@host", "git://%5C@host", true},
1834 
1835       // Paths.
1836       {"git://host/path", "git://host/path", true},
1837       {"git://host/p ath", "git://host/p%20ath", true},
1838       {"git://host/a/../b", "git://host/b", true},
1839       {"git://host/..", "git://host/", true},
1840       {"git://host/../", "git://host/", true},
1841       {"git://host/../..", "git://host/", true},
1842       {"git://host/.", "git://host/", true},
1843       {"git://host/./", "git://host/", true},
1844       {"git://host/./.", "git://host/", true},
1845       // Backslashes.
1846       {"git://host/a\\..\\b", "git://host/a\\..\\b", true},
1847 
1848       // IPv6.
1849       {"git://[1:2:0:0:5:0:0:0]", "git://[1:2:0:0:5::]", true},
1850       {"git://[1:2:0:0:5:0:0:0]/", "git://[1:2:0:0:5::]/", true},
1851       {"git://[1:2:0:0:5:0:0:0]/path", "git://[1:2:0:0:5::]/path", true},
1852 
1853       // IPv4 is unsupported.
1854       {"git://127.00.0.1", "git://127.00.0.1", true},
1855       {"git://127.1000.0.1", "git://127.1000.0.1", true},
1856 
1857       // Invalid URLs.
1858       {"git://@", "git://", false},
1859       // Forbidden host code points.
1860       {"git://<", "git://", false},
1861       {"git:// /", "git:///", false},
1862       // Backslashes cannot be used as host terminators.
1863       {"git://host\\a/../b", "git://host/b", false},
1864 
1865       // Opaque paths.
1866       {"git:", "git:", true},
1867       {"git:opaque", "git:opaque", true},
1868       {"git:o p a q u e", "git:o p a q u e", true},
1869       {"git: <", "git: <", true},
1870       {"git:opaque/a/../b", "git:opaque/a/../b", true},
1871       {"git:opaque\\a\\..\\b", "git:opaque\\a\\..\\b", true},
1872       {"git:\\a", "git:\\a", true},
1873       // Like URNs.
1874       {"git:a:b:c:123", "git:a:b:c:123", true},
1875   };
1876 
1877   for (const auto& i : cases) {
1878     SCOPED_TRACE(i.input);
1879     Parsed parsed = ParseNonSpecialURL(i.input);
1880     Parsed out_parsed;
1881     std::string out_str;
1882     StdStringCanonOutput output(&out_str);
1883     bool success = CanonicalizeNonSpecialURL(
1884         i.input.data(), i.input.size(), parsed,
1885         /*query_converter=*/nullptr, output, out_parsed);
1886     output.Complete();
1887     EXPECT_EQ(success, i.expected_success);
1888     EXPECT_EQ(out_str, i.expected);
1889   }
1890 }
1891 
TEST_F(URLCanonTest,CanonicalizeNonSpecialURLOutputParsed)1892 TEST_F(URLCanonTest, CanonicalizeNonSpecialURLOutputParsed) {
1893   // Test that out_parsed is correctly set.
1894   struct URLCase {
1895     const std::string_view input;
1896     // Currently, test only host and length.
1897     Component expected_output_parsed_host;
1898     int expected_output_parsed_length;
1899   } cases[] = {
1900       {"git:", Component(), 4},
1901       {"git:opaque", Component(), 10},
1902       {"git:/", Component(), 5},
1903       {"git://", Component(6, 0), 6},
1904       {"git:///", Component(6, 0), 7},
1905       // The length of "[1:2:0:0:5::]" is 13.
1906       {"git://[1:2:0:0:5:0:0:0]/", Component(6, 13), 20},
1907   };
1908 
1909   for (const auto& i : cases) {
1910     SCOPED_TRACE(i.input);
1911     Parsed parsed = ParseNonSpecialURL(i.input);
1912     Parsed out_parsed;
1913     std::string unused_out_str;
1914     StdStringCanonOutput unused_output(&unused_out_str);
1915     bool success = CanonicalizeNonSpecialURL(
1916         i.input.data(), i.input.size(), parsed,
1917         /*query_converter=*/nullptr, unused_output, out_parsed);
1918     ASSERT_TRUE(success);
1919     EXPECT_EQ(out_parsed.host, i.expected_output_parsed_host);
1920     EXPECT_EQ(out_parsed.Length(), i.expected_output_parsed_length);
1921   }
1922 }
1923 
1924 // The codepath here is the same as for regular canonicalization, so we just
1925 // need to test that things are replaced or not correctly.
TEST_F(URLCanonTest,ReplaceStandardURL)1926 TEST_F(URLCanonTest, ReplaceStandardURL) {
1927   ReplaceCase replace_cases[] = {
1928       // Common case of truncating the path.
1929       {"http://www.google.com/foo?bar=baz#ref", nullptr, nullptr, nullptr,
1930        nullptr, nullptr, "/", kDeleteComp, kDeleteComp,
1931        "http://www.google.com/"},
1932       // Replace everything
1933       {"http://a:b@google.com:22/foo;bar?baz@cat", "https", "me", "pw",
1934        "host.com", "99", "/path", "query", "ref",
1935        "https://me:pw@host.com:99/path?query#ref"},
1936       // Replace nothing
1937       {"http://a:b@google.com:22/foo?baz@cat", nullptr, nullptr, nullptr,
1938        nullptr, nullptr, nullptr, nullptr, nullptr,
1939        "http://a:b@google.com:22/foo?baz@cat"},
1940       // Replace scheme with filesystem. The result is garbage, but you asked
1941       // for it.
1942       {"http://a:b@google.com:22/foo?baz@cat", "filesystem", nullptr, nullptr,
1943        nullptr, nullptr, nullptr, nullptr, nullptr,
1944        "filesystem://a:b@google.com:22/foo?baz@cat"},
1945   };
1946 
1947   for (const auto& replace_case : replace_cases) {
1948     const ReplaceCase& cur = replace_case;
1949     Parsed parsed = ParseStandardURL(cur.base);
1950 
1951     Replacements<char> r;
1952     typedef Replacements<char> R;  // Clean up syntax.
1953 
1954     // Note that for the scheme we pass in a different clear function since
1955     // there is no function to clear the scheme.
1956     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
1957     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
1958     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
1959     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
1960     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
1961     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
1962     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
1963     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
1964 
1965     std::string out_str;
1966     StdStringCanonOutput output(&out_str);
1967     Parsed out_parsed;
1968     ReplaceStandardURL(replace_case.base, parsed, r,
1969                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1970                        &output, &out_parsed);
1971     output.Complete();
1972 
1973     EXPECT_EQ(replace_case.expected, out_str);
1974   }
1975 
1976   // The path pointer should be ignored if the address is invalid.
1977   {
1978     const char src[] = "http://www.google.com/here_is_the_path";
1979     Parsed parsed = ParseStandardURL(src);
1980 
1981     // Replace the path to 0 length string. By using 1 as the string address,
1982     // the test should get an access violation if it tries to dereference it.
1983     Replacements<char> r;
1984     r.SetPath(reinterpret_cast<char*>(0x00000001), Component(0, 0));
1985     std::string out_str1;
1986     StdStringCanonOutput output1(&out_str1);
1987     Parsed new_parsed;
1988     ReplaceStandardURL(src, parsed, r,
1989                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
1990                        &output1, &new_parsed);
1991     output1.Complete();
1992     EXPECT_STREQ("http://www.google.com/", out_str1.c_str());
1993 
1994     // Same with an "invalid" path.
1995     r.SetPath(reinterpret_cast<char*>(0x00000001), Component());
1996     std::string out_str2;
1997     StdStringCanonOutput output2(&out_str2);
1998     ReplaceStandardURL(src, parsed, r,
1999                        SCHEME_WITH_HOST_PORT_AND_USER_INFORMATION, nullptr,
2000                        &output2, &new_parsed);
2001     output2.Complete();
2002     EXPECT_STREQ("http://www.google.com/", out_str2.c_str());
2003   }
2004 }
2005 
TEST_F(URLCanonTest,ReplaceFileURL)2006 TEST_F(URLCanonTest, ReplaceFileURL) {
2007   ReplaceCase replace_cases[] = {
2008       // Replace everything
2009       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, "filer", nullptr,
2010        "/foo", "b", "c", "file://filer/foo?b#c"},
2011       // Replace nothing
2012       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2013        nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2014       {"file:///Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2015        nullptr, nullptr, "file:///Y:"},
2016       {"file:///Y:/", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2017        nullptr, nullptr, "file:///Y:/"},
2018       {"file:///./Y", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2019        nullptr, nullptr, "file:///Y"},
2020       {"file:///./Y:", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2021        nullptr, nullptr, "file:///Y:"},
2022       // Clear non-path components (common)
2023       {"file:///C:/gaba?query#ref", nullptr, nullptr, nullptr, nullptr, nullptr,
2024        nullptr, kDeleteComp, kDeleteComp, "file:///C:/gaba"},
2025       // Replace path with something that doesn't begin with a slash and make
2026       // sure it gets added properly.
2027       {"file:///C:/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2028        "interesting/", nullptr, nullptr, "file:///interesting/"},
2029       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, "filer",
2030        nullptr, "/foo", "b", "c", "file://filer/foo?b#c"},
2031       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2032        nullptr, nullptr, nullptr, nullptr, "file:///home/gaba?query#ref"},
2033       {"file:///home/gaba?query#ref", nullptr, nullptr, nullptr, nullptr,
2034        nullptr, nullptr, kDeleteComp, kDeleteComp, "file:///home/gaba"},
2035       {"file:///home/gaba", nullptr, nullptr, nullptr, nullptr, nullptr,
2036        "interesting/", nullptr, nullptr, "file:///interesting/"},
2037       // Replace scheme -- shouldn't do anything.
2038       {"file:///C:/gaba?query#ref", "http", nullptr, nullptr, nullptr, nullptr,
2039        nullptr, nullptr, nullptr, "file:///C:/gaba?query#ref"},
2040   };
2041 
2042   for (const auto& replace_case : replace_cases) {
2043     const ReplaceCase& cur = replace_case;
2044     SCOPED_TRACE(cur.base);
2045     Parsed parsed = ParseFileURL(cur.base);
2046 
2047     Replacements<char> r;
2048     typedef Replacements<char> R;  // Clean up syntax.
2049     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2050     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2051     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2052     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2053     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2054     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2055     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2056     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2057 
2058     std::string out_str;
2059     StdStringCanonOutput output(&out_str);
2060     Parsed out_parsed;
2061     ReplaceFileURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2062     output.Complete();
2063 
2064     EXPECT_EQ(replace_case.expected, out_str);
2065   }
2066 }
2067 
TEST_F(URLCanonTest,ReplaceFileSystemURL)2068 TEST_F(URLCanonTest, ReplaceFileSystemURL) {
2069   ReplaceCase replace_cases[] = {
2070       // Replace everything in the outer URL.
2071       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2072        nullptr, nullptr, "/foo", "b", "c",
2073        "filesystem:file:///temporary/foo?b#c"},
2074       // Replace nothing
2075       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2076        nullptr, nullptr, nullptr, nullptr, nullptr,
2077        "filesystem:file:///temporary/gaba?query#ref"},
2078       // Clear non-path components (common)
2079       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2080        nullptr, nullptr, nullptr, kDeleteComp, kDeleteComp,
2081        "filesystem:file:///temporary/gaba"},
2082       // Replace path with something that doesn't begin with a slash and make
2083       // sure it gets added properly.
2084       {"filesystem:file:///temporary/gaba?query#ref", nullptr, nullptr, nullptr,
2085        nullptr, nullptr, "interesting/", nullptr, nullptr,
2086        "filesystem:file:///temporary/interesting/?query#ref"},
2087       // Replace scheme -- shouldn't do anything except canonicalize.
2088       {"filesystem:http://u:p@bar.com/t/gaba?query#ref", "http", nullptr,
2089        nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2090        "filesystem:http://bar.com/t/gaba?query#ref"},
2091       // Replace username -- shouldn't do anything except canonicalize.
2092       {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, "u2", nullptr,
2093        nullptr, nullptr, nullptr, nullptr, nullptr,
2094        "filesystem:http://bar.com/t/gaba?query#ref"},
2095       // Replace password -- shouldn't do anything except canonicalize.
2096       {"filesystem:http://u:p@bar.com/t/gaba?query#ref", nullptr, nullptr,
2097        "pw2", nullptr, nullptr, nullptr, nullptr, nullptr,
2098        "filesystem:http://bar.com/t/gaba?query#ref"},
2099       // Replace host -- shouldn't do anything except canonicalize.
2100       {"filesystem:http://u:p@bar.com:80/t/gaba?query#ref", nullptr, nullptr,
2101        nullptr, "foo.com", nullptr, nullptr, nullptr, nullptr,
2102        "filesystem:http://bar.com/t/gaba?query#ref"},
2103       // Replace port -- shouldn't do anything except canonicalize.
2104       {"filesystem:http://u:p@bar.com:40/t/gaba?query#ref", nullptr, nullptr,
2105        nullptr, nullptr, "41", nullptr, nullptr, nullptr,
2106        "filesystem:http://bar.com:40/t/gaba?query#ref"},
2107   };
2108 
2109   for (const auto& replace_case : replace_cases) {
2110     const ReplaceCase& cur = replace_case;
2111     Parsed parsed = ParseFileSystemURL(cur.base);
2112 
2113     Replacements<char> r;
2114     typedef Replacements<char> R;  // Clean up syntax.
2115     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2116     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2117     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2118     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2119     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2120     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2121     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2122     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2123 
2124     std::string out_str;
2125     StdStringCanonOutput output(&out_str);
2126     Parsed out_parsed;
2127     ReplaceFileSystemURL(cur.base, parsed, r, nullptr, &output, &out_parsed);
2128     output.Complete();
2129 
2130     EXPECT_EQ(replace_case.expected, out_str);
2131   }
2132 }
2133 
TEST_F(URLCanonTest,ReplacePathURL)2134 TEST_F(URLCanonTest, ReplacePathURL) {
2135   ReplaceCase replace_cases[] = {
2136       // Replace everything
2137       {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr,
2138        "alert('foo?');", nullptr, nullptr, "javascript:alert('foo?');"},
2139       // Replace nothing
2140       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2141        nullptr, nullptr, "data:foo"},
2142       // Replace one or the other
2143       {"data:foo", "javascript", nullptr, nullptr, nullptr, nullptr, nullptr,
2144        nullptr, nullptr, "javascript:foo"},
2145       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, "bar", nullptr,
2146        nullptr, "data:bar"},
2147       {"data:foo", nullptr, nullptr, nullptr, nullptr, nullptr, kDeleteComp,
2148        nullptr, nullptr, "data:"},
2149   };
2150 
2151   for (const auto& replace_case : replace_cases) {
2152     const ReplaceCase& cur = replace_case;
2153 
2154     Replacements<char> r;
2155     typedef Replacements<char> R;  // Clean up syntax.
2156     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2157     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2158     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2159     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2160     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2161     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2162     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2163     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2164 
2165     std::string out_str;
2166     StdStringCanonOutput output(&out_str);
2167     Parsed out_parsed;
2168     ReplacePathURL(cur.base, ParsePathURL(cur.base, false), r, &output,
2169                    &out_parsed);
2170     output.Complete();
2171 
2172     EXPECT_EQ(replace_case.expected, out_str);
2173   }
2174 }
2175 
TEST_F(URLCanonTest,ReplaceMailtoURL)2176 TEST_F(URLCanonTest, ReplaceMailtoURL) {
2177   ReplaceCase replace_cases[] = {
2178       // Replace everything
2179       {"mailto:jon@foo.com?body=sup", "mailto", nullptr, nullptr, nullptr,
2180        nullptr, "addr1", "to=tony", nullptr, "mailto:addr1?to=tony"},
2181       // Replace nothing
2182       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2183        nullptr, nullptr, nullptr, nullptr, "mailto:jon@foo.com?body=sup"},
2184       // Replace the path
2185       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2186        nullptr, "jason", nullptr, nullptr, "mailto:jason?body=sup"},
2187       // Replace the query
2188       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2189        nullptr, nullptr, "custom=1", nullptr, "mailto:jon@foo.com?custom=1"},
2190       // Replace the path and query
2191       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2192        nullptr, "jason", "custom=1", nullptr, "mailto:jason?custom=1"},
2193       // Set the query to empty (should leave trailing question mark)
2194       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2195        nullptr, nullptr, "", nullptr, "mailto:jon@foo.com?"},
2196       // Clear the query
2197       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2198        nullptr, nullptr, "|", nullptr, "mailto:jon@foo.com"},
2199       // Clear the path
2200       {"mailto:jon@foo.com?body=sup", nullptr, nullptr, nullptr, nullptr,
2201        nullptr, "|", nullptr, nullptr, "mailto:?body=sup"},
2202       // Clear the path + query
2203       {"mailto:", nullptr, nullptr, nullptr, nullptr, nullptr, "|", "|",
2204        nullptr, "mailto:"},
2205       // Setting the ref should have no effect
2206       {"mailto:addr1", nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
2207        nullptr, "BLAH", "mailto:addr1"},
2208   };
2209 
2210   for (const auto& replace_case : replace_cases) {
2211     const ReplaceCase& cur = replace_case;
2212     Parsed parsed = ParseMailtoURL(cur.base);
2213 
2214     Replacements<char> r;
2215     typedef Replacements<char> R;
2216     SetupReplComp(&R::SetScheme, &R::ClearRef, &r, cur.scheme);
2217     SetupReplComp(&R::SetUsername, &R::ClearUsername, &r, cur.username);
2218     SetupReplComp(&R::SetPassword, &R::ClearPassword, &r, cur.password);
2219     SetupReplComp(&R::SetHost, &R::ClearHost, &r, cur.host);
2220     SetupReplComp(&R::SetPort, &R::ClearPort, &r, cur.port);
2221     SetupReplComp(&R::SetPath, &R::ClearPath, &r, cur.path);
2222     SetupReplComp(&R::SetQuery, &R::ClearQuery, &r, cur.query);
2223     SetupReplComp(&R::SetRef, &R::ClearRef, &r, cur.ref);
2224 
2225     std::string out_str;
2226     StdStringCanonOutput output(&out_str);
2227     Parsed out_parsed;
2228     ReplaceMailtoURL(cur.base, parsed, r, &output, &out_parsed);
2229     output.Complete();
2230 
2231     EXPECT_EQ(replace_case.expected, out_str);
2232   }
2233 }
2234 
TEST_F(URLCanonTest,CanonicalizeFileURL)2235 TEST_F(URLCanonTest, CanonicalizeFileURL) {
2236   struct URLCase {
2237     const char* input;
2238     const char* expected;
2239     bool expected_success;
2240     Component expected_host;
2241     Component expected_path;
2242   } cases[] = {
2243 #ifdef _WIN32
2244       // Windows-style paths
2245       {"file:c:\\foo\\bar.html", "file:///C:/foo/bar.html", true, Component(),
2246        Component(7, 16)},
2247       {"  File:c|////foo\\bar.html", "file:///C:////foo/bar.html", true,
2248        Component(), Component(7, 19)},
2249       {"file:", "file:///", true, Component(), Component(7, 1)},
2250       {"file:UNChost/path", "file://unchost/path", true, Component(7, 7),
2251        Component(14, 5)},
2252       // CanonicalizeFileURL supports absolute Windows style paths for IE
2253       // compatibility. Note that the caller must decide that this is a file
2254       // URL itself so it can call the file canonicalizer. This is usually
2255       // done automatically as part of relative URL resolving.
2256       {"c:\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2257        Component(7, 11)},
2258       {"C|/foo/bar", "file:///C:/foo/bar", true, Component(), Component(7, 11)},
2259       {"/C|\\foo\\bar", "file:///C:/foo/bar", true, Component(),
2260        Component(7, 11)},
2261       {"//C|/foo/bar", "file:///C:/foo/bar", true, Component(),
2262        Component(7, 11)},
2263       {"//server/file", "file://server/file", true, Component(7, 6),
2264        Component(13, 5)},
2265       {"\\\\server\\file", "file://server/file", true, Component(7, 6),
2266        Component(13, 5)},
2267       {"/\\server/file", "file://server/file", true, Component(7, 6),
2268        Component(13, 5)},
2269       // We should preserve the number of slashes after the colon for IE
2270       // compatibility, except when there is none, in which case we should
2271       // add one.
2272       {"file:c:foo/bar.html", "file:///C:/foo/bar.html", true, Component(),
2273        Component(7, 16)},
2274       {"file:/\\/\\C:\\\\//foo\\bar.html", "file:///C:////foo/bar.html", true,
2275        Component(), Component(7, 19)},
2276       // Three slashes should be non-UNC, even if there is no drive spec (IE
2277       // does this, which makes the resulting request invalid).
2278       {"file:///foo/bar.txt", "file:///foo/bar.txt", true, Component(),
2279        Component(7, 12)},
2280       // TODO(brettw) we should probably fail for invalid host names, which
2281       // would change the expected result on this test. We also currently allow
2282       // colon even though it's probably invalid, because its currently the
2283       // "natural" result of the way the canonicalizer is written. There doesn't
2284       // seem to be a strong argument for why allowing it here would be bad, so
2285       // we just tolerate it and the load will fail later.
2286       {"FILE:/\\/\\7:\\\\//foo\\bar.html", "file://7:////foo/bar.html", false,
2287        Component(7, 2), Component(9, 16)},
2288       {"file:filer/home\\me", "file://filer/home/me", true, Component(7, 5),
2289        Component(12, 8)},
2290       // Make sure relative paths can't go above the "C:"
2291       {"file:///C:/foo/../../../bar.html", "file:///C:/bar.html", true,
2292        Component(), Component(7, 12)},
2293       // Busted refs shouldn't make the whole thing fail.
2294       {"file:///C:/asdf#\xc2", "file:///C:/asdf#%EF%BF%BD", true, Component(),
2295        Component(7, 8)},
2296       {"file:///./s:", "file:///S:", true, Component(), Component(7, 3)},
2297 #else
2298       // Unix-style paths
2299       {"file:///home/me", "file:///home/me", true, Component(),
2300        Component(7, 8)},
2301       // Windowsy ones should get still treated as Unix-style.
2302       {"file:c:\\foo\\bar.html", "file:///c:/foo/bar.html", true, Component(),
2303        Component(7, 16)},
2304       {"file:c|//foo\\bar.html", "file:///c%7C//foo/bar.html", true,
2305        Component(), Component(7, 19)},
2306       {"file:///./s:", "file:///s:", true, Component(), Component(7, 3)},
2307       // file: tests from WebKit (LayoutTests/fast/loader/url-parse-1.html)
2308       {"//", "file:///", true, Component(), Component(7, 1)},
2309       {"///", "file:///", true, Component(), Component(7, 1)},
2310       {"///test", "file:///test", true, Component(), Component(7, 5)},
2311       {"file://test", "file://test/", true, Component(7, 4), Component(11, 1)},
2312       {"file://localhost", "file://localhost/", true, Component(7, 9),
2313        Component(16, 1)},
2314       {"file://localhost/", "file://localhost/", true, Component(7, 9),
2315        Component(16, 1)},
2316       {"file://localhost/test", "file://localhost/test", true, Component(7, 9),
2317        Component(16, 5)},
2318 #endif  // _WIN32
2319   };
2320 
2321   for (const auto& i : cases) {
2322     Parsed parsed = ParseFileURL(i.input);
2323 
2324     Parsed out_parsed;
2325     std::string out_str;
2326     StdStringCanonOutput output(&out_str);
2327     bool success =
2328         CanonicalizeFileURL(i.input, static_cast<int>(strlen(i.input)), parsed,
2329                             nullptr, &output, &out_parsed);
2330     output.Complete();
2331 
2332     EXPECT_EQ(i.expected_success, success);
2333     EXPECT_EQ(i.expected, out_str);
2334 
2335     // Make sure the spec was properly identified, the file canonicalizer has
2336     // different code for writing the spec.
2337     EXPECT_EQ(0, out_parsed.scheme.begin);
2338     EXPECT_EQ(4, out_parsed.scheme.len);
2339 
2340     EXPECT_EQ(i.expected_host.begin, out_parsed.host.begin);
2341     EXPECT_EQ(i.expected_host.len, out_parsed.host.len);
2342 
2343     EXPECT_EQ(i.expected_path.begin, out_parsed.path.begin);
2344     EXPECT_EQ(i.expected_path.len, out_parsed.path.len);
2345   }
2346 }
2347 
TEST_F(URLCanonTest,CanonicalizeFileSystemURL)2348 TEST_F(URLCanonTest, CanonicalizeFileSystemURL) {
2349   struct URLCase {
2350     const char* input;
2351     const char* expected;
2352     bool expected_success;
2353   } cases[] = {
2354       {"Filesystem:htTp://www.Foo.com:80/tempoRary",
2355        "filesystem:http://www.foo.com/tempoRary/", true},
2356       {"filesystem:httpS://www.foo.com/temporary/",
2357        "filesystem:https://www.foo.com/temporary/", true},
2358       {"filesystem:http://www.foo.com//", "filesystem:http://www.foo.com//",
2359        false},
2360       {"filesystem:http://www.foo.com/persistent/bob?query#ref",
2361        "filesystem:http://www.foo.com/persistent/bob?query#ref", true},
2362       {"filesystem:fIle://\\temporary/", "filesystem:file:///temporary/", true},
2363       {"filesystem:fiLe:///temporary", "filesystem:file:///temporary/", true},
2364       {"filesystem:File:///temporary/Bob?qUery#reF",
2365        "filesystem:file:///temporary/Bob?qUery#reF", true},
2366       {"FilEsysteM:htTp:E=/.", "filesystem:http://e=//", false},
2367   };
2368 
2369   for (const auto& i : cases) {
2370     Parsed parsed = ParseFileSystemURL(i.input);
2371 
2372     Parsed out_parsed;
2373     std::string out_str;
2374     StdStringCanonOutput output(&out_str);
2375     bool success = CanonicalizeFileSystemURL(i.input, parsed, nullptr, &output,
2376                                              &out_parsed);
2377     output.Complete();
2378 
2379     EXPECT_EQ(i.expected_success, success);
2380     EXPECT_EQ(i.expected, out_str);
2381 
2382     // Make sure the spec was properly identified, the filesystem canonicalizer
2383     // has different code for writing the spec.
2384     EXPECT_EQ(0, out_parsed.scheme.begin);
2385     EXPECT_EQ(10, out_parsed.scheme.len);
2386     if (success)
2387       EXPECT_GT(out_parsed.path.len, 0);
2388   }
2389 }
2390 
TEST_F(URLCanonTest,CanonicalizePathURL)2391 TEST_F(URLCanonTest, CanonicalizePathURL) {
2392   // Path URLs should get canonicalized schemes but nothing else.
2393   struct PathCase {
2394     const char* input;
2395     const char* expected;
2396   } path_cases[] = {
2397       {"javascript:", "javascript:"},
2398       {"JavaScript:Foo", "javascript:Foo"},
2399       {"Foo:\":This /is interesting;?#", "foo:\":This /is interesting;?#"},
2400 
2401       // Unicode invalid characters should not cause failure. See
2402       // https://crbug.com/925614.
2403       {"javascript:\uFFFF", "javascript:%EF%BF%BF"},
2404   };
2405 
2406   for (const auto& path_case : path_cases) {
2407     int url_len = static_cast<int>(strlen(path_case.input));
2408 
2409     Parsed out_parsed;
2410     std::string out_str;
2411     StdStringCanonOutput output(&out_str);
2412     bool success = CanonicalizePathURL(path_case.input, url_len,
2413                                        ParsePathURL(path_case.input, true),
2414                                        &output, &out_parsed);
2415     output.Complete();
2416 
2417     EXPECT_TRUE(success);
2418     EXPECT_EQ(path_case.expected, out_str);
2419 
2420     EXPECT_EQ(0, out_parsed.host.begin);
2421     EXPECT_EQ(-1, out_parsed.host.len);
2422 
2423     // When we end with a colon at the end, there should be no path.
2424     if (path_case.input[url_len - 1] == ':') {
2425       EXPECT_EQ(0, out_parsed.GetContent().begin);
2426       EXPECT_EQ(-1, out_parsed.GetContent().len);
2427     }
2428   }
2429 }
2430 
TEST_F(URLCanonTest,CanonicalizePathURLPath)2431 TEST_F(URLCanonTest, CanonicalizePathURLPath) {
2432   struct PathCase {
2433     std::string input;
2434     std::wstring input16;
2435     std::string expected;
2436   } path_cases[] = {
2437       {"Foo", L"Foo", "Foo"},
2438       {"\":This /is interesting;?#", L"\":This /is interesting;?#",
2439        "\":This /is interesting;?#"},
2440       {"\uFFFF", L"\uFFFF", "%EF%BF%BF"},
2441   };
2442 
2443   for (const auto& path_case : path_cases) {
2444     // 8-bit string input
2445     std::string out_str;
2446     StdStringCanonOutput output(&out_str);
2447     url::Component out_component;
2448     CanonicalizePathURLPath(path_case.input.data(),
2449                             Component(0, path_case.input.size()), &output,
2450                             &out_component);
2451     output.Complete();
2452 
2453     EXPECT_EQ(path_case.expected, out_str);
2454 
2455     EXPECT_EQ(0, out_component.begin);
2456     EXPECT_EQ(path_case.expected.size(),
2457               static_cast<size_t>(out_component.len));
2458 
2459     // 16-bit string input
2460     std::string out_str16;
2461     StdStringCanonOutput output16(&out_str16);
2462     url::Component out_component16;
2463     std::u16string input16(
2464         test_utils::TruncateWStringToUTF16(path_case.input16.data()));
2465     CanonicalizePathURLPath(input16.c_str(),
2466                             Component(0, path_case.input16.size()), &output16,
2467                             &out_component16);
2468     output16.Complete();
2469 
2470     EXPECT_EQ(path_case.expected, out_str16);
2471 
2472     EXPECT_EQ(0, out_component16.begin);
2473     EXPECT_EQ(path_case.expected.size(),
2474               static_cast<size_t>(out_component16.len));
2475   }
2476 }
2477 
TEST_F(URLCanonTest,CanonicalizeMailtoURL)2478 TEST_F(URLCanonTest, CanonicalizeMailtoURL) {
2479   struct URLCase {
2480     const char* input;
2481     const char* expected;
2482     bool expected_success;
2483     Component expected_path;
2484     Component expected_query;
2485   } cases[] = {
2486     // Null character should be escaped to %00.
2487     // Keep this test first in the list as it is handled specially below.
2488     {"mailto:addr1\0addr2?foo",
2489      "mailto:addr1%00addr2?foo",
2490      true, Component(7, 13), Component(21, 3)},
2491     {"mailto:addr1",
2492      "mailto:addr1",
2493      true, Component(7, 5), Component()},
2494     {"mailto:addr1@foo.com",
2495      "mailto:addr1@foo.com",
2496      true, Component(7, 13), Component()},
2497     // Trailing whitespace is stripped.
2498     {"MaIlTo:addr1 \t ",
2499      "mailto:addr1",
2500      true, Component(7, 5), Component()},
2501     {"MaIlTo:addr1?to=jon",
2502      "mailto:addr1?to=jon",
2503      true, Component(7, 5), Component(13,6)},
2504     {"mailto:addr1,addr2",
2505      "mailto:addr1,addr2",
2506      true, Component(7, 11), Component()},
2507     // Embedded spaces must be encoded.
2508     {"mailto:addr1, addr2",
2509      "mailto:addr1,%20addr2",
2510      true, Component(7, 14), Component()},
2511     {"mailto:addr1, addr2?subject=one two ",
2512      "mailto:addr1,%20addr2?subject=one%20two",
2513      true, Component(7, 14), Component(22, 17)},
2514     {"mailto:addr1%2caddr2",
2515      "mailto:addr1%2caddr2",
2516      true, Component(7, 13), Component()},
2517     {"mailto:\xF0\x90\x8C\x80",
2518      "mailto:%F0%90%8C%80",
2519      true, Component(7, 12), Component()},
2520     // Invalid -- UTF-8 encoded surrogate value.
2521     {"mailto:\xed\xa0\x80",
2522      "mailto:%EF%BF%BD%EF%BF%BD%EF%BF%BD",
2523      false, Component(7, 27), Component()},
2524     {"mailto:addr1?",
2525      "mailto:addr1?",
2526      true, Component(7, 5), Component(13, 0)},
2527     // Certain characters have special meanings and must be encoded.
2528     {"mailto:! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~\x7f?Query! \x22$&()+,-./09:;<=>@AZ[\\]&_`az{|}~",
2529      "mailto:!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_%60az%7B%7C%7D~%7F?Query!%20%22$&()+,-./09:;%3C=%3E@AZ[\\]&_`az{|}~",
2530      true, Component(7, 53), Component(61, 47)},
2531   };
2532 
2533   // Define outside of loop to catch bugs where components aren't reset
2534   Parsed out_parsed;
2535 
2536   for (size_t i = 0; i < std::size(cases); i++) {
2537     int url_len = static_cast<int>(strlen(cases[i].input));
2538     if (i == 0) {
2539       // The first test case purposely has a '\0' in it -- don't count it
2540       // as the string terminator.
2541       url_len = 22;
2542     }
2543 
2544     std::string out_str;
2545     StdStringCanonOutput output(&out_str);
2546     bool success = CanonicalizeMailtoURL(
2547         cases[i].input, url_len,
2548         ParseMailtoURL(std::string_view(cases[i].input, url_len)), &output,
2549         &out_parsed);
2550     output.Complete();
2551 
2552     EXPECT_EQ(cases[i].expected_success, success);
2553     EXPECT_EQ(cases[i].expected, out_str);
2554 
2555     // Make sure the spec was properly identified
2556     EXPECT_EQ(0, out_parsed.scheme.begin);
2557     EXPECT_EQ(6, out_parsed.scheme.len);
2558 
2559     EXPECT_EQ(cases[i].expected_path.begin, out_parsed.path.begin);
2560     EXPECT_EQ(cases[i].expected_path.len, out_parsed.path.len);
2561 
2562     EXPECT_EQ(cases[i].expected_query.begin, out_parsed.query.begin);
2563     EXPECT_EQ(cases[i].expected_query.len, out_parsed.query.len);
2564   }
2565 }
2566 
2567 #ifndef WIN32
2568 
TEST_F(URLCanonTest,_itoa_s)2569 TEST_F(URLCanonTest, _itoa_s) {
2570   // We fill the buffer with 0xff to ensure that it's getting properly
2571   // null-terminated. We also allocate one byte more than what we tell
2572   // _itoa_s about, and ensure that the extra byte is untouched.
2573   char buf[6];
2574   memset(buf, 0xff, sizeof(buf));
2575   EXPECT_EQ(0, _itoa_s(12, buf, sizeof(buf) - 1, 10));
2576   EXPECT_STREQ("12", buf);
2577   EXPECT_EQ('\xFF', buf[3]);
2578 
2579   // Test the edge cases - exactly the buffer size and one over
2580   memset(buf, 0xff, sizeof(buf));
2581   EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 10));
2582   EXPECT_STREQ("1234", buf);
2583   EXPECT_EQ('\xFF', buf[5]);
2584 
2585   memset(buf, 0xff, sizeof(buf));
2586   EXPECT_EQ(EINVAL, _itoa_s(12345, buf, sizeof(buf) - 1, 10));
2587   EXPECT_EQ('\xFF', buf[5]);  // should never write to this location
2588 
2589   // Test the template overload (note that this will see the full buffer)
2590   memset(buf, 0xff, sizeof(buf));
2591   EXPECT_EQ(0, _itoa_s(12, buf, 10));
2592   EXPECT_STREQ("12", buf);
2593   EXPECT_EQ('\xFF', buf[3]);
2594 
2595   memset(buf, 0xff, sizeof(buf));
2596   EXPECT_EQ(0, _itoa_s(12345, buf, 10));
2597   EXPECT_STREQ("12345", buf);
2598 
2599   EXPECT_EQ(EINVAL, _itoa_s(123456, buf, 10));
2600 
2601   // Test that radix 16 is supported.
2602   memset(buf, 0xff, sizeof(buf));
2603   EXPECT_EQ(0, _itoa_s(1234, buf, sizeof(buf) - 1, 16));
2604   EXPECT_STREQ("4d2", buf);
2605   EXPECT_EQ('\xFF', buf[5]);
2606 }
2607 
TEST_F(URLCanonTest,_itow_s)2608 TEST_F(URLCanonTest, _itow_s) {
2609   // We fill the buffer with 0xff to ensure that it's getting properly
2610   // null-terminated. We also allocate one byte more than what we tell
2611   // _itoa_s about, and ensure that the extra byte is untouched.
2612   char16_t buf[6];
2613   const char fill_mem = 0xff;
2614   const char16_t fill_char = 0xffff;
2615   memset(buf, fill_mem, sizeof(buf));
2616   EXPECT_EQ(0, _itow_s(12, buf, sizeof(buf) / 2 - 1, 10));
2617   EXPECT_EQ(u"12", std::u16string(buf));
2618   EXPECT_EQ(fill_char, buf[3]);
2619 
2620   // Test the edge cases - exactly the buffer size and one over
2621   EXPECT_EQ(0, _itow_s(1234, buf, sizeof(buf) / 2 - 1, 10));
2622   EXPECT_EQ(u"1234", std::u16string(buf));
2623   EXPECT_EQ(fill_char, buf[5]);
2624 
2625   memset(buf, fill_mem, sizeof(buf));
2626   EXPECT_EQ(EINVAL, _itow_s(12345, buf, sizeof(buf) / 2 - 1, 10));
2627   EXPECT_EQ(fill_char, buf[5]);  // should never write to this location
2628 
2629   // Test the template overload (note that this will see the full buffer)
2630   memset(buf, fill_mem, sizeof(buf));
2631   EXPECT_EQ(0, _itow_s(12, buf, 10));
2632   EXPECT_EQ(u"12", std::u16string(buf));
2633   EXPECT_EQ(fill_char, buf[3]);
2634 
2635   memset(buf, fill_mem, sizeof(buf));
2636   EXPECT_EQ(0, _itow_s(12345, buf, 10));
2637   EXPECT_EQ(u"12345", std::u16string(buf));
2638 
2639   EXPECT_EQ(EINVAL, _itow_s(123456, buf, 10));
2640 }
2641 
2642 #endif  // !WIN32
2643 
2644 // Returns true if the given two structures are the same.
ParsedIsEqual(const Parsed & a,const Parsed & b)2645 static bool ParsedIsEqual(const Parsed& a, const Parsed& b) {
2646   return a.scheme.begin == b.scheme.begin && a.scheme.len == b.scheme.len &&
2647          a.username.begin == b.username.begin && a.username.len == b.username.len &&
2648          a.password.begin == b.password.begin && a.password.len == b.password.len &&
2649          a.host.begin == b.host.begin && a.host.len == b.host.len &&
2650          a.port.begin == b.port.begin && a.port.len == b.port.len &&
2651          a.path.begin == b.path.begin && a.path.len == b.path.len &&
2652          a.query.begin == b.query.begin && a.query.len == b.query.len &&
2653          a.ref.begin == b.ref.begin && a.ref.len == b.ref.len;
2654 }
2655 
TEST_F(URLCanonTest,ResolveRelativeURL)2656 TEST_F(URLCanonTest, ResolveRelativeURL) {
2657   struct RelativeCase {
2658     const char* base;      // Input base URL: MUST BE CANONICAL
2659     bool is_base_hier;     // Is the base URL hierarchical
2660     bool is_base_file;     // Tells us if the base is a file URL.
2661     const char* test;      // Input URL to test against.
2662     bool succeed_relative; // Whether we expect IsRelativeURL to succeed
2663     bool is_rel;           // Whether we expect |test| to be relative or not.
2664     bool succeed_resolve;  // Whether we expect ResolveRelativeURL to succeed.
2665     const char* resolved;  // What we expect in the result when resolving.
2666   } rel_cases[] = {
2667       // Basic absolute input.
2668       {"http://host/a", true, false, "http://another/", true, false, false,
2669        nullptr},
2670       {"http://host/a", true, false, "http:////another/", true, false, false,
2671        nullptr},
2672       // Empty relative URLs should only remove the ref part of the URL,
2673       // leaving the rest unchanged.
2674       {"http://foo/bar", true, false, "", true, true, true, "http://foo/bar"},
2675       {"http://foo/bar#ref", true, false, "", true, true, true,
2676        "http://foo/bar"},
2677       {"http://foo/bar#", true, false, "", true, true, true, "http://foo/bar"},
2678       // Spaces at the ends of the relative path should be ignored.
2679       {"http://foo/bar", true, false, "  another  ", true, true, true,
2680        "http://foo/another"},
2681       {"http://foo/bar", true, false, "  .  ", true, true, true, "http://foo/"},
2682       {"http://foo/bar", true, false, " \t ", true, true, true,
2683        "http://foo/bar"},
2684       // Matching schemes without two slashes are treated as relative.
2685       {"http://host/a", true, false, "http:path", true, true, true,
2686        "http://host/path"},
2687       {"http://host/a/", true, false, "http:path", true, true, true,
2688        "http://host/a/path"},
2689       {"http://host/a", true, false, "http:/path", true, true, true,
2690        "http://host/path"},
2691       {"http://host/a", true, false, "HTTP:/path", true, true, true,
2692        "http://host/path"},
2693       // Nonmatching schemes are absolute.
2694       {"http://host/a", true, false, "https:host2", true, false, false,
2695        nullptr},
2696       {"http://host/a", true, false, "htto:/host2", true, false, false,
2697        nullptr},
2698       // Absolute path input
2699       {"http://host/a", true, false, "/b/c/d", true, true, true,
2700        "http://host/b/c/d"},
2701       {"http://host/a", true, false, "\\b\\c\\d", true, true, true,
2702        "http://host/b/c/d"},
2703       {"http://host/a", true, false, "/b/../c", true, true, true,
2704        "http://host/c"},
2705       {"http://host/a?b#c", true, false, "/b/../c", true, true, true,
2706        "http://host/c"},
2707       {"http://host/a", true, false, "\\b/../c?x#y", true, true, true,
2708        "http://host/c?x#y"},
2709       {"http://host/a?b#c", true, false, "/b/../c?x#y", true, true, true,
2710        "http://host/c?x#y"},
2711       // Relative path input
2712       {"http://host/a", true, false, "b", true, true, true, "http://host/b"},
2713       {"http://host/a", true, false, "bc/de", true, true, true,
2714        "http://host/bc/de"},
2715       {"http://host/a/", true, false, "bc/de?query#ref", true, true, true,
2716        "http://host/a/bc/de?query#ref"},
2717       {"http://host/a/", true, false, ".", true, true, true, "http://host/a/"},
2718       {"http://host/a/", true, false, "..", true, true, true, "http://host/"},
2719       {"http://host/a/", true, false, "./..", true, true, true, "http://host/"},
2720       {"http://host/a/", true, false, "../.", true, true, true, "http://host/"},
2721       {"http://host/a/", true, false, "././.", true, true, true,
2722        "http://host/a/"},
2723       {"http://host/a?query#ref", true, false, "../../../foo", true, true, true,
2724        "http://host/foo"},
2725       // Query input
2726       {"http://host/a", true, false, "?foo=bar", true, true, true,
2727        "http://host/a?foo=bar"},
2728       {"http://host/a?x=y#z", true, false, "?", true, true, true,
2729        "http://host/a?"},
2730       {"http://host/a?x=y#z", true, false, "?foo=bar#com", true, true, true,
2731        "http://host/a?foo=bar#com"},
2732       // Ref input
2733       {"http://host/a", true, false, "#ref", true, true, true,
2734        "http://host/a#ref"},
2735       {"http://host/a#b", true, false, "#", true, true, true, "http://host/a#"},
2736       {"http://host/a?foo=bar#hello", true, false, "#bye", true, true, true,
2737        "http://host/a?foo=bar#bye"},
2738       // Non-hierarchical base: no relative handling. Relative input should
2739       // error, and if a scheme is present, it should be treated as absolute.
2740       {"data:foobar", false, false, "baz.html", false, false, false, nullptr},
2741       {"data:foobar", false, false, "data:baz", true, false, false, nullptr},
2742       {"data:foobar", false, false, "data:/base", true, false, false, nullptr},
2743       // Non-hierarchical base: absolute input should succeed.
2744       {"data:foobar", false, false, "http://host/", true, false, false,
2745        nullptr},
2746       {"data:foobar", false, false, "http:host", true, false, false, nullptr},
2747       // Non-hierarchical base: empty URL should give error.
2748       {"data:foobar", false, false, "", false, false, false, nullptr},
2749       // Invalid schemes should be treated as relative.
2750       {"http://foo/bar", true, false, "./asd:fgh", true, true, true,
2751        "http://foo/asd:fgh"},
2752       {"http://foo/bar", true, false, ":foo", true, true, true,
2753        "http://foo/:foo"},
2754       {"http://foo/bar", true, false, " hello world", true, true, true,
2755        "http://foo/hello%20world"},
2756       {"data:asdf", false, false, ":foo", false, false, false, nullptr},
2757       {"data:asdf", false, false, "bad(':foo')", false, false, false, nullptr},
2758       // We should treat semicolons like any other character in URL resolving
2759       {"http://host/a", true, false, ";foo", true, true, true,
2760        "http://host/;foo"},
2761       {"http://host/a;", true, false, ";foo", true, true, true,
2762        "http://host/;foo"},
2763       {"http://host/a", true, false, ";/../bar", true, true, true,
2764        "http://host/bar"},
2765       // Relative URLs can also be written as "//foo/bar" which is relative to
2766       // the scheme. In this case, it would take the old scheme, so for http
2767       // the example would resolve to "http://foo/bar".
2768       {"http://host/a", true, false, "//another", true, true, true,
2769        "http://another/"},
2770       {"http://host/a", true, false, "//another/path?query#ref", true, true,
2771        true, "http://another/path?query#ref"},
2772       {"http://host/a", true, false, "///another/path", true, true, true,
2773        "http://another/path"},
2774       {"http://host/a", true, false, "//Another\\path", true, true, true,
2775        "http://another/path"},
2776       {"http://host/a", true, false, "//", true, true, false, "http:"},
2777       // IE will also allow one or the other to be a backslash to get the same
2778       // behavior.
2779       {"http://host/a", true, false, "\\/another/path", true, true, true,
2780        "http://another/path"},
2781       {"http://host/a", true, false, "/\\Another\\path", true, true, true,
2782        "http://another/path"},
2783 #ifdef WIN32
2784       // Resolving against Windows file base URLs.
2785       {"file:///C:/foo", true, true, "http://host/", true, false, false,
2786        nullptr},
2787       {"file:///C:/foo", true, true, "bar", true, true, true, "file:///C:/bar"},
2788       {"file:///C:/foo", true, true, "../../../bar.html", true, true, true,
2789        "file:///C:/bar.html"},
2790       {"file:///C:/foo", true, true, "/../bar.html", true, true, true,
2791        "file:///C:/bar.html"},
2792       // But two backslashes on Windows should be UNC so should be treated
2793       // as absolute.
2794       {"http://host/a", true, false, "\\\\another\\path", true, false, false,
2795        nullptr},
2796       // IE doesn't support drive specs starting with two slashes. It fails
2797       // immediately and doesn't even try to load. We fix it up to either
2798       // an absolute path or UNC depending on what it looks like.
2799       {"file:///C:/something", true, true, "//c:/foo", true, true, true,
2800        "file:///C:/foo"},
2801       {"file:///C:/something", true, true, "//localhost/c:/foo", true, true,
2802        true, "file:///C:/foo"},
2803       // Windows drive specs should be allowed and treated as absolute.
2804       {"file:///C:/foo", true, true, "c:", true, false, false, nullptr},
2805       {"file:///C:/foo", true, true, "c:/foo", true, false, false, nullptr},
2806       {"http://host/a", true, false, "c:\\foo", true, false, false, nullptr},
2807       // Relative paths with drive letters should be allowed when the base is
2808       // also a file.
2809       {"file:///C:/foo", true, true, "/z:/bar", true, true, true,
2810        "file:///Z:/bar"},
2811       // Treat absolute paths as being off of the drive.
2812       {"file:///C:/foo", true, true, "/bar", true, true, true,
2813        "file:///C:/bar"},
2814       {"file://localhost/C:/foo", true, true, "/bar", true, true, true,
2815        "file://localhost/C:/bar"},
2816       {"file:///C:/foo/com/", true, true, "/bar", true, true, true,
2817        "file:///C:/bar"},
2818       // On Windows, two slashes without a drive letter when the base is a file
2819       // means that the path is UNC.
2820       {"file:///C:/something", true, true, "//somehost/path", true, true, true,
2821        "file://somehost/path"},
2822       {"file:///C:/something", true, true, "/\\//somehost/path", true, true,
2823        true, "file://somehost/path"},
2824 #else
2825       // On Unix we fall back to relative behavior since there's nothing else
2826       // reasonable to do.
2827       {"http://host/a", true, false, "\\\\Another\\path", true, true, true,
2828        "http://another/path"},
2829 #endif
2830       // Even on Windows, we don't allow relative drive specs when the base
2831       // is not file.
2832       {"http://host/a", true, false, "/c:\\foo", true, true, true,
2833        "http://host/c:/foo"},
2834       {"http://host/a", true, false, "//c:\\foo", true, true, true,
2835        "http://c/foo"},
2836       // Cross-platform relative file: resolution behavior.
2837       {"file://host/a", true, true, "/", true, true, true, "file://host/"},
2838       {"file://host/a", true, true, "//", true, true, true, "file:///"},
2839       {"file://host/a", true, true, "/b", true, true, true, "file://host/b"},
2840       {"file://host/a", true, true, "//b", true, true, true, "file://b/"},
2841       // Ensure that ports aren't allowed for hosts relative to a file url.
2842       // Although the result string shows a host:port portion, the call to
2843       // resolve the relative URL returns false, indicating parse failure,
2844       // which is what is required.
2845       {"file:///foo.txt", true, true, "//host:80/bar.txt", true, true, false,
2846        "file://host:80/bar.txt"},
2847       // Filesystem URL tests; filesystem URLs are only valid and relative if
2848       // they have no scheme, e.g. "./index.html". There's no valid equivalent
2849       // to http:index.html.
2850       {"filesystem:http://host/t/path", true, false,
2851        "filesystem:http://host/t/path2", true, false, false, nullptr},
2852       {"filesystem:http://host/t/path", true, false,
2853        "filesystem:https://host/t/path2", true, false, false, nullptr},
2854       {"filesystem:http://host/t/path", true, false, "http://host/t/path2",
2855        true, false, false, nullptr},
2856       {"http://host/t/path", true, false, "filesystem:http://host/t/path2",
2857        true, false, false, nullptr},
2858       {"filesystem:http://host/t/path", true, false, "./path2", true, true,
2859        true, "filesystem:http://host/t/path2"},
2860       {"filesystem:http://host/t/path/", true, false, "path2", true, true, true,
2861        "filesystem:http://host/t/path/path2"},
2862       {"filesystem:http://host/t/path", true, false, "filesystem:http:path2",
2863        true, false, false, nullptr},
2864       // Absolute URLs are still not relative to a non-standard base URL.
2865       {"about:blank", false, false, "http://X/A", true, false, true, ""},
2866       {"about:blank", false, false, "content://content.Provider/", true, false,
2867        true, ""},
2868   };
2869 
2870   for (const auto& cur_case : rel_cases) {
2871     Parsed parsed;
2872     if (cur_case.is_base_file)
2873       parsed = ParseFileURL(cur_case.base);
2874     else if (cur_case.is_base_hier)
2875       parsed = ParseStandardURL(cur_case.base);
2876     else
2877       parsed = ParsePathURL(cur_case.base, false);
2878 
2879     // First see if it is relative.
2880     int test_len = static_cast<int>(strlen(cur_case.test));
2881     bool is_relative;
2882     Component relative_component;
2883     bool succeed_is_rel = IsRelativeURL(
2884         cur_case.base, parsed, cur_case.test, test_len, cur_case.is_base_hier,
2885         &is_relative, &relative_component);
2886 
2887     EXPECT_EQ(cur_case.succeed_relative, succeed_is_rel) <<
2888         "succeed is rel failure on " << cur_case.test;
2889     EXPECT_EQ(cur_case.is_rel, is_relative) <<
2890         "is rel failure on " << cur_case.test;
2891     // Now resolve it.
2892     if (succeed_is_rel && is_relative && cur_case.is_rel) {
2893       std::string resolved;
2894       StdStringCanonOutput output(&resolved);
2895       Parsed resolved_parsed;
2896 
2897       bool succeed_resolve = ResolveRelativeURL(
2898           cur_case.base, parsed, cur_case.is_base_file, cur_case.test,
2899           relative_component, nullptr, &output, &resolved_parsed);
2900       output.Complete();
2901 
2902       EXPECT_EQ(cur_case.succeed_resolve, succeed_resolve);
2903       EXPECT_EQ(cur_case.resolved, resolved) << " on " << cur_case.test;
2904 
2905       // Verify that the output parsed structure is the same as parsing a
2906       // the URL freshly.
2907       Parsed ref_parsed;
2908       if (cur_case.is_base_file) {
2909         ref_parsed = ParseFileURL(resolved);
2910       } else if (cur_case.is_base_hier) {
2911         ref_parsed = ParseStandardURL(resolved);
2912       } else {
2913         ref_parsed = ParsePathURL(resolved, false);
2914       }
2915       EXPECT_TRUE(ParsedIsEqual(ref_parsed, resolved_parsed));
2916     }
2917   }
2918 }
2919 
2920 class URLCanonTypedTest : public ::testing::TestWithParam<bool> {
2921  public:
URLCanonTypedTest()2922   URLCanonTypedTest()
2923       : use_standard_compliant_non_special_scheme_url_parsing_(GetParam()) {
2924     if (use_standard_compliant_non_special_scheme_url_parsing_) {
2925       scoped_feature_list_.InitAndEnableFeature(
2926           kStandardCompliantNonSpecialSchemeURLParsing);
2927     } else {
2928       scoped_feature_list_.InitAndDisableFeature(
2929           kStandardCompliantNonSpecialSchemeURLParsing);
2930     }
2931   }
2932 
2933  protected:
2934   struct URLCase {
2935     const std::string_view input;
2936     const std::string_view expected;
2937     bool expected_success;
2938   };
2939 
2940   struct ResolveRelativeURLCase {
2941     const std::string_view base;
2942     const std::string_view rel;
2943     const bool is_base_hier;
2944     const bool expected_base_is_valid;
2945     const bool expected_is_relative;
2946     const bool expected_succeed_resolve;
2947     const std::string_view expected_resolved_url;
2948   };
2949 
TestNonSpecialResolveRelativeURL(const ResolveRelativeURLCase & relative_case)2950   void TestNonSpecialResolveRelativeURL(
2951       const ResolveRelativeURLCase& relative_case) {
2952     // The following test is similar to URLCanonTest::ResolveRelativeURL, but
2953     // simplified.
2954     Parsed parsed = use_standard_compliant_non_special_scheme_url_parsing_
2955                         ? ParseNonSpecialURL(relative_case.base)
2956                         : ParsePathURL(relative_case.base,
2957                                        /*trim_path_end=*/true);
2958 
2959     // First see if it is relative.
2960     bool is_relative;
2961     Component relative_component;
2962     bool succeed_is_rel = IsRelativeURL(
2963         relative_case.base.data(), parsed, relative_case.rel.data(),
2964         relative_case.rel.size(), relative_case.is_base_hier, &is_relative,
2965         &relative_component);
2966 
2967     EXPECT_EQ(is_relative, relative_case.expected_is_relative);
2968     if (succeed_is_rel && is_relative) {
2969       std::string resolved_url;
2970       StdStringCanonOutput output(&resolved_url);
2971       Parsed resolved_parsed;
2972 
2973       bool succeed_resolve = ResolveRelativeURL(
2974           relative_case.base.data(), parsed, relative_case.is_base_hier,
2975           relative_case.rel.data(), relative_component, nullptr, &output,
2976           &resolved_parsed);
2977       output.Complete();
2978 
2979       EXPECT_EQ(succeed_resolve, relative_case.expected_succeed_resolve);
2980       EXPECT_EQ(resolved_url, relative_case.expected_resolved_url);
2981     }
2982   }
2983 
2984   bool use_standard_compliant_non_special_scheme_url_parsing_;
2985 
2986  private:
2987   base::test::ScopedFeatureList scoped_feature_list_;
2988 };
2989 
TEST_P(URLCanonTypedTest,NonSpecialResolveRelativeURL)2990 TEST_P(URLCanonTypedTest, NonSpecialResolveRelativeURL) {
2991   // Test flag-dependent behaviors of non-special URLs.
2992   if (use_standard_compliant_non_special_scheme_url_parsing_) {
2993     ResolveRelativeURLCase cases[] = {
2994         {"git://host", "path", true, true, true, true, "git://host/path"},
2995     };
2996     for (const auto& i : cases) {
2997       TestNonSpecialResolveRelativeURL(i);
2998     }
2999   } else {
3000     ResolveRelativeURLCase cases[] = {
3001         {"git://host", "path", true, true, true, true, "git://path"},
3002     };
3003     for (const auto& i : cases) {
3004       TestNonSpecialResolveRelativeURL(i);
3005     }
3006   }
3007 }
3008 
3009 INSTANTIATE_TEST_SUITE_P(All, URLCanonTypedTest, ::testing::Bool());
3010 
3011 // It used to be the case that when we did a replacement with a long buffer of
3012 // UTF-16 characters, we would get invalid data in the URL. This is because the
3013 // buffer that it used to hold the UTF-8 data was resized, while some pointers
3014 // were still kept to the old buffer that was removed.
TEST_F(URLCanonTest,ReplacementOverflow)3015 TEST_F(URLCanonTest, ReplacementOverflow) {
3016   const char src[] = "file:///C:/foo/bar";
3017   Parsed parsed = ParseFileURL(src);
3018 
3019   // Override two components, the path with something short, and the query with
3020   // something long enough to trigger the bug.
3021   Replacements<char16_t> repl;
3022   std::u16string new_query;
3023   for (int i = 0; i < 4800; i++)
3024     new_query.push_back('a');
3025 
3026   std::u16string new_path(test_utils::TruncateWStringToUTF16(L"/foo"));
3027   repl.SetPath(new_path.c_str(), Component(0, 4));
3028   repl.SetQuery(new_query.c_str(),
3029                 Component(0, static_cast<int>(new_query.length())));
3030 
3031   // Call ReplaceComponents on the string. It doesn't matter if we call it for
3032   // standard URLs, file URLs, etc, since they will go to the same replacement
3033   // function that was buggy.
3034   Parsed repl_parsed;
3035   std::string repl_str;
3036   StdStringCanonOutput repl_output(&repl_str);
3037   ReplaceFileURL(src, parsed, repl, nullptr, &repl_output, &repl_parsed);
3038   repl_output.Complete();
3039 
3040   // Generate the expected string and check.
3041   std::string expected("file:///foo?");
3042   for (size_t i = 0; i < new_query.length(); i++)
3043     expected.push_back('a');
3044   EXPECT_TRUE(expected == repl_str);
3045 }
3046 
TEST_F(URLCanonTest,DefaultPortForScheme)3047 TEST_F(URLCanonTest, DefaultPortForScheme) {
3048   struct TestCases {
3049     const char* scheme;
3050     const int expected_port;
3051   } cases[]{
3052       {"http", 80},
3053       {"https", 443},
3054       {"ftp", 21},
3055       {"ws", 80},
3056       {"wss", 443},
3057       {"fake-scheme", PORT_UNSPECIFIED},
3058       {"HTTP", PORT_UNSPECIFIED},
3059       {"HTTPS", PORT_UNSPECIFIED},
3060       {"FTP", PORT_UNSPECIFIED},
3061       {"WS", PORT_UNSPECIFIED},
3062       {"WSS", PORT_UNSPECIFIED},
3063   };
3064 
3065   for (const auto& test_case : cases) {
3066     SCOPED_TRACE(test_case.scheme);
3067     EXPECT_EQ(test_case.expected_port,
3068               DefaultPortForScheme(std::string_view(test_case.scheme,
3069                                                     strlen(test_case.scheme))));
3070   }
3071 }
3072 
TEST_F(URLCanonTest,FindWindowsDriveLetter)3073 TEST_F(URLCanonTest, FindWindowsDriveLetter) {
3074   struct TestCase {
3075     std::string_view spec;
3076     int begin;
3077     int end;  // -1 for end of spec
3078     int expected_drive_letter_pos;
3079   } cases[] = {
3080       {"/", 0, -1, -1},
3081 
3082       {"c:/foo", 0, -1, 0},
3083       {"/c:/foo", 0, -1, 1},
3084       {"//c:/foo", 0, -1, -1},  // "//" does not canonicalize to "/"
3085       {"\\C|\\foo", 0, -1, 1},
3086       {"/cd:/foo", 0, -1, -1},  // "/c" does not canonicalize to "/"
3087       {"/./c:/foo", 0, -1, 3},
3088       {"/.//c:/foo", 0, -1, -1},  // "/.//" does not canonicalize to "/"
3089       {"/././c:/foo", 0, -1, 5},
3090       {"/abc/c:/foo", 0, -1, -1},  // "/abc/" does not canonicalize to "/"
3091       {"/abc/./../c:/foo", 0, -1, 10},
3092 
3093       {"/c:/c:/foo", 3, -1, 4},  // actual input is "/c:/foo"
3094       {"/c:/foo", 3, -1, -1},    // actual input is "/foo"
3095       {"/c:/foo", 0, 1, -1},     // actual input is "/"
3096   };
3097 
3098   for (const auto& c : cases) {
3099     int end = c.end;
3100     if (end == -1)
3101       end = c.spec.size();
3102 
3103     EXPECT_EQ(c.expected_drive_letter_pos,
3104               FindWindowsDriveLetter(c.spec.data(), c.begin, end))
3105         << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-8)";
3106 
3107     std::u16string spec16 = base::ASCIIToUTF16(c.spec);
3108     EXPECT_EQ(c.expected_drive_letter_pos,
3109               FindWindowsDriveLetter(spec16.data(), c.begin, end))
3110         << "for " << c.spec << "[" << c.begin << ":" << end << "] (UTF-16)";
3111   }
3112 }
3113 
TEST_F(URLCanonTest,IDNToASCII)3114 TEST_F(URLCanonTest, IDNToASCII) {
3115   RawCanonOutputW<1024> output;
3116 
3117   // Basic ASCII test.
3118   std::u16string str = u"hello";
3119   EXPECT_TRUE(IDNToASCII(str, &output));
3120   EXPECT_EQ(u"hello", std::u16string(output.data()));
3121   output.set_length(0);
3122 
3123   // Mixed ASCII/non-ASCII.
3124   str = u"hellö";
3125   EXPECT_TRUE(IDNToASCII(str, &output));
3126   EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3127   output.set_length(0);
3128 
3129   // All non-ASCII.
3130   str = u"你好";
3131   EXPECT_TRUE(IDNToASCII(str, &output));
3132   EXPECT_EQ(u"xn--6qq79v", std::u16string(output.data()));
3133   output.set_length(0);
3134 
3135   // Characters that need mapping (the resulting Punycode is the encoding for
3136   // "1⁄4").
3137   str = u"¼";
3138   EXPECT_TRUE(IDNToASCII(str, &output));
3139   EXPECT_EQ(u"xn--14-c6t", std::u16string(output.data()));
3140   output.set_length(0);
3141 
3142   // String to encode already starts with "xn--", and all ASCII. Should not
3143   // modify the string.
3144   str = u"xn--hell-8qa";
3145   EXPECT_TRUE(IDNToASCII(str, &output));
3146   EXPECT_EQ(u"xn--hell-8qa", std::u16string(output.data()));
3147   output.set_length(0);
3148 
3149   // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3150   // Should fail, due to a special case: if the label starts with "xn--", it
3151   // should be parsed as Punycode, which must be all ASCII.
3152   str = u"xn--hellö";
3153   EXPECT_FALSE(IDNToASCII(str, &output));
3154   output.set_length(0);
3155 
3156   // String to encode already starts with "xn--", and mixed ASCII/non-ASCII.
3157   // This tests that there is still an error for the character '⁄' (U+2044),
3158   // which would be a valid ASCII character, U+0044, if the high byte were
3159   // ignored.
3160   str = u"xn--1⁄4";
3161   EXPECT_FALSE(IDNToASCII(str, &output));
3162   output.set_length(0);
3163 }
3164 
ComponentCaseMatches(bool success,std::string_view out_str,const Component & out_comp,const DualComponentCase & expected)3165 void ComponentCaseMatches(bool success,
3166                           std::string_view out_str,
3167                           const Component& out_comp,
3168                           const DualComponentCase& expected) {
3169   EXPECT_EQ(success, expected.expected_success);
3170   EXPECT_STREQ(out_str.data(), expected.expected);
3171   EXPECT_EQ(out_comp, expected.expected_component);
3172 }
3173 
TEST_F(URLCanonTest,OpaqueHost)3174 TEST_F(URLCanonTest, OpaqueHost) {
3175   DualComponentCase host_cases[] = {
3176       {"", L"", "", Component(), true},
3177       {"google.com", L"google.com", "google.com", Component(0, 10), true},
3178       // Upper case letters should be preserved.
3179       {"gooGle.com", L"gooGle.com", "gooGle.com", Component(0, 10), true},
3180       {"\x41", L"\x41", "A", Component(0, 1), true},
3181       {"\x61", L"\x61", "a", Component(0, 1), true},
3182       // Percent encode.
3183       {"\x10", L"\x10", "%10", Component(0, 3), true},
3184       // A valid percent encoding should be preserved.
3185       {"%41", L"%41", "%41", Component(0, 3), true},
3186       // An invalid percent encoding should be preserved too.
3187       {"%zz", L"%zz", "%zz", Component(0, 3), true},
3188       // UTF-16 HIRAGANA LETTER A (codepoint U+3042, "\xe3\x81\x82" in UTF-8).
3189       {"\xe3\x81\x82", L"\x3042", "%E3%81%82", Component(0, 9), true},
3190   };
3191 
3192   for (const auto& host_case : host_cases) {
3193     SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input8 << "\"");
3194     std::string out_str;
3195     StdStringCanonOutput output(&out_str);
3196     Component out_comp;
3197     bool success = CanonicalizeNonSpecialHost(
3198         host_case.input8,
3199         Component(0, static_cast<int>(strlen(host_case.input8))), output,
3200         out_comp);
3201     output.Complete();
3202     ComponentCaseMatches(success, out_str, out_comp, host_case);
3203   }
3204 
3205   // UTF-16 version.
3206   for (const auto& host_case : host_cases) {
3207     SCOPED_TRACE(testing::Message() << "url: \"" << host_case.input16 << "\"");
3208     std::u16string input16(
3209         test_utils::TruncateWStringToUTF16(host_case.input16));
3210     std::string out_str;
3211     StdStringCanonOutput output(&out_str);
3212     Component out_comp;
3213     bool success = CanonicalizeNonSpecialHost(
3214         input16.c_str(), Component(0, static_cast<int>(input16.length())),
3215         output, out_comp);
3216     output.Complete();
3217     ComponentCaseMatches(success, out_str, out_comp, host_case);
3218   }
3219 }
3220 
IPAddressCaseMatches(std::string_view out_str,const CanonHostInfo & host_info,const IPAddressCase & expected)3221 void IPAddressCaseMatches(std::string_view out_str,
3222                           const CanonHostInfo& host_info,
3223                           const IPAddressCase& expected) {
3224   EXPECT_EQ(host_info.family, expected.expected_family);
3225   EXPECT_STREQ(out_str.data(), expected.expected);
3226   EXPECT_EQ(base::HexEncode(host_info.address,
3227                             static_cast<size_t>(host_info.AddressLength())),
3228             expected.expected_address_hex);
3229   if (expected.expected_family == CanonHostInfo::IPV4) {
3230     EXPECT_EQ(host_info.num_ipv4_components,
3231               expected.expected_num_ipv4_components);
3232   }
3233 }
3234 
TEST_F(URLCanonTest,NonSpecialHostIPv6Address)3235 TEST_F(URLCanonTest, NonSpecialHostIPv6Address) {
3236   IPAddressCase ip_address_cases[] = {
3237       // Non-special URLs don't support IPv4. Family must be NEUTRAL.
3238       {"192.168.0.1", L"192.168.0.1", "192.168.0.1", Component(0, 11),
3239        CanonHostInfo::NEUTRAL, 0, ""},
3240       {"192", L"192", "192", Component(0, 3), CanonHostInfo::NEUTRAL, 0, ""},
3241       // "257" is allowed since the number is not considered as a part of IPv4.
3242       {"192.168.0.257", L"192.168.0.257", "192.168.0.257", Component(0, 13),
3243        CanonHostInfo::NEUTRAL, 0, ""},
3244       // IPv6.
3245       {"[1:0:0:2::3:0]", L"[1:0:0:2::3:0]", "[1::2:0:0:3:0]", Component(0, 14),
3246        CanonHostInfo::IPV6, -1, "00010000000000020000000000030000"},
3247       {"[::]", L"[::]", "[::]", Component(0, 4), CanonHostInfo::IPV6, -1,
3248        "00000000000000000000000000000000"},
3249       // Invalid hosts.
3250       {"#[::]", L"#[::]", "", Component(), CanonHostInfo::BROKEN, -1, ""},
3251       {"[]", L"[]", "[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3252       {"a]", L"a]", "a]", Component(), CanonHostInfo::BROKEN, -1, ""},
3253       {"[a", L"[a", "[a", Component(), CanonHostInfo::BROKEN, -1, ""},
3254       {"a[]", L"a[]", "a[]", Component(), CanonHostInfo::BROKEN, -1, ""},
3255       {"[]a", L"[]a", "[]a", Component(), CanonHostInfo::BROKEN, -1, ""},
3256   };
3257 
3258   for (const auto& ip_address_case : ip_address_cases) {
3259     SCOPED_TRACE(testing::Message()
3260                  << "url: \"" << ip_address_case.input8 << "\"");
3261     std::string out_str;
3262     StdStringCanonOutput output(&out_str);
3263     CanonHostInfo host_info;
3264     CanonicalizeNonSpecialHostVerbose(
3265         ip_address_case.input8,
3266         Component(0, static_cast<int>(strlen(ip_address_case.input8))), output,
3267         host_info);
3268     output.Complete();
3269     IPAddressCaseMatches(out_str, host_info, ip_address_case);
3270   }
3271 
3272   // UTF-16 version.
3273   for (const auto& ip_address_case : ip_address_cases) {
3274     SCOPED_TRACE(testing::Message()
3275                  << "url: \"" << ip_address_case.input16 << "\"");
3276     std::u16string input16(
3277         test_utils::TruncateWStringToUTF16(ip_address_case.input16));
3278     std::string out_str;
3279     StdStringCanonOutput output(&out_str);
3280     CanonHostInfo host_info;
3281     CanonicalizeNonSpecialHostVerbose(
3282         input16.c_str(), Component(0, static_cast<int>(input16.length())),
3283         output, host_info);
3284     output.Complete();
3285     IPAddressCaseMatches(out_str, host_info, ip_address_case);
3286   }
3287 }
3288 
3289 }  // namespace url
3290