1 // Copyright 2014 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "url/url_canon_icu.h"
6
7 #include <stddef.h>
8
9 #include "base/logging.h"
10 #include "base/memory/raw_ptr.h"
11 #include "testing/gtest/include/gtest/gtest.h"
12 #include "third_party/icu/source/common/unicode/ucnv.h"
13 #include "url/url_canon.h"
14 #include "url/url_canon_stdstring.h"
15 #include "url/url_test_utils.h"
16
17 namespace url {
18
19 namespace {
20
21 // Wrapper around a UConverter object that managers creation and destruction.
22 class UConvScoper {
23 public:
UConvScoper(const char * charset_name)24 explicit UConvScoper(const char* charset_name) {
25 UErrorCode err = U_ZERO_ERROR;
26 converter_ = ucnv_open(charset_name, &err);
27 if (!converter_) {
28 LOG(ERROR) << "Failed to open charset " << charset_name << ": "
29 << u_errorName(err);
30 }
31 }
32
~UConvScoper()33 ~UConvScoper() {
34 if (converter_)
35 ucnv_close(converter_.ExtractAsDangling());
36 }
37
38 // Returns the converter object, may be NULL.
converter() const39 UConverter* converter() const { return converter_; }
40
41 private:
42 raw_ptr<UConverter> converter_;
43 };
44
TEST(URLCanonIcuTest,ICUCharsetConverter)45 TEST(URLCanonIcuTest, ICUCharsetConverter) {
46 struct ICUCase {
47 const wchar_t* input;
48 const char* encoding;
49 const char* expected;
50 } icu_cases[] = {
51 // UTF-8.
52 {L"Hello, world", "utf-8", "Hello, world"},
53 {L"\x4f60\x597d", "utf-8", "\xe4\xbd\xa0\xe5\xa5\xbd"},
54 // Non-BMP UTF-8.
55 {L"!\xd800\xdf00!", "utf-8", "!\xf0\x90\x8c\x80!"},
56 // Big5
57 {L"\x4f60\x597d", "big5", "\xa7\x41\xa6\x6e"},
58 // Unrepresentable character in the destination set.
59 {L"hello\x4f60\x06de\x597dworld", "big5",
60 "hello\xa7\x41%26%231758%3B\xa6\x6eworld"},
61 };
62
63 for (size_t i = 0; i < std::size(icu_cases); i++) {
64 UConvScoper conv(icu_cases[i].encoding);
65 ASSERT_TRUE(conv.converter() != NULL);
66 ICUCharsetConverter converter(conv.converter());
67
68 std::string str;
69 StdStringCanonOutput output(&str);
70
71 std::u16string input_str(
72 test_utils::TruncateWStringToUTF16(icu_cases[i].input));
73 int input_len = static_cast<int>(input_str.length());
74 converter.ConvertFromUTF16(input_str.c_str(), input_len, &output);
75 output.Complete();
76
77 EXPECT_STREQ(icu_cases[i].expected, str.c_str());
78 }
79
80 // Test string sizes around the resize boundary for the output to make sure
81 // the converter resizes as needed.
82 const int static_size = 16;
83 UConvScoper conv("utf-8");
84 ASSERT_TRUE(conv.converter());
85 ICUCharsetConverter converter(conv.converter());
86 for (int i = static_size - 2; i <= static_size + 2; i++) {
87 // Make a string with the appropriate length.
88 std::u16string input;
89 for (int ch = 0; ch < i; ch++)
90 input.push_back('a');
91
92 RawCanonOutput<static_size> output;
93 converter.ConvertFromUTF16(input.c_str(), static_cast<int>(input.length()),
94 &output);
95 EXPECT_EQ(input.length(), output.length());
96 }
97 }
98
TEST(URLCanonIcuTest,QueryWithConverter)99 TEST(URLCanonIcuTest, QueryWithConverter) {
100 struct QueryCase {
101 const char* input8;
102 const wchar_t* input16;
103 const char* encoding;
104 const char* expected;
105 } query_cases[] = {
106 // Regular ASCII case in some different encodings.
107 {"foo=bar", L"foo=bar", "utf-8", "?foo=bar"},
108 {"foo=bar", L"foo=bar", "shift_jis", "?foo=bar"},
109 {"foo=bar", L"foo=bar", "gb2312", "?foo=bar"},
110 // Chinese input/output
111 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "gb2312",
112 "?q=%C4%E3%BA%C3"},
113 {"q=\xe4\xbd\xa0\xe5\xa5\xbd", L"q=\x4f60\x597d", "big5", "?q=%A7A%A6n"},
114 // Unencodable character in the destination character set should be
115 // escaped. The escape sequence unescapes to be the entity name:
116 // "?q=你"
117 {"q=Chinese\xef\xbc\xa7", L"q=Chinese\xff27", "iso-8859-1",
118 "?q=Chinese%26%2365319%3B"},
119 };
120
121 for (size_t i = 0; i < std::size(query_cases); i++) {
122 Component out_comp;
123
124 UConvScoper conv(query_cases[i].encoding);
125 ASSERT_TRUE(!query_cases[i].encoding || conv.converter());
126 ICUCharsetConverter converter(conv.converter());
127
128 if (query_cases[i].input8) {
129 int len = static_cast<int>(strlen(query_cases[i].input8));
130 Component in_comp(0, len);
131 std::string out_str;
132
133 StdStringCanonOutput output(&out_str);
134 CanonicalizeQuery(query_cases[i].input8, in_comp, &converter, &output,
135 &out_comp);
136 output.Complete();
137
138 EXPECT_EQ(query_cases[i].expected, out_str);
139 }
140
141 if (query_cases[i].input16) {
142 std::u16string input16(
143 test_utils::TruncateWStringToUTF16(query_cases[i].input16));
144 int len = static_cast<int>(input16.length());
145 Component in_comp(0, len);
146 std::string out_str;
147
148 StdStringCanonOutput output(&out_str);
149 CanonicalizeQuery(input16.c_str(), in_comp, &converter, &output,
150 &out_comp);
151 output.Complete();
152
153 EXPECT_EQ(query_cases[i].expected, out_str);
154 }
155 }
156
157 // Extra test for input with embedded NULL;
158 std::string out_str;
159 StdStringCanonOutput output(&out_str);
160 Component out_comp;
161 CanonicalizeQuery("a \x00z\x01", Component(0, 5), NULL, &output, &out_comp);
162 output.Complete();
163 EXPECT_EQ("?a%20%00z%01", out_str);
164 }
165
166 } // namespace
167
168 } // namespace url
169