1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/strings/utf_offset_string_conversions.h"
6
7 #include <stddef.h>
8
9 #include <algorithm>
10
11 #include "base/strings/string_piece.h"
12 #include "testing/gtest/include/gtest/gtest.h"
13
14 namespace base {
15
16 namespace {
17
18 static const size_t kNpos = std::u16string::npos;
19
20 } // namespace
21
TEST(UTFOffsetStringConversionsTest,AdjustOffset)22 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
23 struct UTF8ToUTF16Case {
24 const char* utf8;
25 size_t input_offset;
26 size_t output_offset;
27 } utf8_to_utf16_cases[] = {
28 {"", 0, 0},
29 {"", kNpos, kNpos},
30 {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
31 {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
32 {"\xed\xb0\x80z", 3, 3},
33 {"A\xF0\x90\x8C\x80z", 1, 1},
34 {"A\xF0\x90\x8C\x80z", 2, kNpos},
35 {"A\xF0\x90\x8C\x80z", 5, 3},
36 {"A\xF0\x90\x8C\x80z", 6, 4},
37 {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
38 };
39 for (const auto& i : utf8_to_utf16_cases) {
40 const size_t offset = i.input_offset;
41 std::vector<size_t> offsets;
42 offsets.push_back(offset);
43 UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets);
44 EXPECT_EQ(i.output_offset, offsets[0]);
45 }
46
47 struct UTF16ToUTF8Case {
48 char16_t utf16[10];
49 size_t input_offset;
50 size_t output_offset;
51 } utf16_to_utf8_cases[] = {
52 {{}, 0, 0},
53 // Converted to 3-byte utf-8 sequences
54 {{0x5909, 0x63DB}, 3, kNpos},
55 {{0x5909, 0x63DB}, 2, 6},
56 {{0x5909, 0x63DB}, 1, 3},
57 {{0x5909, 0x63DB}, 0, 0},
58 // Converted to 2-byte utf-8 sequences
59 {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
60 {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
61 {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
62 {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
63 // Surrogate pair
64 {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
65 {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
66 {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
67 {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
68 };
69 for (size_t i = 0; i < std::size(utf16_to_utf8_cases); ++i) {
70 size_t offset = utf16_to_utf8_cases[i].input_offset;
71 std::vector<size_t> offsets;
72 offsets.push_back(offset);
73 UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
74 EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
75 }
76 }
77
TEST(UTFOffsetStringConversionsTest,LimitOffsets)78 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
79 const OffsetAdjuster::Adjustments kNoAdjustments;
80 const size_t kLimit = 10;
81 const size_t kItems = 20;
82 std::vector<size_t> size_ts;
83 for (size_t t = 0; t < kItems; ++t) {
84 size_ts.push_back(t);
85 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
86 }
87 size_t unlimited_count = 0;
88 for (auto ti : size_ts) {
89 if (ti != kNpos)
90 ++unlimited_count;
91 }
92 EXPECT_EQ(11U, unlimited_count);
93
94 // Reverse the values in the vector and try again.
95 size_ts.clear();
96 for (size_t t = kItems; t > 0; --t) {
97 size_ts.push_back(t - 1);
98 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
99 }
100 unlimited_count = 0;
101 for (auto ti : size_ts) {
102 if (ti != kNpos)
103 ++unlimited_count;
104 }
105 EXPECT_EQ(11U, unlimited_count);
106 }
107
TEST(UTFOffsetStringConversionsTest,AdjustOffsets)108 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
109 // Imagine we have strings as shown in the following cases where the
110 // X's represent encoded characters.
111 // 1: abcXXXdef ==> abcXdef
112 {
113 std::vector<size_t> offsets;
114 for (size_t t = 0; t <= 9; ++t)
115 offsets.push_back(t);
116 OffsetAdjuster::Adjustments adjustments;
117 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
118 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
119 size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
120 EXPECT_EQ(offsets.size(), std::size(expected_1));
121 for (size_t i = 0; i < std::size(expected_1); ++i)
122 EXPECT_EQ(expected_1[i], offsets[i]);
123 }
124
125 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
126 {
127 std::vector<size_t> offsets;
128 for (size_t t = 0; t <= 23; ++t)
129 offsets.push_back(t);
130 OffsetAdjuster::Adjustments adjustments;
131 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
132 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
133 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
134 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
135 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
136 size_t expected_2[] = {
137 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
138 kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
139 };
140 EXPECT_EQ(offsets.size(), std::size(expected_2));
141 for (size_t i = 0; i < std::size(expected_2); ++i)
142 EXPECT_EQ(expected_2[i], offsets[i]);
143 }
144
145 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
146 {
147 std::vector<size_t> offsets;
148 for (size_t t = 0; t <= 17; ++t)
149 offsets.push_back(t);
150 OffsetAdjuster::Adjustments adjustments;
151 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
152 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
153 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
154 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
155 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
156 size_t expected_3[] = {
157 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
158 12, kNpos, 12
159 };
160 EXPECT_EQ(offsets.size(), std::size(expected_3));
161 for (size_t i = 0; i < std::size(expected_3); ++i)
162 EXPECT_EQ(expected_3[i], offsets[i]);
163 }
164 }
165
TEST(UTFOffsetStringConversionsTest,UnadjustOffsets)166 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
167 // Imagine we have strings as shown in the following cases where the
168 // X's represent encoded characters.
169 // 1: abcXXXdef ==> abcXdef
170 {
171 std::vector<size_t> offsets;
172 for (size_t t = 0; t <= 7; ++t)
173 offsets.push_back(t);
174 OffsetAdjuster::Adjustments adjustments;
175 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
176 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
177 size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
178 EXPECT_EQ(offsets.size(), std::size(expected_1));
179 for (size_t i = 0; i < std::size(expected_1); ++i)
180 EXPECT_EQ(expected_1[i], offsets[i]);
181 }
182
183 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
184 {
185 std::vector<size_t> offsets;
186 for (size_t t = 0; t <= 14; ++t)
187 offsets.push_back(t);
188 OffsetAdjuster::Adjustments adjustments;
189 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
190 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
191 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
192 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
193 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
194 size_t expected_2[] = {
195 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
196 };
197 EXPECT_EQ(offsets.size(), std::size(expected_2));
198 for (size_t i = 0; i < std::size(expected_2); ++i)
199 EXPECT_EQ(expected_2[i], offsets[i]);
200 }
201
202 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
203 {
204 std::vector<size_t> offsets;
205 for (size_t t = 0; t <= 12; ++t)
206 offsets.push_back(t);
207 OffsetAdjuster::Adjustments adjustments;
208 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
209 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
210 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
211 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
212 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
213 size_t expected_3[] = {
214 0, // this could just as easily be 3
215 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
216 15 // this could just as easily be 17
217 };
218 EXPECT_EQ(offsets.size(), std::size(expected_3));
219 for (size_t i = 0; i < std::size(expected_3); ++i)
220 EXPECT_EQ(expected_3[i], offsets[i]);
221 }
222 }
223
224 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
225 // net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
226 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
227 // is simply a short, additional test.
TEST(UTFOffsetStringConversionsTest,MergeSequentialAdjustments)228 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
229 // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
230
231 // Set up |first_adjustments| to
232 // - remove the leading "a"
233 // - combine the "bc" into one character (call it ".")
234 // - remove the "f"
235 // - remove the "tuv"
236 // The resulting string should be ".deghijklmnopqrswxyz".
237 OffsetAdjuster::Adjustments first_adjustments;
238 first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
239 first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
240 first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
241 first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
242
243 // Set up |adjustments_on_adjusted_string| to
244 // - combine the "." character that replaced "bc" with "d" into one character
245 // (call it "?")
246 // - remove the "egh"
247 // - expand the "i" into two characters (call them "12")
248 // - combine the "jkl" into one character (call it "@")
249 // - expand the "z" into two characters (call it "34")
250 // The resulting string should be "?12@mnopqrswxy34".
251 OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
252 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
253 0, 2, 1));
254 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
255 2, 3, 0));
256 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
257 5, 1, 2));
258 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
259 6, 3, 1));
260 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
261 19, 1, 2));
262
263 // Now merge the adjustments and check the results.
264 OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
265 &adjustments_on_adjusted_string);
266 // The merged adjustments should look like
267 // - combine abcd into "?"
268 // - note: it's also reasonable for the Merge function to instead produce
269 // two adjustments instead of this, one to remove a and another to
270 // combine bcd into "?". This test verifies the current behavior.
271 // - remove efgh
272 // - expand i into "12"
273 // - combine jkl into "@"
274 // - remove tuv
275 // - expand z into "34"
276 ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
277 EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
278 EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
279 EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
280 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
281 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
282 EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
283 EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
284 EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
285 EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
286 EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
287 EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
288 EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
289 EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
290 EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
291 EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
292 EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
293 EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
294 EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
295 }
296
297 } // namespace base
298