1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "base/strings/utf_offset_string_conversions.h"
11
12 #include <stddef.h>
13
14 #include <algorithm>
15
16 #include "testing/gtest/include/gtest/gtest.h"
17
18 namespace base {
19
20 namespace {
21
22 static const size_t kNpos = std::u16string::npos;
23
24 } // namespace
25
TEST(UTFOffsetStringConversionsTest,AdjustOffset)26 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
27 struct UTF8ToUTF16Case {
28 const char* utf8;
29 size_t input_offset;
30 size_t output_offset;
31 } utf8_to_utf16_cases[] = {
32 {"", 0, 0},
33 {"", kNpos, kNpos},
34 {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
35 {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
36 {"\xed\xb0\x80z", 3, 3},
37 {"A\xF0\x90\x8C\x80z", 1, 1},
38 {"A\xF0\x90\x8C\x80z", 2, kNpos},
39 {"A\xF0\x90\x8C\x80z", 5, 3},
40 {"A\xF0\x90\x8C\x80z", 6, 4},
41 {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
42 };
43 for (const auto& i : utf8_to_utf16_cases) {
44 const size_t offset = i.input_offset;
45 std::vector<size_t> offsets;
46 offsets.push_back(offset);
47 UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets);
48 EXPECT_EQ(i.output_offset, offsets[0]);
49 }
50
51 struct UTF16ToUTF8Case {
52 char16_t utf16[10];
53 size_t input_offset;
54 size_t output_offset;
55 } utf16_to_utf8_cases[] = {
56 {{}, 0, 0},
57 // Converted to 3-byte utf-8 sequences
58 {{0x5909, 0x63DB}, 3, kNpos},
59 {{0x5909, 0x63DB}, 2, 6},
60 {{0x5909, 0x63DB}, 1, 3},
61 {{0x5909, 0x63DB}, 0, 0},
62 // Converted to 2-byte utf-8 sequences
63 {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
64 {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
65 {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
66 {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
67 // Surrogate pair
68 {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
69 {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
70 {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
71 {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
72 };
73 for (size_t i = 0; i < std::size(utf16_to_utf8_cases); ++i) {
74 size_t offset = utf16_to_utf8_cases[i].input_offset;
75 std::vector<size_t> offsets;
76 offsets.push_back(offset);
77 UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
78 EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
79 }
80 }
81
TEST(UTFOffsetStringConversionsTest,LimitOffsets)82 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
83 const OffsetAdjuster::Adjustments kNoAdjustments;
84 const size_t kLimit = 10;
85 const size_t kItems = 20;
86 std::vector<size_t> size_ts;
87 for (size_t t = 0; t < kItems; ++t) {
88 size_ts.push_back(t);
89 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
90 }
91 size_t unlimited_count = 0;
92 for (auto ti : size_ts) {
93 if (ti != kNpos)
94 ++unlimited_count;
95 }
96 EXPECT_EQ(11U, unlimited_count);
97
98 // Reverse the values in the vector and try again.
99 size_ts.clear();
100 for (size_t t = kItems; t > 0; --t) {
101 size_ts.push_back(t - 1);
102 OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
103 }
104 unlimited_count = 0;
105 for (auto ti : size_ts) {
106 if (ti != kNpos)
107 ++unlimited_count;
108 }
109 EXPECT_EQ(11U, unlimited_count);
110 }
111
TEST(UTFOffsetStringConversionsTest,AdjustOffsets)112 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
113 // Imagine we have strings as shown in the following cases where the
114 // X's represent encoded characters.
115 // 1: abcXXXdef ==> abcXdef
116 {
117 std::vector<size_t> offsets;
118 for (size_t t = 0; t <= 9; ++t)
119 offsets.push_back(t);
120 OffsetAdjuster::Adjustments adjustments;
121 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
122 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
123 size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
124 EXPECT_EQ(offsets.size(), std::size(expected_1));
125 for (size_t i = 0; i < std::size(expected_1); ++i)
126 EXPECT_EQ(expected_1[i], offsets[i]);
127 }
128
129 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
130 {
131 std::vector<size_t> offsets;
132 for (size_t t = 0; t <= 23; ++t)
133 offsets.push_back(t);
134 OffsetAdjuster::Adjustments adjustments;
135 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
136 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
137 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
138 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
139 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
140 size_t expected_2[] = {
141 0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
142 kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
143 };
144 EXPECT_EQ(offsets.size(), std::size(expected_2));
145 for (size_t i = 0; i < std::size(expected_2); ++i)
146 EXPECT_EQ(expected_2[i], offsets[i]);
147 }
148
149 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
150 {
151 std::vector<size_t> offsets;
152 for (size_t t = 0; t <= 17; ++t)
153 offsets.push_back(t);
154 OffsetAdjuster::Adjustments adjustments;
155 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
156 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
157 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
158 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
159 OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
160 size_t expected_3[] = {
161 0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
162 12, kNpos, 12
163 };
164 EXPECT_EQ(offsets.size(), std::size(expected_3));
165 for (size_t i = 0; i < std::size(expected_3); ++i)
166 EXPECT_EQ(expected_3[i], offsets[i]);
167 }
168 }
169
TEST(UTFOffsetStringConversionsTest,UnadjustOffsets)170 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
171 // Imagine we have strings as shown in the following cases where the
172 // X's represent encoded characters.
173 // 1: abcXXXdef ==> abcXdef
174 {
175 std::vector<size_t> offsets;
176 for (size_t t = 0; t <= 7; ++t)
177 offsets.push_back(t);
178 OffsetAdjuster::Adjustments adjustments;
179 adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
180 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
181 size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
182 EXPECT_EQ(offsets.size(), std::size(expected_1));
183 for (size_t i = 0; i < std::size(expected_1); ++i)
184 EXPECT_EQ(expected_1[i], offsets[i]);
185 }
186
187 // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
188 {
189 std::vector<size_t> offsets;
190 for (size_t t = 0; t <= 14; ++t)
191 offsets.push_back(t);
192 OffsetAdjuster::Adjustments adjustments;
193 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
194 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
195 adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
196 adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
197 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
198 size_t expected_2[] = {
199 0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
200 };
201 EXPECT_EQ(offsets.size(), std::size(expected_2));
202 for (size_t i = 0; i < std::size(expected_2); ++i)
203 EXPECT_EQ(expected_2[i], offsets[i]);
204 }
205
206 // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
207 {
208 std::vector<size_t> offsets;
209 for (size_t t = 0; t <= 12; ++t)
210 offsets.push_back(t);
211 OffsetAdjuster::Adjustments adjustments;
212 adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
213 adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
214 adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
215 adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
216 OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
217 size_t expected_3[] = {
218 0, // this could just as easily be 3
219 4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
220 15 // this could just as easily be 17
221 };
222 EXPECT_EQ(offsets.size(), std::size(expected_3));
223 for (size_t i = 0; i < std::size(expected_3); ++i)
224 EXPECT_EQ(expected_3[i], offsets[i]);
225 }
226 }
227
228 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
229 // net/base/net_util.{h,cc}. The two tests EscapeTest.AdjustOffset and
230 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively. This
231 // is simply a short, additional test.
TEST(UTFOffsetStringConversionsTest,MergeSequentialAdjustments)232 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
233 // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
234
235 // Set up |first_adjustments| to
236 // - remove the leading "a"
237 // - combine the "bc" into one character (call it ".")
238 // - remove the "f"
239 // - remove the "tuv"
240 // The resulting string should be ".deghijklmnopqrswxyz".
241 OffsetAdjuster::Adjustments first_adjustments;
242 first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
243 first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
244 first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
245 first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
246
247 // Set up |adjustments_on_adjusted_string| to
248 // - combine the "." character that replaced "bc" with "d" into one character
249 // (call it "?")
250 // - remove the "egh"
251 // - expand the "i" into two characters (call them "12")
252 // - combine the "jkl" into one character (call it "@")
253 // - expand the "z" into two characters (call it "34")
254 // The resulting string should be "?12@mnopqrswxy34".
255 OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
256 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
257 0, 2, 1));
258 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
259 2, 3, 0));
260 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
261 5, 1, 2));
262 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
263 6, 3, 1));
264 adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
265 19, 1, 2));
266
267 // Now merge the adjustments and check the results.
268 OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
269 &adjustments_on_adjusted_string);
270 // The merged adjustments should look like
271 // - combine abcd into "?"
272 // - note: it's also reasonable for the Merge function to instead produce
273 // two adjustments instead of this, one to remove a and another to
274 // combine bcd into "?". This test verifies the current behavior.
275 // - remove efgh
276 // - expand i into "12"
277 // - combine jkl into "@"
278 // - remove tuv
279 // - expand z into "34"
280 ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
281 EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
282 EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
283 EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
284 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
285 EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
286 EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
287 EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
288 EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
289 EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
290 EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
291 EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
292 EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
293 EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
294 EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
295 EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
296 EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
297 EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
298 EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
299 }
300
301 } // namespace base
302