• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "base/strings/utf_offset_string_conversions.h"
11 
12 #include <stddef.h>
13 
14 #include <algorithm>
15 
16 #include "testing/gtest/include/gtest/gtest.h"
17 
18 namespace base {
19 
20 namespace {
21 
22 static const size_t kNpos = std::u16string::npos;
23 
24 }  // namespace
25 
TEST(UTFOffsetStringConversionsTest,AdjustOffset)26 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
27   struct UTF8ToUTF16Case {
28     const char* utf8;
29     size_t input_offset;
30     size_t output_offset;
31   } utf8_to_utf16_cases[] = {
32     {"", 0, 0},
33     {"", kNpos, kNpos},
34     {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
35     {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
36     {"\xed\xb0\x80z", 3, 3},
37     {"A\xF0\x90\x8C\x80z", 1, 1},
38     {"A\xF0\x90\x8C\x80z", 2, kNpos},
39     {"A\xF0\x90\x8C\x80z", 5, 3},
40     {"A\xF0\x90\x8C\x80z", 6, 4},
41     {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
42   };
43   for (const auto& i : utf8_to_utf16_cases) {
44     const size_t offset = i.input_offset;
45     std::vector<size_t> offsets;
46     offsets.push_back(offset);
47     UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets);
48     EXPECT_EQ(i.output_offset, offsets[0]);
49   }
50 
51   struct UTF16ToUTF8Case {
52     char16_t utf16[10];
53     size_t input_offset;
54     size_t output_offset;
55   } utf16_to_utf8_cases[] = {
56       {{}, 0, 0},
57       // Converted to 3-byte utf-8 sequences
58       {{0x5909, 0x63DB}, 3, kNpos},
59       {{0x5909, 0x63DB}, 2, 6},
60       {{0x5909, 0x63DB}, 1, 3},
61       {{0x5909, 0x63DB}, 0, 0},
62       // Converted to 2-byte utf-8 sequences
63       {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
64       {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
65       {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
66       {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
67       // Surrogate pair
68       {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
69       {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
70       {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
71       {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
72   };
73   for (size_t i = 0; i < std::size(utf16_to_utf8_cases); ++i) {
74     size_t offset = utf16_to_utf8_cases[i].input_offset;
75     std::vector<size_t> offsets;
76     offsets.push_back(offset);
77     UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
78     EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
79   }
80 }
81 
TEST(UTFOffsetStringConversionsTest,LimitOffsets)82 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
83   const OffsetAdjuster::Adjustments kNoAdjustments;
84   const size_t kLimit = 10;
85   const size_t kItems = 20;
86   std::vector<size_t> size_ts;
87   for (size_t t = 0; t < kItems; ++t) {
88     size_ts.push_back(t);
89     OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
90   }
91   size_t unlimited_count = 0;
92   for (auto ti : size_ts) {
93     if (ti != kNpos)
94       ++unlimited_count;
95   }
96   EXPECT_EQ(11U, unlimited_count);
97 
98   // Reverse the values in the vector and try again.
99   size_ts.clear();
100   for (size_t t = kItems; t > 0; --t) {
101     size_ts.push_back(t - 1);
102     OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
103   }
104   unlimited_count = 0;
105   for (auto ti : size_ts) {
106     if (ti != kNpos)
107       ++unlimited_count;
108   }
109   EXPECT_EQ(11U, unlimited_count);
110 }
111 
TEST(UTFOffsetStringConversionsTest,AdjustOffsets)112 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
113   // Imagine we have strings as shown in the following cases where the
114   // X's represent encoded characters.
115   // 1: abcXXXdef ==> abcXdef
116   {
117     std::vector<size_t> offsets;
118     for (size_t t = 0; t <= 9; ++t)
119       offsets.push_back(t);
120     OffsetAdjuster::Adjustments adjustments;
121     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
122     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
123     size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
124     EXPECT_EQ(offsets.size(), std::size(expected_1));
125     for (size_t i = 0; i < std::size(expected_1); ++i)
126       EXPECT_EQ(expected_1[i], offsets[i]);
127   }
128 
129   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
130   {
131     std::vector<size_t> offsets;
132     for (size_t t = 0; t <= 23; ++t)
133       offsets.push_back(t);
134     OffsetAdjuster::Adjustments adjustments;
135     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
136     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
137     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
138     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
139     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
140     size_t expected_2[] = {
141       0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
142       kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
143     };
144     EXPECT_EQ(offsets.size(), std::size(expected_2));
145     for (size_t i = 0; i < std::size(expected_2); ++i)
146       EXPECT_EQ(expected_2[i], offsets[i]);
147   }
148 
149   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
150   {
151     std::vector<size_t> offsets;
152     for (size_t t = 0; t <= 17; ++t)
153       offsets.push_back(t);
154     OffsetAdjuster::Adjustments adjustments;
155     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
156     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
157     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
158     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
159     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
160     size_t expected_3[] = {
161       0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
162       12, kNpos, 12
163     };
164     EXPECT_EQ(offsets.size(), std::size(expected_3));
165     for (size_t i = 0; i < std::size(expected_3); ++i)
166       EXPECT_EQ(expected_3[i], offsets[i]);
167   }
168 }
169 
TEST(UTFOffsetStringConversionsTest,UnadjustOffsets)170 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
171   // Imagine we have strings as shown in the following cases where the
172   // X's represent encoded characters.
173   // 1: abcXXXdef ==> abcXdef
174   {
175     std::vector<size_t> offsets;
176     for (size_t t = 0; t <= 7; ++t)
177       offsets.push_back(t);
178     OffsetAdjuster::Adjustments adjustments;
179     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
180     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
181     size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
182     EXPECT_EQ(offsets.size(), std::size(expected_1));
183     for (size_t i = 0; i < std::size(expected_1); ++i)
184       EXPECT_EQ(expected_1[i], offsets[i]);
185   }
186 
187   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
188   {
189     std::vector<size_t> offsets;
190     for (size_t t = 0; t <= 14; ++t)
191       offsets.push_back(t);
192     OffsetAdjuster::Adjustments adjustments;
193     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
194     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
195     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
196     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
197     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
198     size_t expected_2[] = {
199       0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
200     };
201     EXPECT_EQ(offsets.size(), std::size(expected_2));
202     for (size_t i = 0; i < std::size(expected_2); ++i)
203       EXPECT_EQ(expected_2[i], offsets[i]);
204   }
205 
206   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
207   {
208     std::vector<size_t> offsets;
209     for (size_t t = 0; t <= 12; ++t)
210       offsets.push_back(t);
211     OffsetAdjuster::Adjustments adjustments;
212     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
213     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
214     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
215     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
216     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
217     size_t expected_3[] = {
218       0,  // this could just as easily be 3
219       4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
220       15  // this could just as easily be 17
221     };
222     EXPECT_EQ(offsets.size(), std::size(expected_3));
223     for (size_t i = 0; i < std::size(expected_3); ++i)
224       EXPECT_EQ(expected_3[i], offsets[i]);
225   }
226 }
227 
228 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
229 // net/base/net_util.{h,cc}.  The two tests EscapeTest.AdjustOffset and
230 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively.  This
231 // is simply a short, additional test.
TEST(UTFOffsetStringConversionsTest,MergeSequentialAdjustments)232 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
233   // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
234 
235   // Set up |first_adjustments| to
236   // - remove the leading "a"
237   // - combine the "bc" into one character (call it ".")
238   // - remove the "f"
239   // - remove the "tuv"
240   // The resulting string should be ".deghijklmnopqrswxyz".
241   OffsetAdjuster::Adjustments first_adjustments;
242   first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
243   first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
244   first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
245   first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
246 
247   // Set up |adjustments_on_adjusted_string| to
248   // - combine the "." character that replaced "bc" with "d" into one character
249   //   (call it "?")
250   // - remove the "egh"
251   // - expand the "i" into two characters (call them "12")
252   // - combine the "jkl" into one character (call it "@")
253   // - expand the "z" into two characters (call it "34")
254   // The resulting string should be "?12@mnopqrswxy34".
255   OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
256   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
257       0, 2, 1));
258   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
259       2, 3, 0));
260   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
261       5, 1, 2));
262   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
263       6, 3, 1));
264   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
265       19, 1, 2));
266 
267   // Now merge the adjustments and check the results.
268   OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
269                                              &adjustments_on_adjusted_string);
270   // The merged adjustments should look like
271   // - combine abcd into "?"
272   //   - note: it's also reasonable for the Merge function to instead produce
273   //     two adjustments instead of this, one to remove a and another to
274   //     combine bcd into "?".  This test verifies the current behavior.
275   // - remove efgh
276   // - expand i into "12"
277   // - combine jkl into "@"
278   // - remove tuv
279   // - expand z into "34"
280   ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
281   EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
282   EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
283   EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
284   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
285   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
286   EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
287   EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
288   EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
289   EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
290   EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
291   EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
292   EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
293   EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
294   EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
295   EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
296   EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
297   EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
298   EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
299 }
300 
301 }  // namespace base
302