• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/strings/utf_offset_string_conversions.h"
6 
7 #include <stddef.h>
8 
9 #include <algorithm>
10 
11 #include "base/strings/string_piece.h"
12 #include "testing/gtest/include/gtest/gtest.h"
13 
14 namespace base {
15 
16 namespace {
17 
18 static const size_t kNpos = std::u16string::npos;
19 
20 }  // namespace
21 
TEST(UTFOffsetStringConversionsTest,AdjustOffset)22 TEST(UTFOffsetStringConversionsTest, AdjustOffset) {
23   struct UTF8ToUTF16Case {
24     const char* utf8;
25     size_t input_offset;
26     size_t output_offset;
27   } utf8_to_utf16_cases[] = {
28     {"", 0, 0},
29     {"", kNpos, kNpos},
30     {"\xe4\xbd\xa0\xe5\xa5\xbd", 1, kNpos},
31     {"\xe4\xbd\xa0\xe5\xa5\xbd", 3, 1},
32     {"\xed\xb0\x80z", 3, 3},
33     {"A\xF0\x90\x8C\x80z", 1, 1},
34     {"A\xF0\x90\x8C\x80z", 2, kNpos},
35     {"A\xF0\x90\x8C\x80z", 5, 3},
36     {"A\xF0\x90\x8C\x80z", 6, 4},
37     {"A\xF0\x90\x8C\x80z", kNpos, kNpos},
38   };
39   for (const auto& i : utf8_to_utf16_cases) {
40     const size_t offset = i.input_offset;
41     std::vector<size_t> offsets;
42     offsets.push_back(offset);
43     UTF8ToUTF16AndAdjustOffsets(i.utf8, &offsets);
44     EXPECT_EQ(i.output_offset, offsets[0]);
45   }
46 
47   struct UTF16ToUTF8Case {
48     char16_t utf16[10];
49     size_t input_offset;
50     size_t output_offset;
51   } utf16_to_utf8_cases[] = {
52       {{}, 0, 0},
53       // Converted to 3-byte utf-8 sequences
54       {{0x5909, 0x63DB}, 3, kNpos},
55       {{0x5909, 0x63DB}, 2, 6},
56       {{0x5909, 0x63DB}, 1, 3},
57       {{0x5909, 0x63DB}, 0, 0},
58       // Converted to 2-byte utf-8 sequences
59       {{'A', 0x00bc, 0x00be, 'z'}, 1, 1},
60       {{'A', 0x00bc, 0x00be, 'z'}, 2, 3},
61       {{'A', 0x00bc, 0x00be, 'z'}, 3, 5},
62       {{'A', 0x00bc, 0x00be, 'z'}, 4, 6},
63       // Surrogate pair
64       {{'A', 0xd800, 0xdf00, 'z'}, 1, 1},
65       {{'A', 0xd800, 0xdf00, 'z'}, 2, kNpos},
66       {{'A', 0xd800, 0xdf00, 'z'}, 3, 5},
67       {{'A', 0xd800, 0xdf00, 'z'}, 4, 6},
68   };
69   for (size_t i = 0; i < std::size(utf16_to_utf8_cases); ++i) {
70     size_t offset = utf16_to_utf8_cases[i].input_offset;
71     std::vector<size_t> offsets;
72     offsets.push_back(offset);
73     UTF16ToUTF8AndAdjustOffsets(utf16_to_utf8_cases[i].utf16, &offsets);
74     EXPECT_EQ(utf16_to_utf8_cases[i].output_offset, offsets[0]) << i;
75   }
76 }
77 
TEST(UTFOffsetStringConversionsTest,LimitOffsets)78 TEST(UTFOffsetStringConversionsTest, LimitOffsets) {
79   const OffsetAdjuster::Adjustments kNoAdjustments;
80   const size_t kLimit = 10;
81   const size_t kItems = 20;
82   std::vector<size_t> size_ts;
83   for (size_t t = 0; t < kItems; ++t) {
84     size_ts.push_back(t);
85     OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
86   }
87   size_t unlimited_count = 0;
88   for (auto ti : size_ts) {
89     if (ti != kNpos)
90       ++unlimited_count;
91   }
92   EXPECT_EQ(11U, unlimited_count);
93 
94   // Reverse the values in the vector and try again.
95   size_ts.clear();
96   for (size_t t = kItems; t > 0; --t) {
97     size_ts.push_back(t - 1);
98     OffsetAdjuster::AdjustOffset(kNoAdjustments, &size_ts.back(), kLimit);
99   }
100   unlimited_count = 0;
101   for (auto ti : size_ts) {
102     if (ti != kNpos)
103       ++unlimited_count;
104   }
105   EXPECT_EQ(11U, unlimited_count);
106 }
107 
TEST(UTFOffsetStringConversionsTest,AdjustOffsets)108 TEST(UTFOffsetStringConversionsTest, AdjustOffsets) {
109   // Imagine we have strings as shown in the following cases where the
110   // X's represent encoded characters.
111   // 1: abcXXXdef ==> abcXdef
112   {
113     std::vector<size_t> offsets;
114     for (size_t t = 0; t <= 9; ++t)
115       offsets.push_back(t);
116     OffsetAdjuster::Adjustments adjustments;
117     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
118     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
119     size_t expected_1[] = {0, 1, 2, 3, kNpos, kNpos, 4, 5, 6, 7};
120     EXPECT_EQ(offsets.size(), std::size(expected_1));
121     for (size_t i = 0; i < std::size(expected_1); ++i)
122       EXPECT_EQ(expected_1[i], offsets[i]);
123   }
124 
125   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
126   {
127     std::vector<size_t> offsets;
128     for (size_t t = 0; t <= 23; ++t)
129       offsets.push_back(t);
130     OffsetAdjuster::Adjustments adjustments;
131     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
132     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
133     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
134     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
135     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
136     size_t expected_2[] = {
137       0, kNpos, kNpos, 1, 2, kNpos, kNpos, kNpos, 4, 5, 6, kNpos, kNpos, kNpos,
138       kNpos, kNpos, kNpos, 10, 11, 12, 13, kNpos, kNpos, 14
139     };
140     EXPECT_EQ(offsets.size(), std::size(expected_2));
141     for (size_t i = 0; i < std::size(expected_2); ++i)
142       EXPECT_EQ(expected_2[i], offsets[i]);
143   }
144 
145   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
146   {
147     std::vector<size_t> offsets;
148     for (size_t t = 0; t <= 17; ++t)
149       offsets.push_back(t);
150     OffsetAdjuster::Adjustments adjustments;
151     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
152     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
153     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
154     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
155     OffsetAdjuster::AdjustOffsets(adjustments, &offsets);
156     size_t expected_3[] = {
157       0, kNpos, kNpos, 0, 1, kNpos, kNpos, kNpos, 5, 6, 7, 8, kNpos, kNpos, 11,
158       12, kNpos, 12
159     };
160     EXPECT_EQ(offsets.size(), std::size(expected_3));
161     for (size_t i = 0; i < std::size(expected_3); ++i)
162       EXPECT_EQ(expected_3[i], offsets[i]);
163   }
164 }
165 
TEST(UTFOffsetStringConversionsTest,UnadjustOffsets)166 TEST(UTFOffsetStringConversionsTest, UnadjustOffsets) {
167   // Imagine we have strings as shown in the following cases where the
168   // X's represent encoded characters.
169   // 1: abcXXXdef ==> abcXdef
170   {
171     std::vector<size_t> offsets;
172     for (size_t t = 0; t <= 7; ++t)
173       offsets.push_back(t);
174     OffsetAdjuster::Adjustments adjustments;
175     adjustments.push_back(OffsetAdjuster::Adjustment(3, 3, 1));
176     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
177     size_t expected_1[] = {0, 1, 2, 3, 6, 7, 8, 9};
178     EXPECT_EQ(offsets.size(), std::size(expected_1));
179     for (size_t i = 0; i < std::size(expected_1); ++i)
180       EXPECT_EQ(expected_1[i], offsets[i]);
181   }
182 
183   // 2: XXXaXXXXbcXXXXXXXdefXXX ==> XaXXbcXXXXdefX
184   {
185     std::vector<size_t> offsets;
186     for (size_t t = 0; t <= 14; ++t)
187       offsets.push_back(t);
188     OffsetAdjuster::Adjustments adjustments;
189     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 1));
190     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 2));
191     adjustments.push_back(OffsetAdjuster::Adjustment(10, 7, 4));
192     adjustments.push_back(OffsetAdjuster::Adjustment(20, 3, 1));
193     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
194     size_t expected_2[] = {
195       0, 3, 4, kNpos, 8, 9, 10, kNpos, kNpos, kNpos, 17, 18, 19, 20, 23
196     };
197     EXPECT_EQ(offsets.size(), std::size(expected_2));
198     for (size_t i = 0; i < std::size(expected_2); ++i)
199       EXPECT_EQ(expected_2[i], offsets[i]);
200   }
201 
202   // 3: XXXaXXXXbcdXXXeXX ==> aXXXXbcdXXXe
203   {
204     std::vector<size_t> offsets;
205     for (size_t t = 0; t <= 12; ++t)
206       offsets.push_back(t);
207     OffsetAdjuster::Adjustments adjustments;
208     adjustments.push_back(OffsetAdjuster::Adjustment(0, 3, 0));
209     adjustments.push_back(OffsetAdjuster::Adjustment(4, 4, 4));
210     adjustments.push_back(OffsetAdjuster::Adjustment(11, 3, 3));
211     adjustments.push_back(OffsetAdjuster::Adjustment(15, 2, 0));
212     OffsetAdjuster::UnadjustOffsets(adjustments, &offsets);
213     size_t expected_3[] = {
214       0,  // this could just as easily be 3
215       4, kNpos, kNpos, kNpos, 8, 9, 10, 11, kNpos, kNpos, 14,
216       15  // this could just as easily be 17
217     };
218     EXPECT_EQ(offsets.size(), std::size(expected_3));
219     for (size_t i = 0; i < std::size(expected_3); ++i)
220       EXPECT_EQ(expected_3[i], offsets[i]);
221   }
222 }
223 
224 // MergeSequentialAdjustments is used by net/base/escape.{h,cc} and
225 // net/base/net_util.{h,cc}.  The two tests EscapeTest.AdjustOffset and
226 // NetUtilTest.FormatUrlWithOffsets test its behavior extensively.  This
227 // is simply a short, additional test.
TEST(UTFOffsetStringConversionsTest,MergeSequentialAdjustments)228 TEST(UTFOffsetStringConversionsTest, MergeSequentialAdjustments) {
229   // Pretend the input string is "abcdefghijklmnopqrstuvwxyz".
230 
231   // Set up |first_adjustments| to
232   // - remove the leading "a"
233   // - combine the "bc" into one character (call it ".")
234   // - remove the "f"
235   // - remove the "tuv"
236   // The resulting string should be ".deghijklmnopqrswxyz".
237   OffsetAdjuster::Adjustments first_adjustments;
238   first_adjustments.push_back(OffsetAdjuster::Adjustment(0, 1, 0));
239   first_adjustments.push_back(OffsetAdjuster::Adjustment(1, 2, 1));
240   first_adjustments.push_back(OffsetAdjuster::Adjustment(5, 1, 0));
241   first_adjustments.push_back(OffsetAdjuster::Adjustment(19, 3, 0));
242 
243   // Set up |adjustments_on_adjusted_string| to
244   // - combine the "." character that replaced "bc" with "d" into one character
245   //   (call it "?")
246   // - remove the "egh"
247   // - expand the "i" into two characters (call them "12")
248   // - combine the "jkl" into one character (call it "@")
249   // - expand the "z" into two characters (call it "34")
250   // The resulting string should be "?12@mnopqrswxy34".
251   OffsetAdjuster::Adjustments adjustments_on_adjusted_string;
252   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
253       0, 2, 1));
254   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
255       2, 3, 0));
256   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
257       5, 1, 2));
258   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
259       6, 3, 1));
260   adjustments_on_adjusted_string.push_back(OffsetAdjuster::Adjustment(
261       19, 1, 2));
262 
263   // Now merge the adjustments and check the results.
264   OffsetAdjuster::MergeSequentialAdjustments(first_adjustments,
265                                              &adjustments_on_adjusted_string);
266   // The merged adjustments should look like
267   // - combine abcd into "?"
268   //   - note: it's also reasonable for the Merge function to instead produce
269   //     two adjustments instead of this, one to remove a and another to
270   //     combine bcd into "?".  This test verifies the current behavior.
271   // - remove efgh
272   // - expand i into "12"
273   // - combine jkl into "@"
274   // - remove tuv
275   // - expand z into "34"
276   ASSERT_EQ(6u, adjustments_on_adjusted_string.size());
277   EXPECT_EQ(0u, adjustments_on_adjusted_string[0].original_offset);
278   EXPECT_EQ(4u, adjustments_on_adjusted_string[0].original_length);
279   EXPECT_EQ(1u, adjustments_on_adjusted_string[0].output_length);
280   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_offset);
281   EXPECT_EQ(4u, adjustments_on_adjusted_string[1].original_length);
282   EXPECT_EQ(0u, adjustments_on_adjusted_string[1].output_length);
283   EXPECT_EQ(8u, adjustments_on_adjusted_string[2].original_offset);
284   EXPECT_EQ(1u, adjustments_on_adjusted_string[2].original_length);
285   EXPECT_EQ(2u, adjustments_on_adjusted_string[2].output_length);
286   EXPECT_EQ(9u, adjustments_on_adjusted_string[3].original_offset);
287   EXPECT_EQ(3u, adjustments_on_adjusted_string[3].original_length);
288   EXPECT_EQ(1u, adjustments_on_adjusted_string[3].output_length);
289   EXPECT_EQ(19u, adjustments_on_adjusted_string[4].original_offset);
290   EXPECT_EQ(3u, adjustments_on_adjusted_string[4].original_length);
291   EXPECT_EQ(0u, adjustments_on_adjusted_string[4].output_length);
292   EXPECT_EQ(25u, adjustments_on_adjusted_string[5].original_offset);
293   EXPECT_EQ(1u, adjustments_on_adjusted_string[5].original_length);
294   EXPECT_EQ(2u, adjustments_on_adjusted_string[5].output_length);
295 }
296 
297 }  // namespace base
298