1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include "icing/util/character-iterator.h"
16
17 #include <cstring>
18 #include <string_view>
19
20 #include "gmock/gmock.h"
21 #include "gtest/gtest.h"
22 #include "icing/testing/common-matchers.h"
23 #include "icing/testing/icu-i18n-test-utils.h"
24
25 namespace icing {
26 namespace lib {
27
28 namespace {
29
30 using ::testing::Eq;
31 using ::testing::IsFalse;
32 using ::testing::IsTrue;
33
TEST(CharacterIteratorTest,DefaultInstanceShouldBeInvalid)34 TEST(CharacterIteratorTest, DefaultInstanceShouldBeInvalid) {
35 CharacterIterator iterator;
36 EXPECT_THAT(iterator.is_valid(), IsFalse());
37 }
38
TEST(CharacterIteratorTest,EmptyText)39 TEST(CharacterIteratorTest, EmptyText) {
40 constexpr std::string_view kText = "¿Dónde está la biblioteca?";
41 std::string_view empty_text(kText.data(), 0);
42
43 CharacterIterator iterator(empty_text);
44 EXPECT_THAT(iterator.is_valid(), IsTrue());
45 EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
46 }
47
TEST(CharacterIteratorTest,BasicUtf8)48 TEST(CharacterIteratorTest, BasicUtf8) {
49 constexpr std::string_view kText = "¿Dónde está la biblioteca?";
50
51 CharacterIterator iterator(kText);
52 EXPECT_THAT(iterator.is_valid(), IsTrue());
53 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
54
55 EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue());
56 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
57 EXPECT_THAT(iterator,
58 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
59 /*expected_utf16_index=*/2,
60 /*expected_utf32_index=*/2));
61
62 EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue());
63 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
64 EXPECT_THAT(iterator,
65 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
66 /*expected_utf16_index=*/15,
67 /*expected_utf32_index=*/15));
68
69 EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue());
70 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
71 EXPECT_THAT(iterator,
72 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
73 /*expected_utf16_index=*/25,
74 /*expected_utf32_index=*/25));
75
76 // Advance to the end of the string. This is allowed and we should get null
77 // character.
78 EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue());
79 EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
80 EXPECT_THAT(iterator,
81 EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
82 /*expected_utf16_index=*/26,
83 /*expected_utf32_index=*/26));
84
85 EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue());
86 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
87 EXPECT_THAT(iterator,
88 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
89 /*expected_utf16_index=*/25,
90 /*expected_utf32_index=*/25));
91
92 EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue());
93 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
94 EXPECT_THAT(iterator,
95 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
96 /*expected_utf16_index=*/15,
97 /*expected_utf32_index=*/15));
98
99 EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue());
100 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
101 EXPECT_THAT(iterator,
102 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
103 /*expected_utf16_index=*/2,
104 /*expected_utf32_index=*/2));
105
106 EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
107 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
108 EXPECT_THAT(iterator,
109 EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
110 /*expected_utf16_index=*/0,
111 /*expected_utf32_index=*/0));
112 }
113
TEST(CharacterIteratorTest,BasicUtf16)114 TEST(CharacterIteratorTest, BasicUtf16) {
115 constexpr std::string_view kText = "¿Dónde está la biblioteca?";
116
117 CharacterIterator iterator(kText);
118 EXPECT_THAT(iterator.is_valid(), IsTrue());
119 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
120
121 EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue());
122 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
123 EXPECT_THAT(iterator,
124 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
125 /*expected_utf16_index=*/2,
126 /*expected_utf32_index=*/2));
127
128 EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue());
129 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
130 EXPECT_THAT(iterator,
131 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
132 /*expected_utf16_index=*/15,
133 /*expected_utf32_index=*/15));
134
135 EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue());
136 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
137 EXPECT_THAT(iterator,
138 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
139 /*expected_utf16_index=*/25,
140 /*expected_utf32_index=*/25));
141
142 // Advance to the end of the string. This is allowed and we should get null
143 // character.
144 EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue());
145 EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
146 EXPECT_THAT(iterator,
147 EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
148 /*expected_utf16_index=*/26,
149 /*expected_utf32_index=*/26));
150
151 EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue());
152 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
153 EXPECT_THAT(iterator,
154 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
155 /*expected_utf16_index=*/25,
156 /*expected_utf32_index=*/25));
157
158 EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue());
159 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
160 EXPECT_THAT(iterator,
161 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
162 /*expected_utf16_index=*/15,
163 /*expected_utf32_index=*/15));
164
165 EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue());
166 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
167 EXPECT_THAT(iterator,
168 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
169 /*expected_utf16_index=*/2,
170 /*expected_utf32_index=*/2));
171
172 EXPECT_THAT(iterator.RewindToUtf16(0), IsTrue());
173 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
174 EXPECT_THAT(iterator,
175 EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
176 /*expected_utf16_index=*/0,
177 /*expected_utf32_index=*/0));
178 }
179
TEST(CharacterIteratorTest,BasicUtf32)180 TEST(CharacterIteratorTest, BasicUtf32) {
181 constexpr std::string_view kText = "¿Dónde está la biblioteca?";
182
183 CharacterIterator iterator(kText);
184 EXPECT_THAT(iterator.is_valid(), IsTrue());
185 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
186
187 EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue());
188 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
189 EXPECT_THAT(iterator,
190 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
191 /*expected_utf16_index=*/2,
192 /*expected_utf32_index=*/2));
193
194 EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue());
195 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
196 EXPECT_THAT(iterator,
197 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
198 /*expected_utf16_index=*/15,
199 /*expected_utf32_index=*/15));
200
201 EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue());
202 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
203 EXPECT_THAT(iterator,
204 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
205 /*expected_utf16_index=*/25,
206 /*expected_utf32_index=*/25));
207
208 // Advance to the end of the string. This is allowed and we should get null
209 // character.
210 EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue());
211 EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
212 EXPECT_THAT(iterator,
213 EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
214 /*expected_utf16_index=*/26,
215 /*expected_utf32_index=*/26));
216
217 EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue());
218 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
219 EXPECT_THAT(iterator,
220 EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
221 /*expected_utf16_index=*/25,
222 /*expected_utf32_index=*/25));
223
224 EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue());
225 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
226 EXPECT_THAT(iterator,
227 EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
228 /*expected_utf16_index=*/15,
229 /*expected_utf32_index=*/15));
230
231 EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue());
232 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
233 EXPECT_THAT(iterator,
234 EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
235 /*expected_utf16_index=*/2,
236 /*expected_utf32_index=*/2));
237
238 EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue());
239 EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
240 EXPECT_THAT(iterator,
241 EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
242 /*expected_utf16_index=*/0,
243 /*expected_utf32_index=*/0));
244 }
245
TEST(CharacterIteratorTest,InvalidUtf)246 TEST(CharacterIteratorTest, InvalidUtf) {
247 // "\255" is an invalid sequence.
248 constexpr std::string_view kText = "foo \255 bar";
249 CharacterIterator iterator(kText);
250 EXPECT_THAT(iterator.is_valid(), IsTrue());
251
252 // Try to advance to the 'b' in 'bar'. This will fail. Also the iterator will
253 // be in an undefined state, so no need to verify the state or
254 // GetCurrentChar().
255 EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse());
256 EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse());
257 EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse());
258 }
259
TEST(CharacterIteratorTest,AdvanceToUtf8_emptyText)260 TEST(CharacterIteratorTest, AdvanceToUtf8_emptyText) {
261 // Create an uninitialized buffer.
262 char buf[30];
263
264 // Create a string_view that points to the 10-th byte of the buffer with
265 // length 0.
266 std::string_view text(buf + 10, 0);
267
268 CharacterIterator iter0(text);
269 // Advance to utf8 index 0. This should succeed without memory or
270 // use-of-uninitialized-value errors (tested with "--config=msan").
271 EXPECT_THAT(iter0.AdvanceToUtf8(0), IsTrue());
272 // We should get null character after succeeding.
273 EXPECT_THAT(iter0.GetCurrentChar(), Eq(0));
274
275 // Advance to utf8 indices with positive values. This should fail successfully
276 // without memory or use-of-uninitialized-value errors (tested with
277 // "--config=msan").
278 CharacterIterator iter1(text);
279 EXPECT_THAT(iter1.AdvanceToUtf8(1), IsFalse());
280
281 CharacterIterator iter2(text);
282 EXPECT_THAT(iter2.AdvanceToUtf8(2), IsFalse());
283
284 // Advance to utf8 indices with negative values. This should fail successfully
285 // without memory or use-of-uninitialized-value errors (tested with
286 // "--config=msan").
287 CharacterIterator iter3(text);
288 EXPECT_THAT(iter1.AdvanceToUtf8(-1), IsFalse());
289
290 CharacterIterator iter4(text);
291 EXPECT_THAT(iter2.AdvanceToUtf8(-2), IsFalse());
292 }
293
TEST(CharacterIteratorTest,AdvanceToUtf8_negativeIndex)294 TEST(CharacterIteratorTest, AdvanceToUtf8_negativeIndex) {
295 constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
296 // Create a buffer with extra 4 bytes. Copy kText to the last 26 bytes and
297 // intentionally leave the first 4 bytes uninitialized.
298 char buf[30];
299 memcpy(buf + 4, kText.data(), kText.size());
300
301 std::string_view text(buf + 4, kText.size());
302
303 // Advance to negative utf8 indices. This should fail successfully without
304 // memory or use-of-uninitialized-value errors (tested with "--config=msan").
305
306 CharacterIterator iter0(text);
307 EXPECT_THAT(iter0.AdvanceToUtf8(-1), IsFalse());
308
309 CharacterIterator iter1(text);
310 EXPECT_THAT(iter1.AdvanceToUtf8(-2), IsFalse());
311 }
312
TEST(CharacterIteratorTest,AdvanceToUtf8_indexEqCharLength)313 TEST(CharacterIteratorTest, AdvanceToUtf8_indexEqCharLength) {
314 constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
315 // Create a buffer with extra 4 bytes. Copy kText to the first 26 bytes and
316 // intentionally leave the last 4 bytes uninitialized.
317 char buf[30];
318 memcpy(buf, kText.data(), kText.size());
319
320 std::string_view text(buf, kText.size());
321
322 CharacterIterator iter0(text);
323 // Advance to utf8 index == kText.size(). This should succeed without memory
324 // or use-of-uninitialized-value errors (tested with "--config=msan").
325 EXPECT_THAT(iter0.AdvanceToUtf8(kText.size()), IsTrue());
326 // We should get null character after succeeding.
327 EXPECT_THAT(iter0.GetCurrentChar(), Eq(0));
328 }
329
TEST(CharacterIteratorTest,AdvanceToUtf8_indexGtCharLength)330 TEST(CharacterIteratorTest, AdvanceToUtf8_indexGtCharLength) {
331 constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
332 // Create a buffer with extra 4 bytes. Copy kText to the first 26 bytes and
333 // intentionally leave the last 4 bytes uninitialized.
334 char buf[30];
335 memcpy(buf, kText.data(), kText.size());
336
337 std::string_view text(buf, kText.size());
338
339 // Advance to utf8 index greater than the length of the string. This should
340 // fail successfully without memory or use-of-uninitialized-value errors
341 // (tested with "--config=msan").
342
343 CharacterIterator iter0(text);
344 EXPECT_THAT(iter0.AdvanceToUtf8(kText.size() + 1), IsFalse());
345
346 CharacterIterator iter1(text);
347 EXPECT_THAT(iter0.AdvanceToUtf8(kText.size() + 2), IsFalse());
348 }
349
350 } // namespace
351
352 } // namespace lib
353 } // namespace icing
354