• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (C) 2019 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "icing/util/character-iterator.h"
16 
17 #include <cstring>
18 #include <string_view>
19 
20 #include "gmock/gmock.h"
21 #include "gtest/gtest.h"
22 #include "icing/testing/common-matchers.h"
23 #include "icing/testing/icu-i18n-test-utils.h"
24 
25 namespace icing {
26 namespace lib {
27 
28 namespace {
29 
30 using ::testing::Eq;
31 using ::testing::IsFalse;
32 using ::testing::IsTrue;
33 
TEST(CharacterIteratorTest,DefaultInstanceShouldBeInvalid)34 TEST(CharacterIteratorTest, DefaultInstanceShouldBeInvalid) {
35   CharacterIterator iterator;
36   EXPECT_THAT(iterator.is_valid(), IsFalse());
37 }
38 
TEST(CharacterIteratorTest,EmptyText)39 TEST(CharacterIteratorTest, EmptyText) {
40   constexpr std::string_view kText = "¿Dónde está la biblioteca?";
41   std::string_view empty_text(kText.data(), 0);
42 
43   CharacterIterator iterator(empty_text);
44   EXPECT_THAT(iterator.is_valid(), IsTrue());
45   EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
46 }
47 
TEST(CharacterIteratorTest,BasicUtf8)48 TEST(CharacterIteratorTest, BasicUtf8) {
49   constexpr std::string_view kText = "¿Dónde está la biblioteca?";
50 
51   CharacterIterator iterator(kText);
52   EXPECT_THAT(iterator.is_valid(), IsTrue());
53   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
54 
55   EXPECT_THAT(iterator.AdvanceToUtf8(4), IsTrue());
56   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
57   EXPECT_THAT(iterator,
58               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
59                                       /*expected_utf16_index=*/2,
60                                       /*expected_utf32_index=*/2));
61 
62   EXPECT_THAT(iterator.AdvanceToUtf8(18), IsTrue());
63   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
64   EXPECT_THAT(iterator,
65               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
66                                       /*expected_utf16_index=*/15,
67                                       /*expected_utf32_index=*/15));
68 
69   EXPECT_THAT(iterator.AdvanceToUtf8(28), IsTrue());
70   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
71   EXPECT_THAT(iterator,
72               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
73                                       /*expected_utf16_index=*/25,
74                                       /*expected_utf32_index=*/25));
75 
76   // Advance to the end of the string. This is allowed and we should get null
77   // character.
78   EXPECT_THAT(iterator.AdvanceToUtf8(29), IsTrue());
79   EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
80   EXPECT_THAT(iterator,
81               EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
82                                       /*expected_utf16_index=*/26,
83                                       /*expected_utf32_index=*/26));
84 
85   EXPECT_THAT(iterator.RewindToUtf8(28), IsTrue());
86   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
87   EXPECT_THAT(iterator,
88               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
89                                       /*expected_utf16_index=*/25,
90                                       /*expected_utf32_index=*/25));
91 
92   EXPECT_THAT(iterator.RewindToUtf8(18), IsTrue());
93   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
94   EXPECT_THAT(iterator,
95               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
96                                       /*expected_utf16_index=*/15,
97                                       /*expected_utf32_index=*/15));
98 
99   EXPECT_THAT(iterator.RewindToUtf8(4), IsTrue());
100   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
101   EXPECT_THAT(iterator,
102               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
103                                       /*expected_utf16_index=*/2,
104                                       /*expected_utf32_index=*/2));
105 
106   EXPECT_THAT(iterator.RewindToUtf8(0), IsTrue());
107   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
108   EXPECT_THAT(iterator,
109               EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
110                                       /*expected_utf16_index=*/0,
111                                       /*expected_utf32_index=*/0));
112 }
113 
TEST(CharacterIteratorTest,BasicUtf16)114 TEST(CharacterIteratorTest, BasicUtf16) {
115   constexpr std::string_view kText = "¿Dónde está la biblioteca?";
116 
117   CharacterIterator iterator(kText);
118   EXPECT_THAT(iterator.is_valid(), IsTrue());
119   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
120 
121   EXPECT_THAT(iterator.AdvanceToUtf16(2), IsTrue());
122   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
123   EXPECT_THAT(iterator,
124               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
125                                       /*expected_utf16_index=*/2,
126                                       /*expected_utf32_index=*/2));
127 
128   EXPECT_THAT(iterator.AdvanceToUtf16(15), IsTrue());
129   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
130   EXPECT_THAT(iterator,
131               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
132                                       /*expected_utf16_index=*/15,
133                                       /*expected_utf32_index=*/15));
134 
135   EXPECT_THAT(iterator.AdvanceToUtf16(25), IsTrue());
136   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
137   EXPECT_THAT(iterator,
138               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
139                                       /*expected_utf16_index=*/25,
140                                       /*expected_utf32_index=*/25));
141 
142   // Advance to the end of the string. This is allowed and we should get null
143   // character.
144   EXPECT_THAT(iterator.AdvanceToUtf16(26), IsTrue());
145   EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
146   EXPECT_THAT(iterator,
147               EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
148                                       /*expected_utf16_index=*/26,
149                                       /*expected_utf32_index=*/26));
150 
151   EXPECT_THAT(iterator.RewindToUtf16(25), IsTrue());
152   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
153   EXPECT_THAT(iterator,
154               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
155                                       /*expected_utf16_index=*/25,
156                                       /*expected_utf32_index=*/25));
157 
158   EXPECT_THAT(iterator.RewindToUtf16(15), IsTrue());
159   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
160   EXPECT_THAT(iterator,
161               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
162                                       /*expected_utf16_index=*/15,
163                                       /*expected_utf32_index=*/15));
164 
165   EXPECT_THAT(iterator.RewindToUtf16(2), IsTrue());
166   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
167   EXPECT_THAT(iterator,
168               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
169                                       /*expected_utf16_index=*/2,
170                                       /*expected_utf32_index=*/2));
171 
172   EXPECT_THAT(iterator.RewindToUtf16(0), IsTrue());
173   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
174   EXPECT_THAT(iterator,
175               EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
176                                       /*expected_utf16_index=*/0,
177                                       /*expected_utf32_index=*/0));
178 }
179 
TEST(CharacterIteratorTest,BasicUtf32)180 TEST(CharacterIteratorTest, BasicUtf32) {
181   constexpr std::string_view kText = "¿Dónde está la biblioteca?";
182 
183   CharacterIterator iterator(kText);
184   EXPECT_THAT(iterator.is_valid(), IsTrue());
185   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
186 
187   EXPECT_THAT(iterator.AdvanceToUtf32(2), IsTrue());
188   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
189   EXPECT_THAT(iterator,
190               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
191                                       /*expected_utf16_index=*/2,
192                                       /*expected_utf32_index=*/2));
193 
194   EXPECT_THAT(iterator.AdvanceToUtf32(15), IsTrue());
195   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
196   EXPECT_THAT(iterator,
197               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
198                                       /*expected_utf16_index=*/15,
199                                       /*expected_utf32_index=*/15));
200 
201   EXPECT_THAT(iterator.AdvanceToUtf32(25), IsTrue());
202   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
203   EXPECT_THAT(iterator,
204               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
205                                       /*expected_utf16_index=*/25,
206                                       /*expected_utf32_index=*/25));
207 
208   // Advance to the end of the string. This is allowed and we should get null
209   // character.
210   EXPECT_THAT(iterator.AdvanceToUtf32(26), IsTrue());
211   EXPECT_THAT(iterator.GetCurrentChar(), Eq(0));
212   EXPECT_THAT(iterator,
213               EqualsCharacterIterator(kText, /*expected_utf8_index=*/29,
214                                       /*expected_utf16_index=*/26,
215                                       /*expected_utf32_index=*/26));
216 
217   EXPECT_THAT(iterator.RewindToUtf32(25), IsTrue());
218   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("?"));
219   EXPECT_THAT(iterator,
220               EqualsCharacterIterator(kText, /*expected_utf8_index=*/28,
221                                       /*expected_utf16_index=*/25,
222                                       /*expected_utf32_index=*/25));
223 
224   EXPECT_THAT(iterator.RewindToUtf32(15), IsTrue());
225   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("b"));
226   EXPECT_THAT(iterator,
227               EqualsCharacterIterator(kText, /*expected_utf8_index=*/18,
228                                       /*expected_utf16_index=*/15,
229                                       /*expected_utf32_index=*/15));
230 
231   EXPECT_THAT(iterator.RewindToUtf32(2), IsTrue());
232   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("ó"));
233   EXPECT_THAT(iterator,
234               EqualsCharacterIterator(kText, /*expected_utf8_index=*/3,
235                                       /*expected_utf16_index=*/2,
236                                       /*expected_utf32_index=*/2));
237 
238   EXPECT_THAT(iterator.RewindToUtf32(0), IsTrue());
239   EXPECT_THAT(UCharToString(iterator.GetCurrentChar()), Eq("¿"));
240   EXPECT_THAT(iterator,
241               EqualsCharacterIterator(kText, /*expected_utf8_index=*/0,
242                                       /*expected_utf16_index=*/0,
243                                       /*expected_utf32_index=*/0));
244 }
245 
TEST(CharacterIteratorTest,InvalidUtf)246 TEST(CharacterIteratorTest, InvalidUtf) {
247   // "\255" is an invalid sequence.
248   constexpr std::string_view kText = "foo \255 bar";
249   CharacterIterator iterator(kText);
250   EXPECT_THAT(iterator.is_valid(), IsTrue());
251 
252   // Try to advance to the 'b' in 'bar'. This will fail. Also the iterator will
253   // be in an undefined state, so no need to verify the state or
254   // GetCurrentChar().
255   EXPECT_THAT(iterator.AdvanceToUtf8(6), IsFalse());
256   EXPECT_THAT(iterator.AdvanceToUtf16(6), IsFalse());
257   EXPECT_THAT(iterator.AdvanceToUtf32(6), IsFalse());
258 }
259 
TEST(CharacterIteratorTest,AdvanceToUtf8_emptyText)260 TEST(CharacterIteratorTest, AdvanceToUtf8_emptyText) {
261   // Create an uninitialized buffer.
262   char buf[30];
263 
264   // Create a string_view that points to the 10-th byte of the buffer with
265   // length 0.
266   std::string_view text(buf + 10, 0);
267 
268   CharacterIterator iter0(text);
269   // Advance to utf8 index 0. This should succeed without memory or
270   // use-of-uninitialized-value errors (tested with "--config=msan").
271   EXPECT_THAT(iter0.AdvanceToUtf8(0), IsTrue());
272   // We should get null character after succeeding.
273   EXPECT_THAT(iter0.GetCurrentChar(), Eq(0));
274 
275   // Advance to utf8 indices with positive values. This should fail successfully
276   // without memory or use-of-uninitialized-value errors (tested with
277   // "--config=msan").
278   CharacterIterator iter1(text);
279   EXPECT_THAT(iter1.AdvanceToUtf8(1), IsFalse());
280 
281   CharacterIterator iter2(text);
282   EXPECT_THAT(iter2.AdvanceToUtf8(2), IsFalse());
283 
284   // Advance to utf8 indices with negative values. This should fail successfully
285   // without memory or use-of-uninitialized-value errors (tested with
286   // "--config=msan").
287   CharacterIterator iter3(text);
288   EXPECT_THAT(iter1.AdvanceToUtf8(-1), IsFalse());
289 
290   CharacterIterator iter4(text);
291   EXPECT_THAT(iter2.AdvanceToUtf8(-2), IsFalse());
292 }
293 
TEST(CharacterIteratorTest,AdvanceToUtf8_negativeIndex)294 TEST(CharacterIteratorTest, AdvanceToUtf8_negativeIndex) {
295   constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
296   // Create a buffer with extra 4 bytes. Copy kText to the last 26 bytes and
297   // intentionally leave the first 4 bytes uninitialized.
298   char buf[30];
299   memcpy(buf + 4, kText.data(), kText.size());
300 
301   std::string_view text(buf + 4, kText.size());
302 
303   // Advance to negative utf8 indices. This should fail successfully without
304   // memory or use-of-uninitialized-value errors (tested with "--config=msan").
305 
306   CharacterIterator iter0(text);
307   EXPECT_THAT(iter0.AdvanceToUtf8(-1), IsFalse());
308 
309   CharacterIterator iter1(text);
310   EXPECT_THAT(iter1.AdvanceToUtf8(-2), IsFalse());
311 }
312 
TEST(CharacterIteratorTest,AdvanceToUtf8_indexEqCharLength)313 TEST(CharacterIteratorTest, AdvanceToUtf8_indexEqCharLength) {
314   constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
315   // Create a buffer with extra 4 bytes. Copy kText to the first 26 bytes and
316   // intentionally leave the last 4 bytes uninitialized.
317   char buf[30];
318   memcpy(buf, kText.data(), kText.size());
319 
320   std::string_view text(buf, kText.size());
321 
322   CharacterIterator iter0(text);
323   // Advance to utf8 index == kText.size(). This should succeed without memory
324   // or use-of-uninitialized-value errors (tested with "--config=msan").
325   EXPECT_THAT(iter0.AdvanceToUtf8(kText.size()), IsTrue());
326   // We should get null character after succeeding.
327   EXPECT_THAT(iter0.GetCurrentChar(), Eq(0));
328 }
329 
TEST(CharacterIteratorTest,AdvanceToUtf8_indexGtCharLength)330 TEST(CharacterIteratorTest, AdvanceToUtf8_indexGtCharLength) {
331   constexpr std::string_view kText = "abcdefghijklmnopqrstuvwxyz";
332   // Create a buffer with extra 4 bytes. Copy kText to the first 26 bytes and
333   // intentionally leave the last 4 bytes uninitialized.
334   char buf[30];
335   memcpy(buf, kText.data(), kText.size());
336 
337   std::string_view text(buf, kText.size());
338 
339   // Advance to utf8 index greater than the length of the string. This should
340   // fail successfully without memory or use-of-uninitialized-value errors
341   // (tested with "--config=msan").
342 
343   CharacterIterator iter0(text);
344   EXPECT_THAT(iter0.AdvanceToUtf8(kText.size() + 1), IsFalse());
345 
346   CharacterIterator iter1(text);
347   EXPECT_THAT(iter0.AdvanceToUtf8(kText.size() + 2), IsFalse());
348 }
349 
350 }  // namespace
351 
352 }  // namespace lib
353 }  // namespace icing
354