1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9
10 #include "base/i18n/string_search.h"
11
12 #include <stddef.h>
13
14 #include <string>
15 #include <vector>
16
17 #include "base/i18n/rtl.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20 #include "third_party/icu/source/i18n/unicode/usearch.h"
21
22 namespace base {
23 namespace i18n {
24
25 #define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len) \
26 { \
27 size_t index = 0; \
28 size_t length = 0; \
29 EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \
30 &length)); \
31 EXPECT_EQ(ex_start, index); \
32 EXPECT_EQ(ex_len, length); \
33 index = 0; \
34 length = 0; \
35 EXPECT_TRUE( \
36 StringSearch(find_this, in_this, &index, &length, false, true)); \
37 EXPECT_EQ(ex_start, index); \
38 EXPECT_EQ(ex_len, length); \
39 }
40
41 #define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len) \
42 { \
43 size_t index = 0; \
44 size_t length = 0; \
45 EXPECT_TRUE( \
46 StringSearch(find_this, in_this, &index, &length, true, true)); \
47 EXPECT_EQ(ex_start, index); \
48 EXPECT_EQ(ex_len, length); \
49 }
50
51 #define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start, \
52 ex_len) \
53 { \
54 size_t index = 0; \
55 size_t length = 0; \
56 EXPECT_TRUE( \
57 StringSearch(find_this, in_this, &index, &length, false, false)); \
58 EXPECT_EQ(ex_start, index); \
59 EXPECT_EQ(ex_len, length); \
60 }
61
62 #define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \
63 { \
64 size_t index = 0; \
65 size_t length = 0; \
66 EXPECT_TRUE( \
67 StringSearch(find_this, in_this, &index, &length, true, false)); \
68 EXPECT_EQ(ex_start, index); \
69 EXPECT_EQ(ex_len, length); \
70 }
71
72 #define EXPECT_MISS_IGNORE_CASE(find_this, in_this) \
73 { \
74 size_t index = 0; \
75 size_t length = 0; \
76 EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this, \
77 &index, &length)); \
78 index = 0; \
79 length = 0; \
80 EXPECT_FALSE( \
81 StringSearch(find_this, in_this, &index, &length, false, true)); \
82 }
83
84 #define EXPECT_MISS_SENSITIVE(find_this, in_this) \
85 { \
86 size_t index = 0; \
87 size_t length = 0; \
88 EXPECT_FALSE( \
89 StringSearch(find_this, in_this, &index, &length, true, true)); \
90 }
91
92 #define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this) \
93 { \
94 size_t index = 0; \
95 size_t length = 0; \
96 EXPECT_FALSE( \
97 StringSearch(find_this, in_this, &index, &length, false, false)); \
98 }
99
100 #define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this) \
101 { \
102 size_t index = 0; \
103 size_t length = 0; \
104 EXPECT_FALSE( \
105 StringSearch(find_this, in_this, &index, &length, true, false)); \
106 }
107
108 // Note on setting default locale for testing: The current default locale on
109 // the Mac trybot is en_US_POSIX, with which primary-level collation strength
110 // string search is case-sensitive, when normally it should be
111 // case-insensitive. In other locales (including en_US which English speakers
112 // in the U.S. use), this search would be case-insensitive as expected.
113
TEST(StringSearchTest,ASCII)114 TEST(StringSearchTest, ASCII) {
115 std::string default_locale(uloc_getDefault());
116 bool locale_is_posix = (default_locale == "en_US_POSIX");
117 if (locale_is_posix)
118 SetICUDefaultLocale("en_US");
119
120 EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U);
121
122 EXPECT_MISS_IGNORE_CASE(u"h e l l o", u"h e l l o");
123
124 EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
125
126 EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string());
127
128 EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U,
129 0U);
130
131 EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U,
132 18U);
133
134 EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
135
136 EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string());
137
138 EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U,
139 0U);
140
141 EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY");
142
143 if (locale_is_posix)
144 SetICUDefaultLocale(default_locale.data());
145 }
146
TEST(StringSearchTest,UnicodeLocaleIndependent)147 TEST(StringSearchTest, UnicodeLocaleIndependent) {
148 // Base characters
149 const std::u16string e_base = u"e";
150 const std::u16string E_base = u"E";
151 const std::u16string a_base = u"a";
152
153 // Composed characters
154 const std::u16string e_with_acute_accent = u"\u00e9";
155 const std::u16string E_with_acute_accent = u"\u00c9";
156 const std::u16string e_with_grave_accent = u"\u00e8";
157 const std::u16string E_with_grave_accent = u"\u00c8";
158 const std::u16string a_with_acute_accent = u"\u00e1";
159
160 // Decomposed characters
161 const std::u16string e_with_acute_combining_mark = u"e\u0301";
162 const std::u16string E_with_acute_combining_mark = u"E\u0301";
163 const std::u16string e_with_grave_combining_mark = u"e\u0300";
164 const std::u16string E_with_grave_combining_mark = u"E\u0300";
165 const std::u16string a_with_acute_combining_mark = u"a\u0301";
166
167 std::string default_locale(uloc_getDefault());
168 bool locale_is_posix = (default_locale == "en_US_POSIX");
169 if (locale_is_posix)
170 SetICUDefaultLocale("en_US");
171
172 EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U,
173 e_with_acute_accent.size());
174
175 EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size());
176
177 EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U,
178 e_with_acute_combining_mark.size());
179
180 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U,
181 e_base.size());
182
183 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
184 e_with_acute_accent.size());
185
186 EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
187 e_with_acute_combining_mark.size());
188
189 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark,
190 e_with_grave_combining_mark, 0U,
191 e_with_grave_combining_mark.size());
192
193 EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark,
194 e_with_acute_combining_mark, 0U,
195 e_with_acute_combining_mark.size());
196
197 EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U,
198 e_with_grave_accent.size());
199
200 EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U,
201 e_with_acute_combining_mark.size());
202
203 EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U,
204 e_with_acute_accent.size());
205
206 EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U,
207 e_with_acute_accent.size());
208
209 EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U,
210 e_with_grave_accent.size());
211
212 EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U,
213 e_with_acute_accent.size());
214
215 EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U,
216 e_with_grave_accent.size());
217
218 EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent);
219
220 EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark,
221 e_with_acute_combining_mark);
222
223 EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent);
224
225 EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base);
226
227 EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark);
228
229 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base);
230
231 EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
232 1U);
233
234 EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
235 2U);
236
237 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark,
238 e_with_grave_combining_mark);
239
240 EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark,
241 e_with_acute_combining_mark);
242
243 EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent);
244
245 EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark);
246
247 EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent);
248
249 EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent);
250
251 EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent);
252
253 EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent);
254
255 EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent);
256
257 EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent);
258
259 EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark,
260 e_with_acute_combining_mark);
261
262 EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark,
263 a_with_acute_combining_mark, 0U, 2U);
264
265 if (locale_is_posix)
266 SetICUDefaultLocale(default_locale.data());
267 }
268
TEST(StringSearchTest,UnicodeLocaleDependent)269 TEST(StringSearchTest, UnicodeLocaleDependent) {
270 // Base characters
271 const std::u16string a_base = u"a";
272
273 // Composed characters
274 const std::u16string a_with_ring = u"\u00e5";
275
276 EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
277 nullptr));
278 EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
279
280 const char* default_locale = uloc_getDefault();
281 SetICUDefaultLocale("da");
282
283 EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
284 nullptr));
285 EXPECT_FALSE(
286 StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
287
288 SetICUDefaultLocale(default_locale);
289 }
290
TEST(StringSearchTest,SearchBackwards)291 TEST(StringSearchTest, SearchBackwards) {
292 std::string default_locale(uloc_getDefault());
293 bool locale_is_posix = (default_locale == "en_US_POSIX");
294 if (locale_is_posix)
295 SetICUDefaultLocale("en_US");
296
297 EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U);
298 EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U);
299 EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB");
300
301 if (locale_is_posix)
302 SetICUDefaultLocale(default_locale.data());
303 }
304
TEST(StringSearchTest,FixedPatternMultipleSearch)305 TEST(StringSearchTest, FixedPatternMultipleSearch) {
306 std::string default_locale(uloc_getDefault());
307 bool locale_is_posix = (default_locale == "en_US_POSIX");
308 if (locale_is_posix)
309 SetICUDefaultLocale("en_US");
310
311 size_t index = 0;
312 size_t length = 0;
313
314 // Search "foo" over multiple texts.
315 FixedPatternStringSearch query1(u"foo", true);
316 EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true));
317 EXPECT_EQ(2U, index);
318 EXPECT_EQ(3U, length);
319 EXPECT_FALSE(query1.Search(u"bye", &index, &length, true));
320 EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true));
321 EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true));
322 EXPECT_EQ(0U, index);
323 EXPECT_EQ(3U, length);
324 EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false));
325 EXPECT_EQ(6U, index);
326 EXPECT_EQ(3U, length);
327
328 // Search "hello" over multiple texts.
329 FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello");
330 EXPECT_TRUE(query2.Search(u"12hello34", &index, &length));
331 EXPECT_EQ(2U, index);
332 EXPECT_EQ(5U, length);
333 EXPECT_FALSE(query2.Search(u"bye", &index, &length));
334 EXPECT_TRUE(query2.Search(u"hELLo", &index, &length));
335 EXPECT_EQ(0U, index);
336 EXPECT_EQ(5U, length);
337
338 if (locale_is_posix)
339 SetICUDefaultLocale(default_locale.data());
340 }
341
TEST(StringSearchTest,RepeatingStringSearch)342 TEST(StringSearchTest, RepeatingStringSearch) {
343 struct MatchResult {
344 int match_index;
345 int match_length;
346 };
347
348 std::string default_locale(uloc_getDefault());
349 bool locale_is_posix = (default_locale == "en_US_POSIX");
350 if (locale_is_posix)
351 SetICUDefaultLocale("en_US");
352
353 const char16_t kPattern[] = u"fox";
354 const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox";
355
356 // Case sensitive.
357 {
358 const MatchResult kExpectation[] = {{16, 3}};
359
360 RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true);
361 std::vector<MatchResult> results;
362 int match_index;
363 int match_length;
364 while (searcher.NextMatchResult(match_index, match_length)) {
365 results.push_back(
366 {.match_index = match_index, .match_length = match_length});
367 }
368
369 ASSERT_EQ(std::size(kExpectation), results.size());
370 for (size_t i = 0; i < results.size(); ++i) {
371 EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
372 EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
373 }
374 }
375
376 // Case insensitive.
377 {
378 const MatchResult kExpectation[] = {{16, 3}, {41, 3}};
379
380 RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false);
381 std::vector<MatchResult> results;
382 int match_index;
383 int match_length;
384 while (searcher.NextMatchResult(match_index, match_length)) {
385 results.push_back(
386 {.match_index = match_index, .match_length = match_length});
387 }
388
389 ASSERT_EQ(std::size(kExpectation), results.size());
390 for (size_t i = 0; i < results.size(); ++i) {
391 EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
392 EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
393 }
394 }
395
396 if (locale_is_posix)
397 SetICUDefaultLocale(default_locale.data());
398 }
399
400 } // namespace i18n
401 } // namespace base
402