• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifdef UNSAFE_BUFFERS_BUILD
6 // TODO(crbug.com/40284755): Remove this and spanify to fix the errors.
7 #pragma allow_unsafe_buffers
8 #endif
9 
10 #include "base/i18n/string_search.h"
11 
12 #include <stddef.h>
13 
14 #include <string>
15 #include <vector>
16 
17 #include "base/i18n/rtl.h"
18 #include "base/strings/utf_string_conversions.h"
19 #include "testing/gtest/include/gtest/gtest.h"
20 #include "third_party/icu/source/i18n/unicode/usearch.h"
21 
22 namespace base {
23 namespace i18n {
24 
25 #define EXPECT_MATCH_IGNORE_CASE(find_this, in_this, ex_start, ex_len)         \
26   {                                                                            \
27     size_t index = 0;                                                          \
28     size_t length = 0;                                                         \
29     EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(find_this, in_this, &index, \
30                                                    &length));                  \
31     EXPECT_EQ(ex_start, index);                                                \
32     EXPECT_EQ(ex_len, length);                                                 \
33     index = 0;                                                                 \
34     length = 0;                                                                \
35     EXPECT_TRUE(                                                               \
36         StringSearch(find_this, in_this, &index, &length, false, true));       \
37     EXPECT_EQ(ex_start, index);                                                \
38     EXPECT_EQ(ex_len, length);                                                 \
39   }
40 
41 #define EXPECT_MATCH_SENSITIVE(find_this, in_this, ex_start, ex_len)    \
42   {                                                                     \
43     size_t index = 0;                                                   \
44     size_t length = 0;                                                  \
45     EXPECT_TRUE(                                                        \
46         StringSearch(find_this, in_this, &index, &length, true, true)); \
47     EXPECT_EQ(ex_start, index);                                         \
48     EXPECT_EQ(ex_len, length);                                          \
49   }
50 
51 #define EXPECT_MATCH_IGNORE_CASE_BACKWARDS(find_this, in_this, ex_start,  \
52                                            ex_len)                        \
53   {                                                                       \
54     size_t index = 0;                                                     \
55     size_t length = 0;                                                    \
56     EXPECT_TRUE(                                                          \
57         StringSearch(find_this, in_this, &index, &length, false, false)); \
58     EXPECT_EQ(ex_start, index);                                           \
59     EXPECT_EQ(ex_len, length);                                            \
60   }
61 
62 #define EXPECT_MATCH_SENSITIVE_BACKWARDS(find_this, in_this, ex_start, ex_len) \
63   {                                                                            \
64     size_t index = 0;                                                          \
65     size_t length = 0;                                                         \
66     EXPECT_TRUE(                                                               \
67         StringSearch(find_this, in_this, &index, &length, true, false));       \
68     EXPECT_EQ(ex_start, index);                                                \
69     EXPECT_EQ(ex_len, length);                                                 \
70   }
71 
72 #define EXPECT_MISS_IGNORE_CASE(find_this, in_this)                      \
73   {                                                                      \
74     size_t index = 0;                                                    \
75     size_t length = 0;                                                   \
76     EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(find_this, in_this,  \
77                                                     &index, &length));   \
78     index = 0;                                                           \
79     length = 0;                                                          \
80     EXPECT_FALSE(                                                        \
81         StringSearch(find_this, in_this, &index, &length, false, true)); \
82   }
83 
84 #define EXPECT_MISS_SENSITIVE(find_this, in_this)                       \
85   {                                                                     \
86     size_t index = 0;                                                   \
87     size_t length = 0;                                                  \
88     EXPECT_FALSE(                                                       \
89         StringSearch(find_this, in_this, &index, &length, true, true)); \
90   }
91 
92 #define EXPECT_MISS_IGNORE_CASE_BACKWARDS(find_this, in_this)             \
93   {                                                                       \
94     size_t index = 0;                                                     \
95     size_t length = 0;                                                    \
96     EXPECT_FALSE(                                                         \
97         StringSearch(find_this, in_this, &index, &length, false, false)); \
98   }
99 
100 #define EXPECT_MISS_SENSITIVE_BACKWARDS(find_this, in_this)              \
101   {                                                                      \
102     size_t index = 0;                                                    \
103     size_t length = 0;                                                   \
104     EXPECT_FALSE(                                                        \
105         StringSearch(find_this, in_this, &index, &length, true, false)); \
106   }
107 
108 // Note on setting default locale for testing: The current default locale on
109 // the Mac trybot is en_US_POSIX, with which primary-level collation strength
110 // string search is case-sensitive, when normally it should be
111 // case-insensitive. In other locales (including en_US which English speakers
112 // in the U.S. use), this search would be case-insensitive as expected.
113 
TEST(StringSearchTest,ASCII)114 TEST(StringSearchTest, ASCII) {
115   std::string default_locale(uloc_getDefault());
116   bool locale_is_posix = (default_locale == "en_US_POSIX");
117   if (locale_is_posix)
118     SetICUDefaultLocale("en_US");
119 
120   EXPECT_MATCH_IGNORE_CASE(u"hello", u"hello world", 0U, 5U);
121 
122   EXPECT_MISS_IGNORE_CASE(u"h    e l l o", u"h   e l l o");
123 
124   EXPECT_MATCH_IGNORE_CASE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
125 
126   EXPECT_MISS_IGNORE_CASE(u"searching within empty string", std::u16string());
127 
128   EXPECT_MATCH_IGNORE_CASE(std::u16string(), u"searching for empty string", 0U,
129                            0U);
130 
131   EXPECT_MATCH_IGNORE_CASE(u"case insensitivity", u"CaSe InSeNsItIvItY", 0U,
132                            18U);
133 
134   EXPECT_MATCH_SENSITIVE(u"aabaaa", u"aaabaabaaa", 4U, 6U);
135 
136   EXPECT_MISS_SENSITIVE(u"searching within empty string", std::u16string());
137 
138   EXPECT_MATCH_SENSITIVE(std::u16string(), u"searching for empty string", 0U,
139                          0U);
140 
141   EXPECT_MISS_SENSITIVE(u"case insensitivity", u"CaSe InSeNsItIvItY");
142 
143   if (locale_is_posix)
144     SetICUDefaultLocale(default_locale.data());
145 }
146 
TEST(StringSearchTest,UnicodeLocaleIndependent)147 TEST(StringSearchTest, UnicodeLocaleIndependent) {
148   // Base characters
149   const std::u16string e_base = u"e";
150   const std::u16string E_base = u"E";
151   const std::u16string a_base = u"a";
152 
153   // Composed characters
154   const std::u16string e_with_acute_accent = u"\u00e9";
155   const std::u16string E_with_acute_accent = u"\u00c9";
156   const std::u16string e_with_grave_accent = u"\u00e8";
157   const std::u16string E_with_grave_accent = u"\u00c8";
158   const std::u16string a_with_acute_accent = u"\u00e1";
159 
160   // Decomposed characters
161   const std::u16string e_with_acute_combining_mark = u"e\u0301";
162   const std::u16string E_with_acute_combining_mark = u"E\u0301";
163   const std::u16string e_with_grave_combining_mark = u"e\u0300";
164   const std::u16string E_with_grave_combining_mark = u"E\u0300";
165   const std::u16string a_with_acute_combining_mark = u"a\u0301";
166 
167   std::string default_locale(uloc_getDefault());
168   bool locale_is_posix = (default_locale == "en_US_POSIX");
169   if (locale_is_posix)
170     SetICUDefaultLocale("en_US");
171 
172   EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_accent, 0U,
173                            e_with_acute_accent.size());
174 
175   EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_base, 0U, e_base.size());
176 
177   EXPECT_MATCH_IGNORE_CASE(e_base, e_with_acute_combining_mark, 0U,
178                            e_with_acute_combining_mark.size());
179 
180   EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_base, 0U,
181                            e_base.size());
182 
183   EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
184                            e_with_acute_accent.size());
185 
186   EXPECT_MATCH_IGNORE_CASE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
187                            e_with_acute_combining_mark.size());
188 
189   EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark,
190                            e_with_grave_combining_mark, 0U,
191                            e_with_grave_combining_mark.size());
192 
193   EXPECT_MATCH_IGNORE_CASE(e_with_grave_combining_mark,
194                            e_with_acute_combining_mark, 0U,
195                            e_with_acute_combining_mark.size());
196 
197   EXPECT_MATCH_IGNORE_CASE(e_with_acute_combining_mark, e_with_grave_accent, 0U,
198                            e_with_grave_accent.size());
199 
200   EXPECT_MATCH_IGNORE_CASE(e_with_grave_accent, e_with_acute_combining_mark, 0U,
201                            e_with_acute_combining_mark.size());
202 
203   EXPECT_MATCH_IGNORE_CASE(E_with_acute_accent, e_with_acute_accent, 0U,
204                            e_with_acute_accent.size());
205 
206   EXPECT_MATCH_IGNORE_CASE(E_with_grave_accent, e_with_acute_accent, 0U,
207                            e_with_acute_accent.size());
208 
209   EXPECT_MATCH_IGNORE_CASE(E_with_acute_combining_mark, e_with_grave_accent, 0U,
210                            e_with_grave_accent.size());
211 
212   EXPECT_MATCH_IGNORE_CASE(E_with_grave_combining_mark, e_with_acute_accent, 0U,
213                            e_with_acute_accent.size());
214 
215   EXPECT_MATCH_IGNORE_CASE(E_base, e_with_grave_accent, 0U,
216                            e_with_grave_accent.size());
217 
218   EXPECT_MISS_IGNORE_CASE(a_with_acute_accent, e_with_acute_accent);
219 
220   EXPECT_MISS_IGNORE_CASE(a_with_acute_combining_mark,
221                           e_with_acute_combining_mark);
222 
223   EXPECT_MISS_SENSITIVE(e_base, e_with_acute_accent);
224 
225   EXPECT_MISS_SENSITIVE(e_with_acute_accent, e_base);
226 
227   EXPECT_MISS_SENSITIVE(e_base, e_with_acute_combining_mark);
228 
229   EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_base);
230 
231   EXPECT_MATCH_SENSITIVE(e_with_acute_combining_mark, e_with_acute_accent, 0U,
232                          1U);
233 
234   EXPECT_MATCH_SENSITIVE(e_with_acute_accent, e_with_acute_combining_mark, 0U,
235                          2U);
236 
237   EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark,
238                         e_with_grave_combining_mark);
239 
240   EXPECT_MISS_SENSITIVE(e_with_grave_combining_mark,
241                         e_with_acute_combining_mark);
242 
243   EXPECT_MISS_SENSITIVE(e_with_acute_combining_mark, e_with_grave_accent);
244 
245   EXPECT_MISS_SENSITIVE(e_with_grave_accent, e_with_acute_combining_mark);
246 
247   EXPECT_MISS_SENSITIVE(E_with_acute_accent, e_with_acute_accent);
248 
249   EXPECT_MISS_SENSITIVE(E_with_grave_accent, e_with_acute_accent);
250 
251   EXPECT_MISS_SENSITIVE(E_with_acute_combining_mark, e_with_grave_accent);
252 
253   EXPECT_MISS_SENSITIVE(E_with_grave_combining_mark, e_with_acute_accent);
254 
255   EXPECT_MISS_SENSITIVE(E_base, e_with_grave_accent);
256 
257   EXPECT_MISS_SENSITIVE(a_with_acute_accent, e_with_acute_accent);
258 
259   EXPECT_MISS_SENSITIVE(a_with_acute_combining_mark,
260                         e_with_acute_combining_mark);
261 
262   EXPECT_MATCH_SENSITIVE(a_with_acute_combining_mark,
263                          a_with_acute_combining_mark, 0U, 2U);
264 
265   if (locale_is_posix)
266     SetICUDefaultLocale(default_locale.data());
267 }
268 
TEST(StringSearchTest,UnicodeLocaleDependent)269 TEST(StringSearchTest, UnicodeLocaleDependent) {
270   // Base characters
271   const std::u16string a_base = u"a";
272 
273   // Composed characters
274   const std::u16string a_with_ring = u"\u00e5";
275 
276   EXPECT_TRUE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
277                                                  nullptr));
278   EXPECT_TRUE(StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
279 
280   const char* default_locale = uloc_getDefault();
281   SetICUDefaultLocale("da");
282 
283   EXPECT_FALSE(StringSearchIgnoringCaseAndAccents(a_base, a_with_ring, nullptr,
284                                                   nullptr));
285   EXPECT_FALSE(
286       StringSearch(a_base, a_with_ring, nullptr, nullptr, false, true));
287 
288   SetICUDefaultLocale(default_locale);
289 }
290 
TEST(StringSearchTest,SearchBackwards)291 TEST(StringSearchTest, SearchBackwards) {
292   std::string default_locale(uloc_getDefault());
293   bool locale_is_posix = (default_locale == "en_US_POSIX");
294   if (locale_is_posix)
295     SetICUDefaultLocale("en_US");
296 
297   EXPECT_MATCH_IGNORE_CASE_BACKWARDS(u"ab", u"ABAB", 2U, 2U);
298   EXPECT_MATCH_SENSITIVE_BACKWARDS(u"ab", u"abab", 2U, 2U);
299   EXPECT_MISS_SENSITIVE_BACKWARDS(u"ab", u"ABAB");
300 
301   if (locale_is_posix)
302     SetICUDefaultLocale(default_locale.data());
303 }
304 
TEST(StringSearchTest,FixedPatternMultipleSearch)305 TEST(StringSearchTest, FixedPatternMultipleSearch) {
306   std::string default_locale(uloc_getDefault());
307   bool locale_is_posix = (default_locale == "en_US_POSIX");
308   if (locale_is_posix)
309     SetICUDefaultLocale("en_US");
310 
311   size_t index = 0;
312   size_t length = 0;
313 
314   // Search "foo" over multiple texts.
315   FixedPatternStringSearch query1(u"foo", true);
316   EXPECT_TRUE(query1.Search(u"12foo34", &index, &length, true));
317   EXPECT_EQ(2U, index);
318   EXPECT_EQ(3U, length);
319   EXPECT_FALSE(query1.Search(u"bye", &index, &length, true));
320   EXPECT_FALSE(query1.Search(u"FOO", &index, &length, true));
321   EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, true));
322   EXPECT_EQ(0U, index);
323   EXPECT_EQ(3U, length);
324   EXPECT_TRUE(query1.Search(u"foobarfoo", &index, &length, false));
325   EXPECT_EQ(6U, index);
326   EXPECT_EQ(3U, length);
327 
328   // Search "hello" over multiple texts.
329   FixedPatternStringSearchIgnoringCaseAndAccents query2(u"hello");
330   EXPECT_TRUE(query2.Search(u"12hello34", &index, &length));
331   EXPECT_EQ(2U, index);
332   EXPECT_EQ(5U, length);
333   EXPECT_FALSE(query2.Search(u"bye", &index, &length));
334   EXPECT_TRUE(query2.Search(u"hELLo", &index, &length));
335   EXPECT_EQ(0U, index);
336   EXPECT_EQ(5U, length);
337 
338   if (locale_is_posix)
339     SetICUDefaultLocale(default_locale.data());
340 }
341 
TEST(StringSearchTest,RepeatingStringSearch)342 TEST(StringSearchTest, RepeatingStringSearch) {
343   struct MatchResult {
344     int match_index;
345     int match_length;
346   };
347 
348   std::string default_locale(uloc_getDefault());
349   bool locale_is_posix = (default_locale == "en_US_POSIX");
350   if (locale_is_posix)
351     SetICUDefaultLocale("en_US");
352 
353   const char16_t kPattern[] = u"fox";
354   const char16_t kTarget[] = u"The quick brown fox jumped over the lazy Fox";
355 
356   // Case sensitive.
357   {
358     const MatchResult kExpectation[] = {{16, 3}};
359 
360     RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/true);
361     std::vector<MatchResult> results;
362     int match_index;
363     int match_length;
364     while (searcher.NextMatchResult(match_index, match_length)) {
365       results.push_back(
366           {.match_index = match_index, .match_length = match_length});
367     }
368 
369     ASSERT_EQ(std::size(kExpectation), results.size());
370     for (size_t i = 0; i < results.size(); ++i) {
371       EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
372       EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
373     }
374   }
375 
376   // Case insensitive.
377   {
378     const MatchResult kExpectation[] = {{16, 3}, {41, 3}};
379 
380     RepeatingStringSearch searcher(kPattern, kTarget, /*case_sensitive=*/false);
381     std::vector<MatchResult> results;
382     int match_index;
383     int match_length;
384     while (searcher.NextMatchResult(match_index, match_length)) {
385       results.push_back(
386           {.match_index = match_index, .match_length = match_length});
387     }
388 
389     ASSERT_EQ(std::size(kExpectation), results.size());
390     for (size_t i = 0; i < results.size(); ++i) {
391       EXPECT_EQ(results[i].match_index, kExpectation[i].match_index);
392       EXPECT_EQ(results[i].match_length, kExpectation[i].match_length);
393     }
394   }
395 
396   if (locale_is_posix)
397     SetICUDefaultLocale(default_locale.data());
398 }
399 
400 }  // namespace i18n
401 }  // namespace base
402