• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/break_iterator.h"
6 
7 #include <stddef.h>
8 
9 #include <vector>
10 
11 #include "base/ranges/algorithm.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "build/build_config.h"
16 #include "testing/gtest/include/gtest/gtest.h"
17 
18 namespace base {
19 namespace i18n {
20 
TEST(BreakIteratorTest,BreakWordEmpty)21 TEST(BreakIteratorTest, BreakWordEmpty) {
22   std::u16string empty;
23   BreakIterator iter(empty, BreakIterator::BREAK_WORD);
24   ASSERT_TRUE(iter.Init());
25   EXPECT_FALSE(iter.Advance());
26   EXPECT_FALSE(iter.IsWord());
27   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
28   EXPECT_FALSE(iter.IsWord());
29 }
30 
TEST(BreakIteratorTest,BreakWord)31 TEST(BreakIteratorTest, BreakWord) {
32   std::u16string space(u" ");
33   std::u16string str(u" foo bar! \npouet boom");
34   BreakIterator iter(str, BreakIterator::BREAK_WORD);
35   ASSERT_TRUE(iter.Init());
36   EXPECT_TRUE(iter.Advance());
37   EXPECT_FALSE(iter.IsWord());
38   EXPECT_EQ(space, iter.GetString());
39   EXPECT_TRUE(iter.Advance());
40   EXPECT_TRUE(iter.IsWord());
41   EXPECT_EQ(u"foo", iter.GetString());
42   EXPECT_TRUE(iter.Advance());
43   EXPECT_FALSE(iter.IsWord());
44   EXPECT_EQ(space, iter.GetString());
45   EXPECT_TRUE(iter.Advance());
46   EXPECT_TRUE(iter.IsWord());
47   EXPECT_EQ(u"bar", iter.GetString());
48   EXPECT_TRUE(iter.Advance());
49   EXPECT_FALSE(iter.IsWord());
50   EXPECT_EQ(u"!", iter.GetString());
51   EXPECT_TRUE(iter.Advance());
52   EXPECT_FALSE(iter.IsWord());
53   EXPECT_EQ(space, iter.GetString());
54   EXPECT_TRUE(iter.Advance());
55   EXPECT_FALSE(iter.IsWord());
56   EXPECT_EQ(u"\n", iter.GetString());
57   EXPECT_TRUE(iter.Advance());
58   EXPECT_TRUE(iter.IsWord());
59   EXPECT_EQ(u"pouet", iter.GetString());
60   EXPECT_TRUE(iter.Advance());
61   EXPECT_FALSE(iter.IsWord());
62   EXPECT_EQ(space, iter.GetString());
63   EXPECT_TRUE(iter.Advance());
64   EXPECT_TRUE(iter.IsWord());
65   EXPECT_EQ(u"boom", iter.GetString());
66   EXPECT_FALSE(iter.Advance());
67   EXPECT_FALSE(iter.IsWord());
68   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
69   EXPECT_FALSE(iter.IsWord());
70 }
71 
TEST(BreakIteratorTest,BreakWordWide16)72 TEST(BreakIteratorTest, BreakWordWide16) {
73   // Two greek words separated by space.
74   const std::u16string str(
75       u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
76       u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
77   const std::u16string word1(str.substr(0, 10));
78   const std::u16string word2(str.substr(11, 5));
79   BreakIterator iter(str, BreakIterator::BREAK_WORD);
80   ASSERT_TRUE(iter.Init());
81   EXPECT_TRUE(iter.Advance());
82   EXPECT_TRUE(iter.IsWord());
83   EXPECT_EQ(word1, iter.GetString());
84   EXPECT_TRUE(iter.Advance());
85   EXPECT_FALSE(iter.IsWord());
86   EXPECT_EQ(u" ", iter.GetString());
87   EXPECT_TRUE(iter.Advance());
88   EXPECT_TRUE(iter.IsWord());
89   EXPECT_EQ(word2, iter.GetString());
90   EXPECT_FALSE(iter.Advance());
91   EXPECT_FALSE(iter.IsWord());
92   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
93   EXPECT_FALSE(iter.IsWord());
94 }
95 
TEST(BreakIteratorTest,BreakWordWide32)96 TEST(BreakIteratorTest, BreakWordWide32) {
97   const std::u16string str = u"\U0001d49c a";
98   const std::u16string very_wide_word(str.substr(0, 2));
99 
100   BreakIterator iter(str, BreakIterator::BREAK_WORD);
101   ASSERT_TRUE(iter.Init());
102   EXPECT_TRUE(iter.Advance());
103   EXPECT_TRUE(iter.IsWord());
104   EXPECT_EQ(very_wide_word, iter.GetString());
105   EXPECT_TRUE(iter.Advance());
106   EXPECT_FALSE(iter.IsWord());
107   EXPECT_EQ(u" ", iter.GetString());
108   EXPECT_TRUE(iter.Advance());
109   EXPECT_TRUE(iter.IsWord());
110   EXPECT_EQ(u"a", iter.GetString());
111   EXPECT_FALSE(iter.Advance());
112   EXPECT_FALSE(iter.IsWord());
113   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
114   EXPECT_FALSE(iter.IsWord());
115 }
116 
TEST(BreakIteratorTest,BreakWordThai)117 TEST(BreakIteratorTest, BreakWordThai) {
118   // Terms in Thai, without spaces in between.
119   const char16_t term1[] = u"พิมพ์";
120   const char16_t term2[] = u"น้อย";
121   const char16_t term3[] = u"ลง";
122   const std::u16string str(base::JoinString({term1, term2, term3}, u""));
123 
124   BreakIterator iter(str, BreakIterator::BREAK_WORD);
125   ASSERT_TRUE(iter.Init());
126   EXPECT_TRUE(iter.Advance());
127   EXPECT_TRUE(iter.IsWord());
128   EXPECT_EQ(term1, iter.GetString());
129   EXPECT_TRUE(iter.Advance());
130   EXPECT_TRUE(iter.IsWord());
131   EXPECT_EQ(term2, iter.GetString());
132   EXPECT_TRUE(iter.Advance());
133   EXPECT_TRUE(iter.IsWord());
134   EXPECT_EQ(term3, iter.GetString());
135   EXPECT_FALSE(iter.Advance());
136   EXPECT_FALSE(iter.IsWord());
137 }
138 
139 // In some languages, the words are not broken by spaces. ICU provides a huge
140 // dictionary to detect word boundaries in Thai, Chinese, Japanese, Burmese,
141 // and Khmer. Due to the size of such a table, the part for Chinese and
142 // Japanese is not shipped on mobile.
143 #if !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
144 
TEST(BreakIteratorTest,BreakWordChinese)145 TEST(BreakIteratorTest, BreakWordChinese) {
146   // Terms in Traditional Chinese, without spaces in between.
147   const char16_t term1[] = u"瀏覽";
148   const char16_t term2[] = u"速度";
149   const char16_t term3[] = u"飛快";
150   const std::u16string str(base::JoinString({term1, term2, term3}, u""));
151 
152   BreakIterator iter(str, BreakIterator::BREAK_WORD);
153   ASSERT_TRUE(iter.Init());
154   EXPECT_TRUE(iter.Advance());
155   EXPECT_TRUE(iter.IsWord());
156   EXPECT_EQ(term1, iter.GetString());
157   EXPECT_TRUE(iter.Advance());
158   EXPECT_TRUE(iter.IsWord());
159   EXPECT_EQ(term2, iter.GetString());
160   EXPECT_TRUE(iter.Advance());
161   EXPECT_TRUE(iter.IsWord());
162   EXPECT_EQ(term3, iter.GetString());
163   EXPECT_FALSE(iter.Advance());
164   EXPECT_FALSE(iter.IsWord());
165 }
166 
TEST(BreakIteratorTest,BreakWordJapanese)167 TEST(BreakIteratorTest, BreakWordJapanese) {
168   // Terms in Japanese, without spaces in between.
169   const char16_t term1[] = u"モバイル";
170   const char16_t term2[] = u"でも";
171   const std::u16string str(base::JoinString({term1, term2}, u""));
172 
173   BreakIterator iter(str, BreakIterator::BREAK_WORD);
174   ASSERT_TRUE(iter.Init());
175   EXPECT_TRUE(iter.Advance());
176   EXPECT_TRUE(iter.IsWord());
177   EXPECT_EQ(term1, iter.GetString());
178   EXPECT_TRUE(iter.Advance());
179   EXPECT_TRUE(iter.IsWord());
180   EXPECT_EQ(term2, iter.GetString());
181   EXPECT_FALSE(iter.Advance());
182   EXPECT_FALSE(iter.IsWord());
183 }
184 
TEST(BreakIteratorTest,BreakWordChineseEnglish)185 TEST(BreakIteratorTest, BreakWordChineseEnglish) {
186   // Terms in Simplified Chinese mixed with English and wide punctuations.
187   std::u16string space(u" ");
188   const char16_t token1[] = u"下载";
189   const char16_t token2[] = u"Chrome";
190   const char16_t token3[] = u"(";
191   const char16_t token4[] = u"Mac";
192   const char16_t token5[] = u"版";
193   const char16_t token6[] = u")";
194   const std::u16string str(base::JoinString(
195       {token1, u" ", token2, token3, token4, u" ", token5, token6}, u""));
196 
197   BreakIterator iter(str, BreakIterator::BREAK_WORD);
198   ASSERT_TRUE(iter.Init());
199 
200   EXPECT_TRUE(iter.Advance());
201   EXPECT_TRUE(iter.IsWord());
202   EXPECT_EQ(token1, iter.GetString());
203 
204   EXPECT_TRUE(iter.Advance());
205   EXPECT_FALSE(iter.IsWord());
206   EXPECT_EQ(space, iter.GetString());
207 
208   EXPECT_TRUE(iter.Advance());
209   EXPECT_TRUE(iter.IsWord());
210   EXPECT_EQ(token2, iter.GetString());
211 
212   EXPECT_TRUE(iter.Advance());
213   EXPECT_FALSE(iter.IsWord());
214   EXPECT_EQ(token3, iter.GetString());
215 
216   EXPECT_TRUE(iter.Advance());
217   EXPECT_TRUE(iter.IsWord());
218   EXPECT_EQ(token4, iter.GetString());
219 
220   EXPECT_TRUE(iter.Advance());
221   EXPECT_FALSE(iter.IsWord());
222   EXPECT_EQ(space, iter.GetString());
223 
224   EXPECT_TRUE(iter.Advance());
225   EXPECT_TRUE(iter.IsWord());
226   EXPECT_EQ(token5, iter.GetString());
227 
228   EXPECT_TRUE(iter.Advance());
229   EXPECT_FALSE(iter.IsWord());
230   EXPECT_EQ(token6, iter.GetString());
231 
232   EXPECT_FALSE(iter.Advance());
233   EXPECT_FALSE(iter.IsWord());
234 }
235 
236 #endif  // !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
237 
TEST(BreakIteratorTest,BreakSpaceEmpty)238 TEST(BreakIteratorTest, BreakSpaceEmpty) {
239   std::u16string empty;
240   BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
241   ASSERT_TRUE(iter.Init());
242   EXPECT_FALSE(iter.Advance());
243   EXPECT_FALSE(iter.IsWord());
244   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
245   EXPECT_FALSE(iter.IsWord());
246 }
247 
TEST(BreakIteratorTest,BreakSpace)248 TEST(BreakIteratorTest, BreakSpace) {
249   std::u16string str(u" foo bar! \npouet boom");
250   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
251   ASSERT_TRUE(iter.Init());
252   EXPECT_TRUE(iter.Advance());
253   EXPECT_FALSE(iter.IsWord());
254   EXPECT_EQ(u" ", iter.GetString());
255   EXPECT_TRUE(iter.Advance());
256   EXPECT_FALSE(iter.IsWord());
257   EXPECT_EQ(u"foo ", iter.GetString());
258   EXPECT_TRUE(iter.Advance());
259   EXPECT_FALSE(iter.IsWord());
260   EXPECT_EQ(u"bar! \n", iter.GetString());
261   EXPECT_TRUE(iter.Advance());
262   EXPECT_FALSE(iter.IsWord());
263   EXPECT_EQ(u"pouet ", iter.GetString());
264   EXPECT_TRUE(iter.Advance());
265   EXPECT_FALSE(iter.IsWord());
266   EXPECT_EQ(u"boom", iter.GetString());
267   EXPECT_FALSE(iter.Advance());
268   EXPECT_FALSE(iter.IsWord());
269   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
270   EXPECT_FALSE(iter.IsWord());
271 }
272 
TEST(BreakIteratorTest,BreakSpaceSP)273 TEST(BreakIteratorTest, BreakSpaceSP) {
274   std::u16string str(u" foo bar! \npouet boom ");
275   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
276   ASSERT_TRUE(iter.Init());
277   EXPECT_TRUE(iter.Advance());
278   EXPECT_FALSE(iter.IsWord());
279   EXPECT_EQ(u" ", iter.GetString());
280   EXPECT_TRUE(iter.Advance());
281   EXPECT_FALSE(iter.IsWord());
282   EXPECT_EQ(u"foo ", iter.GetString());
283   EXPECT_TRUE(iter.Advance());
284   EXPECT_FALSE(iter.IsWord());
285   EXPECT_EQ(u"bar! \n", iter.GetString());
286   EXPECT_TRUE(iter.Advance());
287   EXPECT_FALSE(iter.IsWord());
288   EXPECT_EQ(u"pouet ", iter.GetString());
289   EXPECT_TRUE(iter.Advance());
290   EXPECT_FALSE(iter.IsWord());
291   EXPECT_EQ(u"boom ", iter.GetString());
292   EXPECT_FALSE(iter.Advance());
293   EXPECT_FALSE(iter.IsWord());
294   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
295   EXPECT_FALSE(iter.IsWord());
296 }
297 
TEST(BreakIteratorTest,BreakSpacekWide16)298 TEST(BreakIteratorTest, BreakSpacekWide16) {
299   // Two Greek words.
300   const std::u16string str(
301       u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
302       u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
303   const std::u16string word1(str.substr(0, 11));
304   const std::u16string word2(str.substr(11, 5));
305   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
306   ASSERT_TRUE(iter.Init());
307   EXPECT_TRUE(iter.Advance());
308   EXPECT_FALSE(iter.IsWord());
309   EXPECT_EQ(word1, iter.GetString());
310   EXPECT_TRUE(iter.Advance());
311   EXPECT_FALSE(iter.IsWord());
312   EXPECT_EQ(word2, iter.GetString());
313   EXPECT_FALSE(iter.Advance());
314   EXPECT_FALSE(iter.IsWord());
315   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
316   EXPECT_FALSE(iter.IsWord());
317 }
318 
TEST(BreakIteratorTest,BreakSpaceWide32)319 TEST(BreakIteratorTest, BreakSpaceWide32) {
320   const std::u16string str = u"\U0001d49c a";
321   const std::u16string very_wide_word(str.substr(0, 3));
322 
323   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
324   ASSERT_TRUE(iter.Init());
325   EXPECT_TRUE(iter.Advance());
326   EXPECT_FALSE(iter.IsWord());
327   EXPECT_EQ(very_wide_word, iter.GetString());
328   EXPECT_TRUE(iter.Advance());
329   EXPECT_FALSE(iter.IsWord());
330   EXPECT_EQ(u"a", iter.GetString());
331   EXPECT_FALSE(iter.Advance());
332   EXPECT_FALSE(iter.IsWord());
333   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
334   EXPECT_FALSE(iter.IsWord());
335 }
336 
TEST(BreakIteratorTest,BreakLineEmpty)337 TEST(BreakIteratorTest, BreakLineEmpty) {
338   std::u16string empty;
339   BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
340   ASSERT_TRUE(iter.Init());
341   EXPECT_FALSE(iter.Advance());
342   EXPECT_FALSE(iter.IsWord());
343   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
344   EXPECT_FALSE(iter.IsWord());
345 }
346 
TEST(BreakIteratorTest,BreakLine)347 TEST(BreakIteratorTest, BreakLine) {
348   std::u16string nl(u"\n");
349   std::u16string str(u"\nfoo bar!\n\npouet boom");
350   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
351   ASSERT_TRUE(iter.Init());
352   EXPECT_TRUE(iter.Advance());
353   EXPECT_FALSE(iter.IsWord());
354   EXPECT_EQ(nl, iter.GetString());
355   EXPECT_TRUE(iter.Advance());
356   EXPECT_FALSE(iter.IsWord());
357   EXPECT_EQ(u"foo bar!\n", iter.GetString());
358   EXPECT_TRUE(iter.Advance());
359   EXPECT_FALSE(iter.IsWord());
360   EXPECT_EQ(nl, iter.GetString());
361   EXPECT_TRUE(iter.Advance());
362   EXPECT_FALSE(iter.IsWord());
363   EXPECT_EQ(u"pouet boom", iter.GetString());
364   EXPECT_FALSE(iter.Advance());
365   EXPECT_FALSE(iter.IsWord());
366   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
367   EXPECT_FALSE(iter.IsWord());
368 }
369 
TEST(BreakIteratorTest,BreakSentence)370 TEST(BreakIteratorTest, BreakSentence) {
371   std::u16string nl(u"\n");
372   std::u16string str(
373       u"\nFoo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
374   BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
375   ASSERT_TRUE(iter.Init());
376   EXPECT_TRUE(iter.Advance());
377   EXPECT_FALSE(iter.IsWord());
378   EXPECT_EQ(nl, iter.GetString());
379   EXPECT_TRUE(iter.Advance());
380   EXPECT_FALSE(iter.IsWord());
381   EXPECT_EQ(u"Foo bar!\n", iter.GetString());
382   EXPECT_TRUE(iter.Advance());
383   EXPECT_FALSE(iter.IsWord());
384   EXPECT_EQ(u"One sentence.\n", iter.GetString());
385   EXPECT_TRUE(iter.Advance());
386   EXPECT_FALSE(iter.IsWord());
387   EXPECT_EQ(nl, iter.GetString());
388   EXPECT_TRUE(iter.Advance());
389   EXPECT_FALSE(iter.IsWord());
390   EXPECT_EQ(u"\tAnother sentence?", iter.GetString());
391   EXPECT_TRUE(iter.Advance());
392   EXPECT_FALSE(iter.IsWord());
393   EXPECT_EQ(u"One more thing", iter.GetString());
394   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
395   EXPECT_FALSE(iter.IsWord());
396 }
397 
TEST(BreakIteratorTest,IsSentenceBoundary)398 TEST(BreakIteratorTest, IsSentenceBoundary) {
399   std::u16string str(
400       u"Foo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
401   BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
402   ASSERT_TRUE(iter.Init());
403 
404   std::vector<size_t> sentence_breaks;
405   sentence_breaks.push_back(0);
406   sentence_breaks.push_back(9);
407   sentence_breaks.push_back(23);
408   sentence_breaks.push_back(24);
409   sentence_breaks.push_back(42);
410   for (size_t i = 0; i < str.size(); i++) {
411     if (ranges::find(sentence_breaks, i) != sentence_breaks.end()) {
412       EXPECT_TRUE(iter.IsSentenceBoundary(i)) << " at index=" << i;
413     } else {
414       EXPECT_FALSE(iter.IsSentenceBoundary(i)) << " at index=" << i;
415     }
416   }
417 }
418 
TEST(BreakIteratorTest,BreakLineNL)419 TEST(BreakIteratorTest, BreakLineNL) {
420   std::u16string nl(u"\n");
421   std::u16string str(u"\nfoo bar!\n\npouet boom\n");
422   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
423   ASSERT_TRUE(iter.Init());
424   EXPECT_TRUE(iter.Advance());
425   EXPECT_FALSE(iter.IsWord());
426   EXPECT_EQ(nl, iter.GetString());
427   EXPECT_TRUE(iter.Advance());
428   EXPECT_FALSE(iter.IsWord());
429   EXPECT_EQ(u"foo bar!\n", iter.GetString());
430   EXPECT_TRUE(iter.Advance());
431   EXPECT_FALSE(iter.IsWord());
432   EXPECT_EQ(nl, iter.GetString());
433   EXPECT_TRUE(iter.Advance());
434   EXPECT_FALSE(iter.IsWord());
435   EXPECT_EQ(u"pouet boom\n", iter.GetString());
436   EXPECT_FALSE(iter.Advance());
437   EXPECT_FALSE(iter.IsWord());
438   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
439   EXPECT_FALSE(iter.IsWord());
440 }
441 
TEST(BreakIteratorTest,BreakLineWide16)442 TEST(BreakIteratorTest, BreakLineWide16) {
443   // Two Greek words separated by newline.
444   const std::u16string str(
445       u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
446       u"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2");
447   const std::u16string line1(str.substr(0, 11));
448   const std::u16string line2(str.substr(11, 5));
449   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
450   ASSERT_TRUE(iter.Init());
451   EXPECT_TRUE(iter.Advance());
452   EXPECT_FALSE(iter.IsWord());
453   EXPECT_EQ(line1, iter.GetString());
454   EXPECT_TRUE(iter.Advance());
455   EXPECT_FALSE(iter.IsWord());
456   EXPECT_EQ(line2, iter.GetString());
457   EXPECT_FALSE(iter.Advance());
458   EXPECT_FALSE(iter.IsWord());
459   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
460   EXPECT_FALSE(iter.IsWord());
461 }
462 
TEST(BreakIteratorTest,BreakLineWide32)463 TEST(BreakIteratorTest, BreakLineWide32) {
464   const std::u16string str = u"\U0001d49c\na";
465   const std::u16string very_wide_line(str.substr(0, 3));
466   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
467   ASSERT_TRUE(iter.Init());
468   EXPECT_TRUE(iter.Advance());
469   EXPECT_FALSE(iter.IsWord());
470   EXPECT_EQ(very_wide_line, iter.GetString());
471   EXPECT_TRUE(iter.Advance());
472   EXPECT_FALSE(iter.IsWord());
473   EXPECT_EQ(u"a", iter.GetString());
474   EXPECT_FALSE(iter.Advance());
475   EXPECT_FALSE(iter.IsWord());
476   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
477   EXPECT_FALSE(iter.IsWord());
478 }
479 
TEST(BreakIteratorTest,BreakCharacter)480 TEST(BreakIteratorTest, BreakCharacter) {
481   static const char16_t* const kCharacters[] = {
482       // An English word consisting of four ASCII characters.
483       u"w",
484       u"o",
485       u"r",
486       u"d",
487       u" ",
488       // A Hindi word (which means "Hindi") consisting of two Devanagari
489       // grapheme clusters.
490       u"हि",
491       u"न्दी",
492       u" ",
493       // A Thai word (which means "feel") consisting of three Thai grapheme
494       // clusters.
495       u"รู้",
496       u"สึ",
497       u"ก",
498       u" ",
499   };
500   std::vector<std::u16string> characters;
501   std::u16string text;
502   for (const auto* i : kCharacters) {
503     characters.push_back(i);
504     text.append(characters.back());
505   }
506   BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
507   ASSERT_TRUE(iter.Init());
508   for (size_t i = 0; i < std::size(kCharacters); ++i) {
509     EXPECT_TRUE(iter.Advance());
510     EXPECT_EQ(characters[i], iter.GetString());
511   }
512 }
513 
514 // Test for https://code.google.com/p/chromium/issues/detail?id=411213
515 // We should be able to get valid substrings with GetString() function
516 // after setting new content by calling SetText().
TEST(BreakIteratorTest,GetStringAfterSetText)517 TEST(BreakIteratorTest, GetStringAfterSetText) {
518   const std::u16string initial_string(u"str");
519   BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
520   ASSERT_TRUE(iter.Init());
521 
522   const std::u16string long_string(u"another,string");
523   EXPECT_TRUE(iter.SetText(long_string.c_str(), long_string.size()));
524   EXPECT_TRUE(iter.Advance());
525   EXPECT_TRUE(iter.Advance());  // Advance to ',' in |long_string|
526 
527   // Check that the current position is out of bounds of the |initial_string|.
528   EXPECT_LT(initial_string.size(), iter.pos());
529 
530   // Check that we can get a valid substring of |long_string|.
531   EXPECT_EQ(u",", iter.GetString());
532 }
533 
TEST(BreakIteratorTest,GetStringPiece)534 TEST(BreakIteratorTest, GetStringPiece) {
535   const std::u16string initial_string(u"some string");
536   BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
537   ASSERT_TRUE(iter.Init());
538 
539   EXPECT_TRUE(iter.Advance());
540   EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
541   EXPECT_EQ(StringPiece16(u"some"), iter.GetStringPiece());
542 
543   EXPECT_TRUE(iter.Advance());
544   EXPECT_TRUE(iter.Advance());
545   EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
546   EXPECT_EQ(StringPiece16(u"string"), iter.GetStringPiece());
547 }
548 
549 // Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
550 // IS_LINE_OR_CHAR_BREAK.
TEST(BreakIteratorTest,GetWordBreakStatusBreakLine)551 TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
552   // A string containing the English word "foo", followed by two Khmer
553   // characters, the English word "Can", and then two Russian characters and
554   // punctuation.
555   std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
556   BreakIterator iter(text, BreakIterator::BREAK_LINE);
557   ASSERT_TRUE(iter.Init());
558 
559   EXPECT_TRUE(iter.Advance());
560   // Finds "foo" and the space.
561   EXPECT_EQ(u"foo ", iter.GetString());
562   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
563   EXPECT_TRUE(iter.Advance());
564   // Finds the Khmer characters, the next space, and the newline.
565   EXPECT_EQ(u"\x1791\x17C1 \n", iter.GetString());
566   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
567   EXPECT_TRUE(iter.Advance());
568   // Finds "Can" and the space.
569   EXPECT_EQ(u"Can ", iter.GetString());
570   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
571   EXPECT_TRUE(iter.Advance());
572   // Finds the Russian characters and periods.
573   EXPECT_EQ(u"\x041C\x0438...", iter.GetString());
574   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
575   EXPECT_FALSE(iter.Advance());
576 }
577 
578 // Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
579 // IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
580 // finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
581 // be returned on punctuation and spaces.
TEST(BreakIteratorTest,GetWordBreakStatusBreakWord)582 TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
583   // A string containing the English word "foo", followed by two Khmer
584   // characters, the English word "Can", and then two Russian characters and
585   // punctuation.
586   std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
587   BreakIterator iter(text, BreakIterator::BREAK_WORD);
588   ASSERT_TRUE(iter.Init());
589 
590   EXPECT_TRUE(iter.Advance());
591   // Finds "foo".
592   EXPECT_EQ(u"foo", iter.GetString());
593   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
594   EXPECT_TRUE(iter.Advance());
595   // Finds the space, and the Khmer characters.
596   EXPECT_EQ(u" ", iter.GetString());
597   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
598   EXPECT_TRUE(iter.Advance());
599   EXPECT_EQ(u"\x1791\x17C1", iter.GetString());
600   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
601   EXPECT_TRUE(iter.Advance());
602   // Finds the space and the newline.
603   EXPECT_EQ(u" ", iter.GetString());
604   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
605   EXPECT_TRUE(iter.Advance());
606   EXPECT_EQ(u"\n", iter.GetString());
607   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
608   EXPECT_TRUE(iter.Advance());
609   // Finds "Can".
610   EXPECT_EQ(u"Can", iter.GetString());
611   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
612   EXPECT_TRUE(iter.Advance());
613   // Finds the space and the Russian characters.
614   EXPECT_EQ(u" ", iter.GetString());
615   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
616   EXPECT_TRUE(iter.Advance());
617   EXPECT_EQ(u"\x041C\x0438", iter.GetString());
618   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
619   EXPECT_TRUE(iter.Advance());
620   // Finds the trailing periods.
621   EXPECT_EQ(u".", iter.GetString());
622   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
623   EXPECT_TRUE(iter.Advance());
624   EXPECT_EQ(u".", iter.GetString());
625   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
626   EXPECT_TRUE(iter.Advance());
627   EXPECT_EQ(u".", iter.GetString());
628   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
629   EXPECT_FALSE(iter.Advance());
630 }
631 
632 }  // namespace i18n
633 }  // namespace base
634