• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/break_iterator.h"
6 
7 #include <stddef.h>
8 
9 #include "base/macros.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/stringprintf.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "build/build_config.h"
15 #include "testing/gtest/include/gtest/gtest.h"
16 
17 namespace base {
18 namespace i18n {
19 
TEST(BreakIteratorTest,BreakWordEmpty)20 TEST(BreakIteratorTest, BreakWordEmpty) {
21   string16 empty;
22   BreakIterator iter(empty, BreakIterator::BREAK_WORD);
23   ASSERT_TRUE(iter.Init());
24   EXPECT_FALSE(iter.Advance());
25   EXPECT_FALSE(iter.IsWord());
26   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
27   EXPECT_FALSE(iter.IsWord());
28 }
29 
TEST(BreakIteratorTest,BreakWord)30 TEST(BreakIteratorTest, BreakWord) {
31   string16 space(UTF8ToUTF16(" "));
32   string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
33   BreakIterator iter(str, BreakIterator::BREAK_WORD);
34   ASSERT_TRUE(iter.Init());
35   EXPECT_TRUE(iter.Advance());
36   EXPECT_FALSE(iter.IsWord());
37   EXPECT_EQ(space, iter.GetString());
38   EXPECT_TRUE(iter.Advance());
39   EXPECT_TRUE(iter.IsWord());
40   EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString());
41   EXPECT_TRUE(iter.Advance());
42   EXPECT_FALSE(iter.IsWord());
43   EXPECT_EQ(space, iter.GetString());
44   EXPECT_TRUE(iter.Advance());
45   EXPECT_TRUE(iter.IsWord());
46   EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString());
47   EXPECT_TRUE(iter.Advance());
48   EXPECT_FALSE(iter.IsWord());
49   EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString());
50   EXPECT_TRUE(iter.Advance());
51   EXPECT_FALSE(iter.IsWord());
52   EXPECT_EQ(space, iter.GetString());
53   EXPECT_TRUE(iter.Advance());
54   EXPECT_FALSE(iter.IsWord());
55   EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString());
56   EXPECT_TRUE(iter.Advance());
57   EXPECT_TRUE(iter.IsWord());
58   EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString());
59   EXPECT_TRUE(iter.Advance());
60   EXPECT_FALSE(iter.IsWord());
61   EXPECT_EQ(space, iter.GetString());
62   EXPECT_TRUE(iter.Advance());
63   EXPECT_TRUE(iter.IsWord());
64   EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
65   EXPECT_FALSE(iter.Advance());
66   EXPECT_FALSE(iter.IsWord());
67   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
68   EXPECT_FALSE(iter.IsWord());
69 }
70 
TEST(BreakIteratorTest,BreakWordWide16)71 TEST(BreakIteratorTest, BreakWordWide16) {
72   // Two greek words separated by space.
73   const string16 str(WideToUTF16(
74       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
75       L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
76   const string16 word1(str.substr(0, 10));
77   const string16 word2(str.substr(11, 5));
78   BreakIterator iter(str, BreakIterator::BREAK_WORD);
79   ASSERT_TRUE(iter.Init());
80   EXPECT_TRUE(iter.Advance());
81   EXPECT_TRUE(iter.IsWord());
82   EXPECT_EQ(word1, iter.GetString());
83   EXPECT_TRUE(iter.Advance());
84   EXPECT_FALSE(iter.IsWord());
85   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
86   EXPECT_TRUE(iter.Advance());
87   EXPECT_TRUE(iter.IsWord());
88   EXPECT_EQ(word2, iter.GetString());
89   EXPECT_FALSE(iter.Advance());
90   EXPECT_FALSE(iter.IsWord());
91   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
92   EXPECT_FALSE(iter.IsWord());
93 }
94 
TEST(BreakIteratorTest,BreakWordWide32)95 TEST(BreakIteratorTest, BreakWordWide32) {
96   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
97   const char very_wide_char[] = "\xF0\x9D\x92\x9C";
98   const string16 str(
99       UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
100   const string16 very_wide_word(str.substr(0, 2));
101 
102   BreakIterator iter(str, BreakIterator::BREAK_WORD);
103   ASSERT_TRUE(iter.Init());
104   EXPECT_TRUE(iter.Advance());
105   EXPECT_TRUE(iter.IsWord());
106   EXPECT_EQ(very_wide_word, iter.GetString());
107   EXPECT_TRUE(iter.Advance());
108   EXPECT_FALSE(iter.IsWord());
109   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
110   EXPECT_TRUE(iter.Advance());
111   EXPECT_TRUE(iter.IsWord());
112   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
113   EXPECT_FALSE(iter.Advance());
114   EXPECT_FALSE(iter.IsWord());
115   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
116   EXPECT_FALSE(iter.IsWord());
117 }
118 
TEST(BreakIteratorTest,BreakWordThai)119 TEST(BreakIteratorTest, BreakWordThai) {
120   // Terms in Thai, without spaces in between.
121   const char term1[] = "พิมพ์";
122   const char term2[] = "น้อย";
123   const char term3[] = "ลง";
124   const string16 str(UTF8ToUTF16(base::JoinString({term1, term2, term3}, "")));
125 
126   BreakIterator iter(str, BreakIterator::BREAK_WORD);
127   ASSERT_TRUE(iter.Init());
128   EXPECT_TRUE(iter.Advance());
129   EXPECT_TRUE(iter.IsWord());
130   EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
131   EXPECT_TRUE(iter.Advance());
132   EXPECT_TRUE(iter.IsWord());
133   EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
134   EXPECT_TRUE(iter.Advance());
135   EXPECT_TRUE(iter.IsWord());
136   EXPECT_EQ(UTF8ToUTF16(term3), iter.GetString());
137   EXPECT_FALSE(iter.Advance());
138   EXPECT_FALSE(iter.IsWord());
139 }
140 
141 // In some languages, the words are not broken by spaces. ICU provides a huge
142 // dictionary to detect word boundaries in Thai, Chinese, Japanese, Burmese,
143 // and Khmer. Due to the size of such a table, the part for Chinese and
144 // Japanese is not shipped on mobile.
145 #if !(defined(OS_IOS) || defined(OS_ANDROID))
146 
TEST(BreakIteratorTest,BreakWordChinese)147 TEST(BreakIteratorTest, BreakWordChinese) {
148   // Terms in Traditional Chinese, without spaces in between.
149   const char term1[] = "瀏覽";
150   const char term2[] = "速度";
151   const char term3[] = "飛快";
152   const string16 str(UTF8ToUTF16(base::JoinString({term1, term2, term3}, "")));
153 
154   BreakIterator iter(str, BreakIterator::BREAK_WORD);
155   ASSERT_TRUE(iter.Init());
156   EXPECT_TRUE(iter.Advance());
157   EXPECT_TRUE(iter.IsWord());
158   EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
159   EXPECT_TRUE(iter.Advance());
160   EXPECT_TRUE(iter.IsWord());
161   EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
162   EXPECT_TRUE(iter.Advance());
163   EXPECT_TRUE(iter.IsWord());
164   EXPECT_EQ(UTF8ToUTF16(term3), iter.GetString());
165   EXPECT_FALSE(iter.Advance());
166   EXPECT_FALSE(iter.IsWord());
167 }
168 
TEST(BreakIteratorTest,BreakWordJapanese)169 TEST(BreakIteratorTest, BreakWordJapanese) {
170   // Terms in Japanese, without spaces in between.
171   const char term1[] = "モバイル";
172   const char term2[] = "でも";
173   const string16 str(UTF8ToUTF16(base::JoinString({term1, term2}, "")));
174 
175   BreakIterator iter(str, BreakIterator::BREAK_WORD);
176   ASSERT_TRUE(iter.Init());
177   EXPECT_TRUE(iter.Advance());
178   EXPECT_TRUE(iter.IsWord());
179   EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
180   EXPECT_TRUE(iter.Advance());
181   EXPECT_TRUE(iter.IsWord());
182   EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
183   EXPECT_FALSE(iter.Advance());
184   EXPECT_FALSE(iter.IsWord());
185 }
186 
TEST(BreakIteratorTest,BreakWordChineseEnglish)187 TEST(BreakIteratorTest, BreakWordChineseEnglish) {
188   // Terms in Simplified Chinese mixed with English and wide punctuations.
189   string16 space(UTF8ToUTF16(" "));
190   const char token1[] = "下载";
191   const char token2[] = "Chrome";
192   const char token3[] = "(";
193   const char token4[] = "Mac";
194   const char token5[] = "版";
195   const char token6[] = ")";
196   const string16 str(UTF8ToUTF16(base::JoinString(
197       {token1, " ", token2, token3, token4, " ", token5, token6}, "")));
198 
199   BreakIterator iter(str, BreakIterator::BREAK_WORD);
200   ASSERT_TRUE(iter.Init());
201 
202   EXPECT_TRUE(iter.Advance());
203   EXPECT_TRUE(iter.IsWord());
204   EXPECT_EQ(UTF8ToUTF16(token1), iter.GetString());
205 
206   EXPECT_TRUE(iter.Advance());
207   EXPECT_FALSE(iter.IsWord());
208   EXPECT_EQ(space, iter.GetString());
209 
210   EXPECT_TRUE(iter.Advance());
211   EXPECT_TRUE(iter.IsWord());
212   EXPECT_EQ(UTF8ToUTF16(token2), iter.GetString());
213 
214   EXPECT_TRUE(iter.Advance());
215   EXPECT_FALSE(iter.IsWord());
216   EXPECT_EQ(UTF8ToUTF16(token3), iter.GetString());
217 
218   EXPECT_TRUE(iter.Advance());
219   EXPECT_TRUE(iter.IsWord());
220   EXPECT_EQ(UTF8ToUTF16(token4), iter.GetString());
221 
222   EXPECT_TRUE(iter.Advance());
223   EXPECT_FALSE(iter.IsWord());
224   EXPECT_EQ(space, iter.GetString());
225 
226   EXPECT_TRUE(iter.Advance());
227   EXPECT_TRUE(iter.IsWord());
228   EXPECT_EQ(UTF8ToUTF16(token5), iter.GetString());
229 
230   EXPECT_TRUE(iter.Advance());
231   EXPECT_FALSE(iter.IsWord());
232   EXPECT_EQ(UTF8ToUTF16(token6), iter.GetString());
233 
234   EXPECT_FALSE(iter.Advance());
235   EXPECT_FALSE(iter.IsWord());
236 }
237 
238 #endif  // !(defined(OS_IOS) || defined(OS_ANDROID))
239 
TEST(BreakIteratorTest,BreakSpaceEmpty)240 TEST(BreakIteratorTest, BreakSpaceEmpty) {
241   string16 empty;
242   BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
243   ASSERT_TRUE(iter.Init());
244   EXPECT_FALSE(iter.Advance());
245   EXPECT_FALSE(iter.IsWord());
246   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
247   EXPECT_FALSE(iter.IsWord());
248 }
249 
TEST(BreakIteratorTest,BreakSpace)250 TEST(BreakIteratorTest, BreakSpace) {
251   string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
252   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
253   ASSERT_TRUE(iter.Init());
254   EXPECT_TRUE(iter.Advance());
255   EXPECT_FALSE(iter.IsWord());
256   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
257   EXPECT_TRUE(iter.Advance());
258   EXPECT_FALSE(iter.IsWord());
259   EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
260   EXPECT_TRUE(iter.Advance());
261   EXPECT_FALSE(iter.IsWord());
262   EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
263   EXPECT_TRUE(iter.Advance());
264   EXPECT_FALSE(iter.IsWord());
265   EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
266   EXPECT_TRUE(iter.Advance());
267   EXPECT_FALSE(iter.IsWord());
268   EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
269   EXPECT_FALSE(iter.Advance());
270   EXPECT_FALSE(iter.IsWord());
271   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
272   EXPECT_FALSE(iter.IsWord());
273 }
274 
TEST(BreakIteratorTest,BreakSpaceSP)275 TEST(BreakIteratorTest, BreakSpaceSP) {
276   string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
277   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
278   ASSERT_TRUE(iter.Init());
279   EXPECT_TRUE(iter.Advance());
280   EXPECT_FALSE(iter.IsWord());
281   EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
282   EXPECT_TRUE(iter.Advance());
283   EXPECT_FALSE(iter.IsWord());
284   EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
285   EXPECT_TRUE(iter.Advance());
286   EXPECT_FALSE(iter.IsWord());
287   EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
288   EXPECT_TRUE(iter.Advance());
289   EXPECT_FALSE(iter.IsWord());
290   EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
291   EXPECT_TRUE(iter.Advance());
292   EXPECT_FALSE(iter.IsWord());
293   EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
294   EXPECT_FALSE(iter.Advance());
295   EXPECT_FALSE(iter.IsWord());
296   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
297   EXPECT_FALSE(iter.IsWord());
298 }
299 
TEST(BreakIteratorTest,BreakSpacekWide16)300 TEST(BreakIteratorTest, BreakSpacekWide16) {
301   // Two Greek words.
302   const string16 str(WideToUTF16(
303       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
304       L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
305   const string16 word1(str.substr(0, 11));
306   const string16 word2(str.substr(11, 5));
307   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
308   ASSERT_TRUE(iter.Init());
309   EXPECT_TRUE(iter.Advance());
310   EXPECT_FALSE(iter.IsWord());
311   EXPECT_EQ(word1, iter.GetString());
312   EXPECT_TRUE(iter.Advance());
313   EXPECT_FALSE(iter.IsWord());
314   EXPECT_EQ(word2, iter.GetString());
315   EXPECT_FALSE(iter.Advance());
316   EXPECT_FALSE(iter.IsWord());
317   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
318   EXPECT_FALSE(iter.IsWord());
319 }
320 
TEST(BreakIteratorTest,BreakSpaceWide32)321 TEST(BreakIteratorTest, BreakSpaceWide32) {
322   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
323   const char very_wide_char[] = "\xF0\x9D\x92\x9C";
324   const string16 str(
325       UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
326   const string16 very_wide_word(str.substr(0, 3));
327 
328   BreakIterator iter(str, BreakIterator::BREAK_SPACE);
329   ASSERT_TRUE(iter.Init());
330   EXPECT_TRUE(iter.Advance());
331   EXPECT_FALSE(iter.IsWord());
332   EXPECT_EQ(very_wide_word, iter.GetString());
333   EXPECT_TRUE(iter.Advance());
334   EXPECT_FALSE(iter.IsWord());
335   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
336   EXPECT_FALSE(iter.Advance());
337   EXPECT_FALSE(iter.IsWord());
338   EXPECT_FALSE(iter.Advance());  // Test unexpected advance after end.
339   EXPECT_FALSE(iter.IsWord());
340 }
341 
TEST(BreakIteratorTest,BreakLineEmpty)342 TEST(BreakIteratorTest, BreakLineEmpty) {
343   string16 empty;
344   BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
345   ASSERT_TRUE(iter.Init());
346   EXPECT_FALSE(iter.Advance());
347   EXPECT_FALSE(iter.IsWord());
348   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
349   EXPECT_FALSE(iter.IsWord());
350 }
351 
TEST(BreakIteratorTest,BreakLine)352 TEST(BreakIteratorTest, BreakLine) {
353   string16 nl(UTF8ToUTF16("\n"));
354   string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
355   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
356   ASSERT_TRUE(iter.Init());
357   EXPECT_TRUE(iter.Advance());
358   EXPECT_FALSE(iter.IsWord());
359   EXPECT_EQ(nl, iter.GetString());
360   EXPECT_TRUE(iter.Advance());
361   EXPECT_FALSE(iter.IsWord());
362   EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
363   EXPECT_TRUE(iter.Advance());
364   EXPECT_FALSE(iter.IsWord());
365   EXPECT_EQ(nl, iter.GetString());
366   EXPECT_TRUE(iter.Advance());
367   EXPECT_FALSE(iter.IsWord());
368   EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
369   EXPECT_FALSE(iter.Advance());
370   EXPECT_FALSE(iter.IsWord());
371   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
372   EXPECT_FALSE(iter.IsWord());
373 }
374 
TEST(BreakIteratorTest,BreakLineNL)375 TEST(BreakIteratorTest, BreakLineNL) {
376   string16 nl(UTF8ToUTF16("\n"));
377   string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
378   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
379   ASSERT_TRUE(iter.Init());
380   EXPECT_TRUE(iter.Advance());
381   EXPECT_FALSE(iter.IsWord());
382   EXPECT_EQ(nl, iter.GetString());
383   EXPECT_TRUE(iter.Advance());
384   EXPECT_FALSE(iter.IsWord());
385   EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
386   EXPECT_TRUE(iter.Advance());
387   EXPECT_FALSE(iter.IsWord());
388   EXPECT_EQ(nl, iter.GetString());
389   EXPECT_TRUE(iter.Advance());
390   EXPECT_FALSE(iter.IsWord());
391   EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
392   EXPECT_FALSE(iter.Advance());
393   EXPECT_FALSE(iter.IsWord());
394   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
395   EXPECT_FALSE(iter.IsWord());
396 }
397 
TEST(BreakIteratorTest,BreakLineWide16)398 TEST(BreakIteratorTest, BreakLineWide16) {
399   // Two Greek words separated by newline.
400   const string16 str(WideToUTF16(
401       L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
402       L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
403   const string16 line1(str.substr(0, 11));
404   const string16 line2(str.substr(11, 5));
405   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
406   ASSERT_TRUE(iter.Init());
407   EXPECT_TRUE(iter.Advance());
408   EXPECT_FALSE(iter.IsWord());
409   EXPECT_EQ(line1, iter.GetString());
410   EXPECT_TRUE(iter.Advance());
411   EXPECT_FALSE(iter.IsWord());
412   EXPECT_EQ(line2, iter.GetString());
413   EXPECT_FALSE(iter.Advance());
414   EXPECT_FALSE(iter.IsWord());
415   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
416   EXPECT_FALSE(iter.IsWord());
417 }
418 
TEST(BreakIteratorTest,BreakLineWide32)419 TEST(BreakIteratorTest, BreakLineWide32) {
420   // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
421   const char very_wide_char[] = "\xF0\x9D\x92\x9C";
422   const string16 str(
423       UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char)));
424   const string16 very_wide_line(str.substr(0, 3));
425   BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
426   ASSERT_TRUE(iter.Init());
427   EXPECT_TRUE(iter.Advance());
428   EXPECT_FALSE(iter.IsWord());
429   EXPECT_EQ(very_wide_line, iter.GetString());
430   EXPECT_TRUE(iter.Advance());
431   EXPECT_FALSE(iter.IsWord());
432   EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
433   EXPECT_FALSE(iter.Advance());
434   EXPECT_FALSE(iter.IsWord());
435   EXPECT_FALSE(iter.Advance());   // Test unexpected advance after end.
436   EXPECT_FALSE(iter.IsWord());
437 }
438 
TEST(BreakIteratorTest,BreakCharacter)439 TEST(BreakIteratorTest, BreakCharacter) {
440   static const wchar_t* kCharacters[] = {
441     // An English word consisting of four ASCII characters.
442     L"w", L"o", L"r", L"d", L" ",
443     // A Hindi word (which means "Hindi") consisting of three Devanagari
444     // characters.
445     L"\x0939\x093F", L"\x0928\x094D", L"\x0926\x0940", L" ",
446     // A Thai word (which means "feel") consisting of three Thai characters.
447     L"\x0E23\x0E39\x0E49", L"\x0E2A\x0E36", L"\x0E01", L" ",
448   };
449   std::vector<string16> characters;
450   string16 text;
451   for (size_t i = 0; i < arraysize(kCharacters); ++i) {
452     characters.push_back(WideToUTF16(kCharacters[i]));
453     text.append(characters.back());
454   }
455   BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
456   ASSERT_TRUE(iter.Init());
457   for (size_t i = 0; i < arraysize(kCharacters); ++i) {
458     EXPECT_TRUE(iter.Advance());
459     EXPECT_EQ(characters[i], iter.GetString());
460   }
461 }
462 
463 // Test for https://code.google.com/p/chromium/issues/detail?id=411213
464 // We should be able to get valid substrings with GetString() function
465 // after setting new content by calling SetText().
TEST(BreakIteratorTest,GetStringAfterSetText)466 TEST(BreakIteratorTest, GetStringAfterSetText) {
467   const string16 initial_string(ASCIIToUTF16("str"));
468   BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
469   ASSERT_TRUE(iter.Init());
470 
471   const string16 long_string(ASCIIToUTF16("another,string"));
472   EXPECT_TRUE(iter.SetText(long_string.c_str(), long_string.size()));
473   EXPECT_TRUE(iter.Advance());
474   EXPECT_TRUE(iter.Advance());  // Advance to ',' in |long_string|
475 
476   // Check that the current position is out of bounds of the |initial_string|.
477   EXPECT_LT(initial_string.size(), iter.pos());
478 
479   // Check that we can get a valid substring of |long_string|.
480   EXPECT_EQ(ASCIIToUTF16(","), iter.GetString());
481 }
482 
TEST(BreakIteratorTest,GetStringPiece)483 TEST(BreakIteratorTest, GetStringPiece) {
484   const string16 initial_string(ASCIIToUTF16("some string"));
485   BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
486   ASSERT_TRUE(iter.Init());
487 
488   EXPECT_TRUE(iter.Advance());
489   EXPECT_EQ(iter.GetString(), iter.GetStringPiece().as_string());
490   EXPECT_EQ(StringPiece16(ASCIIToUTF16("some")), iter.GetStringPiece());
491 
492   EXPECT_TRUE(iter.Advance());
493   EXPECT_TRUE(iter.Advance());
494   EXPECT_EQ(iter.GetString(), iter.GetStringPiece().as_string());
495   EXPECT_EQ(StringPiece16(ASCIIToUTF16("string")), iter.GetStringPiece());
496 }
497 
498 // Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
499 // IS_LINE_OR_CHAR_BREAK.
TEST(BreakIteratorTest,GetWordBreakStatusBreakLine)500 TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
501   // A string containing the English word "foo", followed by two Khmer
502   // characters, the English word "Can", and then two Russian characters and
503   // punctuation.
504   base::string16 text(
505       base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
506   BreakIterator iter(text, BreakIterator::BREAK_LINE);
507   ASSERT_TRUE(iter.Init());
508 
509   EXPECT_TRUE(iter.Advance());
510   // Finds "foo" and the space.
511   EXPECT_EQ(base::UTF8ToUTF16("foo "), iter.GetString());
512   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
513   EXPECT_TRUE(iter.Advance());
514   // Finds the Khmer characters, the next space, and the newline.
515   EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1 \n"), iter.GetString());
516   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
517   EXPECT_TRUE(iter.Advance());
518   // Finds "Can" and the space.
519   EXPECT_EQ(base::UTF8ToUTF16("Can "), iter.GetString());
520   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
521   EXPECT_TRUE(iter.Advance());
522   // Finds the Russian characters and periods.
523   EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438..."), iter.GetString());
524   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
525   EXPECT_FALSE(iter.Advance());
526 }
527 
528 // Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
529 // IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
530 // finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
531 // be returned on punctuation and spaces.
TEST(BreakIteratorTest,GetWordBreakStatusBreakWord)532 TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
533   // A string containing the English word "foo", followed by two Khmer
534   // characters, the English word "Can", and then two Russian characters and
535   // punctuation.
536   base::string16 text(
537       base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
538   BreakIterator iter(text, BreakIterator::BREAK_WORD);
539   ASSERT_TRUE(iter.Init());
540 
541   EXPECT_TRUE(iter.Advance());
542   // Finds "foo".
543   EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
544   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
545   EXPECT_TRUE(iter.Advance());
546   // Finds the space, and the Khmer characters.
547   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
548   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
549   EXPECT_TRUE(iter.Advance());
550   EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
551   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
552   EXPECT_TRUE(iter.Advance());
553   // Finds the space and the newline.
554   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
555   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
556   EXPECT_TRUE(iter.Advance());
557   EXPECT_EQ(base::UTF8ToUTF16("\n"), iter.GetString());
558   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
559   EXPECT_TRUE(iter.Advance());
560   // Finds "Can".
561   EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
562   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
563   EXPECT_TRUE(iter.Advance());
564   // Finds the space and the Russian characters.
565   EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
566   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
567   EXPECT_TRUE(iter.Advance());
568   EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
569   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
570   EXPECT_TRUE(iter.Advance());
571   // Finds the trailing periods.
572   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
573   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
574   EXPECT_TRUE(iter.Advance());
575   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
576   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
577   EXPECT_TRUE(iter.Advance());
578   EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
579   EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
580   EXPECT_FALSE(iter.Advance());
581 }
582 
583 }  // namespace i18n
584 }  // namespace base
585