1 // Copyright 2011 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/break_iterator.h"
6
7 #include <stddef.h>
8
9 #include <vector>
10
11 #include "base/ranges/algorithm.h"
12 #include "base/strings/string_piece.h"
13 #include "base/strings/string_util.h"
14 #include "base/strings/utf_string_conversions.h"
15 #include "build/build_config.h"
16 #include "testing/gtest/include/gtest/gtest.h"
17
18 namespace base {
19 namespace i18n {
20
TEST(BreakIteratorTest,BreakWordEmpty)21 TEST(BreakIteratorTest, BreakWordEmpty) {
22 std::u16string empty;
23 BreakIterator iter(empty, BreakIterator::BREAK_WORD);
24 ASSERT_TRUE(iter.Init());
25 EXPECT_FALSE(iter.Advance());
26 EXPECT_FALSE(iter.IsWord());
27 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
28 EXPECT_FALSE(iter.IsWord());
29 }
30
TEST(BreakIteratorTest,BreakWord)31 TEST(BreakIteratorTest, BreakWord) {
32 std::u16string space(u" ");
33 std::u16string str(u" foo bar! \npouet boom");
34 BreakIterator iter(str, BreakIterator::BREAK_WORD);
35 ASSERT_TRUE(iter.Init());
36 EXPECT_TRUE(iter.Advance());
37 EXPECT_FALSE(iter.IsWord());
38 EXPECT_EQ(space, iter.GetString());
39 EXPECT_TRUE(iter.Advance());
40 EXPECT_TRUE(iter.IsWord());
41 EXPECT_EQ(u"foo", iter.GetString());
42 EXPECT_TRUE(iter.Advance());
43 EXPECT_FALSE(iter.IsWord());
44 EXPECT_EQ(space, iter.GetString());
45 EXPECT_TRUE(iter.Advance());
46 EXPECT_TRUE(iter.IsWord());
47 EXPECT_EQ(u"bar", iter.GetString());
48 EXPECT_TRUE(iter.Advance());
49 EXPECT_FALSE(iter.IsWord());
50 EXPECT_EQ(u"!", iter.GetString());
51 EXPECT_TRUE(iter.Advance());
52 EXPECT_FALSE(iter.IsWord());
53 EXPECT_EQ(space, iter.GetString());
54 EXPECT_TRUE(iter.Advance());
55 EXPECT_FALSE(iter.IsWord());
56 EXPECT_EQ(u"\n", iter.GetString());
57 EXPECT_TRUE(iter.Advance());
58 EXPECT_TRUE(iter.IsWord());
59 EXPECT_EQ(u"pouet", iter.GetString());
60 EXPECT_TRUE(iter.Advance());
61 EXPECT_FALSE(iter.IsWord());
62 EXPECT_EQ(space, iter.GetString());
63 EXPECT_TRUE(iter.Advance());
64 EXPECT_TRUE(iter.IsWord());
65 EXPECT_EQ(u"boom", iter.GetString());
66 EXPECT_FALSE(iter.Advance());
67 EXPECT_FALSE(iter.IsWord());
68 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
69 EXPECT_FALSE(iter.IsWord());
70 }
71
TEST(BreakIteratorTest,BreakWordWide16)72 TEST(BreakIteratorTest, BreakWordWide16) {
73 // Two greek words separated by space.
74 const std::u16string str(
75 u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
76 u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
77 const std::u16string word1(str.substr(0, 10));
78 const std::u16string word2(str.substr(11, 5));
79 BreakIterator iter(str, BreakIterator::BREAK_WORD);
80 ASSERT_TRUE(iter.Init());
81 EXPECT_TRUE(iter.Advance());
82 EXPECT_TRUE(iter.IsWord());
83 EXPECT_EQ(word1, iter.GetString());
84 EXPECT_TRUE(iter.Advance());
85 EXPECT_FALSE(iter.IsWord());
86 EXPECT_EQ(u" ", iter.GetString());
87 EXPECT_TRUE(iter.Advance());
88 EXPECT_TRUE(iter.IsWord());
89 EXPECT_EQ(word2, iter.GetString());
90 EXPECT_FALSE(iter.Advance());
91 EXPECT_FALSE(iter.IsWord());
92 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
93 EXPECT_FALSE(iter.IsWord());
94 }
95
TEST(BreakIteratorTest,BreakWordWide32)96 TEST(BreakIteratorTest, BreakWordWide32) {
97 const std::u16string str = u"\U0001d49c a";
98 const std::u16string very_wide_word(str.substr(0, 2));
99
100 BreakIterator iter(str, BreakIterator::BREAK_WORD);
101 ASSERT_TRUE(iter.Init());
102 EXPECT_TRUE(iter.Advance());
103 EXPECT_TRUE(iter.IsWord());
104 EXPECT_EQ(very_wide_word, iter.GetString());
105 EXPECT_TRUE(iter.Advance());
106 EXPECT_FALSE(iter.IsWord());
107 EXPECT_EQ(u" ", iter.GetString());
108 EXPECT_TRUE(iter.Advance());
109 EXPECT_TRUE(iter.IsWord());
110 EXPECT_EQ(u"a", iter.GetString());
111 EXPECT_FALSE(iter.Advance());
112 EXPECT_FALSE(iter.IsWord());
113 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
114 EXPECT_FALSE(iter.IsWord());
115 }
116
TEST(BreakIteratorTest,BreakWordThai)117 TEST(BreakIteratorTest, BreakWordThai) {
118 // Terms in Thai, without spaces in between.
119 const char16_t term1[] = u"พิมพ์";
120 const char16_t term2[] = u"น้อย";
121 const char16_t term3[] = u"ลง";
122 const std::u16string str(base::JoinString({term1, term2, term3}, u""));
123
124 BreakIterator iter(str, BreakIterator::BREAK_WORD);
125 ASSERT_TRUE(iter.Init());
126 EXPECT_TRUE(iter.Advance());
127 EXPECT_TRUE(iter.IsWord());
128 EXPECT_EQ(term1, iter.GetString());
129 EXPECT_TRUE(iter.Advance());
130 EXPECT_TRUE(iter.IsWord());
131 EXPECT_EQ(term2, iter.GetString());
132 EXPECT_TRUE(iter.Advance());
133 EXPECT_TRUE(iter.IsWord());
134 EXPECT_EQ(term3, iter.GetString());
135 EXPECT_FALSE(iter.Advance());
136 EXPECT_FALSE(iter.IsWord());
137 }
138
139 // In some languages, the words are not broken by spaces. ICU provides a huge
140 // dictionary to detect word boundaries in Thai, Chinese, Japanese, Burmese,
141 // and Khmer. Due to the size of such a table, the part for Chinese and
142 // Japanese is not shipped on mobile.
143 #if !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
144
TEST(BreakIteratorTest,BreakWordChinese)145 TEST(BreakIteratorTest, BreakWordChinese) {
146 // Terms in Traditional Chinese, without spaces in between.
147 const char16_t term1[] = u"瀏覽";
148 const char16_t term2[] = u"速度";
149 const char16_t term3[] = u"飛快";
150 const std::u16string str(base::JoinString({term1, term2, term3}, u""));
151
152 BreakIterator iter(str, BreakIterator::BREAK_WORD);
153 ASSERT_TRUE(iter.Init());
154 EXPECT_TRUE(iter.Advance());
155 EXPECT_TRUE(iter.IsWord());
156 EXPECT_EQ(term1, iter.GetString());
157 EXPECT_TRUE(iter.Advance());
158 EXPECT_TRUE(iter.IsWord());
159 EXPECT_EQ(term2, iter.GetString());
160 EXPECT_TRUE(iter.Advance());
161 EXPECT_TRUE(iter.IsWord());
162 EXPECT_EQ(term3, iter.GetString());
163 EXPECT_FALSE(iter.Advance());
164 EXPECT_FALSE(iter.IsWord());
165 }
166
TEST(BreakIteratorTest,BreakWordJapanese)167 TEST(BreakIteratorTest, BreakWordJapanese) {
168 // Terms in Japanese, without spaces in between.
169 const char16_t term1[] = u"モバイル";
170 const char16_t term2[] = u"でも";
171 const std::u16string str(base::JoinString({term1, term2}, u""));
172
173 BreakIterator iter(str, BreakIterator::BREAK_WORD);
174 ASSERT_TRUE(iter.Init());
175 EXPECT_TRUE(iter.Advance());
176 EXPECT_TRUE(iter.IsWord());
177 EXPECT_EQ(term1, iter.GetString());
178 EXPECT_TRUE(iter.Advance());
179 EXPECT_TRUE(iter.IsWord());
180 EXPECT_EQ(term2, iter.GetString());
181 EXPECT_FALSE(iter.Advance());
182 EXPECT_FALSE(iter.IsWord());
183 }
184
TEST(BreakIteratorTest,BreakWordChineseEnglish)185 TEST(BreakIteratorTest, BreakWordChineseEnglish) {
186 // Terms in Simplified Chinese mixed with English and wide punctuations.
187 std::u16string space(u" ");
188 const char16_t token1[] = u"下载";
189 const char16_t token2[] = u"Chrome";
190 const char16_t token3[] = u"(";
191 const char16_t token4[] = u"Mac";
192 const char16_t token5[] = u"版";
193 const char16_t token6[] = u")";
194 const std::u16string str(base::JoinString(
195 {token1, u" ", token2, token3, token4, u" ", token5, token6}, u""));
196
197 BreakIterator iter(str, BreakIterator::BREAK_WORD);
198 ASSERT_TRUE(iter.Init());
199
200 EXPECT_TRUE(iter.Advance());
201 EXPECT_TRUE(iter.IsWord());
202 EXPECT_EQ(token1, iter.GetString());
203
204 EXPECT_TRUE(iter.Advance());
205 EXPECT_FALSE(iter.IsWord());
206 EXPECT_EQ(space, iter.GetString());
207
208 EXPECT_TRUE(iter.Advance());
209 EXPECT_TRUE(iter.IsWord());
210 EXPECT_EQ(token2, iter.GetString());
211
212 EXPECT_TRUE(iter.Advance());
213 EXPECT_FALSE(iter.IsWord());
214 EXPECT_EQ(token3, iter.GetString());
215
216 EXPECT_TRUE(iter.Advance());
217 EXPECT_TRUE(iter.IsWord());
218 EXPECT_EQ(token4, iter.GetString());
219
220 EXPECT_TRUE(iter.Advance());
221 EXPECT_FALSE(iter.IsWord());
222 EXPECT_EQ(space, iter.GetString());
223
224 EXPECT_TRUE(iter.Advance());
225 EXPECT_TRUE(iter.IsWord());
226 EXPECT_EQ(token5, iter.GetString());
227
228 EXPECT_TRUE(iter.Advance());
229 EXPECT_FALSE(iter.IsWord());
230 EXPECT_EQ(token6, iter.GetString());
231
232 EXPECT_FALSE(iter.Advance());
233 EXPECT_FALSE(iter.IsWord());
234 }
235
236 #endif // !(BUILDFLAG(IS_IOS) || BUILDFLAG(IS_ANDROID))
237
TEST(BreakIteratorTest,BreakSpaceEmpty)238 TEST(BreakIteratorTest, BreakSpaceEmpty) {
239 std::u16string empty;
240 BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
241 ASSERT_TRUE(iter.Init());
242 EXPECT_FALSE(iter.Advance());
243 EXPECT_FALSE(iter.IsWord());
244 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
245 EXPECT_FALSE(iter.IsWord());
246 }
247
TEST(BreakIteratorTest,BreakSpace)248 TEST(BreakIteratorTest, BreakSpace) {
249 std::u16string str(u" foo bar! \npouet boom");
250 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
251 ASSERT_TRUE(iter.Init());
252 EXPECT_TRUE(iter.Advance());
253 EXPECT_FALSE(iter.IsWord());
254 EXPECT_EQ(u" ", iter.GetString());
255 EXPECT_TRUE(iter.Advance());
256 EXPECT_FALSE(iter.IsWord());
257 EXPECT_EQ(u"foo ", iter.GetString());
258 EXPECT_TRUE(iter.Advance());
259 EXPECT_FALSE(iter.IsWord());
260 EXPECT_EQ(u"bar! \n", iter.GetString());
261 EXPECT_TRUE(iter.Advance());
262 EXPECT_FALSE(iter.IsWord());
263 EXPECT_EQ(u"pouet ", iter.GetString());
264 EXPECT_TRUE(iter.Advance());
265 EXPECT_FALSE(iter.IsWord());
266 EXPECT_EQ(u"boom", iter.GetString());
267 EXPECT_FALSE(iter.Advance());
268 EXPECT_FALSE(iter.IsWord());
269 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
270 EXPECT_FALSE(iter.IsWord());
271 }
272
TEST(BreakIteratorTest,BreakSpaceSP)273 TEST(BreakIteratorTest, BreakSpaceSP) {
274 std::u16string str(u" foo bar! \npouet boom ");
275 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
276 ASSERT_TRUE(iter.Init());
277 EXPECT_TRUE(iter.Advance());
278 EXPECT_FALSE(iter.IsWord());
279 EXPECT_EQ(u" ", iter.GetString());
280 EXPECT_TRUE(iter.Advance());
281 EXPECT_FALSE(iter.IsWord());
282 EXPECT_EQ(u"foo ", iter.GetString());
283 EXPECT_TRUE(iter.Advance());
284 EXPECT_FALSE(iter.IsWord());
285 EXPECT_EQ(u"bar! \n", iter.GetString());
286 EXPECT_TRUE(iter.Advance());
287 EXPECT_FALSE(iter.IsWord());
288 EXPECT_EQ(u"pouet ", iter.GetString());
289 EXPECT_TRUE(iter.Advance());
290 EXPECT_FALSE(iter.IsWord());
291 EXPECT_EQ(u"boom ", iter.GetString());
292 EXPECT_FALSE(iter.Advance());
293 EXPECT_FALSE(iter.IsWord());
294 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
295 EXPECT_FALSE(iter.IsWord());
296 }
297
TEST(BreakIteratorTest,BreakSpacekWide16)298 TEST(BreakIteratorTest, BreakSpacekWide16) {
299 // Two Greek words.
300 const std::u16string str(
301 u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
302 u"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2");
303 const std::u16string word1(str.substr(0, 11));
304 const std::u16string word2(str.substr(11, 5));
305 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
306 ASSERT_TRUE(iter.Init());
307 EXPECT_TRUE(iter.Advance());
308 EXPECT_FALSE(iter.IsWord());
309 EXPECT_EQ(word1, iter.GetString());
310 EXPECT_TRUE(iter.Advance());
311 EXPECT_FALSE(iter.IsWord());
312 EXPECT_EQ(word2, iter.GetString());
313 EXPECT_FALSE(iter.Advance());
314 EXPECT_FALSE(iter.IsWord());
315 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
316 EXPECT_FALSE(iter.IsWord());
317 }
318
TEST(BreakIteratorTest,BreakSpaceWide32)319 TEST(BreakIteratorTest, BreakSpaceWide32) {
320 const std::u16string str = u"\U0001d49c a";
321 const std::u16string very_wide_word(str.substr(0, 3));
322
323 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
324 ASSERT_TRUE(iter.Init());
325 EXPECT_TRUE(iter.Advance());
326 EXPECT_FALSE(iter.IsWord());
327 EXPECT_EQ(very_wide_word, iter.GetString());
328 EXPECT_TRUE(iter.Advance());
329 EXPECT_FALSE(iter.IsWord());
330 EXPECT_EQ(u"a", iter.GetString());
331 EXPECT_FALSE(iter.Advance());
332 EXPECT_FALSE(iter.IsWord());
333 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
334 EXPECT_FALSE(iter.IsWord());
335 }
336
TEST(BreakIteratorTest,BreakLineEmpty)337 TEST(BreakIteratorTest, BreakLineEmpty) {
338 std::u16string empty;
339 BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
340 ASSERT_TRUE(iter.Init());
341 EXPECT_FALSE(iter.Advance());
342 EXPECT_FALSE(iter.IsWord());
343 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
344 EXPECT_FALSE(iter.IsWord());
345 }
346
TEST(BreakIteratorTest,BreakLine)347 TEST(BreakIteratorTest, BreakLine) {
348 std::u16string nl(u"\n");
349 std::u16string str(u"\nfoo bar!\n\npouet boom");
350 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
351 ASSERT_TRUE(iter.Init());
352 EXPECT_TRUE(iter.Advance());
353 EXPECT_FALSE(iter.IsWord());
354 EXPECT_EQ(nl, iter.GetString());
355 EXPECT_TRUE(iter.Advance());
356 EXPECT_FALSE(iter.IsWord());
357 EXPECT_EQ(u"foo bar!\n", iter.GetString());
358 EXPECT_TRUE(iter.Advance());
359 EXPECT_FALSE(iter.IsWord());
360 EXPECT_EQ(nl, iter.GetString());
361 EXPECT_TRUE(iter.Advance());
362 EXPECT_FALSE(iter.IsWord());
363 EXPECT_EQ(u"pouet boom", iter.GetString());
364 EXPECT_FALSE(iter.Advance());
365 EXPECT_FALSE(iter.IsWord());
366 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
367 EXPECT_FALSE(iter.IsWord());
368 }
369
TEST(BreakIteratorTest,BreakSentence)370 TEST(BreakIteratorTest, BreakSentence) {
371 std::u16string nl(u"\n");
372 std::u16string str(
373 u"\nFoo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
374 BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
375 ASSERT_TRUE(iter.Init());
376 EXPECT_TRUE(iter.Advance());
377 EXPECT_FALSE(iter.IsWord());
378 EXPECT_EQ(nl, iter.GetString());
379 EXPECT_TRUE(iter.Advance());
380 EXPECT_FALSE(iter.IsWord());
381 EXPECT_EQ(u"Foo bar!\n", iter.GetString());
382 EXPECT_TRUE(iter.Advance());
383 EXPECT_FALSE(iter.IsWord());
384 EXPECT_EQ(u"One sentence.\n", iter.GetString());
385 EXPECT_TRUE(iter.Advance());
386 EXPECT_FALSE(iter.IsWord());
387 EXPECT_EQ(nl, iter.GetString());
388 EXPECT_TRUE(iter.Advance());
389 EXPECT_FALSE(iter.IsWord());
390 EXPECT_EQ(u"\tAnother sentence?", iter.GetString());
391 EXPECT_TRUE(iter.Advance());
392 EXPECT_FALSE(iter.IsWord());
393 EXPECT_EQ(u"One more thing", iter.GetString());
394 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
395 EXPECT_FALSE(iter.IsWord());
396 }
397
TEST(BreakIteratorTest,IsSentenceBoundary)398 TEST(BreakIteratorTest, IsSentenceBoundary) {
399 std::u16string str(
400 u"Foo bar!\nOne sentence.\n\n\tAnother sentence?One more thing");
401 BreakIterator iter(str, BreakIterator::BREAK_SENTENCE);
402 ASSERT_TRUE(iter.Init());
403
404 std::vector<size_t> sentence_breaks;
405 sentence_breaks.push_back(0);
406 sentence_breaks.push_back(9);
407 sentence_breaks.push_back(23);
408 sentence_breaks.push_back(24);
409 sentence_breaks.push_back(42);
410 for (size_t i = 0; i < str.size(); i++) {
411 if (ranges::find(sentence_breaks, i) != sentence_breaks.end()) {
412 EXPECT_TRUE(iter.IsSentenceBoundary(i)) << " at index=" << i;
413 } else {
414 EXPECT_FALSE(iter.IsSentenceBoundary(i)) << " at index=" << i;
415 }
416 }
417 }
418
TEST(BreakIteratorTest,BreakLineNL)419 TEST(BreakIteratorTest, BreakLineNL) {
420 std::u16string nl(u"\n");
421 std::u16string str(u"\nfoo bar!\n\npouet boom\n");
422 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
423 ASSERT_TRUE(iter.Init());
424 EXPECT_TRUE(iter.Advance());
425 EXPECT_FALSE(iter.IsWord());
426 EXPECT_EQ(nl, iter.GetString());
427 EXPECT_TRUE(iter.Advance());
428 EXPECT_FALSE(iter.IsWord());
429 EXPECT_EQ(u"foo bar!\n", iter.GetString());
430 EXPECT_TRUE(iter.Advance());
431 EXPECT_FALSE(iter.IsWord());
432 EXPECT_EQ(nl, iter.GetString());
433 EXPECT_TRUE(iter.Advance());
434 EXPECT_FALSE(iter.IsWord());
435 EXPECT_EQ(u"pouet boom\n", iter.GetString());
436 EXPECT_FALSE(iter.Advance());
437 EXPECT_FALSE(iter.IsWord());
438 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
439 EXPECT_FALSE(iter.IsWord());
440 }
441
TEST(BreakIteratorTest,BreakLineWide16)442 TEST(BreakIteratorTest, BreakLineWide16) {
443 // Two Greek words separated by newline.
444 const std::u16string str(
445 u"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
446 u"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2");
447 const std::u16string line1(str.substr(0, 11));
448 const std::u16string line2(str.substr(11, 5));
449 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
450 ASSERT_TRUE(iter.Init());
451 EXPECT_TRUE(iter.Advance());
452 EXPECT_FALSE(iter.IsWord());
453 EXPECT_EQ(line1, iter.GetString());
454 EXPECT_TRUE(iter.Advance());
455 EXPECT_FALSE(iter.IsWord());
456 EXPECT_EQ(line2, iter.GetString());
457 EXPECT_FALSE(iter.Advance());
458 EXPECT_FALSE(iter.IsWord());
459 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
460 EXPECT_FALSE(iter.IsWord());
461 }
462
TEST(BreakIteratorTest,BreakLineWide32)463 TEST(BreakIteratorTest, BreakLineWide32) {
464 const std::u16string str = u"\U0001d49c\na";
465 const std::u16string very_wide_line(str.substr(0, 3));
466 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
467 ASSERT_TRUE(iter.Init());
468 EXPECT_TRUE(iter.Advance());
469 EXPECT_FALSE(iter.IsWord());
470 EXPECT_EQ(very_wide_line, iter.GetString());
471 EXPECT_TRUE(iter.Advance());
472 EXPECT_FALSE(iter.IsWord());
473 EXPECT_EQ(u"a", iter.GetString());
474 EXPECT_FALSE(iter.Advance());
475 EXPECT_FALSE(iter.IsWord());
476 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
477 EXPECT_FALSE(iter.IsWord());
478 }
479
TEST(BreakIteratorTest,BreakCharacter)480 TEST(BreakIteratorTest, BreakCharacter) {
481 static const char16_t* const kCharacters[] = {
482 // An English word consisting of four ASCII characters.
483 u"w",
484 u"o",
485 u"r",
486 u"d",
487 u" ",
488 // A Hindi word (which means "Hindi") consisting of two Devanagari
489 // grapheme clusters.
490 u"हि",
491 u"न्दी",
492 u" ",
493 // A Thai word (which means "feel") consisting of three Thai grapheme
494 // clusters.
495 u"รู้",
496 u"สึ",
497 u"ก",
498 u" ",
499 };
500 std::vector<std::u16string> characters;
501 std::u16string text;
502 for (const auto* i : kCharacters) {
503 characters.push_back(i);
504 text.append(characters.back());
505 }
506 BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
507 ASSERT_TRUE(iter.Init());
508 for (size_t i = 0; i < std::size(kCharacters); ++i) {
509 EXPECT_TRUE(iter.Advance());
510 EXPECT_EQ(characters[i], iter.GetString());
511 }
512 }
513
514 // Test for https://code.google.com/p/chromium/issues/detail?id=411213
515 // We should be able to get valid substrings with GetString() function
516 // after setting new content by calling SetText().
TEST(BreakIteratorTest,GetStringAfterSetText)517 TEST(BreakIteratorTest, GetStringAfterSetText) {
518 const std::u16string initial_string(u"str");
519 BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
520 ASSERT_TRUE(iter.Init());
521
522 const std::u16string long_string(u"another,string");
523 EXPECT_TRUE(iter.SetText(long_string.c_str(), long_string.size()));
524 EXPECT_TRUE(iter.Advance());
525 EXPECT_TRUE(iter.Advance()); // Advance to ',' in |long_string|
526
527 // Check that the current position is out of bounds of the |initial_string|.
528 EXPECT_LT(initial_string.size(), iter.pos());
529
530 // Check that we can get a valid substring of |long_string|.
531 EXPECT_EQ(u",", iter.GetString());
532 }
533
TEST(BreakIteratorTest,GetStringPiece)534 TEST(BreakIteratorTest, GetStringPiece) {
535 const std::u16string initial_string(u"some string");
536 BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
537 ASSERT_TRUE(iter.Init());
538
539 EXPECT_TRUE(iter.Advance());
540 EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
541 EXPECT_EQ(StringPiece16(u"some"), iter.GetStringPiece());
542
543 EXPECT_TRUE(iter.Advance());
544 EXPECT_TRUE(iter.Advance());
545 EXPECT_EQ(iter.GetString(), iter.GetStringPiece());
546 EXPECT_EQ(StringPiece16(u"string"), iter.GetStringPiece());
547 }
548
549 // Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
550 // IS_LINE_OR_CHAR_BREAK.
TEST(BreakIteratorTest,GetWordBreakStatusBreakLine)551 TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
552 // A string containing the English word "foo", followed by two Khmer
553 // characters, the English word "Can", and then two Russian characters and
554 // punctuation.
555 std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
556 BreakIterator iter(text, BreakIterator::BREAK_LINE);
557 ASSERT_TRUE(iter.Init());
558
559 EXPECT_TRUE(iter.Advance());
560 // Finds "foo" and the space.
561 EXPECT_EQ(u"foo ", iter.GetString());
562 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
563 EXPECT_TRUE(iter.Advance());
564 // Finds the Khmer characters, the next space, and the newline.
565 EXPECT_EQ(u"\x1791\x17C1 \n", iter.GetString());
566 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
567 EXPECT_TRUE(iter.Advance());
568 // Finds "Can" and the space.
569 EXPECT_EQ(u"Can ", iter.GetString());
570 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
571 EXPECT_TRUE(iter.Advance());
572 // Finds the Russian characters and periods.
573 EXPECT_EQ(u"\x041C\x0438...", iter.GetString());
574 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
575 EXPECT_FALSE(iter.Advance());
576 }
577
578 // Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
579 // IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
580 // finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
581 // be returned on punctuation and spaces.
TEST(BreakIteratorTest,GetWordBreakStatusBreakWord)582 TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
583 // A string containing the English word "foo", followed by two Khmer
584 // characters, the English word "Can", and then two Russian characters and
585 // punctuation.
586 std::u16string text(u"foo \x1791\x17C1 \nCan \x041C\x0438...");
587 BreakIterator iter(text, BreakIterator::BREAK_WORD);
588 ASSERT_TRUE(iter.Init());
589
590 EXPECT_TRUE(iter.Advance());
591 // Finds "foo".
592 EXPECT_EQ(u"foo", iter.GetString());
593 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
594 EXPECT_TRUE(iter.Advance());
595 // Finds the space, and the Khmer characters.
596 EXPECT_EQ(u" ", iter.GetString());
597 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
598 EXPECT_TRUE(iter.Advance());
599 EXPECT_EQ(u"\x1791\x17C1", iter.GetString());
600 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
601 EXPECT_TRUE(iter.Advance());
602 // Finds the space and the newline.
603 EXPECT_EQ(u" ", iter.GetString());
604 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
605 EXPECT_TRUE(iter.Advance());
606 EXPECT_EQ(u"\n", iter.GetString());
607 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
608 EXPECT_TRUE(iter.Advance());
609 // Finds "Can".
610 EXPECT_EQ(u"Can", iter.GetString());
611 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
612 EXPECT_TRUE(iter.Advance());
613 // Finds the space and the Russian characters.
614 EXPECT_EQ(u" ", iter.GetString());
615 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
616 EXPECT_TRUE(iter.Advance());
617 EXPECT_EQ(u"\x041C\x0438", iter.GetString());
618 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
619 EXPECT_TRUE(iter.Advance());
620 // Finds the trailing periods.
621 EXPECT_EQ(u".", iter.GetString());
622 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
623 EXPECT_TRUE(iter.Advance());
624 EXPECT_EQ(u".", iter.GetString());
625 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
626 EXPECT_TRUE(iter.Advance());
627 EXPECT_EQ(u".", iter.GetString());
628 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
629 EXPECT_FALSE(iter.Advance());
630 }
631
632 } // namespace i18n
633 } // namespace base
634