1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "base/i18n/break_iterator.h"
6
7 #include <stddef.h>
8
9 #include "base/macros.h"
10 #include "base/strings/string_piece.h"
11 #include "base/strings/string_util.h"
12 #include "base/strings/stringprintf.h"
13 #include "base/strings/utf_string_conversions.h"
14 #include "build/build_config.h"
15 #include "testing/gtest/include/gtest/gtest.h"
16
17 namespace base {
18 namespace i18n {
19
TEST(BreakIteratorTest,BreakWordEmpty)20 TEST(BreakIteratorTest, BreakWordEmpty) {
21 string16 empty;
22 BreakIterator iter(empty, BreakIterator::BREAK_WORD);
23 ASSERT_TRUE(iter.Init());
24 EXPECT_FALSE(iter.Advance());
25 EXPECT_FALSE(iter.IsWord());
26 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
27 EXPECT_FALSE(iter.IsWord());
28 }
29
TEST(BreakIteratorTest,BreakWord)30 TEST(BreakIteratorTest, BreakWord) {
31 string16 space(UTF8ToUTF16(" "));
32 string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
33 BreakIterator iter(str, BreakIterator::BREAK_WORD);
34 ASSERT_TRUE(iter.Init());
35 EXPECT_TRUE(iter.Advance());
36 EXPECT_FALSE(iter.IsWord());
37 EXPECT_EQ(space, iter.GetString());
38 EXPECT_TRUE(iter.Advance());
39 EXPECT_TRUE(iter.IsWord());
40 EXPECT_EQ(UTF8ToUTF16("foo"), iter.GetString());
41 EXPECT_TRUE(iter.Advance());
42 EXPECT_FALSE(iter.IsWord());
43 EXPECT_EQ(space, iter.GetString());
44 EXPECT_TRUE(iter.Advance());
45 EXPECT_TRUE(iter.IsWord());
46 EXPECT_EQ(UTF8ToUTF16("bar"), iter.GetString());
47 EXPECT_TRUE(iter.Advance());
48 EXPECT_FALSE(iter.IsWord());
49 EXPECT_EQ(UTF8ToUTF16("!"), iter.GetString());
50 EXPECT_TRUE(iter.Advance());
51 EXPECT_FALSE(iter.IsWord());
52 EXPECT_EQ(space, iter.GetString());
53 EXPECT_TRUE(iter.Advance());
54 EXPECT_FALSE(iter.IsWord());
55 EXPECT_EQ(UTF8ToUTF16("\n"), iter.GetString());
56 EXPECT_TRUE(iter.Advance());
57 EXPECT_TRUE(iter.IsWord());
58 EXPECT_EQ(UTF8ToUTF16("pouet"), iter.GetString());
59 EXPECT_TRUE(iter.Advance());
60 EXPECT_FALSE(iter.IsWord());
61 EXPECT_EQ(space, iter.GetString());
62 EXPECT_TRUE(iter.Advance());
63 EXPECT_TRUE(iter.IsWord());
64 EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
65 EXPECT_FALSE(iter.Advance());
66 EXPECT_FALSE(iter.IsWord());
67 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
68 EXPECT_FALSE(iter.IsWord());
69 }
70
TEST(BreakIteratorTest,BreakWordWide16)71 TEST(BreakIteratorTest, BreakWordWide16) {
72 // Two greek words separated by space.
73 const string16 str(WideToUTF16(
74 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
75 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
76 const string16 word1(str.substr(0, 10));
77 const string16 word2(str.substr(11, 5));
78 BreakIterator iter(str, BreakIterator::BREAK_WORD);
79 ASSERT_TRUE(iter.Init());
80 EXPECT_TRUE(iter.Advance());
81 EXPECT_TRUE(iter.IsWord());
82 EXPECT_EQ(word1, iter.GetString());
83 EXPECT_TRUE(iter.Advance());
84 EXPECT_FALSE(iter.IsWord());
85 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
86 EXPECT_TRUE(iter.Advance());
87 EXPECT_TRUE(iter.IsWord());
88 EXPECT_EQ(word2, iter.GetString());
89 EXPECT_FALSE(iter.Advance());
90 EXPECT_FALSE(iter.IsWord());
91 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
92 EXPECT_FALSE(iter.IsWord());
93 }
94
TEST(BreakIteratorTest,BreakWordWide32)95 TEST(BreakIteratorTest, BreakWordWide32) {
96 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
97 const char very_wide_char[] = "\xF0\x9D\x92\x9C";
98 const string16 str(
99 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
100 const string16 very_wide_word(str.substr(0, 2));
101
102 BreakIterator iter(str, BreakIterator::BREAK_WORD);
103 ASSERT_TRUE(iter.Init());
104 EXPECT_TRUE(iter.Advance());
105 EXPECT_TRUE(iter.IsWord());
106 EXPECT_EQ(very_wide_word, iter.GetString());
107 EXPECT_TRUE(iter.Advance());
108 EXPECT_FALSE(iter.IsWord());
109 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
110 EXPECT_TRUE(iter.Advance());
111 EXPECT_TRUE(iter.IsWord());
112 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
113 EXPECT_FALSE(iter.Advance());
114 EXPECT_FALSE(iter.IsWord());
115 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
116 EXPECT_FALSE(iter.IsWord());
117 }
118
TEST(BreakIteratorTest,BreakWordThai)119 TEST(BreakIteratorTest, BreakWordThai) {
120 // Terms in Thai, without spaces in between.
121 const char term1[] = "พิมพ์";
122 const char term2[] = "น้อย";
123 const char term3[] = "ลง";
124 const string16 str(UTF8ToUTF16(base::JoinString({term1, term2, term3}, "")));
125
126 BreakIterator iter(str, BreakIterator::BREAK_WORD);
127 ASSERT_TRUE(iter.Init());
128 EXPECT_TRUE(iter.Advance());
129 EXPECT_TRUE(iter.IsWord());
130 EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
131 EXPECT_TRUE(iter.Advance());
132 EXPECT_TRUE(iter.IsWord());
133 EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
134 EXPECT_TRUE(iter.Advance());
135 EXPECT_TRUE(iter.IsWord());
136 EXPECT_EQ(UTF8ToUTF16(term3), iter.GetString());
137 EXPECT_FALSE(iter.Advance());
138 EXPECT_FALSE(iter.IsWord());
139 }
140
141 // In some languages, the words are not broken by spaces. ICU provides a huge
142 // dictionary to detect word boundaries in Thai, Chinese, Japanese, Burmese,
143 // and Khmer. Due to the size of such a table, the part for Chinese and
144 // Japanese is not shipped on mobile.
145 #if !(defined(OS_IOS) || defined(OS_ANDROID))
146
TEST(BreakIteratorTest,BreakWordChinese)147 TEST(BreakIteratorTest, BreakWordChinese) {
148 // Terms in Traditional Chinese, without spaces in between.
149 const char term1[] = "瀏覽";
150 const char term2[] = "速度";
151 const char term3[] = "飛快";
152 const string16 str(UTF8ToUTF16(base::JoinString({term1, term2, term3}, "")));
153
154 BreakIterator iter(str, BreakIterator::BREAK_WORD);
155 ASSERT_TRUE(iter.Init());
156 EXPECT_TRUE(iter.Advance());
157 EXPECT_TRUE(iter.IsWord());
158 EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
159 EXPECT_TRUE(iter.Advance());
160 EXPECT_TRUE(iter.IsWord());
161 EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
162 EXPECT_TRUE(iter.Advance());
163 EXPECT_TRUE(iter.IsWord());
164 EXPECT_EQ(UTF8ToUTF16(term3), iter.GetString());
165 EXPECT_FALSE(iter.Advance());
166 EXPECT_FALSE(iter.IsWord());
167 }
168
TEST(BreakIteratorTest,BreakWordJapanese)169 TEST(BreakIteratorTest, BreakWordJapanese) {
170 // Terms in Japanese, without spaces in between.
171 const char term1[] = "モバイル";
172 const char term2[] = "でも";
173 const string16 str(UTF8ToUTF16(base::JoinString({term1, term2}, "")));
174
175 BreakIterator iter(str, BreakIterator::BREAK_WORD);
176 ASSERT_TRUE(iter.Init());
177 EXPECT_TRUE(iter.Advance());
178 EXPECT_TRUE(iter.IsWord());
179 EXPECT_EQ(UTF8ToUTF16(term1), iter.GetString());
180 EXPECT_TRUE(iter.Advance());
181 EXPECT_TRUE(iter.IsWord());
182 EXPECT_EQ(UTF8ToUTF16(term2), iter.GetString());
183 EXPECT_FALSE(iter.Advance());
184 EXPECT_FALSE(iter.IsWord());
185 }
186
TEST(BreakIteratorTest,BreakWordChineseEnglish)187 TEST(BreakIteratorTest, BreakWordChineseEnglish) {
188 // Terms in Simplified Chinese mixed with English and wide punctuations.
189 string16 space(UTF8ToUTF16(" "));
190 const char token1[] = "下载";
191 const char token2[] = "Chrome";
192 const char token3[] = "(";
193 const char token4[] = "Mac";
194 const char token5[] = "版";
195 const char token6[] = ")";
196 const string16 str(UTF8ToUTF16(base::JoinString(
197 {token1, " ", token2, token3, token4, " ", token5, token6}, "")));
198
199 BreakIterator iter(str, BreakIterator::BREAK_WORD);
200 ASSERT_TRUE(iter.Init());
201
202 EXPECT_TRUE(iter.Advance());
203 EXPECT_TRUE(iter.IsWord());
204 EXPECT_EQ(UTF8ToUTF16(token1), iter.GetString());
205
206 EXPECT_TRUE(iter.Advance());
207 EXPECT_FALSE(iter.IsWord());
208 EXPECT_EQ(space, iter.GetString());
209
210 EXPECT_TRUE(iter.Advance());
211 EXPECT_TRUE(iter.IsWord());
212 EXPECT_EQ(UTF8ToUTF16(token2), iter.GetString());
213
214 EXPECT_TRUE(iter.Advance());
215 EXPECT_FALSE(iter.IsWord());
216 EXPECT_EQ(UTF8ToUTF16(token3), iter.GetString());
217
218 EXPECT_TRUE(iter.Advance());
219 EXPECT_TRUE(iter.IsWord());
220 EXPECT_EQ(UTF8ToUTF16(token4), iter.GetString());
221
222 EXPECT_TRUE(iter.Advance());
223 EXPECT_FALSE(iter.IsWord());
224 EXPECT_EQ(space, iter.GetString());
225
226 EXPECT_TRUE(iter.Advance());
227 EXPECT_TRUE(iter.IsWord());
228 EXPECT_EQ(UTF8ToUTF16(token5), iter.GetString());
229
230 EXPECT_TRUE(iter.Advance());
231 EXPECT_FALSE(iter.IsWord());
232 EXPECT_EQ(UTF8ToUTF16(token6), iter.GetString());
233
234 EXPECT_FALSE(iter.Advance());
235 EXPECT_FALSE(iter.IsWord());
236 }
237
238 #endif // !(defined(OS_IOS) || defined(OS_ANDROID))
239
TEST(BreakIteratorTest,BreakSpaceEmpty)240 TEST(BreakIteratorTest, BreakSpaceEmpty) {
241 string16 empty;
242 BreakIterator iter(empty, BreakIterator::BREAK_SPACE);
243 ASSERT_TRUE(iter.Init());
244 EXPECT_FALSE(iter.Advance());
245 EXPECT_FALSE(iter.IsWord());
246 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
247 EXPECT_FALSE(iter.IsWord());
248 }
249
TEST(BreakIteratorTest,BreakSpace)250 TEST(BreakIteratorTest, BreakSpace) {
251 string16 str(UTF8ToUTF16(" foo bar! \npouet boom"));
252 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
253 ASSERT_TRUE(iter.Init());
254 EXPECT_TRUE(iter.Advance());
255 EXPECT_FALSE(iter.IsWord());
256 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
257 EXPECT_TRUE(iter.Advance());
258 EXPECT_FALSE(iter.IsWord());
259 EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
260 EXPECT_TRUE(iter.Advance());
261 EXPECT_FALSE(iter.IsWord());
262 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
263 EXPECT_TRUE(iter.Advance());
264 EXPECT_FALSE(iter.IsWord());
265 EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
266 EXPECT_TRUE(iter.Advance());
267 EXPECT_FALSE(iter.IsWord());
268 EXPECT_EQ(UTF8ToUTF16("boom"), iter.GetString());
269 EXPECT_FALSE(iter.Advance());
270 EXPECT_FALSE(iter.IsWord());
271 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
272 EXPECT_FALSE(iter.IsWord());
273 }
274
TEST(BreakIteratorTest,BreakSpaceSP)275 TEST(BreakIteratorTest, BreakSpaceSP) {
276 string16 str(UTF8ToUTF16(" foo bar! \npouet boom "));
277 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
278 ASSERT_TRUE(iter.Init());
279 EXPECT_TRUE(iter.Advance());
280 EXPECT_FALSE(iter.IsWord());
281 EXPECT_EQ(UTF8ToUTF16(" "), iter.GetString());
282 EXPECT_TRUE(iter.Advance());
283 EXPECT_FALSE(iter.IsWord());
284 EXPECT_EQ(UTF8ToUTF16("foo "), iter.GetString());
285 EXPECT_TRUE(iter.Advance());
286 EXPECT_FALSE(iter.IsWord());
287 EXPECT_EQ(UTF8ToUTF16("bar! \n"), iter.GetString());
288 EXPECT_TRUE(iter.Advance());
289 EXPECT_FALSE(iter.IsWord());
290 EXPECT_EQ(UTF8ToUTF16("pouet "), iter.GetString());
291 EXPECT_TRUE(iter.Advance());
292 EXPECT_FALSE(iter.IsWord());
293 EXPECT_EQ(UTF8ToUTF16("boom "), iter.GetString());
294 EXPECT_FALSE(iter.Advance());
295 EXPECT_FALSE(iter.IsWord());
296 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
297 EXPECT_FALSE(iter.IsWord());
298 }
299
TEST(BreakIteratorTest,BreakSpacekWide16)300 TEST(BreakIteratorTest, BreakSpacekWide16) {
301 // Two Greek words.
302 const string16 str(WideToUTF16(
303 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
304 L"\x03bf\x03c2\x0020\x0399\x03c3\x03c4\x03cc\x03c2"));
305 const string16 word1(str.substr(0, 11));
306 const string16 word2(str.substr(11, 5));
307 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
308 ASSERT_TRUE(iter.Init());
309 EXPECT_TRUE(iter.Advance());
310 EXPECT_FALSE(iter.IsWord());
311 EXPECT_EQ(word1, iter.GetString());
312 EXPECT_TRUE(iter.Advance());
313 EXPECT_FALSE(iter.IsWord());
314 EXPECT_EQ(word2, iter.GetString());
315 EXPECT_FALSE(iter.Advance());
316 EXPECT_FALSE(iter.IsWord());
317 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
318 EXPECT_FALSE(iter.IsWord());
319 }
320
TEST(BreakIteratorTest,BreakSpaceWide32)321 TEST(BreakIteratorTest, BreakSpaceWide32) {
322 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
323 const char very_wide_char[] = "\xF0\x9D\x92\x9C";
324 const string16 str(
325 UTF8ToUTF16(base::StringPrintf("%s a", very_wide_char)));
326 const string16 very_wide_word(str.substr(0, 3));
327
328 BreakIterator iter(str, BreakIterator::BREAK_SPACE);
329 ASSERT_TRUE(iter.Init());
330 EXPECT_TRUE(iter.Advance());
331 EXPECT_FALSE(iter.IsWord());
332 EXPECT_EQ(very_wide_word, iter.GetString());
333 EXPECT_TRUE(iter.Advance());
334 EXPECT_FALSE(iter.IsWord());
335 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
336 EXPECT_FALSE(iter.Advance());
337 EXPECT_FALSE(iter.IsWord());
338 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
339 EXPECT_FALSE(iter.IsWord());
340 }
341
TEST(BreakIteratorTest,BreakLineEmpty)342 TEST(BreakIteratorTest, BreakLineEmpty) {
343 string16 empty;
344 BreakIterator iter(empty, BreakIterator::BREAK_NEWLINE);
345 ASSERT_TRUE(iter.Init());
346 EXPECT_FALSE(iter.Advance());
347 EXPECT_FALSE(iter.IsWord());
348 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
349 EXPECT_FALSE(iter.IsWord());
350 }
351
TEST(BreakIteratorTest,BreakLine)352 TEST(BreakIteratorTest, BreakLine) {
353 string16 nl(UTF8ToUTF16("\n"));
354 string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom"));
355 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
356 ASSERT_TRUE(iter.Init());
357 EXPECT_TRUE(iter.Advance());
358 EXPECT_FALSE(iter.IsWord());
359 EXPECT_EQ(nl, iter.GetString());
360 EXPECT_TRUE(iter.Advance());
361 EXPECT_FALSE(iter.IsWord());
362 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
363 EXPECT_TRUE(iter.Advance());
364 EXPECT_FALSE(iter.IsWord());
365 EXPECT_EQ(nl, iter.GetString());
366 EXPECT_TRUE(iter.Advance());
367 EXPECT_FALSE(iter.IsWord());
368 EXPECT_EQ(UTF8ToUTF16("pouet boom"), iter.GetString());
369 EXPECT_FALSE(iter.Advance());
370 EXPECT_FALSE(iter.IsWord());
371 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
372 EXPECT_FALSE(iter.IsWord());
373 }
374
TEST(BreakIteratorTest,BreakLineNL)375 TEST(BreakIteratorTest, BreakLineNL) {
376 string16 nl(UTF8ToUTF16("\n"));
377 string16 str(UTF8ToUTF16("\nfoo bar!\n\npouet boom\n"));
378 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
379 ASSERT_TRUE(iter.Init());
380 EXPECT_TRUE(iter.Advance());
381 EXPECT_FALSE(iter.IsWord());
382 EXPECT_EQ(nl, iter.GetString());
383 EXPECT_TRUE(iter.Advance());
384 EXPECT_FALSE(iter.IsWord());
385 EXPECT_EQ(UTF8ToUTF16("foo bar!\n"), iter.GetString());
386 EXPECT_TRUE(iter.Advance());
387 EXPECT_FALSE(iter.IsWord());
388 EXPECT_EQ(nl, iter.GetString());
389 EXPECT_TRUE(iter.Advance());
390 EXPECT_FALSE(iter.IsWord());
391 EXPECT_EQ(UTF8ToUTF16("pouet boom\n"), iter.GetString());
392 EXPECT_FALSE(iter.Advance());
393 EXPECT_FALSE(iter.IsWord());
394 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
395 EXPECT_FALSE(iter.IsWord());
396 }
397
TEST(BreakIteratorTest,BreakLineWide16)398 TEST(BreakIteratorTest, BreakLineWide16) {
399 // Two Greek words separated by newline.
400 const string16 str(WideToUTF16(
401 L"\x03a0\x03b1\x03b3\x03ba\x03cc\x03c3\x03bc\x03b9"
402 L"\x03bf\x03c2\x000a\x0399\x03c3\x03c4\x03cc\x03c2"));
403 const string16 line1(str.substr(0, 11));
404 const string16 line2(str.substr(11, 5));
405 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
406 ASSERT_TRUE(iter.Init());
407 EXPECT_TRUE(iter.Advance());
408 EXPECT_FALSE(iter.IsWord());
409 EXPECT_EQ(line1, iter.GetString());
410 EXPECT_TRUE(iter.Advance());
411 EXPECT_FALSE(iter.IsWord());
412 EXPECT_EQ(line2, iter.GetString());
413 EXPECT_FALSE(iter.Advance());
414 EXPECT_FALSE(iter.IsWord());
415 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
416 EXPECT_FALSE(iter.IsWord());
417 }
418
TEST(BreakIteratorTest,BreakLineWide32)419 TEST(BreakIteratorTest, BreakLineWide32) {
420 // U+1D49C MATHEMATICAL SCRIPT CAPITAL A
421 const char very_wide_char[] = "\xF0\x9D\x92\x9C";
422 const string16 str(
423 UTF8ToUTF16(base::StringPrintf("%s\na", very_wide_char)));
424 const string16 very_wide_line(str.substr(0, 3));
425 BreakIterator iter(str, BreakIterator::BREAK_NEWLINE);
426 ASSERT_TRUE(iter.Init());
427 EXPECT_TRUE(iter.Advance());
428 EXPECT_FALSE(iter.IsWord());
429 EXPECT_EQ(very_wide_line, iter.GetString());
430 EXPECT_TRUE(iter.Advance());
431 EXPECT_FALSE(iter.IsWord());
432 EXPECT_EQ(UTF8ToUTF16("a"), iter.GetString());
433 EXPECT_FALSE(iter.Advance());
434 EXPECT_FALSE(iter.IsWord());
435 EXPECT_FALSE(iter.Advance()); // Test unexpected advance after end.
436 EXPECT_FALSE(iter.IsWord());
437 }
438
TEST(BreakIteratorTest,BreakCharacter)439 TEST(BreakIteratorTest, BreakCharacter) {
440 static const wchar_t* kCharacters[] = {
441 // An English word consisting of four ASCII characters.
442 L"w", L"o", L"r", L"d", L" ",
443 // A Hindi word (which means "Hindi") consisting of three Devanagari
444 // characters.
445 L"\x0939\x093F", L"\x0928\x094D", L"\x0926\x0940", L" ",
446 // A Thai word (which means "feel") consisting of three Thai characters.
447 L"\x0E23\x0E39\x0E49", L"\x0E2A\x0E36", L"\x0E01", L" ",
448 };
449 std::vector<string16> characters;
450 string16 text;
451 for (size_t i = 0; i < arraysize(kCharacters); ++i) {
452 characters.push_back(WideToUTF16(kCharacters[i]));
453 text.append(characters.back());
454 }
455 BreakIterator iter(text, BreakIterator::BREAK_CHARACTER);
456 ASSERT_TRUE(iter.Init());
457 for (size_t i = 0; i < arraysize(kCharacters); ++i) {
458 EXPECT_TRUE(iter.Advance());
459 EXPECT_EQ(characters[i], iter.GetString());
460 }
461 }
462
463 // Test for https://code.google.com/p/chromium/issues/detail?id=411213
464 // We should be able to get valid substrings with GetString() function
465 // after setting new content by calling SetText().
TEST(BreakIteratorTest,GetStringAfterSetText)466 TEST(BreakIteratorTest, GetStringAfterSetText) {
467 const string16 initial_string(ASCIIToUTF16("str"));
468 BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
469 ASSERT_TRUE(iter.Init());
470
471 const string16 long_string(ASCIIToUTF16("another,string"));
472 EXPECT_TRUE(iter.SetText(long_string.c_str(), long_string.size()));
473 EXPECT_TRUE(iter.Advance());
474 EXPECT_TRUE(iter.Advance()); // Advance to ',' in |long_string|
475
476 // Check that the current position is out of bounds of the |initial_string|.
477 EXPECT_LT(initial_string.size(), iter.pos());
478
479 // Check that we can get a valid substring of |long_string|.
480 EXPECT_EQ(ASCIIToUTF16(","), iter.GetString());
481 }
482
TEST(BreakIteratorTest,GetStringPiece)483 TEST(BreakIteratorTest, GetStringPiece) {
484 const string16 initial_string(ASCIIToUTF16("some string"));
485 BreakIterator iter(initial_string, BreakIterator::BREAK_WORD);
486 ASSERT_TRUE(iter.Init());
487
488 EXPECT_TRUE(iter.Advance());
489 EXPECT_EQ(iter.GetString(), iter.GetStringPiece().as_string());
490 EXPECT_EQ(StringPiece16(ASCIIToUTF16("some")), iter.GetStringPiece());
491
492 EXPECT_TRUE(iter.Advance());
493 EXPECT_TRUE(iter.Advance());
494 EXPECT_EQ(iter.GetString(), iter.GetStringPiece().as_string());
495 EXPECT_EQ(StringPiece16(ASCIIToUTF16("string")), iter.GetStringPiece());
496 }
497
498 // Make sure that when not in RULE_BASED or BREAK_WORD mode we're getting
499 // IS_LINE_OR_CHAR_BREAK.
TEST(BreakIteratorTest,GetWordBreakStatusBreakLine)500 TEST(BreakIteratorTest, GetWordBreakStatusBreakLine) {
501 // A string containing the English word "foo", followed by two Khmer
502 // characters, the English word "Can", and then two Russian characters and
503 // punctuation.
504 base::string16 text(
505 base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
506 BreakIterator iter(text, BreakIterator::BREAK_LINE);
507 ASSERT_TRUE(iter.Init());
508
509 EXPECT_TRUE(iter.Advance());
510 // Finds "foo" and the space.
511 EXPECT_EQ(base::UTF8ToUTF16("foo "), iter.GetString());
512 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
513 EXPECT_TRUE(iter.Advance());
514 // Finds the Khmer characters, the next space, and the newline.
515 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1 \n"), iter.GetString());
516 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
517 EXPECT_TRUE(iter.Advance());
518 // Finds "Can" and the space.
519 EXPECT_EQ(base::UTF8ToUTF16("Can "), iter.GetString());
520 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
521 EXPECT_TRUE(iter.Advance());
522 // Finds the Russian characters and periods.
523 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438..."), iter.GetString());
524 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_LINE_OR_CHAR_BREAK);
525 EXPECT_FALSE(iter.Advance());
526 }
527
528 // Make sure that in BREAK_WORD mode we're getting IS_WORD_BREAK and
529 // IS_SKIPPABLE_WORD when we should be. IS_WORD_BREAK should be returned when we
530 // finish going over non-punctuation characters while IS_SKIPPABLE_WORD should
531 // be returned on punctuation and spaces.
TEST(BreakIteratorTest,GetWordBreakStatusBreakWord)532 TEST(BreakIteratorTest, GetWordBreakStatusBreakWord) {
533 // A string containing the English word "foo", followed by two Khmer
534 // characters, the English word "Can", and then two Russian characters and
535 // punctuation.
536 base::string16 text(
537 base::WideToUTF16(L"foo \x1791\x17C1 \nCan \x041C\x0438..."));
538 BreakIterator iter(text, BreakIterator::BREAK_WORD);
539 ASSERT_TRUE(iter.Init());
540
541 EXPECT_TRUE(iter.Advance());
542 // Finds "foo".
543 EXPECT_EQ(base::UTF8ToUTF16("foo"), iter.GetString());
544 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
545 EXPECT_TRUE(iter.Advance());
546 // Finds the space, and the Khmer characters.
547 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
548 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
549 EXPECT_TRUE(iter.Advance());
550 EXPECT_EQ(base::WideToUTF16(L"\x1791\x17C1"), iter.GetString());
551 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
552 EXPECT_TRUE(iter.Advance());
553 // Finds the space and the newline.
554 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
555 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
556 EXPECT_TRUE(iter.Advance());
557 EXPECT_EQ(base::UTF8ToUTF16("\n"), iter.GetString());
558 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
559 EXPECT_TRUE(iter.Advance());
560 // Finds "Can".
561 EXPECT_EQ(base::UTF8ToUTF16("Can"), iter.GetString());
562 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
563 EXPECT_TRUE(iter.Advance());
564 // Finds the space and the Russian characters.
565 EXPECT_EQ(base::UTF8ToUTF16(" "), iter.GetString());
566 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
567 EXPECT_TRUE(iter.Advance());
568 EXPECT_EQ(base::WideToUTF16(L"\x041C\x0438"), iter.GetString());
569 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_WORD_BREAK);
570 EXPECT_TRUE(iter.Advance());
571 // Finds the trailing periods.
572 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
573 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
574 EXPECT_TRUE(iter.Advance());
575 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
576 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
577 EXPECT_TRUE(iter.Advance());
578 EXPECT_EQ(base::UTF8ToUTF16("."), iter.GetString());
579 EXPECT_EQ(iter.GetWordBreakStatus(), BreakIterator::IS_SKIPPABLE_WORD);
580 EXPECT_FALSE(iter.Advance());
581 }
582
583 } // namespace i18n
584 } // namespace base
585