1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Minikin"
18
19 #include <android/log.h>
20 #include <gtest/gtest.h>
21
22 #include "ICUTestBase.h"
23 #include "UnicodeUtils.h"
24 #include <minikin/WordBreaker.h>
25 #include <unicode/locid.h>
26 #include <unicode/uclean.h>
27 #include <unicode/udata.h>
28
29 #ifndef NELEM
30 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
31 #endif
32
33 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
34
35 namespace minikin {
36
37 typedef ICUTestBase WordBreakerTest;
38
TEST_F(WordBreakerTest,basic)39 TEST_F(WordBreakerTest, basic) {
40 uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
41 WordBreaker breaker;
42 breaker.setLocale(icu::Locale::getUS());
43 breaker.setText(buf, NELEM(buf));
44 EXPECT_EQ(0, breaker.current());
45 EXPECT_EQ(6, breaker.next()); // after "hello "
46 EXPECT_EQ(0, breaker.wordStart()); // "hello"
47 EXPECT_EQ(5, breaker.wordEnd());
48 EXPECT_EQ(0, breaker.breakBadness());
49 EXPECT_EQ(6, breaker.current());
50 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
51 EXPECT_EQ(6, breaker.wordStart()); // "world"
52 EXPECT_EQ(11, breaker.wordEnd());
53 EXPECT_EQ(0, breaker.breakBadness());
54 EXPECT_EQ(11, breaker.current());
55 }
56
TEST_F(WordBreakerTest,softHyphen)57 TEST_F(WordBreakerTest, softHyphen) {
58 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
59 WordBreaker breaker;
60 breaker.setLocale(icu::Locale::getUS());
61 breaker.setText(buf, NELEM(buf));
62 EXPECT_EQ(0, breaker.current());
63 EXPECT_EQ(7, breaker.next()); // after "hel{SOFT HYPHEN}lo "
64 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
65 EXPECT_EQ(6, breaker.wordEnd());
66 EXPECT_EQ(0, breaker.breakBadness());
67 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
68 EXPECT_EQ(7, breaker.wordStart()); // "world"
69 EXPECT_EQ(12, breaker.wordEnd());
70 EXPECT_EQ(0, breaker.breakBadness());
71 }
72
TEST_F(WordBreakerTest,hardHyphen)73 TEST_F(WordBreakerTest, hardHyphen) {
74 // Hyphens should not allow breaks anymore.
75 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
76 WordBreaker breaker;
77 breaker.setLocale(icu::Locale::getUS());
78 breaker.setText(buf, NELEM(buf));
79 EXPECT_EQ(0, breaker.current());
80 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());
81 EXPECT_EQ(0, breaker.wordStart());
82 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
83 EXPECT_EQ(0, breaker.breakBadness());
84 }
85
TEST_F(WordBreakerTest,postfixAndPrefix)86 TEST_F(WordBreakerTest, postfixAndPrefix) {
87 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
88 WordBreaker breaker;
89 breaker.setLocale(icu::Locale::getUS());
90 breaker.setText(buf, NELEM(buf));
91 EXPECT_EQ(0, breaker.current());
92
93 EXPECT_EQ(4, breaker.next()); // after CENT SIGN
94 EXPECT_EQ(0, breaker.wordStart()); // "US¢"
95 EXPECT_EQ(3, breaker.wordEnd());
96
97 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
98 EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
99 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
100 }
101
TEST_F(WordBreakerTest,myanmarKinzi)102 TEST_F(WordBreakerTest, myanmarKinzi) {
103 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
104 WordBreaker breaker;
105 icu::Locale burmese("my");
106 breaker.setLocale(burmese);
107 breaker.setText(buf, NELEM(buf));
108 EXPECT_EQ(0, breaker.current());
109
110 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
111 EXPECT_EQ(0, breaker.wordStart());
112 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
113 }
114
TEST_F(WordBreakerTest,zwjEmojiSequences)115 TEST_F(WordBreakerTest, zwjEmojiSequences) {
116 uint16_t buf[] = {
117 // man + zwj + heart + zwj + man
118 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
119 // woman + zwj + heart + zwj + kiss mark + zwj + woman
120 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
121 // eye + zwj + left speech bubble
122 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
123 // CAT FACE + zwj + BUST IN SILHOUETTE
124 UTF16(0x1F431), 0x200D, UTF16(0x1F464),
125 };
126 WordBreaker breaker;
127 breaker.setLocale(icu::Locale::getUS());
128 breaker.setText(buf, NELEM(buf));
129 EXPECT_EQ(0, breaker.current());
130 EXPECT_EQ(7, breaker.next()); // after man + zwj + heart + zwj + man
131 EXPECT_EQ(0, breaker.wordStart());
132 EXPECT_EQ(7, breaker.wordEnd());
133 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
134 EXPECT_EQ(7, breaker.wordStart());
135 EXPECT_EQ(17, breaker.wordEnd());
136 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble
137 EXPECT_EQ(17, breaker.wordStart());
138 EXPECT_EQ(22, breaker.wordEnd());
139 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
140 EXPECT_EQ(22, breaker.wordStart());
141 EXPECT_EQ(27, breaker.wordEnd());
142 }
143
TEST_F(WordBreakerTest,emojiWithModifier)144 TEST_F(WordBreakerTest, emojiWithModifier) {
145 uint16_t buf[] = {
146 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
147 0x270C, 0xFE0F, UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
148 };
149 WordBreaker breaker;
150 breaker.setLocale(icu::Locale::getUS());
151 breaker.setText(buf, NELEM(buf));
152 EXPECT_EQ(0, breaker.current());
153 EXPECT_EQ(4, breaker.next()); // after boy + type 1-2 fitzpatrick modifier
154 EXPECT_EQ(0, breaker.wordStart());
155 EXPECT_EQ(4, breaker.wordEnd());
156 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
157 EXPECT_EQ(4, breaker.wordStart());
158 EXPECT_EQ(8, breaker.wordEnd());
159 }
160
TEST_F(WordBreakerTest,unicode10Emoji)161 TEST_F(WordBreakerTest, unicode10Emoji) {
162 // Should break between emojis.
163 uint16_t buf[] = {
164 // SLED + SLED
165 UTF16(0x1F6F7), UTF16(0x1F6F7),
166 // SLED + VS15 + SLED
167 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
168 // WHITE SMILING FACE + SLED
169 0x263A, UTF16(0x1F6F7),
170 // WHITE SMILING FACE + VS16 + SLED
171 0x263A, 0xFE0F, UTF16(0x1F6F7),
172 };
173 WordBreaker breaker;
174 breaker.setLocale(icu::Locale::getEnglish());
175 breaker.setText(buf, NELEM(buf));
176 EXPECT_EQ(0, breaker.current());
177 EXPECT_EQ(2, breaker.next());
178 EXPECT_EQ(0, breaker.wordStart());
179 EXPECT_EQ(2, breaker.wordEnd());
180
181 EXPECT_EQ(4, breaker.next());
182 EXPECT_EQ(2, breaker.wordStart());
183 EXPECT_EQ(4, breaker.wordEnd());
184
185 EXPECT_EQ(7, breaker.next());
186 EXPECT_EQ(4, breaker.wordStart());
187 EXPECT_EQ(7, breaker.wordEnd());
188
189 EXPECT_EQ(9, breaker.next());
190 EXPECT_EQ(7, breaker.wordStart());
191 EXPECT_EQ(9, breaker.wordEnd());
192
193 EXPECT_EQ(10, breaker.next());
194 EXPECT_EQ(9, breaker.wordStart());
195 EXPECT_EQ(10, breaker.wordEnd());
196
197 EXPECT_EQ(12, breaker.next());
198 EXPECT_EQ(10, breaker.wordStart());
199 EXPECT_EQ(12, breaker.wordEnd());
200
201 EXPECT_EQ(14, breaker.next());
202 EXPECT_EQ(12, breaker.wordStart());
203 EXPECT_EQ(14, breaker.wordEnd());
204
205 EXPECT_EQ(16, breaker.next());
206 EXPECT_EQ(14, breaker.wordStart());
207 EXPECT_EQ(16, breaker.wordEnd());
208 }
209
TEST_F(WordBreakerTest,flagsSequenceSingleFlag)210 TEST_F(WordBreakerTest, flagsSequenceSingleFlag) {
211 const std::string kFlag = "U+1F3F4";
212 const std::string flags = kFlag + " " + kFlag;
213
214 const int kFlagLength = 2;
215 const size_t BUF_SIZE = kFlagLength * 2;
216
217 uint16_t buf[BUF_SIZE];
218 size_t size;
219 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
220
221 WordBreaker breaker;
222 breaker.setLocale(icu::Locale::getUS());
223 breaker.setText(buf, size);
224 EXPECT_EQ(0, breaker.current());
225 EXPECT_EQ(kFlagLength, breaker.next()); // end of the first flag
226 EXPECT_EQ(0, breaker.wordStart());
227 EXPECT_EQ(kFlagLength, breaker.wordEnd());
228 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
229 EXPECT_EQ(kFlagLength, breaker.wordStart());
230 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
231 }
232
TEST_F(WordBreakerTest,flagsSequence)233 TEST_F(WordBreakerTest, flagsSequence) {
234 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
235 // of Scotland.
236 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
237 const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
238
239 const int kFlagLength = 14;
240 const size_t BUF_SIZE = kFlagLength * 2;
241
242 uint16_t buf[BUF_SIZE];
243 size_t size;
244 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
245
246 WordBreaker breaker;
247 breaker.setLocale(icu::Locale::getUS());
248 breaker.setText(buf, size);
249 EXPECT_EQ(0, breaker.current());
250 EXPECT_EQ(kFlagLength, breaker.next()); // end of the first flag sequence
251 EXPECT_EQ(0, breaker.wordStart());
252 EXPECT_EQ(kFlagLength, breaker.wordEnd());
253 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
254 EXPECT_EQ(kFlagLength, breaker.wordStart());
255 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
256 }
257
TEST_F(WordBreakerTest,punct)258 TEST_F(WordBreakerTest, punct) {
259 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
260 '!', '!'};
261 WordBreaker breaker;
262 breaker.setLocale(icu::Locale::getUS());
263 breaker.setText(buf, NELEM(buf));
264 EXPECT_EQ(0, breaker.current());
265 EXPECT_EQ(9, breaker.next()); // after "¡¡hello, "
266 EXPECT_EQ(2, breaker.wordStart()); // "hello"
267 EXPECT_EQ(7, breaker.wordEnd());
268 EXPECT_EQ(0, breaker.breakBadness());
269 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
270 EXPECT_EQ(9, breaker.wordStart()); // "world"
271 EXPECT_EQ(14, breaker.wordEnd());
272 EXPECT_EQ(0, breaker.breakBadness());
273 }
274
TEST_F(WordBreakerTest,email)275 TEST_F(WordBreakerTest, email) {
276 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
277 ' ', 'x'};
278 WordBreaker breaker;
279 breaker.setLocale(icu::Locale::getUS());
280 breaker.setText(buf, NELEM(buf));
281 EXPECT_EQ(0, breaker.current());
282 EXPECT_EQ(11, breaker.next()); // after "foo@example"
283 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
284 EXPECT_EQ(1, breaker.breakBadness());
285 EXPECT_EQ(16, breaker.next()); // after ".com "
286 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
287 EXPECT_EQ(0, breaker.breakBadness());
288 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
289 EXPECT_EQ(16, breaker.wordStart()); // "x"
290 EXPECT_EQ(17, breaker.wordEnd());
291 EXPECT_EQ(0, breaker.breakBadness());
292 }
293
TEST_F(WordBreakerTest,mailto)294 TEST_F(WordBreakerTest, mailto) {
295 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
296 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
297 WordBreaker breaker;
298 breaker.setLocale(icu::Locale::getUS());
299 breaker.setText(buf, NELEM(buf));
300 EXPECT_EQ(0, breaker.current());
301 EXPECT_EQ(7, breaker.next()); // after "mailto:"
302 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
303 EXPECT_EQ(1, breaker.breakBadness());
304 EXPECT_EQ(18, breaker.next()); // after "foo@example"
305 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
306 EXPECT_EQ(1, breaker.breakBadness());
307 EXPECT_EQ(23, breaker.next()); // after ".com "
308 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
309 EXPECT_EQ(0, breaker.breakBadness());
310 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
311 EXPECT_EQ(23, breaker.wordStart()); // "x"
312 EXPECT_EQ(24, breaker.wordEnd());
313 EXPECT_EQ(0, breaker.breakBadness());
314 }
315
316 // The current logic always places a line break after a detected email address or URL
317 // and an immediately following non-ASCII character.
TEST_F(WordBreakerTest,emailNonAscii)318 TEST_F(WordBreakerTest, emailNonAscii) {
319 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
320 0x4E00};
321 WordBreaker breaker;
322 breaker.setLocale(icu::Locale::getUS());
323 breaker.setText(buf, NELEM(buf));
324 EXPECT_EQ(0, breaker.current());
325 EXPECT_EQ(11, breaker.next()); // after "foo@example"
326 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
327 EXPECT_EQ(1, breaker.breakBadness());
328 EXPECT_EQ(15, breaker.next()); // after ".com"
329 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
330 EXPECT_EQ(0, breaker.breakBadness());
331 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
332 EXPECT_EQ(15, breaker.wordStart()); // "一"
333 EXPECT_EQ(16, breaker.wordEnd());
334 EXPECT_EQ(0, breaker.breakBadness());
335 }
336
TEST_F(WordBreakerTest,emailCombining)337 TEST_F(WordBreakerTest, emailCombining) {
338 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
339 0x0303, ' ', 'x'};
340 WordBreaker breaker;
341 breaker.setLocale(icu::Locale::getUS());
342 breaker.setText(buf, NELEM(buf));
343 EXPECT_EQ(0, breaker.current());
344 EXPECT_EQ(11, breaker.next()); // after "foo@example"
345 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
346 EXPECT_EQ(1, breaker.breakBadness());
347 EXPECT_EQ(17, breaker.next()); // after ".com̃ "
348 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
349 EXPECT_EQ(0, breaker.breakBadness());
350 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
351 EXPECT_EQ(17, breaker.wordStart()); // "x"
352 EXPECT_EQ(18, breaker.wordEnd());
353 EXPECT_EQ(0, breaker.breakBadness());
354 }
355
TEST_F(WordBreakerTest,lonelyAt)356 TEST_F(WordBreakerTest, lonelyAt) {
357 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
358 WordBreaker breaker;
359 breaker.setLocale(icu::Locale::getUS());
360 breaker.setText(buf, NELEM(buf));
361 EXPECT_EQ(0, breaker.current());
362 EXPECT_EQ(2, breaker.next()); // after "a "
363 EXPECT_EQ(0, breaker.wordStart()); // "a"
364 EXPECT_EQ(1, breaker.wordEnd());
365 EXPECT_EQ(0, breaker.breakBadness());
366 EXPECT_EQ(4, breaker.next()); // after "@ "
367 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
368 EXPECT_EQ(0, breaker.breakBadness());
369 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
370 EXPECT_EQ(4, breaker.wordStart()); // "b"
371 EXPECT_EQ(5, breaker.wordEnd());
372 EXPECT_EQ(0, breaker.breakBadness());
373 }
374
TEST_F(WordBreakerTest,url)375 TEST_F(WordBreakerTest, url) {
376 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
377 '.', 'c', 'o', 'm', ' ', 'x'};
378 WordBreaker breaker;
379 breaker.setLocale(icu::Locale::getUS());
380 breaker.setText(buf, NELEM(buf));
381 EXPECT_EQ(0, breaker.current());
382 EXPECT_EQ(5, breaker.next()); // after "http:"
383 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
384 EXPECT_EQ(1, breaker.breakBadness());
385 EXPECT_EQ(7, breaker.next()); // after "//"
386 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
387 EXPECT_EQ(1, breaker.breakBadness());
388 EXPECT_EQ(14, breaker.next()); // after "example"
389 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
390 EXPECT_EQ(1, breaker.breakBadness());
391 EXPECT_EQ(19, breaker.next()); // after ".com "
392 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
393 EXPECT_EQ(0, breaker.breakBadness());
394 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
395 EXPECT_EQ(19, breaker.wordStart()); // "x"
396 EXPECT_EQ(20, breaker.wordEnd());
397 EXPECT_EQ(0, breaker.breakBadness());
398 }
399
400 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST_F(WordBreakerTest,urlBreakChars)401 TEST_F(WordBreakerTest, urlBreakChars) {
402 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
403 '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
404 WordBreaker breaker;
405 breaker.setLocale(icu::Locale::getUS());
406 breaker.setText(buf, NELEM(buf));
407 EXPECT_EQ(0, breaker.current());
408 EXPECT_EQ(5, breaker.next()); // after "http:"
409 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410 EXPECT_EQ(1, breaker.breakBadness());
411 EXPECT_EQ(7, breaker.next()); // after "//"
412 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413 EXPECT_EQ(1, breaker.breakBadness());
414 EXPECT_EQ(8, breaker.next()); // after "a"
415 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416 EXPECT_EQ(1, breaker.breakBadness());
417 EXPECT_EQ(10, breaker.next()); // after ".b"
418 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419 EXPECT_EQ(1, breaker.breakBadness());
420 EXPECT_EQ(11, breaker.next()); // after "/"
421 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
422 EXPECT_EQ(1, breaker.breakBadness());
423 EXPECT_EQ(13, breaker.next()); // after "~c"
424 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
425 EXPECT_EQ(1, breaker.breakBadness());
426 EXPECT_EQ(15, breaker.next()); // after ",d"
427 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
428 EXPECT_EQ(1, breaker.breakBadness());
429 EXPECT_EQ(17, breaker.next()); // after "-e"
430 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
431 EXPECT_EQ(1, breaker.breakBadness());
432 EXPECT_EQ(19, breaker.next()); // after "?f"
433 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
434 EXPECT_EQ(1, breaker.breakBadness());
435 EXPECT_EQ(20, breaker.next()); // after "="
436 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
437 EXPECT_EQ(1, breaker.breakBadness());
438 EXPECT_EQ(21, breaker.next()); // after "g"
439 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440 EXPECT_EQ(1, breaker.breakBadness());
441 EXPECT_EQ(22, breaker.next()); // after "&"
442 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443 EXPECT_EQ(1, breaker.breakBadness());
444 EXPECT_EQ(23, breaker.next()); // after "h"
445 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
446 EXPECT_EQ(1, breaker.breakBadness());
447 EXPECT_EQ(25, breaker.next()); // after "#i"
448 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
449 EXPECT_EQ(1, breaker.breakBadness());
450 EXPECT_EQ(27, breaker.next()); // after "%j"
451 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
452 EXPECT_EQ(1, breaker.breakBadness());
453 EXPECT_EQ(29, breaker.next()); // after "_k"
454 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455 EXPECT_EQ(1, breaker.breakBadness());
456 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
457 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
458 EXPECT_EQ(0, breaker.breakBadness());
459 }
460
TEST_F(WordBreakerTest,urlNoHyphenBreak)461 TEST_F(WordBreakerTest, urlNoHyphenBreak) {
462 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
463 WordBreaker breaker;
464 breaker.setLocale(icu::Locale::getUS());
465 breaker.setText(buf, NELEM(buf));
466 EXPECT_EQ(0, breaker.current());
467 EXPECT_EQ(5, breaker.next()); // after "http:"
468 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
469 EXPECT_EQ(7, breaker.next()); // after "//"
470 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
471 EXPECT_EQ(8, breaker.next()); // after "a"
472 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
473 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
474 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
475 }
476
TEST_F(WordBreakerTest,urlEndsWithSlash)477 TEST_F(WordBreakerTest, urlEndsWithSlash) {
478 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
479 WordBreaker breaker;
480 breaker.setLocale(icu::Locale::getUS());
481 breaker.setText(buf, NELEM(buf));
482 EXPECT_EQ(0, breaker.current());
483 EXPECT_EQ(5, breaker.next()); // after "http:"
484 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485 EXPECT_EQ(7, breaker.next()); // after "//"
486 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
487 EXPECT_EQ(8, breaker.next()); // after "a"
488 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
489 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
490 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
491 }
492
TEST_F(WordBreakerTest,emailStartsWithSlash)493 TEST_F(WordBreakerTest, emailStartsWithSlash) {
494 uint16_t buf[] = {'/', 'a', '@', 'b'};
495 WordBreaker breaker;
496 breaker.setLocale(icu::Locale::getUS());
497 breaker.setText(buf, NELEM(buf));
498 EXPECT_EQ(0, breaker.current());
499 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
500 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
501 }
502
503 } // namespace minikin
504