• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Minikin"
18 
19 #include <android/log.h>
20 #include <gtest/gtest.h>
21 
22 #include "ICUTestBase.h"
23 #include "UnicodeUtils.h"
24 #include <minikin/WordBreaker.h>
25 #include <unicode/locid.h>
26 #include <unicode/uclean.h>
27 #include <unicode/udata.h>
28 
29 #ifndef NELEM
30 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
31 #endif
32 
33 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
34 
35 namespace minikin {
36 
37 typedef ICUTestBase WordBreakerTest;
38 
TEST_F(WordBreakerTest,basic)39 TEST_F(WordBreakerTest, basic) {
40     uint16_t buf[] = {'h', 'e', 'l', 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
41     WordBreaker breaker;
42     breaker.setLocale(icu::Locale::getUS());
43     breaker.setText(buf, NELEM(buf));
44     EXPECT_EQ(0, breaker.current());
45     EXPECT_EQ(6, breaker.next());  // after "hello "
46     EXPECT_EQ(0, breaker.wordStart());  // "hello"
47     EXPECT_EQ(5, breaker.wordEnd());
48     EXPECT_EQ(0, breaker.breakBadness());
49     EXPECT_EQ(6, breaker.current());
50     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
51     EXPECT_EQ(6, breaker.wordStart());  // "world"
52     EXPECT_EQ(11, breaker.wordEnd());
53     EXPECT_EQ(0, breaker.breakBadness());
54     EXPECT_EQ(11, breaker.current());
55 }
56 
TEST_F(WordBreakerTest,softHyphen)57 TEST_F(WordBreakerTest, softHyphen) {
58     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l' ,'o', ' ', 'w', 'o', 'r', 'l', 'd'};
59     WordBreaker breaker;
60     breaker.setLocale(icu::Locale::getUS());
61     breaker.setText(buf, NELEM(buf));
62     EXPECT_EQ(0, breaker.current());
63     EXPECT_EQ(7, breaker.next());  // after "hel{SOFT HYPHEN}lo "
64     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
65     EXPECT_EQ(6, breaker.wordEnd());
66     EXPECT_EQ(0, breaker.breakBadness());
67     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
68     EXPECT_EQ(7, breaker.wordStart());  // "world"
69     EXPECT_EQ(12, breaker.wordEnd());
70     EXPECT_EQ(0, breaker.breakBadness());
71 }
72 
TEST_F(WordBreakerTest,hardHyphen)73 TEST_F(WordBreakerTest, hardHyphen) {
74     // Hyphens should not allow breaks anymore.
75     uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
76     WordBreaker breaker;
77     breaker.setLocale(icu::Locale::getUS());
78     breaker.setText(buf, NELEM(buf));
79     EXPECT_EQ(0, breaker.current());
80     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());
81     EXPECT_EQ(0, breaker.wordStart());
82     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
83     EXPECT_EQ(0, breaker.breakBadness());
84 }
85 
TEST_F(WordBreakerTest,postfixAndPrefix)86 TEST_F(WordBreakerTest, postfixAndPrefix) {
87     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
88     WordBreaker breaker;
89     breaker.setLocale(icu::Locale::getUS());
90     breaker.setText(buf, NELEM(buf));
91     EXPECT_EQ(0, breaker.current());
92 
93     EXPECT_EQ(4, breaker.next());  // after CENT SIGN
94     EXPECT_EQ(0, breaker.wordStart());  // "US¢"
95     EXPECT_EQ(3, breaker.wordEnd());
96 
97     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
98     EXPECT_EQ(4, breaker.wordStart());  // "JP¥"
99     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
100 }
101 
TEST_F(WordBreakerTest,myanmarKinzi)102 TEST_F(WordBreakerTest, myanmarKinzi) {
103     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
104     WordBreaker breaker;
105     icu::Locale burmese("my");
106     breaker.setLocale(burmese);
107     breaker.setText(buf, NELEM(buf));
108     EXPECT_EQ(0, breaker.current());
109 
110     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
111     EXPECT_EQ(0, breaker.wordStart());
112     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
113 }
114 
TEST_F(WordBreakerTest,zwjEmojiSequences)115 TEST_F(WordBreakerTest, zwjEmojiSequences) {
116     uint16_t buf[] = {
117         // man + zwj + heart + zwj + man
118         UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
119         // woman + zwj + heart + zwj + kiss mark + zwj + woman
120         UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
121         // eye + zwj + left speech bubble
122         UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
123         // CAT FACE + zwj + BUST IN SILHOUETTE
124         UTF16(0x1F431), 0x200D, UTF16(0x1F464),
125     };
126     WordBreaker breaker;
127     breaker.setLocale(icu::Locale::getUS());
128     breaker.setText(buf, NELEM(buf));
129     EXPECT_EQ(0, breaker.current());
130     EXPECT_EQ(7, breaker.next());  // after man + zwj + heart + zwj + man
131     EXPECT_EQ(0, breaker.wordStart());
132     EXPECT_EQ(7, breaker.wordEnd());
133     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
134     EXPECT_EQ(7, breaker.wordStart());
135     EXPECT_EQ(17, breaker.wordEnd());
136     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
137     EXPECT_EQ(17, breaker.wordStart());
138     EXPECT_EQ(22, breaker.wordEnd());
139     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
140     EXPECT_EQ(22, breaker.wordStart());
141     EXPECT_EQ(27, breaker.wordEnd());
142 }
143 
TEST_F(WordBreakerTest,emojiWithModifier)144 TEST_F(WordBreakerTest, emojiWithModifier) {
145     uint16_t buf[] = {
146         UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
147         0x270C, 0xFE0F, UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
148     };
149     WordBreaker breaker;
150     breaker.setLocale(icu::Locale::getUS());
151     breaker.setText(buf, NELEM(buf));
152     EXPECT_EQ(0, breaker.current());
153     EXPECT_EQ(4, breaker.next());  // after boy + type 1-2 fitzpatrick modifier
154     EXPECT_EQ(0, breaker.wordStart());
155     EXPECT_EQ(4, breaker.wordEnd());
156     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
157     EXPECT_EQ(4, breaker.wordStart());
158     EXPECT_EQ(8, breaker.wordEnd());
159 }
160 
TEST_F(WordBreakerTest,unicode10Emoji)161 TEST_F(WordBreakerTest, unicode10Emoji) {
162     // Should break between emojis.
163     uint16_t buf[] = {
164         // SLED + SLED
165         UTF16(0x1F6F7), UTF16(0x1F6F7),
166         // SLED + VS15 + SLED
167         UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
168         // WHITE SMILING FACE + SLED
169         0x263A, UTF16(0x1F6F7),
170         // WHITE SMILING FACE + VS16 + SLED
171         0x263A, 0xFE0F, UTF16(0x1F6F7),
172     };
173     WordBreaker breaker;
174     breaker.setLocale(icu::Locale::getEnglish());
175     breaker.setText(buf, NELEM(buf));
176     EXPECT_EQ(0, breaker.current());
177     EXPECT_EQ(2, breaker.next());
178     EXPECT_EQ(0, breaker.wordStart());
179     EXPECT_EQ(2, breaker.wordEnd());
180 
181     EXPECT_EQ(4, breaker.next());
182     EXPECT_EQ(2, breaker.wordStart());
183     EXPECT_EQ(4, breaker.wordEnd());
184 
185     EXPECT_EQ(7, breaker.next());
186     EXPECT_EQ(4, breaker.wordStart());
187     EXPECT_EQ(7, breaker.wordEnd());
188 
189     EXPECT_EQ(9, breaker.next());
190     EXPECT_EQ(7, breaker.wordStart());
191     EXPECT_EQ(9, breaker.wordEnd());
192 
193     EXPECT_EQ(10, breaker.next());
194     EXPECT_EQ(9, breaker.wordStart());
195     EXPECT_EQ(10, breaker.wordEnd());
196 
197     EXPECT_EQ(12, breaker.next());
198     EXPECT_EQ(10, breaker.wordStart());
199     EXPECT_EQ(12, breaker.wordEnd());
200 
201     EXPECT_EQ(14, breaker.next());
202     EXPECT_EQ(12, breaker.wordStart());
203     EXPECT_EQ(14, breaker.wordEnd());
204 
205     EXPECT_EQ(16, breaker.next());
206     EXPECT_EQ(14, breaker.wordStart());
207     EXPECT_EQ(16, breaker.wordEnd());
208 }
209 
TEST_F(WordBreakerTest,flagsSequenceSingleFlag)210 TEST_F(WordBreakerTest, flagsSequenceSingleFlag) {
211     const std::string kFlag = "U+1F3F4";
212     const std::string flags = kFlag + " " + kFlag;
213 
214     const int kFlagLength = 2;
215     const size_t BUF_SIZE = kFlagLength * 2;
216 
217     uint16_t buf[BUF_SIZE];
218     size_t size;
219     ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
220 
221     WordBreaker breaker;
222     breaker.setLocale(icu::Locale::getUS());
223     breaker.setText(buf, size);
224     EXPECT_EQ(0, breaker.current());
225     EXPECT_EQ(kFlagLength, breaker.next());  // end of the first flag
226     EXPECT_EQ(0, breaker.wordStart());
227     EXPECT_EQ(kFlagLength, breaker.wordEnd());
228     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
229     EXPECT_EQ(kFlagLength, breaker.wordStart());
230     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
231 }
232 
TEST_F(WordBreakerTest,flagsSequence)233 TEST_F(WordBreakerTest, flagsSequence) {
234     // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
235     // of Scotland.
236     const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
237     const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
238 
239     const int kFlagLength = 14;
240     const size_t BUF_SIZE = kFlagLength * 2;
241 
242     uint16_t buf[BUF_SIZE];
243     size_t size;
244     ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
245 
246     WordBreaker breaker;
247     breaker.setLocale(icu::Locale::getUS());
248     breaker.setText(buf, size);
249     EXPECT_EQ(0, breaker.current());
250     EXPECT_EQ(kFlagLength, breaker.next());  // end of the first flag sequence
251     EXPECT_EQ(0, breaker.wordStart());
252     EXPECT_EQ(kFlagLength, breaker.wordEnd());
253     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
254     EXPECT_EQ(kFlagLength, breaker.wordStart());
255     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
256 }
257 
TEST_F(WordBreakerTest,punct)258 TEST_F(WordBreakerTest, punct) {
259     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l' ,'o', ',', ' ', 'w', 'o', 'r', 'l', 'd',
260         '!', '!'};
261     WordBreaker breaker;
262     breaker.setLocale(icu::Locale::getUS());
263     breaker.setText(buf, NELEM(buf));
264     EXPECT_EQ(0, breaker.current());
265     EXPECT_EQ(9, breaker.next());  // after "¡¡hello, "
266     EXPECT_EQ(2, breaker.wordStart());  // "hello"
267     EXPECT_EQ(7, breaker.wordEnd());
268     EXPECT_EQ(0, breaker.breakBadness());
269     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
270     EXPECT_EQ(9, breaker.wordStart());  // "world"
271     EXPECT_EQ(14, breaker.wordEnd());
272     EXPECT_EQ(0, breaker.breakBadness());
273 }
274 
TEST_F(WordBreakerTest,email)275 TEST_F(WordBreakerTest, email) {
276     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
277         ' ', 'x'};
278     WordBreaker breaker;
279     breaker.setLocale(icu::Locale::getUS());
280     breaker.setText(buf, NELEM(buf));
281     EXPECT_EQ(0, breaker.current());
282     EXPECT_EQ(11, breaker.next());  // after "foo@example"
283     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
284     EXPECT_EQ(1, breaker.breakBadness());
285     EXPECT_EQ(16, breaker.next());  // after ".com "
286     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
287     EXPECT_EQ(0, breaker.breakBadness());
288     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
289     EXPECT_EQ(16, breaker.wordStart());  // "x"
290     EXPECT_EQ(17, breaker.wordEnd());
291     EXPECT_EQ(0, breaker.breakBadness());
292 }
293 
TEST_F(WordBreakerTest,mailto)294 TEST_F(WordBreakerTest, mailto) {
295     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@',
296         'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
297     WordBreaker breaker;
298     breaker.setLocale(icu::Locale::getUS());
299     breaker.setText(buf, NELEM(buf));
300     EXPECT_EQ(0, breaker.current());
301     EXPECT_EQ(7, breaker.next());  // after "mailto:"
302     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
303     EXPECT_EQ(1, breaker.breakBadness());
304     EXPECT_EQ(18, breaker.next());  // after "foo@example"
305     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
306     EXPECT_EQ(1, breaker.breakBadness());
307     EXPECT_EQ(23, breaker.next());  // after ".com "
308     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
309     EXPECT_EQ(0, breaker.breakBadness());
310     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
311     EXPECT_EQ(23, breaker.wordStart());  // "x"
312     EXPECT_EQ(24, breaker.wordEnd());
313     EXPECT_EQ(0, breaker.breakBadness());
314 }
315 
316 // The current logic always places a line break after a detected email address or URL
317 // and an immediately following non-ASCII character.
TEST_F(WordBreakerTest,emailNonAscii)318 TEST_F(WordBreakerTest, emailNonAscii) {
319     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
320         0x4E00};
321     WordBreaker breaker;
322     breaker.setLocale(icu::Locale::getUS());
323     breaker.setText(buf, NELEM(buf));
324     EXPECT_EQ(0, breaker.current());
325     EXPECT_EQ(11, breaker.next());  // after "foo@example"
326     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
327     EXPECT_EQ(1, breaker.breakBadness());
328     EXPECT_EQ(15, breaker.next());  // after ".com"
329     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
330     EXPECT_EQ(0, breaker.breakBadness());
331     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
332     EXPECT_EQ(15, breaker.wordStart());  // "一"
333     EXPECT_EQ(16, breaker.wordEnd());
334     EXPECT_EQ(0, breaker.breakBadness());
335 }
336 
TEST_F(WordBreakerTest,emailCombining)337 TEST_F(WordBreakerTest, emailCombining) {
338     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm',
339         0x0303, ' ', 'x'};
340     WordBreaker breaker;
341     breaker.setLocale(icu::Locale::getUS());
342     breaker.setText(buf, NELEM(buf));
343     EXPECT_EQ(0, breaker.current());
344     EXPECT_EQ(11, breaker.next());  // after "foo@example"
345     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
346     EXPECT_EQ(1, breaker.breakBadness());
347     EXPECT_EQ(17, breaker.next());  // after ".com̃ "
348     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
349     EXPECT_EQ(0, breaker.breakBadness());
350     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
351     EXPECT_EQ(17, breaker.wordStart());  // "x"
352     EXPECT_EQ(18, breaker.wordEnd());
353     EXPECT_EQ(0, breaker.breakBadness());
354 }
355 
TEST_F(WordBreakerTest,lonelyAt)356 TEST_F(WordBreakerTest, lonelyAt) {
357     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
358     WordBreaker breaker;
359     breaker.setLocale(icu::Locale::getUS());
360     breaker.setText(buf, NELEM(buf));
361     EXPECT_EQ(0, breaker.current());
362     EXPECT_EQ(2, breaker.next());  // after "a "
363     EXPECT_EQ(0, breaker.wordStart());  // "a"
364     EXPECT_EQ(1, breaker.wordEnd());
365     EXPECT_EQ(0, breaker.breakBadness());
366     EXPECT_EQ(4, breaker.next());  // after "@ "
367     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
368     EXPECT_EQ(0, breaker.breakBadness());
369     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
370     EXPECT_EQ(4, breaker.wordStart());  // "b"
371     EXPECT_EQ(5, breaker.wordEnd());
372     EXPECT_EQ(0, breaker.breakBadness());
373 }
374 
TEST_F(WordBreakerTest,url)375 TEST_F(WordBreakerTest, url) {
376     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a', 'm', 'p', 'l', 'e',
377         '.', 'c', 'o', 'm', ' ', 'x'};
378     WordBreaker breaker;
379     breaker.setLocale(icu::Locale::getUS());
380     breaker.setText(buf, NELEM(buf));
381     EXPECT_EQ(0, breaker.current());
382     EXPECT_EQ(5, breaker.next());  // after "http:"
383     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
384     EXPECT_EQ(1, breaker.breakBadness());
385     EXPECT_EQ(7, breaker.next());  // after "//"
386     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
387     EXPECT_EQ(1, breaker.breakBadness());
388     EXPECT_EQ(14, breaker.next());  // after "example"
389     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
390     EXPECT_EQ(1, breaker.breakBadness());
391     EXPECT_EQ(19, breaker.next());  // after ".com "
392     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
393     EXPECT_EQ(0, breaker.breakBadness());
394     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
395     EXPECT_EQ(19, breaker.wordStart());  // "x"
396     EXPECT_EQ(20, breaker.wordEnd());
397     EXPECT_EQ(0, breaker.breakBadness());
398 }
399 
400 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST_F(WordBreakerTest,urlBreakChars)401 TEST_F(WordBreakerTest, urlBreakChars) {
402     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/', '~', 'c', ',', 'd',
403         '-', 'e', '?', 'f', '=', 'g', '&', 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
404     WordBreaker breaker;
405     breaker.setLocale(icu::Locale::getUS());
406     breaker.setText(buf, NELEM(buf));
407     EXPECT_EQ(0, breaker.current());
408     EXPECT_EQ(5, breaker.next());  // after "http:"
409     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410     EXPECT_EQ(1, breaker.breakBadness());
411     EXPECT_EQ(7, breaker.next());  // after "//"
412     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413     EXPECT_EQ(1, breaker.breakBadness());
414     EXPECT_EQ(8, breaker.next());  // after "a"
415     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416     EXPECT_EQ(1, breaker.breakBadness());
417     EXPECT_EQ(10, breaker.next());  // after ".b"
418     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419     EXPECT_EQ(1, breaker.breakBadness());
420     EXPECT_EQ(11, breaker.next());  // after "/"
421     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
422     EXPECT_EQ(1, breaker.breakBadness());
423     EXPECT_EQ(13, breaker.next());  // after "~c"
424     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
425     EXPECT_EQ(1, breaker.breakBadness());
426     EXPECT_EQ(15, breaker.next());  // after ",d"
427     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
428     EXPECT_EQ(1, breaker.breakBadness());
429     EXPECT_EQ(17, breaker.next());  // after "-e"
430     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
431     EXPECT_EQ(1, breaker.breakBadness());
432     EXPECT_EQ(19, breaker.next());  // after "?f"
433     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
434     EXPECT_EQ(1, breaker.breakBadness());
435     EXPECT_EQ(20, breaker.next());  // after "="
436     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
437     EXPECT_EQ(1, breaker.breakBadness());
438     EXPECT_EQ(21, breaker.next());  // after "g"
439     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
440     EXPECT_EQ(1, breaker.breakBadness());
441     EXPECT_EQ(22, breaker.next());  // after "&"
442     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
443     EXPECT_EQ(1, breaker.breakBadness());
444     EXPECT_EQ(23, breaker.next());  // after "h"
445     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
446     EXPECT_EQ(1, breaker.breakBadness());
447     EXPECT_EQ(25, breaker.next());  // after "#i"
448     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
449     EXPECT_EQ(1, breaker.breakBadness());
450     EXPECT_EQ(27, breaker.next());  // after "%j"
451     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
452     EXPECT_EQ(1, breaker.breakBadness());
453     EXPECT_EQ(29, breaker.next());  // after "_k"
454     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455     EXPECT_EQ(1, breaker.breakBadness());
456     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
457     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
458     EXPECT_EQ(0, breaker.breakBadness());
459 }
460 
TEST_F(WordBreakerTest,urlNoHyphenBreak)461 TEST_F(WordBreakerTest, urlNoHyphenBreak) {
462     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
463     WordBreaker breaker;
464     breaker.setLocale(icu::Locale::getUS());
465     breaker.setText(buf, NELEM(buf));
466     EXPECT_EQ(0, breaker.current());
467     EXPECT_EQ(5, breaker.next());  // after "http:"
468     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
469     EXPECT_EQ(7, breaker.next());  // after "//"
470     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
471     EXPECT_EQ(8, breaker.next());  // after "a"
472     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
473     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
474     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
475 }
476 
TEST_F(WordBreakerTest,urlEndsWithSlash)477 TEST_F(WordBreakerTest, urlEndsWithSlash) {
478     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
479     WordBreaker breaker;
480     breaker.setLocale(icu::Locale::getUS());
481     breaker.setText(buf, NELEM(buf));
482     EXPECT_EQ(0, breaker.current());
483     EXPECT_EQ(5, breaker.next());  // after "http:"
484     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485     EXPECT_EQ(7, breaker.next());  // after "//"
486     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
487     EXPECT_EQ(8, breaker.next());  // after "a"
488     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
489     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
490     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
491 }
492 
TEST_F(WordBreakerTest,emailStartsWithSlash)493 TEST_F(WordBreakerTest, emailStartsWithSlash) {
494     uint16_t buf[] = {'/', 'a', '@', 'b'};
495     WordBreaker breaker;
496     breaker.setLocale(icu::Locale::getUS());
497     breaker.setText(buf, NELEM(buf));
498     EXPECT_EQ(0, breaker.current());
499     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
500     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
501 }
502 
503 }  // namespace minikin
504