• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <cstdio>
20 
21 #include <gtest/gtest.h>
22 #include <unicode/locid.h>
23 #include <unicode/uclean.h>
24 #include <unicode/udata.h>
25 
26 #include "UnicodeUtils.h"
27 
28 #ifndef NELEM
29 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
30 #endif
31 
32 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
33 
34 namespace minikin {
35 
TEST(WordBreakerTest,basic)36 TEST(WordBreakerTest, basic) {
37     uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
38     WordBreaker breaker;
39     breaker.setText(buf, NELEM(buf));
40     EXPECT_EQ(0, breaker.current());
41     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0));  // after "hello "
42     EXPECT_EQ(0, breaker.wordStart());                              // "hello"
43     EXPECT_EQ(5, breaker.wordEnd());
44     EXPECT_EQ(0, breaker.breakBadness());
45     EXPECT_EQ(6, breaker.current());
46     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
47     EXPECT_EQ(6, breaker.wordStart());               // "world"
48     EXPECT_EQ(11, breaker.wordEnd());
49     EXPECT_EQ(0, breaker.breakBadness());
50     EXPECT_EQ(11, breaker.current());
51 }
52 
TEST(WordBreakerTest,softHyphen)53 TEST(WordBreakerTest, softHyphen) {
54     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
55     WordBreaker breaker;
56     breaker.setText(buf, NELEM(buf));
57     EXPECT_EQ(0, breaker.current());
58     // after "hel{SOFT HYPHEN}lo "
59     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
60     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
61     EXPECT_EQ(6, breaker.wordEnd());
62     EXPECT_EQ(0, breaker.breakBadness());
63     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
64     EXPECT_EQ(7, breaker.wordStart());               // "world"
65     EXPECT_EQ(12, breaker.wordEnd());
66     EXPECT_EQ(0, breaker.breakBadness());
67 }
68 
TEST(WordBreakerTest,hardHyphen)69 TEST(WordBreakerTest, hardHyphen) {
70     // Hyphens should not allow breaks anymore.
71     uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
72     WordBreaker breaker;
73     breaker.setText(buf, NELEM(buf));
74     EXPECT_EQ(0, breaker.current());
75     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
76     EXPECT_EQ(0, breaker.wordStart());
77     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
78     EXPECT_EQ(0, breaker.breakBadness());
79 }
80 
TEST(WordBreakerTest,postfixAndPrefix)81 TEST(WordBreakerTest, postfixAndPrefix) {
82     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5};  // US¢ JP¥
83     WordBreaker breaker;
84     breaker.setText(buf, NELEM(buf));
85     EXPECT_EQ(0, breaker.current());
86 
87     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));  // after CENT SIGN
88     EXPECT_EQ(0, breaker.wordStart());                              // "US¢"
89     EXPECT_EQ(3, breaker.wordEnd());
90 
91     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
92     EXPECT_EQ(4, breaker.wordStart());               // "JP¥"
93     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
94 }
95 
TEST(WordBreakerTest,myanmarKinzi)96 TEST(WordBreakerTest, myanmarKinzi) {
97     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
98     WordBreaker breaker;
99     icu::Locale burmese("my");
100     breaker.setText(buf, NELEM(buf));
101     EXPECT_EQ(0, breaker.current());
102 
103     // end of string
104     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
105     EXPECT_EQ(0, breaker.wordStart());
106     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
107 }
108 
TEST(WordBreakerTest,zwjEmojiSequences)109 TEST(WordBreakerTest, zwjEmojiSequences) {
110     uint16_t buf[] = {
111             // man + zwj + heart + zwj + man
112             UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
113             // woman + zwj + heart + zwj + kiss mark + zwj + woman
114             UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
115             // eye + zwj + left speech bubble
116             UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
117             // CAT FACE + zwj + BUST IN SILHOUETTE
118             UTF16(0x1F431), 0x200D, UTF16(0x1F464),
119     };
120     WordBreaker breaker;
121     breaker.setText(buf, NELEM(buf));
122     EXPECT_EQ(0, breaker.current());
123     // after man + zwj + heart + zwj + man
124     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
125     EXPECT_EQ(0, breaker.wordStart());
126     EXPECT_EQ(7, breaker.wordEnd());
127     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
128     EXPECT_EQ(7, breaker.wordStart());
129     EXPECT_EQ(17, breaker.wordEnd());
130     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
131     EXPECT_EQ(17, breaker.wordStart());
132     EXPECT_EQ(22, breaker.wordEnd());
133     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
134     EXPECT_EQ(22, breaker.wordStart());
135     EXPECT_EQ(27, breaker.wordEnd());
136 }
137 
TEST(WordBreakerTest,emojiWithModifier)138 TEST(WordBreakerTest, emojiWithModifier) {
139     uint16_t buf[] = {
140             UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
141             0x270C, 0xFE0F,
142             UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
143     };
144     WordBreaker breaker;
145     breaker.setText(buf, NELEM(buf));
146     EXPECT_EQ(0, breaker.current());
147     // after boy + type 1-2 fitzpatrick modifier
148     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));
149     EXPECT_EQ(0, breaker.wordStart());
150     EXPECT_EQ(4, breaker.wordEnd());
151     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
152     EXPECT_EQ(4, breaker.wordStart());
153     EXPECT_EQ(8, breaker.wordEnd());
154 }
155 
TEST(WordBreakerTest,unicode10Emoji)156 TEST(WordBreakerTest, unicode10Emoji) {
157     // Should break between emojis.
158     uint16_t buf[] = {
159             // SLED + SLED
160             UTF16(0x1F6F7), UTF16(0x1F6F7),
161             // SLED + VS15 + SLED
162             UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
163             // WHITE SMILING FACE + SLED
164             0x263A, UTF16(0x1F6F7),
165             // WHITE SMILING FACE + VS16 + SLED
166             0x263A, 0xFE0F, UTF16(0x1F6F7),
167     };
168     WordBreaker breaker;
169     breaker.setText(buf, NELEM(buf));
170     EXPECT_EQ(0, breaker.current());
171     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), 0));
172     EXPECT_EQ(0, breaker.wordStart());
173     EXPECT_EQ(2, breaker.wordEnd());
174 
175     EXPECT_EQ(4, breaker.next());
176     EXPECT_EQ(2, breaker.wordStart());
177     EXPECT_EQ(4, breaker.wordEnd());
178 
179     EXPECT_EQ(7, breaker.next());
180     EXPECT_EQ(4, breaker.wordStart());
181     EXPECT_EQ(7, breaker.wordEnd());
182 
183     EXPECT_EQ(9, breaker.next());
184     EXPECT_EQ(7, breaker.wordStart());
185     EXPECT_EQ(9, breaker.wordEnd());
186 
187     EXPECT_EQ(10, breaker.next());
188     EXPECT_EQ(9, breaker.wordStart());
189     EXPECT_EQ(10, breaker.wordEnd());
190 
191     EXPECT_EQ(12, breaker.next());
192     EXPECT_EQ(10, breaker.wordStart());
193     EXPECT_EQ(12, breaker.wordEnd());
194 
195     EXPECT_EQ(14, breaker.next());
196     EXPECT_EQ(12, breaker.wordStart());
197     EXPECT_EQ(14, breaker.wordEnd());
198 
199     EXPECT_EQ(16, breaker.next());
200     EXPECT_EQ(14, breaker.wordStart());
201     EXPECT_EQ(16, breaker.wordEnd());
202 }
203 
TEST(WordBreakerTest,flagsSequenceSingleFlag)204 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
205     const std::string kFlag = "U+1F3F4";
206     const std::string flags = kFlag + " " + kFlag;
207 
208     const int kFlagLength = 2;
209     const size_t BUF_SIZE = kFlagLength * 2;
210 
211     uint16_t buf[BUF_SIZE];
212     size_t size;
213     ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
214 
215     WordBreaker breaker;
216     breaker.setText(buf, size);
217     EXPECT_EQ(0, breaker.current());
218     // end of the first flag
219     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
220     EXPECT_EQ(0, breaker.wordStart());
221     EXPECT_EQ(kFlagLength, breaker.wordEnd());
222     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
223     EXPECT_EQ(kFlagLength, breaker.wordStart());
224     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
225 }
226 
TEST(WordBreakerTest,flagsSequence)227 TEST(WordBreakerTest, flagsSequence) {
228     // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
229     // of Scotland.
230     const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
231     const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
232 
233     const int kFlagLength = 14;
234     const size_t BUF_SIZE = kFlagLength * 2;
235 
236     uint16_t buf[BUF_SIZE];
237     size_t size;
238     ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
239 
240     WordBreaker breaker;
241     breaker.setText(buf, size);
242     EXPECT_EQ(0, breaker.current());
243     // end of the first flag sequence
244     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
245     EXPECT_EQ(0, breaker.wordStart());
246     EXPECT_EQ(kFlagLength, breaker.wordEnd());
247     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
248     EXPECT_EQ(kFlagLength, breaker.wordStart());
249     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
250 }
251 
TEST(WordBreakerTest,punct)252 TEST(WordBreakerTest, punct) {
253     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
254                       ' ',    'w',    'o', 'r', 'l', 'd', '!', '!'};
255     WordBreaker breaker;
256     breaker.setText(buf, NELEM(buf));
257     EXPECT_EQ(0, breaker.current());
258     EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), 0));  // after "¡¡hello, "
259     EXPECT_EQ(2, breaker.wordStart());                              // "hello"
260     EXPECT_EQ(7, breaker.wordEnd());
261     EXPECT_EQ(0, breaker.breakBadness());
262     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
263     EXPECT_EQ(9, breaker.wordStart());               // "world"
264     EXPECT_EQ(14, breaker.wordEnd());
265     EXPECT_EQ(0, breaker.breakBadness());
266 }
267 
TEST(WordBreakerTest,email)268 TEST(WordBreakerTest, email) {
269     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
270                       'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
271     WordBreaker breaker;
272     breaker.setText(buf, NELEM(buf));
273     EXPECT_EQ(0, breaker.current());
274     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
275     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
276     EXPECT_EQ(1, breaker.breakBadness());
277     EXPECT_EQ(16, breaker.next());  // after ".com "
278     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
279     EXPECT_EQ(0, breaker.breakBadness());
280     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
281     EXPECT_EQ(16, breaker.wordStart());              // "x"
282     EXPECT_EQ(17, breaker.wordEnd());
283     EXPECT_EQ(0, breaker.breakBadness());
284 }
285 
TEST(WordBreakerTest,mailto)286 TEST(WordBreakerTest, mailto) {
287     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
288                       'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
289     WordBreaker breaker;
290     breaker.setText(buf, NELEM(buf));
291     EXPECT_EQ(0, breaker.current());
292     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));  // after "mailto:"
293     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
294     EXPECT_EQ(1, breaker.breakBadness());
295     EXPECT_EQ(18, breaker.next());  // after "foo@example"
296     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
297     EXPECT_EQ(1, breaker.breakBadness());
298     EXPECT_EQ(23, breaker.next());  // after ".com "
299     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
300     EXPECT_EQ(0, breaker.breakBadness());
301     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
302     EXPECT_EQ(23, breaker.wordStart());              // "x"
303     EXPECT_EQ(24, breaker.wordEnd());
304     EXPECT_EQ(0, breaker.breakBadness());
305 }
306 
307 // The current logic always places a line break after a detected email address or URL
308 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)309 TEST(WordBreakerTest, emailNonAscii) {
310     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
311                       'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
312     WordBreaker breaker;
313     breaker.setText(buf, NELEM(buf));
314     EXPECT_EQ(0, breaker.current());
315     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
316     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
317     EXPECT_EQ(1, breaker.breakBadness());
318     EXPECT_EQ(15, breaker.next());  // after ".com"
319     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
320     EXPECT_EQ(0, breaker.breakBadness());
321     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
322     EXPECT_EQ(15, breaker.wordStart());              // "一"
323     EXPECT_EQ(16, breaker.wordEnd());
324     EXPECT_EQ(0, breaker.breakBadness());
325 }
326 
TEST(WordBreakerTest,emailCombining)327 TEST(WordBreakerTest, emailCombining) {
328     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a',    'm', 'p',
329                       'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
330     WordBreaker breaker;
331     breaker.setText(buf, NELEM(buf));
332     EXPECT_EQ(0, breaker.current());
333     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0));  // after "foo@example"
334     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
335     EXPECT_EQ(1, breaker.breakBadness());
336     EXPECT_EQ(17, breaker.next());  // after ".com̃ "
337     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
338     EXPECT_EQ(0, breaker.breakBadness());
339     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
340     EXPECT_EQ(17, breaker.wordStart());              // "x"
341     EXPECT_EQ(18, breaker.wordEnd());
342     EXPECT_EQ(0, breaker.breakBadness());
343 }
344 
TEST(WordBreakerTest,lonelyAt)345 TEST(WordBreakerTest, lonelyAt) {
346     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
347     WordBreaker breaker;
348     breaker.setText(buf, NELEM(buf));
349     EXPECT_EQ(0, breaker.current());
350     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0));  // after "a "
351     EXPECT_EQ(0, breaker.wordStart());                              // "a"
352     EXPECT_EQ(1, breaker.wordEnd());
353     EXPECT_EQ(0, breaker.breakBadness());
354     EXPECT_EQ(4, breaker.next());  // after "@ "
355     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
356     EXPECT_EQ(0, breaker.breakBadness());
357     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
358     EXPECT_EQ(4, breaker.wordStart());               // "b"
359     EXPECT_EQ(5, breaker.wordEnd());
360     EXPECT_EQ(0, breaker.breakBadness());
361 }
362 
TEST(WordBreakerTest,url)363 TEST(WordBreakerTest, url) {
364     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
365                       'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
366     WordBreaker breaker;
367     breaker.setText(buf, NELEM(buf));
368     EXPECT_EQ(0, breaker.current());
369     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
370     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
371     EXPECT_EQ(1, breaker.breakBadness());
372     EXPECT_EQ(7, breaker.next());  // after "//"
373     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
374     EXPECT_EQ(1, breaker.breakBadness());
375     EXPECT_EQ(14, breaker.next());  // after "example"
376     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
377     EXPECT_EQ(1, breaker.breakBadness());
378     EXPECT_EQ(19, breaker.next());  // after ".com "
379     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
380     EXPECT_EQ(0, breaker.breakBadness());
381     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
382     EXPECT_EQ(19, breaker.wordStart());              // "x"
383     EXPECT_EQ(20, breaker.wordEnd());
384     EXPECT_EQ(0, breaker.breakBadness());
385 }
386 
387 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)388 TEST(WordBreakerTest, urlBreakChars) {
389     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
390                       '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
391                       'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
392     WordBreaker breaker;
393     breaker.setText(buf, NELEM(buf));
394     EXPECT_EQ(0, breaker.current());
395     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
396     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
397     EXPECT_EQ(1, breaker.breakBadness());
398     EXPECT_EQ(7, breaker.next());  // after "//"
399     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
400     EXPECT_EQ(1, breaker.breakBadness());
401     EXPECT_EQ(8, breaker.next());  // after "a"
402     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
403     EXPECT_EQ(1, breaker.breakBadness());
404     EXPECT_EQ(10, breaker.next());  // after ".b"
405     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
406     EXPECT_EQ(1, breaker.breakBadness());
407     EXPECT_EQ(11, breaker.next());  // after "/"
408     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
409     EXPECT_EQ(1, breaker.breakBadness());
410     EXPECT_EQ(13, breaker.next());  // after "~c"
411     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
412     EXPECT_EQ(1, breaker.breakBadness());
413     EXPECT_EQ(15, breaker.next());  // after ",d"
414     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
415     EXPECT_EQ(1, breaker.breakBadness());
416     EXPECT_EQ(17, breaker.next());  // after "-e"
417     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
418     EXPECT_EQ(1, breaker.breakBadness());
419     EXPECT_EQ(19, breaker.next());  // after "?f"
420     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
421     EXPECT_EQ(1, breaker.breakBadness());
422     EXPECT_EQ(20, breaker.next());  // after "="
423     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
424     EXPECT_EQ(1, breaker.breakBadness());
425     EXPECT_EQ(21, breaker.next());  // after "g"
426     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
427     EXPECT_EQ(1, breaker.breakBadness());
428     EXPECT_EQ(22, breaker.next());  // after "&"
429     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
430     EXPECT_EQ(1, breaker.breakBadness());
431     EXPECT_EQ(23, breaker.next());  // after "h"
432     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
433     EXPECT_EQ(1, breaker.breakBadness());
434     EXPECT_EQ(25, breaker.next());  // after "#i"
435     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
436     EXPECT_EQ(1, breaker.breakBadness());
437     EXPECT_EQ(27, breaker.next());  // after "%j"
438     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
439     EXPECT_EQ(1, breaker.breakBadness());
440     EXPECT_EQ(29, breaker.next());  // after "_k"
441     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
442     EXPECT_EQ(1, breaker.breakBadness());
443     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
444     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
445     EXPECT_EQ(0, breaker.breakBadness());
446 }
447 
TEST(WordBreakerTest,urlNoHyphenBreak)448 TEST(WordBreakerTest, urlNoHyphenBreak) {
449     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
450     WordBreaker breaker;
451     breaker.setText(buf, NELEM(buf));
452     EXPECT_EQ(0, breaker.current());
453     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
454     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455     EXPECT_EQ(7, breaker.next());  // after "//"
456     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457     EXPECT_EQ(8, breaker.next());  // after "a"
458     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
459     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
460     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
461 }
462 
TEST(WordBreakerTest,urlEndsWithSlash)463 TEST(WordBreakerTest, urlEndsWithSlash) {
464     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
465     WordBreaker breaker;
466     breaker.setText(buf, NELEM(buf));
467     EXPECT_EQ(0, breaker.current());
468     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0));  // after "http:"
469     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470     EXPECT_EQ(7, breaker.next());  // after "//"
471     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472     EXPECT_EQ(8, breaker.next());  // after "a"
473     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
474     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
475     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
476 }
477 
TEST(WordBreakerTest,emailStartsWithSlash)478 TEST(WordBreakerTest, emailStartsWithSlash) {
479     uint16_t buf[] = {'/', 'a', '@', 'b'};
480     WordBreaker breaker;
481     breaker.setText(buf, NELEM(buf));
482     EXPECT_EQ(0, breaker.current());
483     EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));  // end
484     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485 }
486 
TEST(WordBreakerTest,setLocaleInsideUrl)487 TEST(WordBreakerTest, setLocaleInsideUrl) {
488     std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
489     WordBreaker breaker;
490     breaker.setText(buf.data(), buf.size());
491     EXPECT_EQ(0, breaker.current());
492     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0));  // after "Hello "
493     EXPECT_EQ(0, breaker.wordStart());
494     EXPECT_EQ(5, breaker.wordEnd());
495 
496     EXPECT_EQ(6, breaker.current());
497     EXPECT_EQ(11, breaker.next());  // after "http:"
498 
499     // Restart from middle point of the URL. It should return the same previous break point.
500     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 6));  // after "http:"
501     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
502 
503     EXPECT_EQ(13, breaker.next());  // after "//"
504     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
505 
506     // Restart from middle point of the URL. It should return the same previous break point.
507     EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), 12));  // after "//"
508     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
509     EXPECT_EQ(16, breaker.next());  // after "abc"
510     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
511     EXPECT_EQ(18, breaker.next());  // after "/d"
512     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
513     EXPECT_EQ(24, breaker.next());  // after ".html"
514     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
515 
516     EXPECT_EQ(29, breaker.next());  // after "World"
517     EXPECT_EQ(24, breaker.wordStart());
518     EXPECT_EQ(29, breaker.wordEnd());
519 }
520 
521 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)522 TEST(WordBreakerTest, spaceAfterSpace) {
523     const std::vector<uint16_t> SPACES = {
524             '\t',    // TAB
525             0x1680,  // OGHAM SPACE MARK
526             0x3000,  // IDEOGRAPHIC SPACE
527     };
528 
529     constexpr uint16_t CHAR_SPACE = 0x0020;
530 
531     for (uint16_t sp : SPACES) {
532         char msg[64] = {};
533         snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
534         SCOPED_TRACE(msg);
535 
536         std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
537         WordBreaker breaker;
538         breaker.setText(buf.data(), buf.size());
539 
540         EXPECT_EQ(0, breaker.current());
541         EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0));  // after "a "
542         EXPECT_EQ(0, breaker.wordStart());
543         EXPECT_EQ(1, breaker.wordEnd());
544 
545         EXPECT_EQ(2, breaker.current());
546         EXPECT_EQ(3, breaker.next());  // after CHAR_SPACE character.
547         EXPECT_EQ(2, breaker.wordStart());
548         EXPECT_EQ(2, breaker.wordEnd());
549 
550         EXPECT_EQ(3, breaker.current());
551         EXPECT_EQ(4, breaker.next());  // after sp character.
552         EXPECT_EQ(3, breaker.wordStart());
553         EXPECT_EQ(4, breaker.wordEnd());
554     }
555 }
556 
557 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
558 public:
TestableICULineBreakerPoolImpl()559     TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
560 
561     using ICULineBreakerPoolImpl::getPoolSize;
562     using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
563 };
564 
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)565 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
566     TestableICULineBreakerPoolImpl pool;
567 
568     const Locale enUS("en-Latn-US");
569     const Locale frFR("fr-Latn-FR");
570 
571     // All following three breakers must be the different instances.
572     ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
573     ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
574     ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
575 
576     EXPECT_NE(nullptr, enUSBreaker.breaker.get());
577     EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
578     EXPECT_NE(nullptr, frFRBreaker.breaker.get());
579 
580     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
581     EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
582     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
583 
584     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
585     EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
586     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
587 }
588 
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)589 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
590     TestableICULineBreakerPoolImpl pool;
591 
592     const Locale enUS("en-Latn-US");
593     const Locale frFR("fr-Latn-FR");
594 
595     // All following three breakers must be the different instances.
596     ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
597 
598     uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
599     icu::BreakIterator* enUSBreakerPtr = enUSBreaker.breaker.get();
600 
601     pool.release(std::move(enUSBreaker));
602     EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
603 
604     // acquire must return a different instance if the locale is different.
605     ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
606     EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
607     EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
608 
609     // acquire must return the same instance as released before if the locale is the same.
610     ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
611     EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
612     EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
613 }
614 
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)615 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
616     const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
617     TestableICULineBreakerPoolImpl pool;
618 
619     const Locale enUS("en-Latn-US");
620 
621     ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
622 
623     // Make pool full.
624     for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
625         slots[i] = pool.acquire(enUS);
626         EXPECT_EQ(0U, pool.getPoolSize());
627     }
628 
629     for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
630         pool.release(std::move(slots[i]));
631         EXPECT_EQ(i + 1, pool.getPoolSize());
632     }
633 
634     for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
635         pool.release(std::move(slots[i]));
636         EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
637     }
638 }
639 
640 }  // namespace minikin
641