• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <cstdio>
20 
21 #include <gtest/gtest.h>
22 
23 #include "UnicodeUtils.h"
24 
25 #ifndef NELEM
26 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
27 #endif
28 
29 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
30 
31 namespace minikin {
32 
TEST(WordBreakerTest,basic)33 TEST(WordBreakerTest, basic) {
34     uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
35     WordBreaker breaker;
36     breaker.setText(buf, NELEM(buf));
37     EXPECT_EQ(0, breaker.current());
38     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), LineBreakStyle::None,
39                                              LineBreakWordStyle::None, 0));  // after "hello "
40     EXPECT_EQ(0, breaker.wordStart());                                       // "hello"
41     EXPECT_EQ(5, breaker.wordEnd());
42     EXPECT_EQ(0, breaker.breakBadness());
43     EXPECT_EQ(6, breaker.current());
44     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
45     EXPECT_EQ(6, breaker.wordStart());               // "world"
46     EXPECT_EQ(11, breaker.wordEnd());
47     EXPECT_EQ(0, breaker.breakBadness());
48     EXPECT_EQ(11, breaker.current());
49 }
50 
TEST(WordBreakerTest,softHyphen)51 TEST(WordBreakerTest, softHyphen) {
52     uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
53     auto lbStyle = LineBreakStyle::None;
54     auto lbWordStyle = LineBreakWordStyle::None;
55     WordBreaker breaker;
56     breaker.setText(buf, NELEM(buf));
57     EXPECT_EQ(0, breaker.current());
58     // after "hel{SOFT HYPHEN}lo "
59     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
60     EXPECT_EQ(0, breaker.wordStart());  // "hel{SOFT HYPHEN}lo"
61     EXPECT_EQ(6, breaker.wordEnd());
62     EXPECT_EQ(0, breaker.breakBadness());
63     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
64     EXPECT_EQ(7, breaker.wordStart());               // "world"
65     EXPECT_EQ(12, breaker.wordEnd());
66     EXPECT_EQ(0, breaker.breakBadness());
67 }
68 
TEST(WordBreakerTest,hardHyphen)69 TEST(WordBreakerTest, hardHyphen) {
70     // Hyphens should not allow breaks anymore.
71     uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
72     auto lbStyle = LineBreakStyle::None;
73     auto lbWordStyle = LineBreakWordStyle::None;
74     WordBreaker breaker;
75     breaker.setText(buf, NELEM(buf));
76     EXPECT_EQ(0, breaker.current());
77     EXPECT_EQ((ssize_t)NELEM(buf),
78               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
79     EXPECT_EQ(0, breaker.wordStart());
80     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
81     EXPECT_EQ(0, breaker.breakBadness());
82 }
83 
TEST(WordBreakerTest,postfixAndPrefix)84 TEST(WordBreakerTest, postfixAndPrefix) {
85     uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5};  // US¢ JP¥
86     auto lbStyle = LineBreakStyle::None;
87     auto lbWordStyle = LineBreakWordStyle::None;
88     WordBreaker breaker;
89     breaker.setText(buf, NELEM(buf));
90     EXPECT_EQ(0, breaker.current());
91 
92     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
93                                              0));  // after CENT SIGN
94     EXPECT_EQ(0, breaker.wordStart());             // "US¢"
95     EXPECT_EQ(3, breaker.wordEnd());
96 
97     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end of string
98     EXPECT_EQ(4, breaker.wordStart());               // "JP¥"
99     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
100 }
101 
TEST(WordBreakerTest,myanmarKinzi)102 TEST(WordBreakerTest, myanmarKinzi) {
103     uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C};  // NGA, ASAT, VIRAMA, KA, UU
104     auto lbStyle = LineBreakStyle::None;
105     auto lbWordStyle = LineBreakWordStyle::None;
106     WordBreaker breaker;
107     breaker.setText(buf, NELEM(buf));
108     EXPECT_EQ(0, breaker.current());
109 
110     // end of string
111     EXPECT_EQ((ssize_t)NELEM(buf),
112               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
113     EXPECT_EQ(0, breaker.wordStart());
114     EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
115 }
116 
TEST(WordBreakerTest,zwjEmojiSequences)117 TEST(WordBreakerTest, zwjEmojiSequences) {
118     uint16_t buf[] = {
119             // man + zwj + heart + zwj + man
120             UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
121             // woman + zwj + heart + zwj + kiss mark + zwj + woman
122             UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
123             // eye + zwj + left speech bubble
124             UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
125             // CAT FACE + zwj + BUST IN SILHOUETTE
126             UTF16(0x1F431), 0x200D, UTF16(0x1F464),
127     };
128     auto lbStyle = LineBreakStyle::None;
129     auto lbWordStyle = LineBreakWordStyle::None;
130     WordBreaker breaker;
131     breaker.setText(buf, NELEM(buf));
132     EXPECT_EQ(0, breaker.current());
133     // after man + zwj + heart + zwj + man
134     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
135     EXPECT_EQ(0, breaker.wordStart());
136     EXPECT_EQ(7, breaker.wordEnd());
137     EXPECT_EQ(17, breaker.next());  // after woman + zwj + heart + zwj + woman
138     EXPECT_EQ(7, breaker.wordStart());
139     EXPECT_EQ(17, breaker.wordEnd());
140     EXPECT_EQ(22, breaker.next());  // after eye + zwj + left speech bubble
141     EXPECT_EQ(17, breaker.wordStart());
142     EXPECT_EQ(22, breaker.wordEnd());
143     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
144     EXPECT_EQ(22, breaker.wordStart());
145     EXPECT_EQ(27, breaker.wordEnd());
146 }
147 
TEST(WordBreakerTest,emojiWithModifier)148 TEST(WordBreakerTest, emojiWithModifier) {
149     uint16_t buf[] = {
150             UTF16(0x1F466), UTF16(0x1F3FB),  // boy + type 1-2 fitzpatrick modifier
151             0x270C, 0xFE0F,
152             UTF16(0x1F3FF)  // victory hand + emoji style + type 6 fitzpatrick modifier
153     };
154     auto lbStyle = LineBreakStyle::None;
155     auto lbWordStyle = LineBreakWordStyle::None;
156     WordBreaker breaker;
157     breaker.setText(buf, NELEM(buf));
158     EXPECT_EQ(0, breaker.current());
159     // after boy + type 1-2 fitzpatrick modifier
160     EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
161     EXPECT_EQ(0, breaker.wordStart());
162     EXPECT_EQ(4, breaker.wordEnd());
163     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
164     EXPECT_EQ(4, breaker.wordStart());
165     EXPECT_EQ(8, breaker.wordEnd());
166 }
167 
TEST(WordBreakerTest,unicode10Emoji)168 TEST(WordBreakerTest, unicode10Emoji) {
169     // Should break between emojis.
170     uint16_t buf[] = {
171             // SLED + SLED
172             UTF16(0x1F6F7), UTF16(0x1F6F7),
173             // SLED + VS15 + SLED
174             UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
175             // WHITE SMILING FACE + SLED
176             0x263A, UTF16(0x1F6F7),
177             // WHITE SMILING FACE + VS16 + SLED
178             0x263A, 0xFE0F, UTF16(0x1F6F7),
179     };
180     auto lbStyle = LineBreakStyle::None;
181     auto lbWordStyle = LineBreakWordStyle::None;
182     WordBreaker breaker;
183     breaker.setText(buf, NELEM(buf));
184     EXPECT_EQ(0, breaker.current());
185     EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), lbStyle, lbWordStyle, 0));
186     EXPECT_EQ(0, breaker.wordStart());
187     EXPECT_EQ(2, breaker.wordEnd());
188 
189     EXPECT_EQ(4, breaker.next());
190     EXPECT_EQ(2, breaker.wordStart());
191     EXPECT_EQ(4, breaker.wordEnd());
192 
193     EXPECT_EQ(7, breaker.next());
194     EXPECT_EQ(4, breaker.wordStart());
195     EXPECT_EQ(7, breaker.wordEnd());
196 
197     EXPECT_EQ(9, breaker.next());
198     EXPECT_EQ(7, breaker.wordStart());
199     EXPECT_EQ(9, breaker.wordEnd());
200 
201     EXPECT_EQ(10, breaker.next());
202     EXPECT_EQ(9, breaker.wordStart());
203     EXPECT_EQ(10, breaker.wordEnd());
204 
205     EXPECT_EQ(12, breaker.next());
206     EXPECT_EQ(10, breaker.wordStart());
207     EXPECT_EQ(12, breaker.wordEnd());
208 
209     EXPECT_EQ(14, breaker.next());
210     EXPECT_EQ(12, breaker.wordStart());
211     EXPECT_EQ(14, breaker.wordEnd());
212 
213     EXPECT_EQ(16, breaker.next());
214     EXPECT_EQ(14, breaker.wordStart());
215     EXPECT_EQ(16, breaker.wordEnd());
216 }
217 
TEST(WordBreakerTest,flagsSequenceSingleFlag)218 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
219     const std::string kFlag = "U+1F3F4";
220     const std::string flags = kFlag + " " + kFlag;
221 
222     const int kFlagLength = 2;
223     const size_t BUF_SIZE = kFlagLength * 2;
224 
225     uint16_t buf[BUF_SIZE];
226     size_t size;
227     ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
228     auto lbStyle = LineBreakStyle::None;
229     auto lbWordStyle = LineBreakWordStyle::None;
230 
231     WordBreaker breaker;
232     breaker.setText(buf, size);
233     EXPECT_EQ(0, breaker.current());
234     // end of the first flag
235     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
236     EXPECT_EQ(0, breaker.wordStart());
237     EXPECT_EQ(kFlagLength, breaker.wordEnd());
238     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
239     EXPECT_EQ(kFlagLength, breaker.wordStart());
240     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
241 }
242 
TEST(WordBreakerTest,flagsSequence)243 TEST(WordBreakerTest, flagsSequence) {
244     // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
245     // of Scotland.
246     const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
247     const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
248 
249     const int kFlagLength = 14;
250     const size_t BUF_SIZE = kFlagLength * 2;
251 
252     uint16_t buf[BUF_SIZE];
253     size_t size;
254     ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
255     auto lbStyle = LineBreakStyle::None;
256     auto lbWordStyle = LineBreakWordStyle::None;
257 
258     WordBreaker breaker;
259     breaker.setText(buf, size);
260     EXPECT_EQ(0, breaker.current());
261     // end of the first flag sequence
262     EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
263     EXPECT_EQ(0, breaker.wordStart());
264     EXPECT_EQ(kFlagLength, breaker.wordEnd());
265     EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
266     EXPECT_EQ(kFlagLength, breaker.wordStart());
267     EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
268 }
269 
TEST(WordBreakerTest,punct)270 TEST(WordBreakerTest, punct) {
271     uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
272                       ' ',    'w',    'o', 'r', 'l', 'd', '!', '!'};
273     auto lbStyle = LineBreakStyle::None;
274     auto lbWordStyle = LineBreakWordStyle::None;
275     WordBreaker breaker;
276     breaker.setText(buf, NELEM(buf));
277     EXPECT_EQ(0, breaker.current());
278     EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
279                                              0));  // after "¡¡hello, "
280     EXPECT_EQ(2, breaker.wordStart());             // "hello"
281     EXPECT_EQ(7, breaker.wordEnd());
282     EXPECT_EQ(0, breaker.breakBadness());
283     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
284     EXPECT_EQ(9, breaker.wordStart());               // "world"
285     EXPECT_EQ(14, breaker.wordEnd());
286     EXPECT_EQ(0, breaker.breakBadness());
287 }
288 
TEST(WordBreakerTest,email)289 TEST(WordBreakerTest, email) {
290     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
291                       'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
292     auto lbStyle = LineBreakStyle::None;
293     auto lbWordStyle = LineBreakWordStyle::None;
294     WordBreaker breaker;
295     breaker.setText(buf, NELEM(buf));
296     EXPECT_EQ(0, breaker.current());
297     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
298                                               0));  // after "foo@example"
299     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
300     EXPECT_EQ(1, breaker.breakBadness());
301     EXPECT_EQ(16, breaker.next());  // after ".com "
302     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
303     EXPECT_EQ(0, breaker.breakBadness());
304     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
305     EXPECT_EQ(16, breaker.wordStart());              // "x"
306     EXPECT_EQ(17, breaker.wordEnd());
307     EXPECT_EQ(0, breaker.breakBadness());
308 }
309 
TEST(WordBreakerTest,mailto)310 TEST(WordBreakerTest, mailto) {
311     uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
312                       'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
313     auto lbStyle = LineBreakStyle::None;
314     auto lbWordStyle = LineBreakWordStyle::None;
315     WordBreaker breaker;
316     breaker.setText(buf, NELEM(buf));
317     EXPECT_EQ(0, breaker.current());
318     EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
319                                              0));  // after "mailto:"
320     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
321     EXPECT_EQ(1, breaker.breakBadness());
322     EXPECT_EQ(18, breaker.next());  // after "foo@example"
323     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
324     EXPECT_EQ(1, breaker.breakBadness());
325     EXPECT_EQ(23, breaker.next());  // after ".com "
326     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
327     EXPECT_EQ(0, breaker.breakBadness());
328     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
329     EXPECT_EQ(23, breaker.wordStart());              // "x"
330     EXPECT_EQ(24, breaker.wordEnd());
331     EXPECT_EQ(0, breaker.breakBadness());
332 }
333 
334 // The current logic always places a line break after a detected email address or URL
335 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)336 TEST(WordBreakerTest, emailNonAscii) {
337     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
338                       'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
339     auto lbStyle = LineBreakStyle::None;
340     auto lbWordStyle = LineBreakWordStyle::None;
341     WordBreaker breaker;
342     breaker.setText(buf, NELEM(buf));
343     EXPECT_EQ(0, breaker.current());
344     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
345                                               0));  // after "foo@example"
346     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
347     EXPECT_EQ(1, breaker.breakBadness());
348     EXPECT_EQ(15, breaker.next());  // after ".com"
349     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
350     EXPECT_EQ(0, breaker.breakBadness());
351     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
352     EXPECT_EQ(15, breaker.wordStart());              // "一"
353     EXPECT_EQ(16, breaker.wordEnd());
354     EXPECT_EQ(0, breaker.breakBadness());
355 }
356 
TEST(WordBreakerTest,emailCombining)357 TEST(WordBreakerTest, emailCombining) {
358     uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a',    'm', 'p',
359                       'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
360     auto lbStyle = LineBreakStyle::None;
361     auto lbWordStyle = LineBreakWordStyle::None;
362     WordBreaker breaker;
363     breaker.setText(buf, NELEM(buf));
364     EXPECT_EQ(0, breaker.current());
365     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
366                                               0));  // after "foo@example"
367     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
368     EXPECT_EQ(1, breaker.breakBadness());
369     EXPECT_EQ(17, breaker.next());  // after ".com̃ "
370     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
371     EXPECT_EQ(0, breaker.breakBadness());
372     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
373     EXPECT_EQ(17, breaker.wordStart());              // "x"
374     EXPECT_EQ(18, breaker.wordEnd());
375     EXPECT_EQ(0, breaker.breakBadness());
376 }
377 
TEST(WordBreakerTest,lonelyAt)378 TEST(WordBreakerTest, lonelyAt) {
379     uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
380     auto lbStyle = LineBreakStyle::None;
381     auto lbWordStyle = LineBreakWordStyle::None;
382     WordBreaker breaker;
383     breaker.setText(buf, NELEM(buf));
384     EXPECT_EQ(0, breaker.current());
385     EXPECT_EQ(2,
386               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));  // after "a "
387     EXPECT_EQ(0, breaker.wordStart());                              // "a"
388     EXPECT_EQ(1, breaker.wordEnd());
389     EXPECT_EQ(0, breaker.breakBadness());
390     EXPECT_EQ(4, breaker.next());  // after "@ "
391     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
392     EXPECT_EQ(0, breaker.breakBadness());
393     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
394     EXPECT_EQ(4, breaker.wordStart());               // "b"
395     EXPECT_EQ(5, breaker.wordEnd());
396     EXPECT_EQ(0, breaker.breakBadness());
397 }
398 
TEST(WordBreakerTest,url)399 TEST(WordBreakerTest, url) {
400     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
401                       'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
402     auto lbStyle = LineBreakStyle::None;
403     auto lbWordStyle = LineBreakWordStyle::None;
404     WordBreaker breaker;
405     breaker.setText(buf, NELEM(buf));
406     EXPECT_EQ(0, breaker.current());
407     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
408                                              0));  // after "http:"
409     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410     EXPECT_EQ(1, breaker.breakBadness());
411     EXPECT_EQ(7, breaker.next());  // after "//"
412     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413     EXPECT_EQ(1, breaker.breakBadness());
414     EXPECT_EQ(14, breaker.next());  // after "example"
415     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416     EXPECT_EQ(1, breaker.breakBadness());
417     EXPECT_EQ(19, breaker.next());  // after ".com "
418     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419     EXPECT_EQ(0, breaker.breakBadness());
420     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
421     EXPECT_EQ(19, breaker.wordStart());              // "x"
422     EXPECT_EQ(20, breaker.wordEnd());
423     EXPECT_EQ(0, breaker.breakBadness());
424 }
425 
426 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)427 TEST(WordBreakerTest, urlBreakChars) {
428     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
429                       '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
430                       'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
431     auto lbStyle = LineBreakStyle::None;
432     auto lbWordStyle = LineBreakWordStyle::None;
433     WordBreaker breaker;
434     breaker.setText(buf, NELEM(buf));
435     EXPECT_EQ(0, breaker.current());
436     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
437                                              0));  // after "http:"
438     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
439     EXPECT_EQ(1, breaker.breakBadness());
440     EXPECT_EQ(7, breaker.next());  // after "//"
441     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
442     EXPECT_EQ(1, breaker.breakBadness());
443     EXPECT_EQ(8, breaker.next());  // after "a"
444     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
445     EXPECT_EQ(1, breaker.breakBadness());
446     EXPECT_EQ(10, breaker.next());  // after ".b"
447     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
448     EXPECT_EQ(1, breaker.breakBadness());
449     EXPECT_EQ(11, breaker.next());  // after "/"
450     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
451     EXPECT_EQ(1, breaker.breakBadness());
452     EXPECT_EQ(13, breaker.next());  // after "~c"
453     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
454     EXPECT_EQ(1, breaker.breakBadness());
455     EXPECT_EQ(15, breaker.next());  // after ",d"
456     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457     EXPECT_EQ(1, breaker.breakBadness());
458     EXPECT_EQ(17, breaker.next());  // after "-e"
459     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
460     EXPECT_EQ(1, breaker.breakBadness());
461     EXPECT_EQ(19, breaker.next());  // after "?f"
462     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
463     EXPECT_EQ(1, breaker.breakBadness());
464     EXPECT_EQ(20, breaker.next());  // after "="
465     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
466     EXPECT_EQ(1, breaker.breakBadness());
467     EXPECT_EQ(21, breaker.next());  // after "g"
468     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
469     EXPECT_EQ(1, breaker.breakBadness());
470     EXPECT_EQ(22, breaker.next());  // after "&"
471     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472     EXPECT_EQ(1, breaker.breakBadness());
473     EXPECT_EQ(23, breaker.next());  // after "h"
474     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
475     EXPECT_EQ(1, breaker.breakBadness());
476     EXPECT_EQ(25, breaker.next());  // after "#i"
477     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
478     EXPECT_EQ(1, breaker.breakBadness());
479     EXPECT_EQ(27, breaker.next());  // after "%j"
480     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
481     EXPECT_EQ(1, breaker.breakBadness());
482     EXPECT_EQ(29, breaker.next());  // after "_k"
483     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
484     EXPECT_EQ(1, breaker.breakBadness());
485     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
486     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
487     EXPECT_EQ(0, breaker.breakBadness());
488 }
489 
TEST(WordBreakerTest,urlNoHyphenBreak)490 TEST(WordBreakerTest, urlNoHyphenBreak) {
491     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
492     auto lbStyle = LineBreakStyle::None;
493     auto lbWordStyle = LineBreakWordStyle::None;
494     WordBreaker breaker;
495     breaker.setText(buf, NELEM(buf));
496     EXPECT_EQ(0, breaker.current());
497     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
498                                              0));  // after "http:"
499     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
500     EXPECT_EQ(7, breaker.next());  // after "//"
501     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
502     EXPECT_EQ(8, breaker.next());  // after "a"
503     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
504     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
505     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
506 }
507 
TEST(WordBreakerTest,urlEndsWithSlash)508 TEST(WordBreakerTest, urlEndsWithSlash) {
509     uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
510     auto lbStyle = LineBreakStyle::None;
511     auto lbWordStyle = LineBreakWordStyle::None;
512     WordBreaker breaker;
513     breaker.setText(buf, NELEM(buf));
514     EXPECT_EQ(0, breaker.current());
515     EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
516                                              0));  // after "http:"
517     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
518     EXPECT_EQ(7, breaker.next());  // after "//"
519     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
520     EXPECT_EQ(8, breaker.next());  // after "a"
521     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
522     EXPECT_EQ((ssize_t)NELEM(buf), breaker.next());  // end
523     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
524 }
525 
TEST(WordBreakerTest,emailStartsWithSlash)526 TEST(WordBreakerTest, emailStartsWithSlash) {
527     uint16_t buf[] = {'/', 'a', '@', 'b'};
528     auto lbStyle = LineBreakStyle::None;
529     auto lbWordStyle = LineBreakWordStyle::None;
530     WordBreaker breaker;
531     breaker.setText(buf, NELEM(buf));
532     EXPECT_EQ(0, breaker.current());
533     EXPECT_EQ((ssize_t)NELEM(buf),
534               breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));  // end
535     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
536 }
537 
TEST(WordBreakerTest,setLocaleInsideUrl)538 TEST(WordBreakerTest, setLocaleInsideUrl) {
539     std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
540     auto lbStyle = LineBreakStyle::None;
541     auto lbWordStyle = LineBreakWordStyle::None;
542     WordBreaker breaker;
543     breaker.setText(buf.data(), buf.size());
544     EXPECT_EQ(0, breaker.current());
545     EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
546                                              0));  // after "Hello "
547     EXPECT_EQ(0, breaker.wordStart());
548     EXPECT_EQ(5, breaker.wordEnd());
549 
550     EXPECT_EQ(6, breaker.current());
551     EXPECT_EQ(11, breaker.next());  // after "http:"
552 
553     // Restart from middle point of the URL. It should return the same previous break point.
554     EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
555                                               6));  // after "http:"
556     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
557 
558     EXPECT_EQ(13, breaker.next());  // after "//"
559     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
560 
561     // Restart from middle point of the URL. It should return the same previous break point.
562     EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
563                                               12));  // after "//"
564     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
565     EXPECT_EQ(16, breaker.next());  // after "abc"
566     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
567     EXPECT_EQ(18, breaker.next());  // after "/d"
568     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
569     EXPECT_EQ(24, breaker.next());  // after ".html"
570     EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
571 
572     EXPECT_EQ(29, breaker.next());  // after "World"
573     EXPECT_EQ(24, breaker.wordStart());
574     EXPECT_EQ(29, breaker.wordEnd());
575 }
576 
577 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)578 TEST(WordBreakerTest, spaceAfterSpace) {
579     const std::vector<uint16_t> SPACES = {
580             '\t',    // TAB
581             0x1680,  // OGHAM SPACE MARK
582             0x3000,  // IDEOGRAPHIC SPACE
583     };
584 
585     constexpr uint16_t CHAR_SPACE = 0x0020;
586     auto lbStyle = LineBreakStyle::None;
587     auto lbWordStyle = LineBreakWordStyle::None;
588 
589     for (uint16_t sp : SPACES) {
590         char msg[64] = {};
591         snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
592         SCOPED_TRACE(msg);
593 
594         std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
595         WordBreaker breaker;
596         breaker.setText(buf.data(), buf.size());
597 
598         EXPECT_EQ(0, breaker.current());
599         EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
600                                                  0));  // after "a "
601         EXPECT_EQ(0, breaker.wordStart());
602         EXPECT_EQ(1, breaker.wordEnd());
603 
604         EXPECT_EQ(2, breaker.current());
605         EXPECT_EQ(3, breaker.next());  // after CHAR_SPACE character.
606         EXPECT_EQ(2, breaker.wordStart());
607         EXPECT_EQ(2, breaker.wordEnd());
608 
609         EXPECT_EQ(3, breaker.current());
610         EXPECT_EQ(4, breaker.next());  // after sp character.
611         EXPECT_EQ(3, breaker.wordStart());
612         EXPECT_EQ(4, breaker.wordEnd());
613     }
614 }
615 
616 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
617 public:
TestableICULineBreakerPoolImpl()618     TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
619 
620     using ICULineBreakerPoolImpl::getPoolSize;
621     using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
622 };
623 
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)624 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
625     TestableICULineBreakerPoolImpl pool;
626 
627     const Locale enUS("en-Latn-US");
628     const Locale frFR("fr-Latn-FR");
629 
630     // All following three breakers must be the different instances.
631     ICULineBreakerPool::Slot enUSBreaker =
632             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
633     ICULineBreakerPool::Slot enUSBreaker2 =
634             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
635     ICULineBreakerPool::Slot enUSBreaker3 =
636             pool.acquire(enUS, LineBreakStyle::Strict, LineBreakWordStyle::None);
637     ICULineBreakerPool::Slot frFRBreaker =
638             pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::None);
639     ICULineBreakerPool::Slot frFRBreaker2 =
640             pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::Phrase);
641 
642     EXPECT_NE(nullptr, enUSBreaker.breaker.get());
643     EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
644     EXPECT_NE(nullptr, enUSBreaker3.breaker.get());
645     EXPECT_NE(nullptr, frFRBreaker.breaker.get());
646     EXPECT_NE(nullptr, frFRBreaker2.breaker.get());
647 
648     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
649     EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker3.breaker.get());
650     EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
651     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
652     EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker2.breaker.get());
653     EXPECT_NE(enUSBreaker2.breaker.get(), enUSBreaker3.breaker.get());
654 
655     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
656     EXPECT_EQ(enUSBreaker.localeId, enUSBreaker3.localeId);
657     EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
658     EXPECT_NE(enUSBreaker.localeId, frFRBreaker2.localeId);
659     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
660     EXPECT_NE(enUSBreaker2.localeId, frFRBreaker2.localeId);
661     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
662 }
663 
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)664 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
665     TestableICULineBreakerPoolImpl pool;
666 
667     const Locale enUS("en-Latn-US");
668     const Locale frFR("fr-Latn-FR");
669 
670     // All following three breakers must be the different instances.
671     ICULineBreakerPool::Slot enUSBreaker =
672             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
673 
674     uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
675     auto* enUSBreakerPtr = enUSBreaker.breaker.get();
676 
677     pool.release(std::move(enUSBreaker));
678     EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
679 
680     // acquire must return a different instance if the locale is different.
681     ICULineBreakerPool::Slot frFRBreaker =
682             pool.acquire(frFR, LineBreakStyle::Loose, LineBreakWordStyle::None);
683     EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
684     EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
685 
686     // acquire must return the same instance as released before if the locale is the same.
687     ICULineBreakerPool::Slot enUSBreaker2 =
688             pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
689     EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
690     EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
691 
692     // acquire must return a different instance if the line break is different.
693     ICULineBreakerPool::Slot frFRBreaker2 =
694             pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::None);
695     ICULineBreakerPool::Slot frFRBreaker3 =
696             pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::Phrase);
697     EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker2.breaker.get());
698     EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker3.breaker.get());
699     EXPECT_NE(frFRBreaker2.breaker.get(), frFRBreaker3.breaker.get());
700     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
701     EXPECT_EQ(frFRBreaker.localeId, frFRBreaker3.localeId);
702     EXPECT_EQ(frFRBreaker2.localeId, frFRBreaker3.localeId);
703 }
704 
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)705 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
706     const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
707     TestableICULineBreakerPoolImpl pool;
708 
709     const Locale enUS("en-Latn-US");
710 
711     ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
712 
713     // Make pool full.
714     for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
715         slots[i] = pool.acquire(enUS, LineBreakStyle::None, LineBreakWordStyle::None);
716         EXPECT_EQ(0U, pool.getPoolSize());
717     }
718 
719     for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
720         pool.release(std::move(slots[i]));
721         EXPECT_EQ(i + 1, pool.getPoolSize());
722     }
723 
724     for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
725         pool.release(std::move(slots[i]));
726         EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
727     }
728 }
729 
730 }  // namespace minikin
731