1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <cstdio>
20
21 #include <gtest/gtest.h>
22 #include <unicode/locid.h>
23 #include <unicode/uclean.h>
24 #include <unicode/udata.h>
25
26 #include "UnicodeUtils.h"
27
28 #ifndef NELEM
29 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
30 #endif
31
32 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
33
34 namespace minikin {
35
TEST(WordBreakerTest,basic)36 TEST(WordBreakerTest, basic) {
37 uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
38 WordBreaker breaker;
39 breaker.setText(buf, NELEM(buf));
40 EXPECT_EQ(0, breaker.current());
41 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0)); // after "hello "
42 EXPECT_EQ(0, breaker.wordStart()); // "hello"
43 EXPECT_EQ(5, breaker.wordEnd());
44 EXPECT_EQ(0, breaker.breakBadness());
45 EXPECT_EQ(6, breaker.current());
46 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
47 EXPECT_EQ(6, breaker.wordStart()); // "world"
48 EXPECT_EQ(11, breaker.wordEnd());
49 EXPECT_EQ(0, breaker.breakBadness());
50 EXPECT_EQ(11, breaker.current());
51 }
52
TEST(WordBreakerTest,softHyphen)53 TEST(WordBreakerTest, softHyphen) {
54 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
55 WordBreaker breaker;
56 breaker.setText(buf, NELEM(buf));
57 EXPECT_EQ(0, breaker.current());
58 // after "hel{SOFT HYPHEN}lo "
59 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
60 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
61 EXPECT_EQ(6, breaker.wordEnd());
62 EXPECT_EQ(0, breaker.breakBadness());
63 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
64 EXPECT_EQ(7, breaker.wordStart()); // "world"
65 EXPECT_EQ(12, breaker.wordEnd());
66 EXPECT_EQ(0, breaker.breakBadness());
67 }
68
TEST(WordBreakerTest,hardHyphen)69 TEST(WordBreakerTest, hardHyphen) {
70 // Hyphens should not allow breaks anymore.
71 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
72 WordBreaker breaker;
73 breaker.setText(buf, NELEM(buf));
74 EXPECT_EQ(0, breaker.current());
75 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
76 EXPECT_EQ(0, breaker.wordStart());
77 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
78 EXPECT_EQ(0, breaker.breakBadness());
79 }
80
TEST(WordBreakerTest,postfixAndPrefix)81 TEST(WordBreakerTest, postfixAndPrefix) {
82 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
83 WordBreaker breaker;
84 breaker.setText(buf, NELEM(buf));
85 EXPECT_EQ(0, breaker.current());
86
87 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0)); // after CENT SIGN
88 EXPECT_EQ(0, breaker.wordStart()); // "US¢"
89 EXPECT_EQ(3, breaker.wordEnd());
90
91 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
92 EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
93 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
94 }
95
TEST(WordBreakerTest,myanmarKinzi)96 TEST(WordBreakerTest, myanmarKinzi) {
97 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
98 WordBreaker breaker;
99 icu::Locale burmese("my");
100 breaker.setText(buf, NELEM(buf));
101 EXPECT_EQ(0, breaker.current());
102
103 // end of string
104 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0));
105 EXPECT_EQ(0, breaker.wordStart());
106 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
107 }
108
TEST(WordBreakerTest,zwjEmojiSequences)109 TEST(WordBreakerTest, zwjEmojiSequences) {
110 uint16_t buf[] = {
111 // man + zwj + heart + zwj + man
112 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
113 // woman + zwj + heart + zwj + kiss mark + zwj + woman
114 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
115 // eye + zwj + left speech bubble
116 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
117 // CAT FACE + zwj + BUST IN SILHOUETTE
118 UTF16(0x1F431), 0x200D, UTF16(0x1F464),
119 };
120 WordBreaker breaker;
121 breaker.setText(buf, NELEM(buf));
122 EXPECT_EQ(0, breaker.current());
123 // after man + zwj + heart + zwj + man
124 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0));
125 EXPECT_EQ(0, breaker.wordStart());
126 EXPECT_EQ(7, breaker.wordEnd());
127 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
128 EXPECT_EQ(7, breaker.wordStart());
129 EXPECT_EQ(17, breaker.wordEnd());
130 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble
131 EXPECT_EQ(17, breaker.wordStart());
132 EXPECT_EQ(22, breaker.wordEnd());
133 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
134 EXPECT_EQ(22, breaker.wordStart());
135 EXPECT_EQ(27, breaker.wordEnd());
136 }
137
TEST(WordBreakerTest,emojiWithModifier)138 TEST(WordBreakerTest, emojiWithModifier) {
139 uint16_t buf[] = {
140 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
141 0x270C, 0xFE0F,
142 UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
143 };
144 WordBreaker breaker;
145 breaker.setText(buf, NELEM(buf));
146 EXPECT_EQ(0, breaker.current());
147 // after boy + type 1-2 fitzpatrick modifier
148 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), 0));
149 EXPECT_EQ(0, breaker.wordStart());
150 EXPECT_EQ(4, breaker.wordEnd());
151 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
152 EXPECT_EQ(4, breaker.wordStart());
153 EXPECT_EQ(8, breaker.wordEnd());
154 }
155
TEST(WordBreakerTest,unicode10Emoji)156 TEST(WordBreakerTest, unicode10Emoji) {
157 // Should break between emojis.
158 uint16_t buf[] = {
159 // SLED + SLED
160 UTF16(0x1F6F7), UTF16(0x1F6F7),
161 // SLED + VS15 + SLED
162 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
163 // WHITE SMILING FACE + SLED
164 0x263A, UTF16(0x1F6F7),
165 // WHITE SMILING FACE + VS16 + SLED
166 0x263A, 0xFE0F, UTF16(0x1F6F7),
167 };
168 WordBreaker breaker;
169 breaker.setText(buf, NELEM(buf));
170 EXPECT_EQ(0, breaker.current());
171 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), 0));
172 EXPECT_EQ(0, breaker.wordStart());
173 EXPECT_EQ(2, breaker.wordEnd());
174
175 EXPECT_EQ(4, breaker.next());
176 EXPECT_EQ(2, breaker.wordStart());
177 EXPECT_EQ(4, breaker.wordEnd());
178
179 EXPECT_EQ(7, breaker.next());
180 EXPECT_EQ(4, breaker.wordStart());
181 EXPECT_EQ(7, breaker.wordEnd());
182
183 EXPECT_EQ(9, breaker.next());
184 EXPECT_EQ(7, breaker.wordStart());
185 EXPECT_EQ(9, breaker.wordEnd());
186
187 EXPECT_EQ(10, breaker.next());
188 EXPECT_EQ(9, breaker.wordStart());
189 EXPECT_EQ(10, breaker.wordEnd());
190
191 EXPECT_EQ(12, breaker.next());
192 EXPECT_EQ(10, breaker.wordStart());
193 EXPECT_EQ(12, breaker.wordEnd());
194
195 EXPECT_EQ(14, breaker.next());
196 EXPECT_EQ(12, breaker.wordStart());
197 EXPECT_EQ(14, breaker.wordEnd());
198
199 EXPECT_EQ(16, breaker.next());
200 EXPECT_EQ(14, breaker.wordStart());
201 EXPECT_EQ(16, breaker.wordEnd());
202 }
203
TEST(WordBreakerTest,flagsSequenceSingleFlag)204 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
205 const std::string kFlag = "U+1F3F4";
206 const std::string flags = kFlag + " " + kFlag;
207
208 const int kFlagLength = 2;
209 const size_t BUF_SIZE = kFlagLength * 2;
210
211 uint16_t buf[BUF_SIZE];
212 size_t size;
213 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
214
215 WordBreaker breaker;
216 breaker.setText(buf, size);
217 EXPECT_EQ(0, breaker.current());
218 // end of the first flag
219 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
220 EXPECT_EQ(0, breaker.wordStart());
221 EXPECT_EQ(kFlagLength, breaker.wordEnd());
222 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
223 EXPECT_EQ(kFlagLength, breaker.wordStart());
224 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
225 }
226
TEST(WordBreakerTest,flagsSequence)227 TEST(WordBreakerTest, flagsSequence) {
228 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
229 // of Scotland.
230 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
231 const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
232
233 const int kFlagLength = 14;
234 const size_t BUF_SIZE = kFlagLength * 2;
235
236 uint16_t buf[BUF_SIZE];
237 size_t size;
238 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
239
240 WordBreaker breaker;
241 breaker.setText(buf, size);
242 EXPECT_EQ(0, breaker.current());
243 // end of the first flag sequence
244 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), 0));
245 EXPECT_EQ(0, breaker.wordStart());
246 EXPECT_EQ(kFlagLength, breaker.wordEnd());
247 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
248 EXPECT_EQ(kFlagLength, breaker.wordStart());
249 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
250 }
251
TEST(WordBreakerTest,punct)252 TEST(WordBreakerTest, punct) {
253 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
254 ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};
255 WordBreaker breaker;
256 breaker.setText(buf, NELEM(buf));
257 EXPECT_EQ(0, breaker.current());
258 EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), 0)); // after "¡¡hello, "
259 EXPECT_EQ(2, breaker.wordStart()); // "hello"
260 EXPECT_EQ(7, breaker.wordEnd());
261 EXPECT_EQ(0, breaker.breakBadness());
262 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
263 EXPECT_EQ(9, breaker.wordStart()); // "world"
264 EXPECT_EQ(14, breaker.wordEnd());
265 EXPECT_EQ(0, breaker.breakBadness());
266 }
267
TEST(WordBreakerTest,email)268 TEST(WordBreakerTest, email) {
269 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
270 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
271 WordBreaker breaker;
272 breaker.setText(buf, NELEM(buf));
273 EXPECT_EQ(0, breaker.current());
274 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
275 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
276 EXPECT_EQ(1, breaker.breakBadness());
277 EXPECT_EQ(16, breaker.next()); // after ".com "
278 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
279 EXPECT_EQ(0, breaker.breakBadness());
280 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
281 EXPECT_EQ(16, breaker.wordStart()); // "x"
282 EXPECT_EQ(17, breaker.wordEnd());
283 EXPECT_EQ(0, breaker.breakBadness());
284 }
285
TEST(WordBreakerTest,mailto)286 TEST(WordBreakerTest, mailto) {
287 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
288 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
289 WordBreaker breaker;
290 breaker.setText(buf, NELEM(buf));
291 EXPECT_EQ(0, breaker.current());
292 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), 0)); // after "mailto:"
293 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
294 EXPECT_EQ(1, breaker.breakBadness());
295 EXPECT_EQ(18, breaker.next()); // after "foo@example"
296 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
297 EXPECT_EQ(1, breaker.breakBadness());
298 EXPECT_EQ(23, breaker.next()); // after ".com "
299 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
300 EXPECT_EQ(0, breaker.breakBadness());
301 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
302 EXPECT_EQ(23, breaker.wordStart()); // "x"
303 EXPECT_EQ(24, breaker.wordEnd());
304 EXPECT_EQ(0, breaker.breakBadness());
305 }
306
307 // The current logic always places a line break after a detected email address or URL
308 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)309 TEST(WordBreakerTest, emailNonAscii) {
310 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
311 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
312 WordBreaker breaker;
313 breaker.setText(buf, NELEM(buf));
314 EXPECT_EQ(0, breaker.current());
315 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
316 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
317 EXPECT_EQ(1, breaker.breakBadness());
318 EXPECT_EQ(15, breaker.next()); // after ".com"
319 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
320 EXPECT_EQ(0, breaker.breakBadness());
321 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
322 EXPECT_EQ(15, breaker.wordStart()); // "一"
323 EXPECT_EQ(16, breaker.wordEnd());
324 EXPECT_EQ(0, breaker.breakBadness());
325 }
326
TEST(WordBreakerTest,emailCombining)327 TEST(WordBreakerTest, emailCombining) {
328 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
329 'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
330 WordBreaker breaker;
331 breaker.setText(buf, NELEM(buf));
332 EXPECT_EQ(0, breaker.current());
333 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 0)); // after "foo@example"
334 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
335 EXPECT_EQ(1, breaker.breakBadness());
336 EXPECT_EQ(17, breaker.next()); // after ".com̃ "
337 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
338 EXPECT_EQ(0, breaker.breakBadness());
339 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
340 EXPECT_EQ(17, breaker.wordStart()); // "x"
341 EXPECT_EQ(18, breaker.wordEnd());
342 EXPECT_EQ(0, breaker.breakBadness());
343 }
344
TEST(WordBreakerTest,lonelyAt)345 TEST(WordBreakerTest, lonelyAt) {
346 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
347 WordBreaker breaker;
348 breaker.setText(buf, NELEM(buf));
349 EXPECT_EQ(0, breaker.current());
350 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0)); // after "a "
351 EXPECT_EQ(0, breaker.wordStart()); // "a"
352 EXPECT_EQ(1, breaker.wordEnd());
353 EXPECT_EQ(0, breaker.breakBadness());
354 EXPECT_EQ(4, breaker.next()); // after "@ "
355 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
356 EXPECT_EQ(0, breaker.breakBadness());
357 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
358 EXPECT_EQ(4, breaker.wordStart()); // "b"
359 EXPECT_EQ(5, breaker.wordEnd());
360 EXPECT_EQ(0, breaker.breakBadness());
361 }
362
TEST(WordBreakerTest,url)363 TEST(WordBreakerTest, url) {
364 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
365 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
366 WordBreaker breaker;
367 breaker.setText(buf, NELEM(buf));
368 EXPECT_EQ(0, breaker.current());
369 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
370 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
371 EXPECT_EQ(1, breaker.breakBadness());
372 EXPECT_EQ(7, breaker.next()); // after "//"
373 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
374 EXPECT_EQ(1, breaker.breakBadness());
375 EXPECT_EQ(14, breaker.next()); // after "example"
376 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
377 EXPECT_EQ(1, breaker.breakBadness());
378 EXPECT_EQ(19, breaker.next()); // after ".com "
379 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
380 EXPECT_EQ(0, breaker.breakBadness());
381 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
382 EXPECT_EQ(19, breaker.wordStart()); // "x"
383 EXPECT_EQ(20, breaker.wordEnd());
384 EXPECT_EQ(0, breaker.breakBadness());
385 }
386
387 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)388 TEST(WordBreakerTest, urlBreakChars) {
389 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
390 '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
391 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
392 WordBreaker breaker;
393 breaker.setText(buf, NELEM(buf));
394 EXPECT_EQ(0, breaker.current());
395 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
396 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
397 EXPECT_EQ(1, breaker.breakBadness());
398 EXPECT_EQ(7, breaker.next()); // after "//"
399 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
400 EXPECT_EQ(1, breaker.breakBadness());
401 EXPECT_EQ(8, breaker.next()); // after "a"
402 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
403 EXPECT_EQ(1, breaker.breakBadness());
404 EXPECT_EQ(10, breaker.next()); // after ".b"
405 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
406 EXPECT_EQ(1, breaker.breakBadness());
407 EXPECT_EQ(11, breaker.next()); // after "/"
408 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
409 EXPECT_EQ(1, breaker.breakBadness());
410 EXPECT_EQ(13, breaker.next()); // after "~c"
411 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
412 EXPECT_EQ(1, breaker.breakBadness());
413 EXPECT_EQ(15, breaker.next()); // after ",d"
414 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
415 EXPECT_EQ(1, breaker.breakBadness());
416 EXPECT_EQ(17, breaker.next()); // after "-e"
417 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
418 EXPECT_EQ(1, breaker.breakBadness());
419 EXPECT_EQ(19, breaker.next()); // after "?f"
420 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
421 EXPECT_EQ(1, breaker.breakBadness());
422 EXPECT_EQ(20, breaker.next()); // after "="
423 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
424 EXPECT_EQ(1, breaker.breakBadness());
425 EXPECT_EQ(21, breaker.next()); // after "g"
426 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
427 EXPECT_EQ(1, breaker.breakBadness());
428 EXPECT_EQ(22, breaker.next()); // after "&"
429 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
430 EXPECT_EQ(1, breaker.breakBadness());
431 EXPECT_EQ(23, breaker.next()); // after "h"
432 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
433 EXPECT_EQ(1, breaker.breakBadness());
434 EXPECT_EQ(25, breaker.next()); // after "#i"
435 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
436 EXPECT_EQ(1, breaker.breakBadness());
437 EXPECT_EQ(27, breaker.next()); // after "%j"
438 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
439 EXPECT_EQ(1, breaker.breakBadness());
440 EXPECT_EQ(29, breaker.next()); // after "_k"
441 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
442 EXPECT_EQ(1, breaker.breakBadness());
443 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
444 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
445 EXPECT_EQ(0, breaker.breakBadness());
446 }
447
TEST(WordBreakerTest,urlNoHyphenBreak)448 TEST(WordBreakerTest, urlNoHyphenBreak) {
449 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
450 WordBreaker breaker;
451 breaker.setText(buf, NELEM(buf));
452 EXPECT_EQ(0, breaker.current());
453 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
454 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
455 EXPECT_EQ(7, breaker.next()); // after "//"
456 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457 EXPECT_EQ(8, breaker.next()); // after "a"
458 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
459 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
460 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
461 }
462
TEST(WordBreakerTest,urlEndsWithSlash)463 TEST(WordBreakerTest, urlEndsWithSlash) {
464 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
465 WordBreaker breaker;
466 breaker.setText(buf, NELEM(buf));
467 EXPECT_EQ(0, breaker.current());
468 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), 0)); // after "http:"
469 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
470 EXPECT_EQ(7, breaker.next()); // after "//"
471 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472 EXPECT_EQ(8, breaker.next()); // after "a"
473 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
474 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
475 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
476 }
477
TEST(WordBreakerTest,emailStartsWithSlash)478 TEST(WordBreakerTest, emailStartsWithSlash) {
479 uint16_t buf[] = {'/', 'a', '@', 'b'};
480 WordBreaker breaker;
481 breaker.setText(buf, NELEM(buf));
482 EXPECT_EQ(0, breaker.current());
483 EXPECT_EQ((ssize_t)NELEM(buf), breaker.followingWithLocale(Locale("en-US"), 0)); // end
484 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
485 }
486
TEST(WordBreakerTest,setLocaleInsideUrl)487 TEST(WordBreakerTest, setLocaleInsideUrl) {
488 std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
489 WordBreaker breaker;
490 breaker.setText(buf.data(), buf.size());
491 EXPECT_EQ(0, breaker.current());
492 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), 0)); // after "Hello "
493 EXPECT_EQ(0, breaker.wordStart());
494 EXPECT_EQ(5, breaker.wordEnd());
495
496 EXPECT_EQ(6, breaker.current());
497 EXPECT_EQ(11, breaker.next()); // after "http:"
498
499 // Restart from middle point of the URL. It should return the same previous break point.
500 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), 6)); // after "http:"
501 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
502
503 EXPECT_EQ(13, breaker.next()); // after "//"
504 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
505
506 // Restart from middle point of the URL. It should return the same previous break point.
507 EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), 12)); // after "//"
508 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
509 EXPECT_EQ(16, breaker.next()); // after "abc"
510 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
511 EXPECT_EQ(18, breaker.next()); // after "/d"
512 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
513 EXPECT_EQ(24, breaker.next()); // after ".html"
514 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
515
516 EXPECT_EQ(29, breaker.next()); // after "World"
517 EXPECT_EQ(24, breaker.wordStart());
518 EXPECT_EQ(29, breaker.wordEnd());
519 }
520
521 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)522 TEST(WordBreakerTest, spaceAfterSpace) {
523 const std::vector<uint16_t> SPACES = {
524 '\t', // TAB
525 0x1680, // OGHAM SPACE MARK
526 0x3000, // IDEOGRAPHIC SPACE
527 };
528
529 constexpr uint16_t CHAR_SPACE = 0x0020;
530
531 for (uint16_t sp : SPACES) {
532 char msg[64] = {};
533 snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
534 SCOPED_TRACE(msg);
535
536 std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
537 WordBreaker breaker;
538 breaker.setText(buf.data(), buf.size());
539
540 EXPECT_EQ(0, breaker.current());
541 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), 0)); // after "a "
542 EXPECT_EQ(0, breaker.wordStart());
543 EXPECT_EQ(1, breaker.wordEnd());
544
545 EXPECT_EQ(2, breaker.current());
546 EXPECT_EQ(3, breaker.next()); // after CHAR_SPACE character.
547 EXPECT_EQ(2, breaker.wordStart());
548 EXPECT_EQ(2, breaker.wordEnd());
549
550 EXPECT_EQ(3, breaker.current());
551 EXPECT_EQ(4, breaker.next()); // after sp character.
552 EXPECT_EQ(3, breaker.wordStart());
553 EXPECT_EQ(4, breaker.wordEnd());
554 }
555 }
556
557 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
558 public:
TestableICULineBreakerPoolImpl()559 TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
560
561 using ICULineBreakerPoolImpl::getPoolSize;
562 using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
563 };
564
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)565 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
566 TestableICULineBreakerPoolImpl pool;
567
568 const Locale enUS("en-Latn-US");
569 const Locale frFR("fr-Latn-FR");
570
571 // All following three breakers must be the different instances.
572 ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
573 ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
574 ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
575
576 EXPECT_NE(nullptr, enUSBreaker.breaker.get());
577 EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
578 EXPECT_NE(nullptr, frFRBreaker.breaker.get());
579
580 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
581 EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
582 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
583
584 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
585 EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
586 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
587 }
588
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)589 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
590 TestableICULineBreakerPoolImpl pool;
591
592 const Locale enUS("en-Latn-US");
593 const Locale frFR("fr-Latn-FR");
594
595 // All following three breakers must be the different instances.
596 ICULineBreakerPool::Slot enUSBreaker = pool.acquire(enUS);
597
598 uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
599 icu::BreakIterator* enUSBreakerPtr = enUSBreaker.breaker.get();
600
601 pool.release(std::move(enUSBreaker));
602 EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
603
604 // acquire must return a different instance if the locale is different.
605 ICULineBreakerPool::Slot frFRBreaker = pool.acquire(frFR);
606 EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
607 EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
608
609 // acquire must return the same instance as released before if the locale is the same.
610 ICULineBreakerPool::Slot enUSBreaker2 = pool.acquire(enUS);
611 EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
612 EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
613 }
614
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)615 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
616 const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
617 TestableICULineBreakerPoolImpl pool;
618
619 const Locale enUS("en-Latn-US");
620
621 ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
622
623 // Make pool full.
624 for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
625 slots[i] = pool.acquire(enUS);
626 EXPECT_EQ(0U, pool.getPoolSize());
627 }
628
629 for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
630 pool.release(std::move(slots[i]));
631 EXPECT_EQ(i + 1, pool.getPoolSize());
632 }
633
634 for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
635 pool.release(std::move(slots[i]));
636 EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
637 }
638 }
639
640 } // namespace minikin
641