1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <cstdio>
20
21 #include <gtest/gtest.h>
22
23 #include "UnicodeUtils.h"
24
25 #ifndef NELEM
26 #define NELEM(x) ((sizeof(x) / sizeof((x)[0])))
27 #endif
28
29 #define UTF16(codepoint) U16_LEAD(codepoint), U16_TRAIL(codepoint)
30
31 namespace minikin {
32
TEST(WordBreakerTest,basic)33 TEST(WordBreakerTest, basic) {
34 uint16_t buf[] = {'h', 'e', 'l', 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
35 WordBreaker breaker;
36 breaker.setText(buf, NELEM(buf));
37 EXPECT_EQ(0, breaker.current());
38 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), LineBreakStyle::None,
39 LineBreakWordStyle::None, 0)); // after "hello "
40 EXPECT_EQ(0, breaker.wordStart()); // "hello"
41 EXPECT_EQ(5, breaker.wordEnd());
42 EXPECT_EQ(0, breaker.breakBadness());
43 EXPECT_EQ(6, breaker.current());
44 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
45 EXPECT_EQ(6, breaker.wordStart()); // "world"
46 EXPECT_EQ(11, breaker.wordEnd());
47 EXPECT_EQ(0, breaker.breakBadness());
48 EXPECT_EQ(11, breaker.current());
49 }
50
TEST(WordBreakerTest,softHyphen)51 TEST(WordBreakerTest, softHyphen) {
52 uint16_t buf[] = {'h', 'e', 'l', 0x00AD, 'l', 'o', ' ', 'w', 'o', 'r', 'l', 'd'};
53 auto lbStyle = LineBreakStyle::None;
54 auto lbWordStyle = LineBreakWordStyle::None;
55 WordBreaker breaker;
56 breaker.setText(buf, NELEM(buf));
57 EXPECT_EQ(0, breaker.current());
58 // after "hel{SOFT HYPHEN}lo "
59 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
60 EXPECT_EQ(0, breaker.wordStart()); // "hel{SOFT HYPHEN}lo"
61 EXPECT_EQ(6, breaker.wordEnd());
62 EXPECT_EQ(0, breaker.breakBadness());
63 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
64 EXPECT_EQ(7, breaker.wordStart()); // "world"
65 EXPECT_EQ(12, breaker.wordEnd());
66 EXPECT_EQ(0, breaker.breakBadness());
67 }
68
TEST(WordBreakerTest,hardHyphen)69 TEST(WordBreakerTest, hardHyphen) {
70 // Hyphens should not allow breaks anymore.
71 uint16_t buf[] = {'s', 'u', 'g', 'a', 'r', '-', 'f', 'r', 'e', 'e'};
72 auto lbStyle = LineBreakStyle::None;
73 auto lbWordStyle = LineBreakWordStyle::None;
74 WordBreaker breaker;
75 breaker.setText(buf, NELEM(buf));
76 EXPECT_EQ(0, breaker.current());
77 EXPECT_EQ((ssize_t)NELEM(buf),
78 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
79 EXPECT_EQ(0, breaker.wordStart());
80 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
81 EXPECT_EQ(0, breaker.breakBadness());
82 }
83
TEST(WordBreakerTest,postfixAndPrefix)84 TEST(WordBreakerTest, postfixAndPrefix) {
85 uint16_t buf[] = {'U', 'S', 0x00A2, ' ', 'J', 'P', 0x00A5}; // US¢ JP¥
86 auto lbStyle = LineBreakStyle::None;
87 auto lbWordStyle = LineBreakWordStyle::None;
88 WordBreaker breaker;
89 breaker.setText(buf, NELEM(buf));
90 EXPECT_EQ(0, breaker.current());
91
92 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
93 0)); // after CENT SIGN
94 EXPECT_EQ(0, breaker.wordStart()); // "US¢"
95 EXPECT_EQ(3, breaker.wordEnd());
96
97 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end of string
98 EXPECT_EQ(4, breaker.wordStart()); // "JP¥"
99 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
100 }
101
TEST(WordBreakerTest,myanmarKinzi)102 TEST(WordBreakerTest, myanmarKinzi) {
103 uint16_t buf[] = {0x1004, 0x103A, 0x1039, 0x1000, 0x102C}; // NGA, ASAT, VIRAMA, KA, UU
104 auto lbStyle = LineBreakStyle::None;
105 auto lbWordStyle = LineBreakWordStyle::None;
106 WordBreaker breaker;
107 breaker.setText(buf, NELEM(buf));
108 EXPECT_EQ(0, breaker.current());
109
110 // end of string
111 EXPECT_EQ((ssize_t)NELEM(buf),
112 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
113 EXPECT_EQ(0, breaker.wordStart());
114 EXPECT_EQ((ssize_t)NELEM(buf), breaker.wordEnd());
115 }
116
TEST(WordBreakerTest,zwjEmojiSequences)117 TEST(WordBreakerTest, zwjEmojiSequences) {
118 uint16_t buf[] = {
119 // man + zwj + heart + zwj + man
120 UTF16(0x1F468), 0x200D, 0x2764, 0x200D, UTF16(0x1F468),
121 // woman + zwj + heart + zwj + kiss mark + zwj + woman
122 UTF16(0x1F469), 0x200D, 0x2764, 0x200D, UTF16(0x1F48B), 0x200D, UTF16(0x1F469),
123 // eye + zwj + left speech bubble
124 UTF16(0x1F441), 0x200D, UTF16(0x1F5E8),
125 // CAT FACE + zwj + BUST IN SILHOUETTE
126 UTF16(0x1F431), 0x200D, UTF16(0x1F464),
127 };
128 auto lbStyle = LineBreakStyle::None;
129 auto lbWordStyle = LineBreakWordStyle::None;
130 WordBreaker breaker;
131 breaker.setText(buf, NELEM(buf));
132 EXPECT_EQ(0, breaker.current());
133 // after man + zwj + heart + zwj + man
134 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
135 EXPECT_EQ(0, breaker.wordStart());
136 EXPECT_EQ(7, breaker.wordEnd());
137 EXPECT_EQ(17, breaker.next()); // after woman + zwj + heart + zwj + woman
138 EXPECT_EQ(7, breaker.wordStart());
139 EXPECT_EQ(17, breaker.wordEnd());
140 EXPECT_EQ(22, breaker.next()); // after eye + zwj + left speech bubble
141 EXPECT_EQ(17, breaker.wordStart());
142 EXPECT_EQ(22, breaker.wordEnd());
143 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
144 EXPECT_EQ(22, breaker.wordStart());
145 EXPECT_EQ(27, breaker.wordEnd());
146 }
147
TEST(WordBreakerTest,emojiWithModifier)148 TEST(WordBreakerTest, emojiWithModifier) {
149 uint16_t buf[] = {
150 UTF16(0x1F466), UTF16(0x1F3FB), // boy + type 1-2 fitzpatrick modifier
151 0x270C, 0xFE0F,
152 UTF16(0x1F3FF) // victory hand + emoji style + type 6 fitzpatrick modifier
153 };
154 auto lbStyle = LineBreakStyle::None;
155 auto lbWordStyle = LineBreakWordStyle::None;
156 WordBreaker breaker;
157 breaker.setText(buf, NELEM(buf));
158 EXPECT_EQ(0, breaker.current());
159 // after boy + type 1-2 fitzpatrick modifier
160 EXPECT_EQ(4, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
161 EXPECT_EQ(0, breaker.wordStart());
162 EXPECT_EQ(4, breaker.wordEnd());
163 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
164 EXPECT_EQ(4, breaker.wordStart());
165 EXPECT_EQ(8, breaker.wordEnd());
166 }
167
TEST(WordBreakerTest,unicode10Emoji)168 TEST(WordBreakerTest, unicode10Emoji) {
169 // Should break between emojis.
170 uint16_t buf[] = {
171 // SLED + SLED
172 UTF16(0x1F6F7), UTF16(0x1F6F7),
173 // SLED + VS15 + SLED
174 UTF16(0x1F6F7), 0xFE0E, UTF16(0x1F6F7),
175 // WHITE SMILING FACE + SLED
176 0x263A, UTF16(0x1F6F7),
177 // WHITE SMILING FACE + VS16 + SLED
178 0x263A, 0xFE0F, UTF16(0x1F6F7),
179 };
180 auto lbStyle = LineBreakStyle::None;
181 auto lbWordStyle = LineBreakWordStyle::None;
182 WordBreaker breaker;
183 breaker.setText(buf, NELEM(buf));
184 EXPECT_EQ(0, breaker.current());
185 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en"), lbStyle, lbWordStyle, 0));
186 EXPECT_EQ(0, breaker.wordStart());
187 EXPECT_EQ(2, breaker.wordEnd());
188
189 EXPECT_EQ(4, breaker.next());
190 EXPECT_EQ(2, breaker.wordStart());
191 EXPECT_EQ(4, breaker.wordEnd());
192
193 EXPECT_EQ(7, breaker.next());
194 EXPECT_EQ(4, breaker.wordStart());
195 EXPECT_EQ(7, breaker.wordEnd());
196
197 EXPECT_EQ(9, breaker.next());
198 EXPECT_EQ(7, breaker.wordStart());
199 EXPECT_EQ(9, breaker.wordEnd());
200
201 EXPECT_EQ(10, breaker.next());
202 EXPECT_EQ(9, breaker.wordStart());
203 EXPECT_EQ(10, breaker.wordEnd());
204
205 EXPECT_EQ(12, breaker.next());
206 EXPECT_EQ(10, breaker.wordStart());
207 EXPECT_EQ(12, breaker.wordEnd());
208
209 EXPECT_EQ(14, breaker.next());
210 EXPECT_EQ(12, breaker.wordStart());
211 EXPECT_EQ(14, breaker.wordEnd());
212
213 EXPECT_EQ(16, breaker.next());
214 EXPECT_EQ(14, breaker.wordStart());
215 EXPECT_EQ(16, breaker.wordEnd());
216 }
217
TEST(WordBreakerTest,flagsSequenceSingleFlag)218 TEST(WordBreakerTest, flagsSequenceSingleFlag) {
219 const std::string kFlag = "U+1F3F4";
220 const std::string flags = kFlag + " " + kFlag;
221
222 const int kFlagLength = 2;
223 const size_t BUF_SIZE = kFlagLength * 2;
224
225 uint16_t buf[BUF_SIZE];
226 size_t size;
227 ParseUnicode(buf, BUF_SIZE, flags.c_str(), &size, nullptr);
228 auto lbStyle = LineBreakStyle::None;
229 auto lbWordStyle = LineBreakWordStyle::None;
230
231 WordBreaker breaker;
232 breaker.setText(buf, size);
233 EXPECT_EQ(0, breaker.current());
234 // end of the first flag
235 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
236 EXPECT_EQ(0, breaker.wordStart());
237 EXPECT_EQ(kFlagLength, breaker.wordEnd());
238 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
239 EXPECT_EQ(kFlagLength, breaker.wordStart());
240 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
241 }
242
TEST(WordBreakerTest,flagsSequence)243 TEST(WordBreakerTest, flagsSequence) {
244 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
245 // of Scotland.
246 const std::string kFlagSequence = "U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F";
247 const std::string flagSequence = kFlagSequence + " " + kFlagSequence;
248
249 const int kFlagLength = 14;
250 const size_t BUF_SIZE = kFlagLength * 2;
251
252 uint16_t buf[BUF_SIZE];
253 size_t size;
254 ParseUnicode(buf, BUF_SIZE, flagSequence.c_str(), &size, nullptr);
255 auto lbStyle = LineBreakStyle::None;
256 auto lbWordStyle = LineBreakWordStyle::None;
257
258 WordBreaker breaker;
259 breaker.setText(buf, size);
260 EXPECT_EQ(0, breaker.current());
261 // end of the first flag sequence
262 EXPECT_EQ(kFlagLength, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0));
263 EXPECT_EQ(0, breaker.wordStart());
264 EXPECT_EQ(kFlagLength, breaker.wordEnd());
265 EXPECT_EQ(static_cast<ssize_t>(size), breaker.next());
266 EXPECT_EQ(kFlagLength, breaker.wordStart());
267 EXPECT_EQ(kFlagLength * 2, breaker.wordEnd());
268 }
269
TEST(WordBreakerTest,punct)270 TEST(WordBreakerTest, punct) {
271 uint16_t buf[] = {0x00A1, 0x00A1, 'h', 'e', 'l', 'l', 'o', ',',
272 ' ', 'w', 'o', 'r', 'l', 'd', '!', '!'};
273 auto lbStyle = LineBreakStyle::None;
274 auto lbWordStyle = LineBreakWordStyle::None;
275 WordBreaker breaker;
276 breaker.setText(buf, NELEM(buf));
277 EXPECT_EQ(0, breaker.current());
278 EXPECT_EQ(9, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
279 0)); // after "¡¡hello, "
280 EXPECT_EQ(2, breaker.wordStart()); // "hello"
281 EXPECT_EQ(7, breaker.wordEnd());
282 EXPECT_EQ(0, breaker.breakBadness());
283 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
284 EXPECT_EQ(9, breaker.wordStart()); // "world"
285 EXPECT_EQ(14, breaker.wordEnd());
286 EXPECT_EQ(0, breaker.breakBadness());
287 }
288
TEST(WordBreakerTest,email)289 TEST(WordBreakerTest, email) {
290 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
291 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
292 auto lbStyle = LineBreakStyle::None;
293 auto lbWordStyle = LineBreakWordStyle::None;
294 WordBreaker breaker;
295 breaker.setText(buf, NELEM(buf));
296 EXPECT_EQ(0, breaker.current());
297 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
298 0)); // after "foo@example"
299 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
300 EXPECT_EQ(1, breaker.breakBadness());
301 EXPECT_EQ(16, breaker.next()); // after ".com "
302 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
303 EXPECT_EQ(0, breaker.breakBadness());
304 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
305 EXPECT_EQ(16, breaker.wordStart()); // "x"
306 EXPECT_EQ(17, breaker.wordEnd());
307 EXPECT_EQ(0, breaker.breakBadness());
308 }
309
TEST(WordBreakerTest,mailto)310 TEST(WordBreakerTest, mailto) {
311 uint16_t buf[] = {'m', 'a', 'i', 'l', 't', 'o', ':', 'f', 'o', 'o', '@', 'e',
312 'x', 'a', 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
313 auto lbStyle = LineBreakStyle::None;
314 auto lbWordStyle = LineBreakWordStyle::None;
315 WordBreaker breaker;
316 breaker.setText(buf, NELEM(buf));
317 EXPECT_EQ(0, breaker.current());
318 EXPECT_EQ(7, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
319 0)); // after "mailto:"
320 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
321 EXPECT_EQ(1, breaker.breakBadness());
322 EXPECT_EQ(18, breaker.next()); // after "foo@example"
323 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
324 EXPECT_EQ(1, breaker.breakBadness());
325 EXPECT_EQ(23, breaker.next()); // after ".com "
326 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
327 EXPECT_EQ(0, breaker.breakBadness());
328 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
329 EXPECT_EQ(23, breaker.wordStart()); // "x"
330 EXPECT_EQ(24, breaker.wordEnd());
331 EXPECT_EQ(0, breaker.breakBadness());
332 }
333
334 // The current logic always places a line break after a detected email address or URL
335 // and an immediately following non-ASCII character.
TEST(WordBreakerTest,emailNonAscii)336 TEST(WordBreakerTest, emailNonAscii) {
337 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm',
338 'p', 'l', 'e', '.', 'c', 'o', 'm', 0x4E00};
339 auto lbStyle = LineBreakStyle::None;
340 auto lbWordStyle = LineBreakWordStyle::None;
341 WordBreaker breaker;
342 breaker.setText(buf, NELEM(buf));
343 EXPECT_EQ(0, breaker.current());
344 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
345 0)); // after "foo@example"
346 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
347 EXPECT_EQ(1, breaker.breakBadness());
348 EXPECT_EQ(15, breaker.next()); // after ".com"
349 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
350 EXPECT_EQ(0, breaker.breakBadness());
351 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
352 EXPECT_EQ(15, breaker.wordStart()); // "一"
353 EXPECT_EQ(16, breaker.wordEnd());
354 EXPECT_EQ(0, breaker.breakBadness());
355 }
356
TEST(WordBreakerTest,emailCombining)357 TEST(WordBreakerTest, emailCombining) {
358 uint16_t buf[] = {'f', 'o', 'o', '@', 'e', 'x', 'a', 'm', 'p',
359 'l', 'e', '.', 'c', 'o', 'm', 0x0303, ' ', 'x'};
360 auto lbStyle = LineBreakStyle::None;
361 auto lbWordStyle = LineBreakWordStyle::None;
362 WordBreaker breaker;
363 breaker.setText(buf, NELEM(buf));
364 EXPECT_EQ(0, breaker.current());
365 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
366 0)); // after "foo@example"
367 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
368 EXPECT_EQ(1, breaker.breakBadness());
369 EXPECT_EQ(17, breaker.next()); // after ".com̃ "
370 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
371 EXPECT_EQ(0, breaker.breakBadness());
372 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
373 EXPECT_EQ(17, breaker.wordStart()); // "x"
374 EXPECT_EQ(18, breaker.wordEnd());
375 EXPECT_EQ(0, breaker.breakBadness());
376 }
377
TEST(WordBreakerTest,lonelyAt)378 TEST(WordBreakerTest, lonelyAt) {
379 uint16_t buf[] = {'a', ' ', '@', ' ', 'b'};
380 auto lbStyle = LineBreakStyle::None;
381 auto lbWordStyle = LineBreakWordStyle::None;
382 WordBreaker breaker;
383 breaker.setText(buf, NELEM(buf));
384 EXPECT_EQ(0, breaker.current());
385 EXPECT_EQ(2,
386 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // after "a "
387 EXPECT_EQ(0, breaker.wordStart()); // "a"
388 EXPECT_EQ(1, breaker.wordEnd());
389 EXPECT_EQ(0, breaker.breakBadness());
390 EXPECT_EQ(4, breaker.next()); // after "@ "
391 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
392 EXPECT_EQ(0, breaker.breakBadness());
393 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
394 EXPECT_EQ(4, breaker.wordStart()); // "b"
395 EXPECT_EQ(5, breaker.wordEnd());
396 EXPECT_EQ(0, breaker.breakBadness());
397 }
398
TEST(WordBreakerTest,url)399 TEST(WordBreakerTest, url) {
400 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'e', 'x', 'a',
401 'm', 'p', 'l', 'e', '.', 'c', 'o', 'm', ' ', 'x'};
402 auto lbStyle = LineBreakStyle::None;
403 auto lbWordStyle = LineBreakWordStyle::None;
404 WordBreaker breaker;
405 breaker.setText(buf, NELEM(buf));
406 EXPECT_EQ(0, breaker.current());
407 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
408 0)); // after "http:"
409 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
410 EXPECT_EQ(1, breaker.breakBadness());
411 EXPECT_EQ(7, breaker.next()); // after "//"
412 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
413 EXPECT_EQ(1, breaker.breakBadness());
414 EXPECT_EQ(14, breaker.next()); // after "example"
415 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
416 EXPECT_EQ(1, breaker.breakBadness());
417 EXPECT_EQ(19, breaker.next()); // after ".com "
418 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
419 EXPECT_EQ(0, breaker.breakBadness());
420 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
421 EXPECT_EQ(19, breaker.wordStart()); // "x"
422 EXPECT_EQ(20, breaker.wordEnd());
423 EXPECT_EQ(0, breaker.breakBadness());
424 }
425
426 // Breaks according to section 14.12 of Chicago Manual of Style, *URLs or DOIs and line breaks*
TEST(WordBreakerTest,urlBreakChars)427 TEST(WordBreakerTest, urlBreakChars) {
428 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '.', 'b', '/',
429 '~', 'c', ',', 'd', '-', 'e', '?', 'f', '=', 'g', '&',
430 'h', '#', 'i', '%', 'j', '_', 'k', '/', 'l'};
431 auto lbStyle = LineBreakStyle::None;
432 auto lbWordStyle = LineBreakWordStyle::None;
433 WordBreaker breaker;
434 breaker.setText(buf, NELEM(buf));
435 EXPECT_EQ(0, breaker.current());
436 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
437 0)); // after "http:"
438 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
439 EXPECT_EQ(1, breaker.breakBadness());
440 EXPECT_EQ(7, breaker.next()); // after "//"
441 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
442 EXPECT_EQ(1, breaker.breakBadness());
443 EXPECT_EQ(8, breaker.next()); // after "a"
444 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
445 EXPECT_EQ(1, breaker.breakBadness());
446 EXPECT_EQ(10, breaker.next()); // after ".b"
447 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
448 EXPECT_EQ(1, breaker.breakBadness());
449 EXPECT_EQ(11, breaker.next()); // after "/"
450 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
451 EXPECT_EQ(1, breaker.breakBadness());
452 EXPECT_EQ(13, breaker.next()); // after "~c"
453 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
454 EXPECT_EQ(1, breaker.breakBadness());
455 EXPECT_EQ(15, breaker.next()); // after ",d"
456 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
457 EXPECT_EQ(1, breaker.breakBadness());
458 EXPECT_EQ(17, breaker.next()); // after "-e"
459 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
460 EXPECT_EQ(1, breaker.breakBadness());
461 EXPECT_EQ(19, breaker.next()); // after "?f"
462 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
463 EXPECT_EQ(1, breaker.breakBadness());
464 EXPECT_EQ(20, breaker.next()); // after "="
465 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
466 EXPECT_EQ(1, breaker.breakBadness());
467 EXPECT_EQ(21, breaker.next()); // after "g"
468 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
469 EXPECT_EQ(1, breaker.breakBadness());
470 EXPECT_EQ(22, breaker.next()); // after "&"
471 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
472 EXPECT_EQ(1, breaker.breakBadness());
473 EXPECT_EQ(23, breaker.next()); // after "h"
474 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
475 EXPECT_EQ(1, breaker.breakBadness());
476 EXPECT_EQ(25, breaker.next()); // after "#i"
477 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
478 EXPECT_EQ(1, breaker.breakBadness());
479 EXPECT_EQ(27, breaker.next()); // after "%j"
480 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
481 EXPECT_EQ(1, breaker.breakBadness());
482 EXPECT_EQ(29, breaker.next()); // after "_k"
483 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
484 EXPECT_EQ(1, breaker.breakBadness());
485 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
486 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
487 EXPECT_EQ(0, breaker.breakBadness());
488 }
489
TEST(WordBreakerTest,urlNoHyphenBreak)490 TEST(WordBreakerTest, urlNoHyphenBreak) {
491 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '-', '/', 'b'};
492 auto lbStyle = LineBreakStyle::None;
493 auto lbWordStyle = LineBreakWordStyle::None;
494 WordBreaker breaker;
495 breaker.setText(buf, NELEM(buf));
496 EXPECT_EQ(0, breaker.current());
497 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
498 0)); // after "http:"
499 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
500 EXPECT_EQ(7, breaker.next()); // after "//"
501 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
502 EXPECT_EQ(8, breaker.next()); // after "a"
503 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
504 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
505 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
506 }
507
TEST(WordBreakerTest,urlEndsWithSlash)508 TEST(WordBreakerTest, urlEndsWithSlash) {
509 uint16_t buf[] = {'h', 't', 't', 'p', ':', '/', '/', 'a', '/'};
510 auto lbStyle = LineBreakStyle::None;
511 auto lbWordStyle = LineBreakWordStyle::None;
512 WordBreaker breaker;
513 breaker.setText(buf, NELEM(buf));
514 EXPECT_EQ(0, breaker.current());
515 EXPECT_EQ(5, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
516 0)); // after "http:"
517 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
518 EXPECT_EQ(7, breaker.next()); // after "//"
519 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
520 EXPECT_EQ(8, breaker.next()); // after "a"
521 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
522 EXPECT_EQ((ssize_t)NELEM(buf), breaker.next()); // end
523 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
524 }
525
TEST(WordBreakerTest,emailStartsWithSlash)526 TEST(WordBreakerTest, emailStartsWithSlash) {
527 uint16_t buf[] = {'/', 'a', '@', 'b'};
528 auto lbStyle = LineBreakStyle::None;
529 auto lbWordStyle = LineBreakWordStyle::None;
530 WordBreaker breaker;
531 breaker.setText(buf, NELEM(buf));
532 EXPECT_EQ(0, breaker.current());
533 EXPECT_EQ((ssize_t)NELEM(buf),
534 breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle, 0)); // end
535 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
536 }
537
TEST(WordBreakerTest,setLocaleInsideUrl)538 TEST(WordBreakerTest, setLocaleInsideUrl) {
539 std::vector<uint16_t> buf = utf8ToUtf16("Hello http://abc/d.html World");
540 auto lbStyle = LineBreakStyle::None;
541 auto lbWordStyle = LineBreakWordStyle::None;
542 WordBreaker breaker;
543 breaker.setText(buf.data(), buf.size());
544 EXPECT_EQ(0, breaker.current());
545 EXPECT_EQ(6, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
546 0)); // after "Hello "
547 EXPECT_EQ(0, breaker.wordStart());
548 EXPECT_EQ(5, breaker.wordEnd());
549
550 EXPECT_EQ(6, breaker.current());
551 EXPECT_EQ(11, breaker.next()); // after "http:"
552
553 // Restart from middle point of the URL. It should return the same previous break point.
554 EXPECT_EQ(11, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
555 6)); // after "http:"
556 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
557
558 EXPECT_EQ(13, breaker.next()); // after "//"
559 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
560
561 // Restart from middle point of the URL. It should return the same previous break point.
562 EXPECT_EQ(13, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
563 12)); // after "//"
564 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
565 EXPECT_EQ(16, breaker.next()); // after "abc"
566 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
567 EXPECT_EQ(18, breaker.next()); // after "/d"
568 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
569 EXPECT_EQ(24, breaker.next()); // after ".html"
570 EXPECT_TRUE(breaker.wordStart() >= breaker.wordEnd());
571
572 EXPECT_EQ(29, breaker.next()); // after "World"
573 EXPECT_EQ(24, breaker.wordStart());
574 EXPECT_EQ(29, breaker.wordEnd());
575 }
576
577 // b/68669534
TEST(WordBreakerTest,spaceAfterSpace)578 TEST(WordBreakerTest, spaceAfterSpace) {
579 const std::vector<uint16_t> SPACES = {
580 '\t', // TAB
581 0x1680, // OGHAM SPACE MARK
582 0x3000, // IDEOGRAPHIC SPACE
583 };
584
585 constexpr uint16_t CHAR_SPACE = 0x0020;
586 auto lbStyle = LineBreakStyle::None;
587 auto lbWordStyle = LineBreakWordStyle::None;
588
589 for (uint16_t sp : SPACES) {
590 char msg[64] = {};
591 snprintf(msg, sizeof(msg), "Test Space: U+%04X", sp);
592 SCOPED_TRACE(msg);
593
594 std::vector<uint16_t> buf = {'a', CHAR_SPACE, sp, 'b'};
595 WordBreaker breaker;
596 breaker.setText(buf.data(), buf.size());
597
598 EXPECT_EQ(0, breaker.current());
599 EXPECT_EQ(2, breaker.followingWithLocale(Locale("en-US"), lbStyle, lbWordStyle,
600 0)); // after "a "
601 EXPECT_EQ(0, breaker.wordStart());
602 EXPECT_EQ(1, breaker.wordEnd());
603
604 EXPECT_EQ(2, breaker.current());
605 EXPECT_EQ(3, breaker.next()); // after CHAR_SPACE character.
606 EXPECT_EQ(2, breaker.wordStart());
607 EXPECT_EQ(2, breaker.wordEnd());
608
609 EXPECT_EQ(3, breaker.current());
610 EXPECT_EQ(4, breaker.next()); // after sp character.
611 EXPECT_EQ(3, breaker.wordStart());
612 EXPECT_EQ(4, breaker.wordEnd());
613 }
614 }
615
616 class TestableICULineBreakerPoolImpl : public ICULineBreakerPoolImpl {
617 public:
TestableICULineBreakerPoolImpl()618 TestableICULineBreakerPoolImpl() : ICULineBreakerPoolImpl() {}
619
620 using ICULineBreakerPoolImpl::getPoolSize;
621 using ICULineBreakerPoolImpl::MAX_POOL_SIZE;
622 };
623
TEST(WordBreakerTest,LineBreakerPool_acquire_without_release)624 TEST(WordBreakerTest, LineBreakerPool_acquire_without_release) {
625 TestableICULineBreakerPoolImpl pool;
626
627 const Locale enUS("en-Latn-US");
628 const Locale frFR("fr-Latn-FR");
629
630 // All following three breakers must be the different instances.
631 ICULineBreakerPool::Slot enUSBreaker =
632 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
633 ICULineBreakerPool::Slot enUSBreaker2 =
634 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
635 ICULineBreakerPool::Slot enUSBreaker3 =
636 pool.acquire(enUS, LineBreakStyle::Strict, LineBreakWordStyle::None);
637 ICULineBreakerPool::Slot frFRBreaker =
638 pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::None);
639 ICULineBreakerPool::Slot frFRBreaker2 =
640 pool.acquire(frFR, LineBreakStyle::None, LineBreakWordStyle::Phrase);
641
642 EXPECT_NE(nullptr, enUSBreaker.breaker.get());
643 EXPECT_NE(nullptr, enUSBreaker2.breaker.get());
644 EXPECT_NE(nullptr, enUSBreaker3.breaker.get());
645 EXPECT_NE(nullptr, frFRBreaker.breaker.get());
646 EXPECT_NE(nullptr, frFRBreaker2.breaker.get());
647
648 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker2.breaker.get());
649 EXPECT_NE(enUSBreaker.breaker.get(), enUSBreaker3.breaker.get());
650 EXPECT_NE(enUSBreaker.breaker.get(), frFRBreaker.breaker.get());
651 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker.breaker.get());
652 EXPECT_NE(enUSBreaker2.breaker.get(), frFRBreaker2.breaker.get());
653 EXPECT_NE(enUSBreaker2.breaker.get(), enUSBreaker3.breaker.get());
654
655 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker2.localeId);
656 EXPECT_EQ(enUSBreaker.localeId, enUSBreaker3.localeId);
657 EXPECT_NE(enUSBreaker.localeId, frFRBreaker.localeId);
658 EXPECT_NE(enUSBreaker.localeId, frFRBreaker2.localeId);
659 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker.localeId);
660 EXPECT_NE(enUSBreaker2.localeId, frFRBreaker2.localeId);
661 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
662 }
663
TEST(WordBreakerTest,LineBreakerPool_acquire_with_release)664 TEST(WordBreakerTest, LineBreakerPool_acquire_with_release) {
665 TestableICULineBreakerPoolImpl pool;
666
667 const Locale enUS("en-Latn-US");
668 const Locale frFR("fr-Latn-FR");
669
670 // All following three breakers must be the different instances.
671 ICULineBreakerPool::Slot enUSBreaker =
672 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
673
674 uint64_t enUSBreakerLocaleId = enUSBreaker.localeId;
675 auto* enUSBreakerPtr = enUSBreaker.breaker.get();
676
677 pool.release(std::move(enUSBreaker));
678 EXPECT_EQ(nullptr, enUSBreaker.breaker.get());
679
680 // acquire must return a different instance if the locale is different.
681 ICULineBreakerPool::Slot frFRBreaker =
682 pool.acquire(frFR, LineBreakStyle::Loose, LineBreakWordStyle::None);
683 EXPECT_NE(enUSBreakerPtr, frFRBreaker.breaker.get());
684 EXPECT_NE(enUSBreakerLocaleId, frFRBreaker.localeId);
685
686 // acquire must return the same instance as released before if the locale is the same.
687 ICULineBreakerPool::Slot enUSBreaker2 =
688 pool.acquire(enUS, LineBreakStyle::Loose, LineBreakWordStyle::None);
689 EXPECT_EQ(enUSBreakerPtr, enUSBreaker2.breaker.get());
690 EXPECT_EQ(enUSBreakerLocaleId, enUSBreaker2.localeId);
691
692 // acquire must return a different instance if the line break is different.
693 ICULineBreakerPool::Slot frFRBreaker2 =
694 pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::None);
695 ICULineBreakerPool::Slot frFRBreaker3 =
696 pool.acquire(frFR, LineBreakStyle::Normal, LineBreakWordStyle::Phrase);
697 EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker2.breaker.get());
698 EXPECT_NE(frFRBreaker.breaker.get(), frFRBreaker3.breaker.get());
699 EXPECT_NE(frFRBreaker2.breaker.get(), frFRBreaker3.breaker.get());
700 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker2.localeId);
701 EXPECT_EQ(frFRBreaker.localeId, frFRBreaker3.localeId);
702 EXPECT_EQ(frFRBreaker2.localeId, frFRBreaker3.localeId);
703 }
704
TEST(WordBreakerTest,LineBreakerPool_exceeds_pool_size)705 TEST(WordBreakerTest, LineBreakerPool_exceeds_pool_size) {
706 const size_t MAX_POOL_SIZE = TestableICULineBreakerPoolImpl::MAX_POOL_SIZE;
707 TestableICULineBreakerPoolImpl pool;
708
709 const Locale enUS("en-Latn-US");
710
711 ICULineBreakerPool::Slot slots[MAX_POOL_SIZE * 2];
712
713 // Make pool full.
714 for (size_t i = 0; i < MAX_POOL_SIZE * 2; i++) {
715 slots[i] = pool.acquire(enUS, LineBreakStyle::None, LineBreakWordStyle::None);
716 EXPECT_EQ(0U, pool.getPoolSize());
717 }
718
719 for (size_t i = 0; i < MAX_POOL_SIZE; i++) {
720 pool.release(std::move(slots[i]));
721 EXPECT_EQ(i + 1, pool.getPoolSize());
722 }
723
724 for (size_t i = MAX_POOL_SIZE; i < MAX_POOL_SIZE * 2; i++) {
725 pool.release(std::move(slots[i]));
726 EXPECT_EQ(MAX_POOL_SIZE, pool.getPoolSize());
727 }
728 }
729
730 } // namespace minikin
731