1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <gtest/gtest.h>
18 #include <UnicodeUtils.h>
19 #include <minikin/GraphemeBreak.h>
20
21 namespace minikin {
22
IsBreak(const char * src)23 bool IsBreak(const char* src) {
24 const size_t BUF_SIZE = 256;
25 uint16_t buf[BUF_SIZE];
26 size_t offset;
27 size_t size;
28 ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
29 return GraphemeBreak::isGraphemeBreak(nullptr, buf, 0, size, offset);
30 }
31
IsBreakWithAdvances(const float * advances,const char * src)32 bool IsBreakWithAdvances(const float* advances, const char* src) {
33 const size_t BUF_SIZE = 256;
34 uint16_t buf[BUF_SIZE];
35 size_t offset;
36 size_t size;
37 ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
38 return GraphemeBreak::isGraphemeBreak(advances, buf, 0, size, offset);
39 }
40
TEST(GraphemeBreak,utf16)41 TEST(GraphemeBreak, utf16) {
42 EXPECT_FALSE(IsBreak("U+D83C | U+DC31")); // emoji, U+1F431
43
44 // tests for invalid UTF-16
45 EXPECT_TRUE(IsBreak("U+D800 | U+D800")); // two leading surrogates
46 EXPECT_TRUE(IsBreak("U+DC00 | U+DC00")); // two trailing surrogates
47 EXPECT_TRUE(IsBreak("'a' | U+D800")); // lonely leading surrogate
48 EXPECT_TRUE(IsBreak("U+DC00 | 'a'")); // lonely trailing surrogate
49 EXPECT_TRUE(IsBreak("U+D800 | 'a'")); // leading surrogate followed by non-surrogate
50 EXPECT_TRUE(IsBreak("'a' | U+DC00")); // non-surrogate followed by trailing surrogate
51 }
52
TEST(GraphemeBreak,rules)53 TEST(GraphemeBreak, rules) {
54 // Rule GB1, sot ÷; Rule GB2, ÷ eot
55 EXPECT_TRUE(IsBreak("| 'a'"));
56 EXPECT_TRUE(IsBreak("'a' |"));
57
58 // Rule GB3, CR x LF
59 EXPECT_FALSE(IsBreak("U+000D | U+000A")); // CR x LF
60
61 // Rule GB4, (Control | CR | LF) ÷
62 EXPECT_TRUE(IsBreak("'a' | U+2028")); // Line separator
63 EXPECT_TRUE(IsBreak("'a' | U+000D")); // LF
64 EXPECT_TRUE(IsBreak("'a' | U+000A")); // CR
65
66 // Rule GB5, ÷ (Control | CR | LF)
67 EXPECT_TRUE(IsBreak("U+2028 | 'a'")); // Line separator
68 EXPECT_TRUE(IsBreak("U+000D | 'a'")); // LF
69 EXPECT_TRUE(IsBreak("U+000A | 'a'")); // CR
70
71 // Rule GB6, L x ( L | V | LV | LVT )
72 EXPECT_FALSE(IsBreak("U+1100 | U+1100")); // L x L
73 EXPECT_FALSE(IsBreak("U+1100 | U+1161")); // L x V
74 EXPECT_FALSE(IsBreak("U+1100 | U+AC00")); // L x LV
75 EXPECT_FALSE(IsBreak("U+1100 | U+AC01")); // L x LVT
76
77 // Rule GB7, ( LV | V ) x ( V | T )
78 EXPECT_FALSE(IsBreak("U+AC00 | U+1161")); // LV x V
79 EXPECT_FALSE(IsBreak("U+1161 | U+1161")); // V x V
80 EXPECT_FALSE(IsBreak("U+AC00 | U+11A8")); // LV x T
81 EXPECT_FALSE(IsBreak("U+1161 | U+11A8")); // V x T
82
83 // Rule GB8, ( LVT | T ) x T
84 EXPECT_FALSE(IsBreak("U+AC01 | U+11A8")); // LVT x T
85 EXPECT_FALSE(IsBreak("U+11A8 | U+11A8")); // T x T
86
87 // Other hangul pairs not counted above _are_ breaks (GB10)
88 EXPECT_TRUE(IsBreak("U+AC00 | U+1100")); // LV x L
89 EXPECT_TRUE(IsBreak("U+AC01 | U+1100")); // LVT x L
90 EXPECT_TRUE(IsBreak("U+11A8 | U+1100")); // T x L
91 EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV
92 EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT
93
94 // Rule GB12 and Rule GB13, Regional_Indicator x Regional_Indicator
95 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
96 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
97 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
98 EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
99
100 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
101 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
102 // Same case as the two above, knowing that the first two characters ligate, which is what
103 // would typically happen.
104 const float firstPairLigated[] = {1.0, 0.0, 0.0, 0.0, 1.0, 0.0}; // Two entries per codepoint
105 EXPECT_TRUE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
106 EXPECT_FALSE(IsBreakWithAdvances(firstPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
107 // Repeat the tests, But now the font doesn't have a ligature for the first two characters,
108 // while it does have a ligature for the last two. This could happen for fonts that do not
109 // support some (potentially encoded later than they were developed) flags.
110 const float secondPairLigated[] = {1.0, 0.0, 1.0, 0.0, 0.0, 0.0};
111 EXPECT_FALSE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA U+1F1F8 | U+1F1FA"));
112 EXPECT_TRUE(IsBreakWithAdvances(secondPairLigated, "U+1F1FA | U+1F1F8 U+1F1FA"));
113
114 EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
115 EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
116
117 EXPECT_TRUE(
118 IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
119 EXPECT_FALSE(
120 IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
121 EXPECT_FALSE(
122 IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
123
124 // Rule GB9, x (Extend | ZWJ)
125 EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent
126 EXPECT_FALSE(IsBreak("'a' | U+200D")); // ZWJ
127 // Rule GB9a, x SpacingMark
128 EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark)
129 // Rule GB9b, Prepend x
130 // see tailoring test for prepend, as current ICU doesn't have any characters in the class
131
132 // Rule GB999, Any ÷ Any
133 EXPECT_TRUE(IsBreak("'a' | 'b'"));
134 EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature
135 EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef
136 EXPECT_TRUE(IsBreak("U+4E00 | U+4E00")); // CJK ideographs
137 EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
138 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'")); // Regional indicator pair (flag)
139
140 // Extended rule for emoji tag sequence.
141 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 'a'"));
142 EXPECT_TRUE(IsBreak("'a' U+1F3F4 | 'a'"));
143
144 // Immediate tag_term after tag_base.
145 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E007F 'a'"));
146 EXPECT_FALSE(IsBreak("U+1F3F4 | U+E007F"));
147 EXPECT_TRUE(IsBreak("'a' U+1F3F4 U+E007F | 'a'"));
148
149 // Flag sequence
150 // U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F is emoji tag sequence for the flag
151 // of Scotland.
152 // U+1F3F4 is WAVING BLACK FLAG. This can be a tag_base character.
153 // U+E0067 is TAG LATIN SMALL LETTER G. This can be a part of tag_spec.
154 // U+E0062 is TAG LATIN SMALL LETTER B. This can be a part of tag_spec.
155 // U+E0073 is TAG LATIN SMALL LETTER S. This can be a part of tag_spec.
156 // U+E0063 is TAG LATIN SMALL LETTER C. This can be a part of tag_spec.
157 // U+E0074 is TAG LATIN SMALL LETTER T. This can be a part of tag_spec.
158 // U+E007F is CANCEL TAG. This is a tag_term character.
159 EXPECT_TRUE(IsBreak("'a' | U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
160 EXPECT_FALSE(IsBreak("U+1F3F4 | U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
161 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 | U+E0062 U+E0073 U+E0063 U+E0074 U+E007F"));
162 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 | U+E0073 U+E0063 U+E0074 U+E007F"));
163 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 | U+E0063 U+E0074 U+E007F"));
164 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 | U+E0074 U+E007F"));
165 EXPECT_FALSE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 | U+E007F"));
166 EXPECT_TRUE(IsBreak("U+1F3F4 U+E0067 U+E0062 U+E0073 U+E0063 U+E0074 U+E007F | 'a'"));
167 }
168
TEST(GraphemeBreak,tailoring)169 TEST(GraphemeBreak, tailoring) {
170 // control characters that we interpret as "extend"
171 EXPECT_FALSE(IsBreak("'a' | U+00AD")); // soft hyphen
172 EXPECT_FALSE(IsBreak("'a' | U+200B")); // zwsp
173 EXPECT_FALSE(IsBreak("'a' | U+200E")); // lrm
174 EXPECT_FALSE(IsBreak("'a' | U+202A")); // lre
175 EXPECT_FALSE(IsBreak("'a' | U+E0041")); // tag character
176
177 // UTC-approved characters for the Prepend class
178 EXPECT_FALSE(IsBreak("U+06DD | U+0661")); // arabic subtending mark + digit one
179
180 EXPECT_TRUE(IsBreak("U+0E01 | U+0E33")); // Thai sara am
181
182 // virama is not a grapheme break, but "pure killer" is
183 EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka
184 EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka
185 EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer
186 EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
187
188 // Repetition of above tests, but with a given advances array that implies everything
189 // became just one cluster.
190 const float conjoined[] = {1.0, 0.0, 0.0};
191 EXPECT_FALSE(IsBreakWithAdvances(conjoined,
192 "U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka
193 EXPECT_FALSE(IsBreakWithAdvances(conjoined,
194 "U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka
195 EXPECT_FALSE(IsBreakWithAdvances(conjoined,
196 "U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer
197 EXPECT_TRUE(IsBreakWithAdvances(conjoined,
198 "U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
199
200 // Repetition of above tests, but with a given advances array that the virama did not
201 // form a cluster with the following consonant. The difference is that there is now
202 // a grapheme break after the virama in ka+virama+ka.
203 const float separate[] = {1.0, 0.0, 1.0};
204 EXPECT_FALSE(IsBreakWithAdvances(separate,
205 "U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka
206 EXPECT_TRUE(IsBreakWithAdvances(separate,
207 "U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka
208 EXPECT_FALSE(IsBreakWithAdvances(separate,
209 "U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer
210 EXPECT_TRUE(IsBreakWithAdvances(separate,
211 "U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
212
213 // suppress grapheme breaks in zwj emoji sequences
214 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
215 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
216 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
217 EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466"));
218 EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466"));
219 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466"));
220 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466"));
221 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
222 EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
223
224 // Do not break before and after zwj with all kind of emoji characters.
225 EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464"));
226 EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464"));
227
228 // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
229 EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
230 }
231
TEST(GraphemeBreak,emojiModifiers)232 TEST(GraphemeBreak, emojiModifiers) {
233 EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier
234 EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier
235 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier
236 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier
237 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier
238 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier
239 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
240 EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
241 EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
242 // Reptition of the tests above, with the knowledge that they are ligated.
243 const float ligated1_2[] = {1.0, 0.0, 0.0};
244 const float ligated2_2[] = {1.0, 0.0, 0.0, 0.0};
245 EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+261D | U+1F3FB"));
246 EXPECT_FALSE(IsBreakWithAdvances(ligated1_2, "U+270C | U+1F3FB"));
247 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FB"));
248 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FC"));
249 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FD"));
250 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FE"));
251 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F466 | U+1F3FF"));
252 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F918 | U+1F3FF"));
253 EXPECT_FALSE(IsBreakWithAdvances(ligated2_2, "U+1F933 | U+1F3FF"));
254 // Reptition of the tests above, with the knowledge that they are not ligated.
255 const float unligated1_2[] = {1.0, 1.0, 0.0};
256 const float unligated2_2[] = {1.0, 0.0, 1.0, 0.0};
257 EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+261D | U+1F3FB"));
258 EXPECT_TRUE(IsBreakWithAdvances(unligated1_2, "U+270C | U+1F3FB"));
259 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FB"));
260 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FC"));
261 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FD"));
262 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FE"));
263 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F466 | U+1F3FF"));
264 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F918 | U+1F3FF"));
265 EXPECT_TRUE(IsBreakWithAdvances(unligated2_2, "U+1F933 | U+1F3FF"));
266
267 // adding extend characters between emoji base and modifier doesn't affect grapheme cluster
268 EXPECT_FALSE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
269 EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
270 // Reptition of the two tests above, with the knowledge that they are ligated.
271 const float ligated1_1_2[] = {1.0, 0.0, 0.0, 0.0};
272 EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
273 EXPECT_FALSE(IsBreakWithAdvances(ligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
274 // Reptition of the first two tests, with the knowledge that they are not ligated.
275 const float unligated1_1_2[] = {1.0, 0.0, 1.0, 0.0};
276 EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0E | U+1F3FB"));
277 EXPECT_TRUE(IsBreakWithAdvances(unligated1_1_2, "U+270C U+FE0F | U+1F3FB"));
278
279 // heart is not an emoji base
280 EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
281 EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier
282 EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier
283 EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB")); // modifier + modifier
284
285 // rat is not an emoji modifer
286 EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
287 }
288
TEST(GraphemeBreak,genderBalancedEmoji)289 TEST(GraphemeBreak, genderBalancedEmoji) {
290 // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE.
291 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC"));
292 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC"));
293 // The above two cases, when the ligature is not supported in the font. We now expect a break
294 // between them.
295 const float unligated2_1_2[] = {1.0, 0.0, 0.0, 1.0, 0.0};
296 EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 | U+200D U+1F4BC"));
297 EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_2, "U+1F469 U+200D | U+1F4BC"));
298
299 // U+2695 has now emoji property, so should be part of ZWJ sequence.
300 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695"));
301 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695"));
302 // The above two cases, when the ligature is not supported in the font. We now expect a break
303 // between them.
304 const float unligated2_1_1[] = {1.0, 0.0, 0.0, 1.0};
305 EXPECT_FALSE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 | U+200D U+2695"));
306 EXPECT_TRUE(IsBreakWithAdvances(unligated2_1_1, "U+1F469 U+200D | U+2695"));
307 }
308
TEST(GraphemeBreak,offsets)309 TEST(GraphemeBreak, offsets) {
310 uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
311 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 2));
312 EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 3));
313 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 4));
314 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(nullptr, string, 2, 3, 5));
315 }
316
317 } // namespace minikin
318