1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <gtest/gtest.h>
18 #include <UnicodeUtils.h>
19 #include <minikin/GraphemeBreak.h>
20
21 using namespace android;
22
IsBreak(const char * src)23 bool IsBreak(const char* src) {
24 const size_t BUF_SIZE = 256;
25 uint16_t buf[BUF_SIZE];
26 size_t offset;
27 size_t size;
28 ParseUnicode(buf, BUF_SIZE, src, &size, &offset);
29 return GraphemeBreak::isGraphemeBreak(buf, 0, size, offset);
30 }
31
TEST(GraphemeBreak,utf16)32 TEST(GraphemeBreak, utf16) {
33 EXPECT_FALSE(IsBreak("U+D83C | U+DC31")); // emoji, U+1F431
34
35 // tests for invalid UTF-16
36 EXPECT_TRUE(IsBreak("U+D800 | U+D800")); // two leading surrogates
37 EXPECT_TRUE(IsBreak("U+DC00 | U+DC00")); // two trailing surrogates
38 EXPECT_TRUE(IsBreak("'a' | U+D800")); // lonely leading surrogate
39 EXPECT_TRUE(IsBreak("U+DC00 | 'a'")); // lonely trailing surrogate
40 EXPECT_TRUE(IsBreak("U+D800 | 'a'")); // leading surrogate followed by non-surrogate
41 EXPECT_TRUE(IsBreak("'a' | U+DC00")); // non-surrogate followed by trailing surrogate
42 }
43
TEST(GraphemeBreak,rules)44 TEST(GraphemeBreak, rules) {
45 // Rule GB1, sot ÷; Rule GB2, ÷ eot
46 EXPECT_TRUE(IsBreak("| 'a'"));
47 EXPECT_TRUE(IsBreak("'a' |"));
48
49 // Rule GB3, CR x LF
50 EXPECT_FALSE(IsBreak("U+000D | U+000A")); // CR x LF
51
52 // Rule GB4, (Control | CR | LF) ÷
53 EXPECT_TRUE(IsBreak("'a' | U+2028")); // Line separator
54 EXPECT_TRUE(IsBreak("'a' | U+000D")); // LF
55 EXPECT_TRUE(IsBreak("'a' | U+000A")); // CR
56
57 // Rule GB5, ÷ (Control | CR | LF)
58 EXPECT_TRUE(IsBreak("U+2028 | 'a'")); // Line separator
59 EXPECT_TRUE(IsBreak("U+000D | 'a'")); // LF
60 EXPECT_TRUE(IsBreak("U+000A | 'a'")); // CR
61
62 // Rule GB6, L x ( L | V | LV | LVT )
63 EXPECT_FALSE(IsBreak("U+1100 | U+1100")); // L x L
64 EXPECT_FALSE(IsBreak("U+1100 | U+1161")); // L x V
65 EXPECT_FALSE(IsBreak("U+1100 | U+AC00")); // L x LV
66 EXPECT_FALSE(IsBreak("U+1100 | U+AC01")); // L x LVT
67
68 // Rule GB7, ( LV | V ) x ( V | T )
69 EXPECT_FALSE(IsBreak("U+AC00 | U+1161")); // LV x V
70 EXPECT_FALSE(IsBreak("U+1161 | U+1161")); // V x V
71 EXPECT_FALSE(IsBreak("U+AC00 | U+11A8")); // LV x T
72 EXPECT_FALSE(IsBreak("U+1161 | U+11A8")); // V x T
73
74 // Rule GB8, ( LVT | T ) x T
75 EXPECT_FALSE(IsBreak("U+AC01 | U+11A8")); // LVT x T
76 EXPECT_FALSE(IsBreak("U+11A8 | U+11A8")); // T x T
77
78 // Other hangul pairs not counted above _are_ breaks (GB10)
79 EXPECT_TRUE(IsBreak("U+AC00 | U+1100")); // LV x L
80 EXPECT_TRUE(IsBreak("U+AC01 | U+1100")); // LVT x L
81 EXPECT_TRUE(IsBreak("U+11A8 | U+1100")); // T x L
82 EXPECT_TRUE(IsBreak("U+11A8 | U+AC00")); // T x LV
83 EXPECT_TRUE(IsBreak("U+11A8 | U+AC01")); // T x LVT
84
85 // Rule GB8a, Regional_Indicator x Regional_Indicator
86 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8"));
87 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
88 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
89 EXPECT_FALSE(IsBreak("U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
90
91 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
92 EXPECT_FALSE(IsBreak("U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
93
94 EXPECT_TRUE(IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA")); // Regional indicator pair (flag)
95 EXPECT_FALSE(IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA")); // Regional indicator pair (flag)
96
97 EXPECT_TRUE(
98 IsBreak("'a' U+1F1FA U+1F1F8 | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
99 EXPECT_FALSE(
100 IsBreak("'a' U+1F1FA | U+1F1F8 U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
101 EXPECT_FALSE(
102 IsBreak("'a' U+1F1FA U+1F1F8 U+1F1FA | U+1F1F8")); // Regional indicator pair (flag)
103
104 // Rule GB9, x Extend
105 EXPECT_FALSE(IsBreak("'a' | U+0301")); // combining accent
106 // Rule GB9a, x SpacingMark
107 EXPECT_FALSE(IsBreak("U+0915 | U+093E")); // KA, AA (spacing mark)
108 // Rule GB9b, Prepend x
109 // see tailoring test for prepend, as current ICU doesn't have any characters in the class
110
111 // Rule GB10, Any ÷ Any
112 EXPECT_TRUE(IsBreak("'a' | 'b'"));
113 EXPECT_TRUE(IsBreak("'f' | 'i'")); // probable ligature
114 EXPECT_TRUE(IsBreak("U+0644 | U+0627")); // probable ligature, lam + alef
115 EXPECT_TRUE(IsBreak("U+4E00 | U+4E00")); // CJK ideographs
116 EXPECT_TRUE(IsBreak("'a' | U+1F1FA U+1F1F8")); // Regional indicator pair (flag)
117 EXPECT_TRUE(IsBreak("U+1F1FA U+1F1F8 | 'a'")); // Regional indicator pair (flag)
118 }
119
TEST(GraphemeBreak,tailoring)120 TEST(GraphemeBreak, tailoring) {
121 // control characters that we interpret as "extend"
122 EXPECT_FALSE(IsBreak("'a' | U+00AD")); // soft hyphen
123 EXPECT_FALSE(IsBreak("'a' | U+200B")); // zwsp
124 EXPECT_FALSE(IsBreak("'a' | U+200E")); // lrm
125 EXPECT_FALSE(IsBreak("'a' | U+202A")); // lre
126 EXPECT_FALSE(IsBreak("'a' | U+E0041")); // tag character
127
128 // UTC-approved characters for the Prepend class
129 EXPECT_FALSE(IsBreak("U+06DD | U+0661")); // arabic subtending mark + digit one
130
131 EXPECT_TRUE(IsBreak("U+0E01 | U+0E33")); // Thai sara am
132
133 // virama is not a grapheme break, but "pure killer" is
134 EXPECT_FALSE(IsBreak("U+0915 | U+094D U+0915")); // Devanagari ka+virama+ka
135 EXPECT_FALSE(IsBreak("U+0915 U+094D | U+0915")); // Devanagari ka+virama+ka
136 EXPECT_FALSE(IsBreak("U+0E01 | U+0E3A U+0E01")); // thai phinthu = pure killer
137 EXPECT_TRUE(IsBreak("U+0E01 U+0E3A | U+0E01")); // thai phinthu = pure killer
138
139 // suppress grapheme breaks in zwj emoji sequences, see
140 // http://www.unicode.org/emoji/charts/emoji-zwj-sequences.html
141 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2764 U+FE0F U+200D U+1F48B U+200D U+1F468"));
142 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D | U+1F48B U+200D U+1F468"));
143 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+2764 U+FE0F U+200D U+1F48B U+200D | U+1F468"));
144 EXPECT_FALSE(IsBreak("U+1F468 U+200D | U+1F469 U+200D U+1F466"));
145 EXPECT_FALSE(IsBreak("U+1F468 U+200D U+1F469 U+200D | U+1F466"));
146 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F469 U+200D U+1F467 U+200D U+1F466"));
147 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D | U+1F467 U+200D U+1F466"));
148 EXPECT_FALSE(IsBreak("U+1F469 U+200D U+1F469 U+200D U+1F467 U+200D | U+1F466"));
149 EXPECT_FALSE(IsBreak("U+1F441 U+200D | U+1F5E8"));
150
151 // Do not break before and after zwj with all kind of emoji characters.
152 EXPECT_FALSE(IsBreak("U+1F431 | U+200D U+1F464"));
153 EXPECT_FALSE(IsBreak("U+1F431 U+200D | U+1F464"));
154
155 // ARABIC LETTER BEH + ZWJ + heart, not a zwj emoji sequence, so we preserve the break
156 EXPECT_TRUE(IsBreak("U+0628 U+200D | U+2764"));
157 }
158
TEST(GraphemeBreak,emojiModifiers)159 TEST(GraphemeBreak, emojiModifiers) {
160 EXPECT_FALSE(IsBreak("U+261D | U+1F3FB")); // white up pointing index + modifier
161 EXPECT_FALSE(IsBreak("U+270C | U+1F3FB")); // victory hand + modifier
162 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FB")); // boy + modifier
163 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FC")); // boy + modifier
164 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FD")); // boy + modifier
165 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FE")); // boy + modifier
166 EXPECT_FALSE(IsBreak("U+1F466 | U+1F3FF")); // boy + modifier
167 EXPECT_FALSE(IsBreak("U+1F918 | U+1F3FF")); // sign of the horns + modifier
168 EXPECT_FALSE(IsBreak("U+1F933 | U+1F3FF")); // selfie (Unicode 9) + modifier
169
170 // adding emoji style variation selector doesn't affect grapheme cluster
171 EXPECT_TRUE(IsBreak("U+270C U+FE0E | U+1F3FB")); // victory hand + text style + modifier
172 EXPECT_FALSE(IsBreak("U+270C U+FE0F | U+1F3FB")); // heart + emoji style + modifier
173
174 // heart is not an emoji base
175 EXPECT_TRUE(IsBreak("U+2764 | U+1F3FB")); // heart + modifier
176 EXPECT_TRUE(IsBreak("U+2764 U+FE0E | U+1F3FB")); // heart + emoji style + modifier
177 EXPECT_TRUE(IsBreak("U+2764 U+FE0F | U+1F3FB")); // heart + emoji style + modifier
178 EXPECT_TRUE(IsBreak("U+1F3FB | U+1F3FB")); // modifier + modifier
179
180 // rat is not an emoji modifer
181 EXPECT_TRUE(IsBreak("U+1F466 | U+1F400")); // boy + rat
182
183 }
184
TEST(GraphemeBreak,genderBalancedEmoji)185 TEST(GraphemeBreak, genderBalancedEmoji) {
186 // U+1F469 is WOMAN, U+200D is ZWJ, U+1F4BC is BRIEFCASE.
187 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+1F4BC"));
188 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+1F4BC"));
189
190 // U+2695 has now emoji property, so should be part of ZWJ sequence.
191 EXPECT_FALSE(IsBreak("U+1F469 | U+200D U+2695"));
192 EXPECT_FALSE(IsBreak("U+1F469 U+200D | U+2695"));
193 }
194
TEST(GraphemeBreak,offsets)195 TEST(GraphemeBreak, offsets) {
196 uint16_t string[] = { 0x0041, 0x06DD, 0x0045, 0x0301, 0x0049, 0x0301 };
197 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 2));
198 EXPECT_FALSE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 3));
199 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 4));
200 EXPECT_TRUE(GraphemeBreak::isGraphemeBreak(string, 2, 3, 5));
201 }
202