1 /*
2 * Copyright (C) 2014 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include <stdint.h>
18 #include <algorithm>
19 #include <unicode/uchar.h>
20 #include <unicode/utf16.h>
21
22 #include <minikin/GraphemeBreak.h>
23 #include "MinikinInternal.h"
24
25 namespace android {
26
tailoredGraphemeClusterBreak(uint32_t c)27 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
28 // Characters defined as Control that we want to treat them as Extend.
29 // These are curated manually.
30 if (c == 0x00AD // SHY
31 || c == 0x061C // ALM
32 || c == 0x180E // MONGOLIAN VOWEL SEPARATOR
33 || c == 0x200B // ZWSP
34 || c == 0x200E // LRM
35 || c == 0x200F // RLM
36 || (0x202A <= c && c <= 0x202E) // LRE, RLE, PDF, LRO, RLO
37 || ((c | 0xF) == 0x206F) // WJ, invisible math operators, LRI, RLI, FSI, PDI,
38 // and the deprecated invisible format controls
39 || c == 0xFEFF // BOM
40 || ((c | 0x7F) == 0xE007F)) // recently undeprecated tag characters in Plane 14
41 return U_GCB_EXTEND;
42 // UTC-approved characters for the Prepend class, per
43 // http://www.unicode.org/L2/L2015/15183r-graph-cluster-brk.txt
44 // These should be removed when our copy of ICU gets updated to Unicode 9.0 (~2016 or 2017).
45 else if ((0x0600 <= c && c <= 0x0605) // Arabic subtending marks
46 || c == 0x06DD // ARABIC SUBTENDING MARK
47 || c == 0x070F // SYRIAC ABBREVIATION MARK
48 || c == 0x0D4E // MALAYALAM LETTER DOT REPH
49 || c == 0x110BD // KAITHI NUMBER SIGN
50 || c == 0x111C2 // SHARADA SIGN JIHVAMULIYA
51 || c == 0x111C3) // SHARADA SIGN UPADHMANIYA
52 return U_GCB_PREPEND;
53 // THAI CHARACTER SARA AM is treated as a normal letter by most other implementations: they
54 // allow a grapheme break before it.
55 else if (c == 0x0E33)
56 return U_GCB_OTHER;
57 else
58 return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
59 }
60
61 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
62 // From http://www.unicode.org/Public/8.0.0/ucd/IndicSyllabicCategory.txt
isPureKiller(uint32_t c)63 bool isPureKiller(uint32_t c) {
64 return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A || c == 0x1714 || c == 0x1734
65 || c == 0x17D1 || c == 0x1BAA || c == 0x1BF2 || c == 0x1BF3 || c == 0xA806
66 || c == 0xA953 || c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
67 }
68
isGraphemeBreak(const uint16_t * buf,size_t start,size_t count,size_t offset)69 bool GraphemeBreak::isGraphemeBreak(const uint16_t* buf, size_t start, size_t count,
70 size_t offset) {
71 // This implementation closely follows Unicode Standard Annex #29 on
72 // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
73 // implementing a tailored version of extended grapheme clusters.
74 // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
75
76 // Rule GB1, sot ÷; Rule GB2, ÷ eot
77 if (offset <= start || offset >= start + count) {
78 return true;
79 }
80 if (U16_IS_TRAIL(buf[offset])) {
81 // Don't break a surrogate pair, but a lonely trailing surrogate pair is a break
82 return !U16_IS_LEAD(buf[offset - 1]);
83 }
84 uint32_t c1 = 0;
85 uint32_t c2 = 0;
86 size_t offset_back = offset;
87 U16_PREV(buf, start, offset_back, c1);
88 U16_NEXT(buf, offset, start + count, c2);
89 int32_t p1 = tailoredGraphemeClusterBreak(c1);
90 int32_t p2 = tailoredGraphemeClusterBreak(c2);
91 // Rule GB3, CR x LF
92 if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
93 return false;
94 }
95 // Rule GB4, (Control | CR | LF) ÷
96 if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
97 return true;
98 }
99 // Rule GB5, ÷ (Control | CR | LF)
100 if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
101 return true;
102 }
103 // Rule GB6, L x ( L | V | LV | LVT )
104 if (p1 == U_GCB_L && (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
105 return false;
106 }
107 // Rule GB7, ( LV | V ) x ( V | T )
108 if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
109 return false;
110 }
111 // Rule GB8, ( LVT | T ) x T
112 if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
113 return false;
114 }
115 // Rule GB8a that looks at even-off cases.
116 //
117 // sot (RI RI)* RI x RI
118 // [^RI] (RI RI)* RI x RI
119 // RI ÷ RI
120 if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
121 // Look at up to 1000 code units.
122 start = std::max((ssize_t)start, (ssize_t)offset_back - 1000);
123 while (offset_back > start) {
124 U16_PREV(buf, start, offset_back, c1);
125 if (tailoredGraphemeClusterBreak(c1) != U_GCB_REGIONAL_INDICATOR) {
126 offset_back += U16_LENGTH(c1);
127 break;
128 }
129 }
130
131 // Note that the offset has moved forwared 2 code units by U16_NEXT.
132 // The number 4 comes from the number of code units in a whole flag.
133 return (offset - 2 - offset_back) % 4 == 0;
134 }
135 // Rule GB9, x Extend; Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
136 if (p2 == U_GCB_EXTEND || p2 == U_GCB_SPACING_MARK || p1 == U_GCB_PREPEND) {
137 return false;
138 }
139 // Cluster indic syllables together (tailoring of UAX #29)
140 // Known limitation: this is overly conservative, and assumes that the virama may form a
141 // conjunct with the following letter, which doesn't always happen.
142 //
143 // There is no easy solution to do this correctly. Even querying the font does not help (with
144 // the current font technoloies), since the font may be creating the conjunct using multiple
145 // glyphs, while the user may be perceiving that sequence of glyphs as one conjunct or one
146 // letter.
147 if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9 // virama
148 && !isPureKiller(c1)
149 && u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
150 return false;
151 }
152 // Tailoring: make emoji sequences with ZWJ a single grapheme cluster
153 if (c1 == 0x200D && isEmoji(c2) && offset_back > start) {
154 // look at character before ZWJ to see that both can participate in an emoji zwj sequence
155 uint32_t c0 = 0;
156 U16_PREV(buf, start, offset_back, c0);
157 if (c0 == 0xFE0F && offset_back > start) {
158 // skip over emoji variation selector
159 U16_PREV(buf, start, offset_back, c0);
160 }
161 if (isEmoji(c0)) {
162 return false;
163 }
164 }
165 // Proposed Rule GB9c from http://www.unicode.org/L2/L2016/16011r3-break-prop-emoji.pdf
166 // E_Base x E_Modifier
167 if (isEmojiModifier(c2)) {
168 if (c1 == 0xFE0F && offset_back > start) {
169 // skip over emoji variation selector
170 U16_PREV(buf, start, offset_back, c1);
171 }
172 if (isEmojiBase(c1)) {
173 return false;
174 }
175 }
176 // Rule GB10, Any ÷ Any
177 return true;
178 }
179
getTextRunCursor(const uint16_t * buf,size_t start,size_t count,size_t offset,MoveOpt opt)180 size_t GraphemeBreak::getTextRunCursor(const uint16_t* buf, size_t start, size_t count,
181 size_t offset, MoveOpt opt) {
182 switch (opt) {
183 case AFTER:
184 if (offset < start + count) {
185 offset++;
186 }
187 // fall through
188 case AT_OR_AFTER:
189 while (!isGraphemeBreak(buf, start, count, offset)) {
190 offset++;
191 }
192 break;
193 case BEFORE:
194 if (offset > start) {
195 offset--;
196 }
197 // fall through
198 case AT_OR_BEFORE:
199 while (!isGraphemeBreak(buf, start, count, offset)) {
200 offset--;
201 }
202 break;
203 case AT:
204 if (!isGraphemeBreak(buf, start, count, offset)) {
205 offset = (size_t)-1;
206 }
207 break;
208 }
209 return offset;
210 }
211
212 } // namespace android
213