• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2014 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <stdint.h>
18 #include <unicode/uchar.h>
19 #include <unicode/utf16.h>
20 #include <algorithm>
21 
22 #include <minikin/Emoji.h>
23 #include <minikin/GraphemeBreak.h>
24 #include "MinikinInternal.h"
25 #include "utils/WindowsUtils.h"
26 
27 namespace minikin {
28 
tailoredGraphemeClusterBreak(uint32_t c)29 int32_t tailoredGraphemeClusterBreak(uint32_t c) {
30   // Characters defined as Control that we want to treat them as Extend.
31   // These are curated manually.
32   if (c == 0x00AD                      // SHY
33       || c == 0x061C                   // ALM
34       || c == 0x180E                   // MONGOLIAN VOWEL SEPARATOR
35       || c == 0x200B                   // ZWSP
36       || c == 0x200E                   // LRM
37       || c == 0x200F                   // RLM
38       || (0x202A <= c && c <= 0x202E)  // LRE, RLE, PDF, LRO, RLO
39       || ((c | 0xF) ==
40           0x206F)     // WJ, invisible math operators, LRI, RLI, FSI, PDI,
41                       // and the deprecated invisible format controls
42       || c == 0xFEFF  // BOM
43       || ((c | 0x7F) ==
44           0xE007F))  // recently undeprecated tag characters in Plane 14
45     return U_GCB_EXTEND;
46   // THAI CHARACTER SARA AM is treated as a normal letter by most other
47   // implementations: they allow a grapheme break before it.
48   else if (c == 0x0E33)
49     return U_GCB_OTHER;
50   else
51     return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
52 }
53 
54 // Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
55 // From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
isPureKiller(uint32_t c)56 bool isPureKiller(uint32_t c) {
57   return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A ||
58           c == 0x1714 || c == 0x1734 || c == 0x17D1 || c == 0x1BAA ||
59           c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 || c == 0xA953 ||
60           c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
61 }
62 
isGraphemeBreak(const float * advances,const uint16_t * buf,size_t start,size_t count,const size_t offset)63 bool GraphemeBreak::isGraphemeBreak(const float* advances,
64                                     const uint16_t* buf,
65                                     size_t start,
66                                     size_t count,
67                                     const size_t offset) {
68   // This implementation closely follows Unicode Standard Annex #29 on
69   // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
70   // implementing a tailored version of extended grapheme clusters.
71   // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.
72 
73   // Rule GB1, sot ÷; Rule GB2, ÷ eot
74   if (offset <= start || offset >= start + count) {
75     return true;
76   }
77   if (U16_IS_TRAIL(buf[offset])) {
78     // Don't break a surrogate pair, but a lonely trailing surrogate pair is a
79     // break
80     return !U16_IS_LEAD(buf[offset - 1]);
81   }
82   uint32_t c1 = 0;
83   uint32_t c2 = 0;
84   size_t offset_back = offset;
85   size_t offset_forward = offset;
86   U16_PREV(buf, start, offset_back, c1);
87   U16_NEXT(buf, offset_forward, start + count, c2);
88   int32_t p1 = tailoredGraphemeClusterBreak(c1);
89   int32_t p2 = tailoredGraphemeClusterBreak(c2);
90   // Rule GB3, CR x LF
91   if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
92     return false;
93   }
94   // Rule GB4, (Control | CR | LF) ÷
95   if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
96     return true;
97   }
98   // Rule GB5, ÷ (Control | CR | LF)
99   if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
100     return true;
101   }
102   // Rule GB6, L x ( L | V | LV | LVT )
103   if (p1 == U_GCB_L &&
104       (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
105     return false;
106   }
107   // Rule GB7, ( LV | V ) x ( V | T )
108   if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
109     return false;
110   }
111   // Rule GB8, ( LVT | T ) x T
112   if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
113     return false;
114   }
115   // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
116   if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK ||
117       p1 == U_GCB_PREPEND) {
118     return false;
119   }
120 
121   // This is used to decide font-dependent grapheme clusters. If we don't have
122   // the advance information, we become conservative in grapheme breaking and
123   // assume that it has no advance.
124   const bool c2_has_advance =
125       (advances != nullptr && advances[offset - start] != 0.0);
126 
127   // All the following rules are font-dependent, in the way that if we know c2
128   // has an advance, we definitely know that it cannot form a grapheme with the
129   // character(s) before it. So we make the decision in favor a grapheme break
130   // early.
131   if (c2_has_advance) {
132     return true;
133   }
134 
135   // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking
136   // properties for determining emoji-ness and carry our own data, because our
137   // data could be more fresh than what ICU provides.
138   //
139   // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier.
140   // The rule itself says do not break between emoji base and emoji modifiers,
141   // skipping all Extend characters. Variation selectors are considered Extend,
142   // so they are handled fine.
143   //
144   // We tailor this by requiring that an actual ligature is formed. If the font
145   // doesn't form a ligature, we allow a break before the modifier.
146   if (isEmojiModifier(c2)) {
147     uint32_t c0 = c1;
148     size_t offset_backback = offset_back;
149     int32_t p0 = p1;
150     if (p0 == U_GCB_EXTEND && offset_backback > start) {
151       // skip over emoji variation selector
152       U16_PREV(buf, start, offset_backback, c0);
153     }
154     if (isEmojiBase(c0)) {
155       return false;
156     }
157   }
158 
159   // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG)
160   // We try to make emoji sequences with ZWJ a single grapheme cluster, but only
161   // if they actually merge to one cluster. So we are more relaxed than the UAX
162   // #29 rules in accepting any emoji character after the ZWJ, but are tighter
163   // in that we only treat it as one cluster if a ligature is actually formed
164   // and we also require the character before the ZWJ to also be an emoji.
165   if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
166     // look at character before ZWJ to see that both can participate in an
167     // emoji zwj sequence
168     uint32_t c0 = 0;
169     size_t offset_backback = offset_back;
170     U16_PREV(buf, start, offset_backback, c0);
171     if (c0 == 0xFE0F && offset_backback > start) {
172       // skip over emoji variation selector
173       U16_PREV(buf, start, offset_backback, c0);
174     }
175     if (isEmoji(c0)) {
176       return false;
177     }
178   }
179 
180   // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
181   // sot   (RI RI)*  RI x RI
182   // [^RI] (RI RI)*  RI x RI
183   //
184   // If we have font information, we have already broken the cluster if and only
185   // if the second character had no advance, which means a ligature was formed.
186   // If we don't, we look back like UAX #29 recommends, but only up to 1000 code
187   // units.
188   if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
189     if (advances != nullptr) {
190       // We have advances information. But if we are here, we already know c2
191       // has no advance. So we should definitely disallow a break.
192       return false;
193     } else {
194       // Look at up to 1000 code units.
195       const size_t lookback_barrier =
196           std::max((ssize_t)start, (ssize_t)offset_back - 1000);
197       size_t offset_backback = offset_back;
198       while (offset_backback > lookback_barrier) {
199         uint32_t c0 = 0;
200         U16_PREV(buf, lookback_barrier, offset_backback, c0);
201         if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
202           offset_backback += U16_LENGTH(c0);
203           break;
204         }
205       }
206       // The number 4 comes from the number of code units in a whole flag.
207       return (offset - offset_backback) % 4 == 0;
208     }
209   }
210   // Cluster Indic syllables together (tailoring of UAX #29).
211   // Immediately after each virama (that is not just a pure killer) followed by
212   // a letter, we disallow grapheme breaks (if we are here, we don't know about
213   // advances, or we already know that c2 has no advance).
214   if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
215       && !isPureKiller(c1) &&
216       u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
217     return false;
218   }
219   // Rule GB999, Any ÷ Any
220   return true;
221 }
222 
getTextRunCursor(const float * advances,const uint16_t * buf,size_t start,size_t count,size_t offset,MoveOpt opt)223 size_t GraphemeBreak::getTextRunCursor(const float* advances,
224                                        const uint16_t* buf,
225                                        size_t start,
226                                        size_t count,
227                                        size_t offset,
228                                        MoveOpt opt) {
229   switch (opt) {
230     case AFTER:
231       if (offset < start + count) {
232         offset++;
233       }
234       // fall through
235     case AT_OR_AFTER:
236       while (!isGraphemeBreak(advances, buf, start, count, offset)) {
237         offset++;
238       }
239       break;
240     case BEFORE:
241       if (offset > start) {
242         offset--;
243       }
244       // fall through
245     case AT_OR_BEFORE:
246       while (!isGraphemeBreak(advances, buf, start, count, offset)) {
247         offset--;
248       }
249       break;
250     case AT:
251       if (!isGraphemeBreak(advances, buf, start, count, offset)) {
252         offset = (size_t)-1;
253       }
254       break;
255   }
256   return offset;
257 }
258 
259 }  // namespace minikin
260