• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Minikin"
18 
19 #include "FontLanguage.h"
20 
21 #include <algorithm>
22 #include <hb.h>
23 #include <string.h>
24 #include <unicode/uloc.h>
25 
26 namespace minikin {
27 
28 #define SCRIPT_TAG(c1, c2, c3, c4) \
29         (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) <<  8 | \
30          ((uint32_t)(c4)))
31 
32 // Check if a language code supports emoji according to its subtag
isEmojiSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)33 static bool isEmojiSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
34     if (bufLen < subtagLen) {
35         return false;
36     }
37     if (strncmp(buf, subtag, subtagLen) != 0) {
38         return false;  // no match between two strings
39     }
40     return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
41             buf[subtagLen] == '-' || buf[subtagLen] == '_');
42 }
43 
44 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
45 // For the region code, the letters must be all digits in three letter case, so the number of
46 // possible values are 10. For the language code, the letters must be all small alphabets, so the
47 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
48 // three letter language code or region code to 15 bits.
49 //
50 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const char * c,size_t length,uint8_t twoLetterBase,uint8_t threeLetterBase)51 static uint16_t packLanguageOrRegion(const char* c, size_t length, uint8_t twoLetterBase,
52         uint8_t threeLetterBase) {
53     if (length == 2) {
54         return 0x7c00u |  // 0x1fu << 10
55                 (uint16_t)(c[0] - twoLetterBase) << 5 |
56                 (uint16_t)(c[1] - twoLetterBase);
57     } else {
58         return ((uint16_t)(c[0] - threeLetterBase) << 10) |
59                 (uint16_t)(c[1] - threeLetterBase) << 5 |
60                 (uint16_t)(c[2] - threeLetterBase);
61     }
62 }
63 
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)64 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
65         uint8_t threeLetterBase) {
66     uint8_t first = (in >> 10) & 0x1f;
67     uint8_t second = (in >> 5) & 0x1f;
68     uint8_t third = in & 0x1f;
69 
70     if (first == 0x1f) {
71         out[0] = second + twoLetterBase;
72         out[1] = third + twoLetterBase;
73         return 2;
74     } else {
75         out[0] = first + threeLetterBase;
76         out[1] = second + threeLetterBase;
77         out[2] = third + threeLetterBase;
78         return 3;
79     }
80 }
81 
82 // Find the next '-' or '_' index from startOffset position. If not found, returns bufferLength.
nextDelimiterIndex(const char * buffer,size_t bufferLength,size_t startOffset)83 static size_t nextDelimiterIndex(const char* buffer, size_t bufferLength, size_t startOffset) {
84     for (size_t i = startOffset; i < bufferLength; ++i) {
85         if (buffer[i] == '-' || buffer[i] == '_') {
86             return i;
87         }
88     }
89     return bufferLength;
90 }
91 
isLowercase(char c)92 static inline bool isLowercase(char c) {
93     return 'a' <= c && c <= 'z';
94 }
95 
isUppercase(char c)96 static inline bool isUppercase(char c) {
97     return 'A' <= c && c <= 'Z';
98 }
99 
isDigit(char c)100 static inline bool isDigit(char c) {
101     return '0' <= c && c <= '9';
102 }
103 
104 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const char * buffer,size_t length)105 static inline bool isValidLanguageCode(const char* buffer, size_t length) {
106     if (length != 2 && length != 3) return false;
107     if (!isLowercase(buffer[0])) return false;
108     if (!isLowercase(buffer[1])) return false;
109     if (length == 3 && !isLowercase(buffer[2])) return false;
110     return true;
111 }
112 
113 // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const char * buffer)114 static inline bool isValidScriptCode(const char* buffer) {
115     return isUppercase(buffer[0]) && isLowercase(buffer[1]) && isLowercase(buffer[2]) &&
116         isLowercase(buffer[3]);
117 }
118 
119 // Returns true if the buffer is valid for region code.
isValidRegionCode(const char * buffer,size_t length)120 static inline bool isValidRegionCode(const char* buffer, size_t length) {
121     return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
122             (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
123 }
124 
125 // Parse BCP 47 language identifier into internal structure
FontLanguage(const char * buf,size_t length)126 FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
127     size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
128     if (isValidLanguageCode(buf, firstDelimiterPos)) {
129         mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
130     } else {
131         // We don't understand anything other than two-letter or three-letter
132         // language codes, so we skip parsing the rest of the string.
133         return;
134     }
135 
136     if (firstDelimiterPos == length) {
137         mHbLanguage = hb_language_from_string(getString().c_str(), -1);
138         return;  // Language code only.
139     }
140 
141     size_t nextComponentStartPos = firstDelimiterPos + 1;
142     size_t nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
143     size_t componentLength = nextDelimiterPos - nextComponentStartPos;
144 
145     if (componentLength == 4) {
146         // Possibly script code.
147         const char* p = buf + nextComponentStartPos;
148         if (isValidScriptCode(p)) {
149             mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
150             mSubScriptBits = scriptToSubScriptBits(mScript);
151         }
152 
153         if (nextDelimiterPos == length) {
154             mHbLanguage = hb_language_from_string(getString().c_str(), -1);
155             mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
156             return;  // No region code.
157         }
158 
159         nextComponentStartPos = nextDelimiterPos + 1;
160         nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
161         componentLength = nextDelimiterPos - nextComponentStartPos;
162     }
163 
164     if (componentLength == 2 || componentLength == 3) {
165         // Possibly region code.
166         const char* p = buf + nextComponentStartPos;
167         if (isValidRegionCode(p, componentLength)) {
168             mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
169         }
170     }
171 
172     mHbLanguage = hb_language_from_string(getString().c_str(), -1);
173     mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
174 }
175 
176 // static
resolveEmojiStyle(const char * buf,size_t length,uint32_t script)177 FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf, size_t length,
178         uint32_t script) {
179     // First, lookup emoji subtag.
180     // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
181     // unnecessary comparison can be avoided if total length is smaller than 10.
182     const size_t kMinSubtagLength = 10;
183     if (length >= kMinSubtagLength) {
184         static const char kPrefix[] = "-u-em-";
185         const char *pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
186         if (pos != buf + length) {  // found
187             pos += strlen(kPrefix);
188             const size_t remainingLength = length - (pos - buf);
189             if (isEmojiSubtag(pos, remainingLength, "emoji", 5)){
190                 return EMSTYLE_EMOJI;
191             } else if (isEmojiSubtag(pos, remainingLength, "text", 4)){
192                 return EMSTYLE_TEXT;
193             } else if (isEmojiSubtag(pos, remainingLength, "default", 7)){
194                 return EMSTYLE_DEFAULT;
195             }
196         }
197     }
198 
199     // If no emoji subtag was provided, resolve the emoji style from script code.
200     if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
201         return EMSTYLE_EMOJI;
202     } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
203         return EMSTYLE_TEXT;
204     }
205 
206     return EMSTYLE_EMPTY;
207 }
208 
209 //static
scriptToSubScriptBits(uint32_t script)210 uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
211     uint8_t subScriptBits = 0u;
212     switch (script) {
213         case SCRIPT_TAG('B', 'o', 'p', 'o'):
214             subScriptBits = kBopomofoFlag;
215             break;
216         case SCRIPT_TAG('H', 'a', 'n', 'g'):
217             subScriptBits = kHangulFlag;
218             break;
219         case SCRIPT_TAG('H', 'a', 'n', 'b'):
220             // Bopomofo is almost exclusively used in Taiwan.
221             subScriptBits = kHanFlag | kBopomofoFlag;
222             break;
223         case SCRIPT_TAG('H', 'a', 'n', 'i'):
224             subScriptBits = kHanFlag;
225             break;
226         case SCRIPT_TAG('H', 'a', 'n', 's'):
227             subScriptBits = kHanFlag | kSimplifiedChineseFlag;
228             break;
229         case SCRIPT_TAG('H', 'a', 'n', 't'):
230             subScriptBits = kHanFlag | kTraditionalChineseFlag;
231             break;
232         case SCRIPT_TAG('H', 'i', 'r', 'a'):
233             subScriptBits = kHiraganaFlag;
234             break;
235         case SCRIPT_TAG('H', 'r', 'k', 't'):
236             subScriptBits = kKatakanaFlag | kHiraganaFlag;
237             break;
238         case SCRIPT_TAG('J', 'p', 'a', 'n'):
239             subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
240             break;
241         case SCRIPT_TAG('K', 'a', 'n', 'a'):
242             subScriptBits = kKatakanaFlag;
243             break;
244         case SCRIPT_TAG('K', 'o', 'r', 'e'):
245             subScriptBits = kHanFlag | kHangulFlag;
246             break;
247     }
248     return subScriptBits;
249 }
250 
getString() const251 std::string FontLanguage::getString() const {
252     if (isUnsupported()) {
253         return "und";
254     }
255     char buf[16];
256     size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
257     if (mScript != 0) {
258         buf[i++] = '-';
259         buf[i++] = (mScript >> 24) & 0xFFu;
260         buf[i++] = (mScript >> 16) & 0xFFu;
261         buf[i++] = (mScript >> 8) & 0xFFu;
262         buf[i++] = mScript & 0xFFu;
263     }
264     if (mRegion != INVALID_CODE) {
265         buf[i++] = '-';
266         i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
267     }
268     return std::string(buf, i);
269 }
270 
isEqualScript(const FontLanguage & other) const271 bool FontLanguage::isEqualScript(const FontLanguage& other) const {
272     return other.mScript == mScript;
273 }
274 
275 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)276 bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
277     return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
278 }
279 
supportsHbScript(hb_script_t script) const280 bool FontLanguage::supportsHbScript(hb_script_t script) const {
281     static_assert(SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
282                   "The Minikin script and HarfBuzz hb_script_t have different encodings.");
283     if (script == mScript) return true;
284     return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
285 }
286 
calcScoreFor(const FontLanguages & supported) const287 int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
288     bool languageScriptMatch = false;
289     bool subtagMatch = false;
290     bool scriptMatch = false;
291 
292     for (size_t i = 0; i < supported.size(); ++i) {
293         if (mEmojiStyle != EMSTYLE_EMPTY &&
294                mEmojiStyle == supported[i].mEmojiStyle) {
295             subtagMatch = true;
296             if (mLanguage == supported[i].mLanguage) {
297                 return 4;
298             }
299         }
300         if (isEqualScript(supported[i]) ||
301                 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
302             scriptMatch = true;
303             if (mLanguage == supported[i].mLanguage) {
304                 languageScriptMatch = true;
305             }
306         }
307     }
308 
309     if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
310         scriptMatch = true;
311         if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLanguage()) {
312             return 3;
313         }
314     }
315 
316     if (languageScriptMatch) {
317         return 3;
318     } else if (subtagMatch) {
319         return 2;
320     } else if (scriptMatch) {
321         return 1;
322     }
323     return 0;
324 }
325 
FontLanguages(std::vector<FontLanguage> && languages)326 FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
327     : mLanguages(std::move(languages)) {
328     if (mLanguages.empty()) {
329         return;
330     }
331 
332     const FontLanguage& lang = mLanguages[0];
333 
334     mIsAllTheSameLanguage = true;
335     mUnionOfSubScriptBits = lang.mSubScriptBits;
336     for (size_t i = 1; i < mLanguages.size(); ++i) {
337         mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
338         if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
339             mIsAllTheSameLanguage = false;
340         }
341     }
342 }
343 
344 #undef SCRIPT_TAG
345 }  // namespace minikin
346