1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Minikin"
18
19 #include "FontLanguage.h"
20
21 #include <hb.h>
22 #include <string.h>
23 #include <unicode/uloc.h>
24 #include <algorithm>
25
26 namespace minikin {
27
28 #define SCRIPT_TAG(c1, c2, c3, c4) \
29 (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) << 8 | \
30 ((uint32_t)(c4)))
31
32 // Check if a language code supports emoji according to its subtag
isEmojiSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)33 static bool isEmojiSubtag(const char* buf,
34 size_t bufLen,
35 const char* subtag,
36 size_t subtagLen) {
37 if (bufLen < subtagLen) {
38 return false;
39 }
40 if (strncmp(buf, subtag, subtagLen) != 0) {
41 return false; // no match between two strings
42 }
43 return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
44 buf[subtagLen] == '-' || buf[subtagLen] == '_');
45 }
46
47 // Pack the three letter code into 15 bits and stored to 16 bit integer. The
48 // highest bit is 0. For the region code, the letters must be all digits in
49 // three letter case, so the number of possible values are 10. For the language
50 // code, the letters must be all small alphabets, so the number of possible
51 // values are 26. Thus, 5 bits are sufficient for each case and we can pack the
52 // three letter language code or region code to 15 bits.
53 //
54 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const char * c,size_t length,uint8_t twoLetterBase,uint8_t threeLetterBase)55 static uint16_t packLanguageOrRegion(const char* c,
56 size_t length,
57 uint8_t twoLetterBase,
58 uint8_t threeLetterBase) {
59 if (length == 2) {
60 return 0x7c00u | // 0x1fu << 10
61 (uint16_t)(c[0] - twoLetterBase) << 5 |
62 (uint16_t)(c[1] - twoLetterBase);
63 } else {
64 return ((uint16_t)(c[0] - threeLetterBase) << 10) |
65 (uint16_t)(c[1] - threeLetterBase) << 5 |
66 (uint16_t)(c[2] - threeLetterBase);
67 }
68 }
69
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)70 static size_t unpackLanguageOrRegion(uint16_t in,
71 char* out,
72 uint8_t twoLetterBase,
73 uint8_t threeLetterBase) {
74 uint8_t first = (in >> 10) & 0x1f;
75 uint8_t second = (in >> 5) & 0x1f;
76 uint8_t third = in & 0x1f;
77
78 if (first == 0x1f) {
79 out[0] = second + twoLetterBase;
80 out[1] = third + twoLetterBase;
81 return 2;
82 } else {
83 out[0] = first + threeLetterBase;
84 out[1] = second + threeLetterBase;
85 out[2] = third + threeLetterBase;
86 return 3;
87 }
88 }
89
90 // Find the next '-' or '_' index from startOffset position. If not found,
91 // returns bufferLength.
nextDelimiterIndex(const char * buffer,size_t bufferLength,size_t startOffset)92 static size_t nextDelimiterIndex(const char* buffer,
93 size_t bufferLength,
94 size_t startOffset) {
95 for (size_t i = startOffset; i < bufferLength; ++i) {
96 if (buffer[i] == '-' || buffer[i] == '_') {
97 return i;
98 }
99 }
100 return bufferLength;
101 }
102
isLowercase(char c)103 static inline bool isLowercase(char c) {
104 return 'a' <= c && c <= 'z';
105 }
106
isUppercase(char c)107 static inline bool isUppercase(char c) {
108 return 'A' <= c && c <= 'Z';
109 }
110
isDigit(char c)111 static inline bool isDigit(char c) {
112 return '0' <= c && c <= '9';
113 }
114
115 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const char * buffer,size_t length)116 static inline bool isValidLanguageCode(const char* buffer, size_t length) {
117 if (length != 2 && length != 3)
118 return false;
119 if (!isLowercase(buffer[0]))
120 return false;
121 if (!isLowercase(buffer[1]))
122 return false;
123 if (length == 3 && !isLowercase(buffer[2]))
124 return false;
125 return true;
126 }
127
128 // Returns true if buffer is valid for script code. The length of buffer must
129 // be 4.
isValidScriptCode(const char * buffer)130 static inline bool isValidScriptCode(const char* buffer) {
131 return isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
132 isLowercase(buffer[2]) && isLowercase(buffer[3]);
133 }
134
135 // Returns true if the buffer is valid for region code.
isValidRegionCode(const char * buffer,size_t length)136 static inline bool isValidRegionCode(const char* buffer, size_t length) {
137 return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
138 (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) &&
139 isDigit(buffer[2]));
140 }
141
142 // Parse BCP 47 language identifier into internal structure
FontLanguage(const char * buf,size_t length)143 FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
144 size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
145 if (isValidLanguageCode(buf, firstDelimiterPos)) {
146 mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
147 } else {
148 // We don't understand anything other than two-letter or three-letter
149 // language codes, so we skip parsing the rest of the string.
150 return;
151 }
152
153 if (firstDelimiterPos == length) {
154 mHbLanguage = hb_language_from_string(getString().c_str(), -1);
155 return; // Language code only.
156 }
157
158 size_t nextComponentStartPos = firstDelimiterPos + 1;
159 size_t nextDelimiterPos =
160 nextDelimiterIndex(buf, length, nextComponentStartPos);
161 size_t componentLength = nextDelimiterPos - nextComponentStartPos;
162
163 if (componentLength == 4) {
164 // Possibly script code.
165 const char* p = buf + nextComponentStartPos;
166 if (isValidScriptCode(p)) {
167 mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
168 mSubScriptBits = scriptToSubScriptBits(mScript);
169 }
170
171 if (nextDelimiterPos == length) {
172 mHbLanguage = hb_language_from_string(getString().c_str(), -1);
173 mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
174 return; // No region code.
175 }
176
177 nextComponentStartPos = nextDelimiterPos + 1;
178 nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
179 componentLength = nextDelimiterPos - nextComponentStartPos;
180 }
181
182 if (componentLength == 2 || componentLength == 3) {
183 // Possibly region code.
184 const char* p = buf + nextComponentStartPos;
185 if (isValidRegionCode(p, componentLength)) {
186 mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
187 }
188 }
189
190 mHbLanguage = hb_language_from_string(getString().c_str(), -1);
191 mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
192 }
193
194 // static
resolveEmojiStyle(const char * buf,size_t length,uint32_t script)195 FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf,
196 size_t length,
197 uint32_t script) {
198 // First, lookup emoji subtag.
199 // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
200 // unnecessary comparison can be avoided if total length is smaller than 10.
201 const size_t kMinSubtagLength = 10;
202 if (length >= kMinSubtagLength) {
203 static const char kPrefix[] = "-u-em-";
204 const char* pos =
205 std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
206 if (pos != buf + length) { // found
207 pos += strlen(kPrefix);
208 const size_t remainingLength = length - (pos - buf);
209 if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) {
210 return EMSTYLE_EMOJI;
211 } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) {
212 return EMSTYLE_TEXT;
213 } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) {
214 return EMSTYLE_DEFAULT;
215 }
216 }
217 }
218
219 // If no emoji subtag was provided, resolve the emoji style from script code.
220 if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
221 return EMSTYLE_EMOJI;
222 } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
223 return EMSTYLE_TEXT;
224 }
225
226 return EMSTYLE_EMPTY;
227 }
228
229 // static
scriptToSubScriptBits(uint32_t script)230 uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
231 uint8_t subScriptBits = 0u;
232 switch (script) {
233 case SCRIPT_TAG('B', 'o', 'p', 'o'):
234 subScriptBits = kBopomofoFlag;
235 break;
236 case SCRIPT_TAG('H', 'a', 'n', 'g'):
237 subScriptBits = kHangulFlag;
238 break;
239 case SCRIPT_TAG('H', 'a', 'n', 'b'):
240 // Bopomofo is almost exclusively used in Taiwan.
241 subScriptBits = kHanFlag | kBopomofoFlag;
242 break;
243 case SCRIPT_TAG('H', 'a', 'n', 'i'):
244 subScriptBits = kHanFlag;
245 break;
246 case SCRIPT_TAG('H', 'a', 'n', 's'):
247 subScriptBits = kHanFlag | kSimplifiedChineseFlag;
248 break;
249 case SCRIPT_TAG('H', 'a', 'n', 't'):
250 subScriptBits = kHanFlag | kTraditionalChineseFlag;
251 break;
252 case SCRIPT_TAG('H', 'i', 'r', 'a'):
253 subScriptBits = kHiraganaFlag;
254 break;
255 case SCRIPT_TAG('H', 'r', 'k', 't'):
256 subScriptBits = kKatakanaFlag | kHiraganaFlag;
257 break;
258 case SCRIPT_TAG('J', 'p', 'a', 'n'):
259 subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
260 break;
261 case SCRIPT_TAG('K', 'a', 'n', 'a'):
262 subScriptBits = kKatakanaFlag;
263 break;
264 case SCRIPT_TAG('K', 'o', 'r', 'e'):
265 subScriptBits = kHanFlag | kHangulFlag;
266 break;
267 }
268 return subScriptBits;
269 }
270
getString() const271 std::string FontLanguage::getString() const {
272 if (isUnsupported()) {
273 return "und";
274 }
275 char buf[16];
276 size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
277 if (mScript != 0) {
278 buf[i++] = '-';
279 buf[i++] = (mScript >> 24) & 0xFFu;
280 buf[i++] = (mScript >> 16) & 0xFFu;
281 buf[i++] = (mScript >> 8) & 0xFFu;
282 buf[i++] = mScript & 0xFFu;
283 }
284 if (mRegion != INVALID_CODE) {
285 buf[i++] = '-';
286 i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
287 }
288 return std::string(buf, i);
289 }
290
isEqualScript(const FontLanguage & other) const291 bool FontLanguage::isEqualScript(const FontLanguage& other) const {
292 return other.mScript == mScript;
293 }
294
295 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)296 bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
297 return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
298 }
299
supportsHbScript(hb_script_t script) const300 bool FontLanguage::supportsHbScript(hb_script_t script) const {
301 static_assert(
302 SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
303 "The Minikin script and HarfBuzz hb_script_t have different encodings.");
304 if (script == mScript)
305 return true;
306 return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
307 }
308
calcScoreFor(const FontLanguages & supported) const309 int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
310 bool languageScriptMatch = false;
311 bool subtagMatch = false;
312 bool scriptMatch = false;
313
314 for (size_t i = 0; i < supported.size(); ++i) {
315 if (mEmojiStyle != EMSTYLE_EMPTY &&
316 mEmojiStyle == supported[i].mEmojiStyle) {
317 subtagMatch = true;
318 if (mLanguage == supported[i].mLanguage) {
319 return 4;
320 }
321 }
322 if (isEqualScript(supported[i]) ||
323 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
324 scriptMatch = true;
325 if (mLanguage == supported[i].mLanguage) {
326 languageScriptMatch = true;
327 }
328 }
329 }
330
331 if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
332 scriptMatch = true;
333 if (mLanguage == supported[0].mLanguage &&
334 supported.isAllTheSameLanguage()) {
335 return 3;
336 }
337 }
338
339 if (languageScriptMatch) {
340 return 3;
341 } else if (subtagMatch) {
342 return 2;
343 } else if (scriptMatch) {
344 return 1;
345 }
346 return 0;
347 }
348
FontLanguages(std::vector<FontLanguage> && languages)349 FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
350 : mLanguages(std::move(languages)) {
351 if (mLanguages.empty()) {
352 return;
353 }
354
355 const FontLanguage& lang = mLanguages[0];
356
357 mIsAllTheSameLanguage = true;
358 mUnionOfSubScriptBits = lang.mSubScriptBits;
359 for (size_t i = 1; i < mLanguages.size(); ++i) {
360 mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
361 if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
362 mIsAllTheSameLanguage = false;
363 }
364 }
365 }
366
367 #undef SCRIPT_TAG
368 } // namespace minikin
369