• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Minikin"
18 
19 #include "FontLanguage.h"
20 
21 #include <hb.h>
22 #include <string.h>
23 #include <unicode/uloc.h>
24 #include <algorithm>
25 
26 namespace minikin {
27 
28 #define SCRIPT_TAG(c1, c2, c3, c4)                                           \
29   (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) << 8 | \
30    ((uint32_t)(c4)))
31 
32 // Check if a language code supports emoji according to its subtag
isEmojiSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)33 static bool isEmojiSubtag(const char* buf,
34                           size_t bufLen,
35                           const char* subtag,
36                           size_t subtagLen) {
37   if (bufLen < subtagLen) {
38     return false;
39   }
40   if (strncmp(buf, subtag, subtagLen) != 0) {
41     return false;  // no match between two strings
42   }
43   return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
44           buf[subtagLen] == '-' || buf[subtagLen] == '_');
45 }
46 
47 // Pack the three letter code into 15 bits and stored to 16 bit integer. The
48 // highest bit is 0. For the region code, the letters must be all digits in
49 // three letter case, so the number of possible values are 10. For the language
50 // code, the letters must be all small alphabets, so the number of possible
51 // values are 26. Thus, 5 bits are sufficient for each case and we can pack the
52 // three letter language code or region code to 15 bits.
53 //
54 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const char * c,size_t length,uint8_t twoLetterBase,uint8_t threeLetterBase)55 static uint16_t packLanguageOrRegion(const char* c,
56                                      size_t length,
57                                      uint8_t twoLetterBase,
58                                      uint8_t threeLetterBase) {
59   if (length == 2) {
60     return 0x7c00u |  // 0x1fu << 10
61            (uint16_t)(c[0] - twoLetterBase) << 5 |
62            (uint16_t)(c[1] - twoLetterBase);
63   } else {
64     return ((uint16_t)(c[0] - threeLetterBase) << 10) |
65            (uint16_t)(c[1] - threeLetterBase) << 5 |
66            (uint16_t)(c[2] - threeLetterBase);
67   }
68 }
69 
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)70 static size_t unpackLanguageOrRegion(uint16_t in,
71                                      char* out,
72                                      uint8_t twoLetterBase,
73                                      uint8_t threeLetterBase) {
74   uint8_t first = (in >> 10) & 0x1f;
75   uint8_t second = (in >> 5) & 0x1f;
76   uint8_t third = in & 0x1f;
77 
78   if (first == 0x1f) {
79     out[0] = second + twoLetterBase;
80     out[1] = third + twoLetterBase;
81     return 2;
82   } else {
83     out[0] = first + threeLetterBase;
84     out[1] = second + threeLetterBase;
85     out[2] = third + threeLetterBase;
86     return 3;
87   }
88 }
89 
90 // Find the next '-' or '_' index from startOffset position. If not found,
91 // returns bufferLength.
nextDelimiterIndex(const char * buffer,size_t bufferLength,size_t startOffset)92 static size_t nextDelimiterIndex(const char* buffer,
93                                  size_t bufferLength,
94                                  size_t startOffset) {
95   for (size_t i = startOffset; i < bufferLength; ++i) {
96     if (buffer[i] == '-' || buffer[i] == '_') {
97       return i;
98     }
99   }
100   return bufferLength;
101 }
102 
isLowercase(char c)103 static inline bool isLowercase(char c) {
104   return 'a' <= c && c <= 'z';
105 }
106 
isUppercase(char c)107 static inline bool isUppercase(char c) {
108   return 'A' <= c && c <= 'Z';
109 }
110 
isDigit(char c)111 static inline bool isDigit(char c) {
112   return '0' <= c && c <= '9';
113 }
114 
115 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const char * buffer,size_t length)116 static inline bool isValidLanguageCode(const char* buffer, size_t length) {
117   if (length != 2 && length != 3)
118     return false;
119   if (!isLowercase(buffer[0]))
120     return false;
121   if (!isLowercase(buffer[1]))
122     return false;
123   if (length == 3 && !isLowercase(buffer[2]))
124     return false;
125   return true;
126 }
127 
128 // Returns true if buffer is valid for script code. The length of buffer must
129 // be 4.
isValidScriptCode(const char * buffer)130 static inline bool isValidScriptCode(const char* buffer) {
131   return isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
132          isLowercase(buffer[2]) && isLowercase(buffer[3]);
133 }
134 
135 // Returns true if the buffer is valid for region code.
isValidRegionCode(const char * buffer,size_t length)136 static inline bool isValidRegionCode(const char* buffer, size_t length) {
137   return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
138          (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) &&
139           isDigit(buffer[2]));
140 }
141 
142 // Parse BCP 47 language identifier into internal structure
FontLanguage(const char * buf,size_t length)143 FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
144   size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
145   if (isValidLanguageCode(buf, firstDelimiterPos)) {
146     mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
147   } else {
148     // We don't understand anything other than two-letter or three-letter
149     // language codes, so we skip parsing the rest of the string.
150     return;
151   }
152 
153   if (firstDelimiterPos == length) {
154     mHbLanguage = hb_language_from_string(getString().c_str(), -1);
155     return;  // Language code only.
156   }
157 
158   size_t nextComponentStartPos = firstDelimiterPos + 1;
159   size_t nextDelimiterPos =
160       nextDelimiterIndex(buf, length, nextComponentStartPos);
161   size_t componentLength = nextDelimiterPos - nextComponentStartPos;
162 
163   if (componentLength == 4) {
164     // Possibly script code.
165     const char* p = buf + nextComponentStartPos;
166     if (isValidScriptCode(p)) {
167       mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
168       mSubScriptBits = scriptToSubScriptBits(mScript);
169     }
170 
171     if (nextDelimiterPos == length) {
172       mHbLanguage = hb_language_from_string(getString().c_str(), -1);
173       mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
174       return;  // No region code.
175     }
176 
177     nextComponentStartPos = nextDelimiterPos + 1;
178     nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
179     componentLength = nextDelimiterPos - nextComponentStartPos;
180   }
181 
182   if (componentLength == 2 || componentLength == 3) {
183     // Possibly region code.
184     const char* p = buf + nextComponentStartPos;
185     if (isValidRegionCode(p, componentLength)) {
186       mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
187     }
188   }
189 
190   mHbLanguage = hb_language_from_string(getString().c_str(), -1);
191   mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
192 }
193 
194 // static
resolveEmojiStyle(const char * buf,size_t length,uint32_t script)195 FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf,
196                                                          size_t length,
197                                                          uint32_t script) {
198   // First, lookup emoji subtag.
199   // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
200   // unnecessary comparison can be avoided if total length is smaller than 10.
201   const size_t kMinSubtagLength = 10;
202   if (length >= kMinSubtagLength) {
203     static const char kPrefix[] = "-u-em-";
204     const char* pos =
205         std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
206     if (pos != buf + length) {  // found
207       pos += strlen(kPrefix);
208       const size_t remainingLength = length - (pos - buf);
209       if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) {
210         return EMSTYLE_EMOJI;
211       } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) {
212         return EMSTYLE_TEXT;
213       } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) {
214         return EMSTYLE_DEFAULT;
215       }
216     }
217   }
218 
219   // If no emoji subtag was provided, resolve the emoji style from script code.
220   if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
221     return EMSTYLE_EMOJI;
222   } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
223     return EMSTYLE_TEXT;
224   }
225 
226   return EMSTYLE_EMPTY;
227 }
228 
229 // static
scriptToSubScriptBits(uint32_t script)230 uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
231   uint8_t subScriptBits = 0u;
232   switch (script) {
233     case SCRIPT_TAG('B', 'o', 'p', 'o'):
234       subScriptBits = kBopomofoFlag;
235       break;
236     case SCRIPT_TAG('H', 'a', 'n', 'g'):
237       subScriptBits = kHangulFlag;
238       break;
239     case SCRIPT_TAG('H', 'a', 'n', 'b'):
240       // Bopomofo is almost exclusively used in Taiwan.
241       subScriptBits = kHanFlag | kBopomofoFlag;
242       break;
243     case SCRIPT_TAG('H', 'a', 'n', 'i'):
244       subScriptBits = kHanFlag;
245       break;
246     case SCRIPT_TAG('H', 'a', 'n', 's'):
247       subScriptBits = kHanFlag | kSimplifiedChineseFlag;
248       break;
249     case SCRIPT_TAG('H', 'a', 'n', 't'):
250       subScriptBits = kHanFlag | kTraditionalChineseFlag;
251       break;
252     case SCRIPT_TAG('H', 'i', 'r', 'a'):
253       subScriptBits = kHiraganaFlag;
254       break;
255     case SCRIPT_TAG('H', 'r', 'k', 't'):
256       subScriptBits = kKatakanaFlag | kHiraganaFlag;
257       break;
258     case SCRIPT_TAG('J', 'p', 'a', 'n'):
259       subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
260       break;
261     case SCRIPT_TAG('K', 'a', 'n', 'a'):
262       subScriptBits = kKatakanaFlag;
263       break;
264     case SCRIPT_TAG('K', 'o', 'r', 'e'):
265       subScriptBits = kHanFlag | kHangulFlag;
266       break;
267   }
268   return subScriptBits;
269 }
270 
getString() const271 std::string FontLanguage::getString() const {
272   if (isUnsupported()) {
273     return "und";
274   }
275   char buf[16];
276   size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
277   if (mScript != 0) {
278     buf[i++] = '-';
279     buf[i++] = (mScript >> 24) & 0xFFu;
280     buf[i++] = (mScript >> 16) & 0xFFu;
281     buf[i++] = (mScript >> 8) & 0xFFu;
282     buf[i++] = mScript & 0xFFu;
283   }
284   if (mRegion != INVALID_CODE) {
285     buf[i++] = '-';
286     i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
287   }
288   return std::string(buf, i);
289 }
290 
isEqualScript(const FontLanguage & other) const291 bool FontLanguage::isEqualScript(const FontLanguage& other) const {
292   return other.mScript == mScript;
293 }
294 
295 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)296 bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
297   return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
298 }
299 
supportsHbScript(hb_script_t script) const300 bool FontLanguage::supportsHbScript(hb_script_t script) const {
301   static_assert(
302       SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
303       "The Minikin script and HarfBuzz hb_script_t have different encodings.");
304   if (script == mScript)
305     return true;
306   return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
307 }
308 
calcScoreFor(const FontLanguages & supported) const309 int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
310   bool languageScriptMatch = false;
311   bool subtagMatch = false;
312   bool scriptMatch = false;
313 
314   for (size_t i = 0; i < supported.size(); ++i) {
315     if (mEmojiStyle != EMSTYLE_EMPTY &&
316         mEmojiStyle == supported[i].mEmojiStyle) {
317       subtagMatch = true;
318       if (mLanguage == supported[i].mLanguage) {
319         return 4;
320       }
321     }
322     if (isEqualScript(supported[i]) ||
323         supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
324       scriptMatch = true;
325       if (mLanguage == supported[i].mLanguage) {
326         languageScriptMatch = true;
327       }
328     }
329   }
330 
331   if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
332     scriptMatch = true;
333     if (mLanguage == supported[0].mLanguage &&
334         supported.isAllTheSameLanguage()) {
335       return 3;
336     }
337   }
338 
339   if (languageScriptMatch) {
340     return 3;
341   } else if (subtagMatch) {
342     return 2;
343   } else if (scriptMatch) {
344     return 1;
345   }
346   return 0;
347 }
348 
FontLanguages(std::vector<FontLanguage> && languages)349 FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
350     : mLanguages(std::move(languages)) {
351   if (mLanguages.empty()) {
352     return;
353   }
354 
355   const FontLanguage& lang = mLanguages[0];
356 
357   mIsAllTheSameLanguage = true;
358   mUnionOfSubScriptBits = lang.mSubScriptBits;
359   for (size_t i = 1; i < mLanguages.size(); ++i) {
360     mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
361     if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
362       mIsAllTheSameLanguage = false;
363     }
364   }
365 }
366 
367 #undef SCRIPT_TAG
368 }  // namespace minikin
369