1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "Locale.h"
18
19 #include <algorithm>
20
21 #include <hb.h>
22
23 #include "minikin/LocaleList.h"
24
25 #include "LocaleListCache.h"
26 #include "MinikinInternal.h"
27 #include "StringPiece.h"
28
29 namespace minikin {
30
31 constexpr uint32_t FIVE_BITS = 0x1f;
32
registerLocaleList(const std::string & locales)33 uint32_t registerLocaleList(const std::string& locales) {
34 return LocaleListCache::getId(locales);
35 }
36
getLocaleString(uint32_t localeId)37 std::string getLocaleString(uint32_t localeId) {
38 const LocaleList& localeList = LocaleListCache::getById(localeId);
39 std::string out;
40 for (size_t i = 0; i < localeList.size(); ++i) {
41 if (i != 0) {
42 out += ",";
43 }
44 out += localeList[i].getString();
45 }
46 return out;
47 }
48
49 // Check if a language code supports extension such as emoji and line break etc. according to its
50 // subtag
isSubtag(const char * buf,size_t bufLen,const char * subtag,size_t subtagLen)51 static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
52 if (bufLen < subtagLen) {
53 return false;
54 }
55 if (strncmp(buf, subtag, subtagLen) != 0) {
56 return false; // no match between two strings
57 }
58 return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
59 buf[subtagLen] == '_');
60 }
61
62 // Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
63 // For the region code, the letters must be all digits in three letter case, so the number of
64 // possible values are 10. For the language code, the letters must be all small alphabets, so the
65 // number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
66 // three letter language code or region code to 15 bits.
67 //
68 // In case of two letter code, use fullbit(0x1f) for the first letter instead.
packLanguageOrRegion(const StringPiece & in,uint8_t twoLetterBase,uint8_t threeLetterBase)69 static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
70 uint8_t threeLetterBase) {
71 if (in.length() == 2) {
72 return 0x7c00u | // 0x1fu << 10
73 (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
74 } else {
75 return ((uint16_t)(in[0] - threeLetterBase) << 10) |
76 (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
77 }
78 }
79
unpackLanguageOrRegion(uint16_t in,char * out,uint8_t twoLetterBase,uint8_t threeLetterBase)80 static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
81 uint8_t threeLetterBase) {
82 uint8_t first = (in >> 10) & FIVE_BITS;
83 uint8_t second = (in >> 5) & FIVE_BITS;
84 uint8_t third = in & FIVE_BITS;
85
86 if (first == 0x1f) {
87 out[0] = second + twoLetterBase;
88 out[1] = third + twoLetterBase;
89 return 2;
90 } else {
91 out[0] = first + threeLetterBase;
92 out[1] = second + threeLetterBase;
93 out[2] = third + threeLetterBase;
94 return 3;
95 }
96 }
97
packLanguage(const StringPiece & in)98 static uint16_t packLanguage(const StringPiece& in) {
99 return packLanguageOrRegion(in, 'a', 'a');
100 }
101
unpackLanguage(uint16_t in,char * out)102 static size_t unpackLanguage(uint16_t in, char* out) {
103 return unpackLanguageOrRegion(in, out, 'a', 'a');
104 }
105
packScript(char c1,char c2,char c3,char c4)106 constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
107 constexpr char FIRST_LETTER_BASE = 'A';
108 constexpr char REST_LETTER_BASE = 'a';
109 return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
110 ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
111 }
112
packScript(uint32_t script)113 constexpr uint32_t packScript(uint32_t script) {
114 return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
115 }
116
unpackScript(uint32_t packedScript)117 constexpr uint32_t unpackScript(uint32_t packedScript) {
118 constexpr char FIRST_LETTER_BASE = 'A';
119 constexpr char REST_LETTER_BASE = 'a';
120 const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
121 const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
122 const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
123 const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;
124
125 return first << 24 | second << 16 | third << 8 | fourth;
126 }
127
packRegion(const StringPiece & in)128 static uint16_t packRegion(const StringPiece& in) {
129 return packLanguageOrRegion(in, 'A', '0');
130 }
131
unpackRegion(uint16_t in,char * out)132 static size_t unpackRegion(uint16_t in, char* out) {
133 return unpackLanguageOrRegion(in, out, 'A', '0');
134 }
135
isLowercase(char c)136 static inline bool isLowercase(char c) {
137 return 'a' <= c && c <= 'z';
138 }
139
isUppercase(char c)140 static inline bool isUppercase(char c) {
141 return 'A' <= c && c <= 'Z';
142 }
143
isDigit(char c)144 static inline bool isDigit(char c) {
145 return '0' <= c && c <= '9';
146 }
147
148 // Returns true if the buffer is valid for language code.
isValidLanguageCode(const StringPiece & buffer)149 static inline bool isValidLanguageCode(const StringPiece& buffer) {
150 if (buffer.length() != 2 && buffer.length() != 3) return false;
151 if (!isLowercase(buffer[0])) return false;
152 if (!isLowercase(buffer[1])) return false;
153 if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
154 return true;
155 }
156
157 // Returns true if buffer is valid for script code. The length of buffer must be 4.
isValidScriptCode(const StringPiece & buffer)158 static inline bool isValidScriptCode(const StringPiece& buffer) {
159 return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
160 isLowercase(buffer[2]) && isLowercase(buffer[3]);
161 }
162
163 // Returns true if the buffer is valid for region code.
isValidRegionCode(const StringPiece & buffer)164 static inline bool isValidRegionCode(const StringPiece& buffer) {
165 return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
166 (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
167 }
168
169 // Parse BCP 47 language identifier into internal structure
Locale(const StringPiece & input)170 Locale::Locale(const StringPiece& input) : Locale() {
171 SplitIterator it(input, '-');
172
173 StringPiece language = it.next();
174 if (isValidLanguageCode(language)) {
175 mLanguage = packLanguage(language);
176 } else {
177 // We don't understand anything other than two-letter or three-letter
178 // language codes, so we skip parsing the rest of the string.
179 return;
180 }
181
182 if (!it.hasNext()) {
183 return; // Language code only.
184 }
185 StringPiece token = it.next();
186
187 if (isValidScriptCode(token)) {
188 mScript = packScript(token[0], token[1], token[2], token[3]);
189 mSubScriptBits = scriptToSubScriptBits(mScript);
190
191 if (!it.hasNext()) {
192 goto finalize; // No variant, emoji subtag and region code.
193 }
194 token = it.next();
195 }
196
197 if (isValidRegionCode(token)) {
198 mRegion = packRegion(token);
199
200 if (!it.hasNext()) {
201 goto finalize; // No variant or emoji subtag.
202 }
203 token = it.next();
204 }
205
206 if (language == "de") { // We are only interested in German variants.
207 if (token == "1901") {
208 mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
209 } else if (token == "1996") {
210 mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
211 }
212
213 if (mVariant != Variant::NO_VARIANT) {
214 if (!it.hasNext()) {
215 goto finalize; // No emoji subtag.
216 }
217
218 token = it.next();
219 }
220 }
221
222 resolveUnicodeExtension(input.data(), input.length());
223
224 finalize:
225 if (mEmojiStyle == EmojiStyle::EMPTY) {
226 mEmojiStyle = scriptToEmojiStyle(mScript);
227 }
228 }
229
resolveUnicodeExtension(const char * buf,size_t length)230 void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
231 static const char kPrefix[] = "-u-";
232 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
233 if (pos != buf + length) {
234 pos += strlen(kPrefix);
235 const size_t remainingLength = length - (pos - buf);
236 mLBStyle = resolveLineBreakStyle(pos, remainingLength);
237 mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
238 }
239 }
240
241 // static
242 // Lookup line break subtag and determine the line break style.
resolveLineBreakStyle(const char * buf,size_t length)243 LineBreakStyle Locale::resolveLineBreakStyle(const char* buf, size_t length) {
244 // 8 is the length of "-u-lb-loose", which is the shortest line break subtag,
245 // unnecessary comparison can be avoided if total length is smaller than 11.
246 const size_t kMinSubtagLength = 8;
247 if (length >= kMinSubtagLength) {
248 static const char kPrefix[] = "lb-";
249 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
250 if (pos != buf + length) { // found
251 pos += strlen(kPrefix);
252 const size_t remainingLength = length - (pos - buf);
253 if (isSubtag(pos, remainingLength, "loose", 5)) {
254 return LineBreakStyle::LOOSE;
255 } else if (isSubtag(pos, remainingLength, "normal", 6)) {
256 return LineBreakStyle::NORMAL;
257 } else if (isSubtag(pos, remainingLength, "strict", 6)) {
258 return LineBreakStyle::STRICT;
259 }
260 }
261 }
262 return LineBreakStyle::EMPTY;
263 }
264
265 // static
266 // Lookup emoji subtag and determine the emoji style.
resolveEmojiStyle(const char * buf,size_t length)267 EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
268 // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
269 // unnecessary comparison can be avoided if total length is smaller than 10.
270 const size_t kMinSubtagLength = 7;
271 if (length >= kMinSubtagLength) {
272 static const char kPrefix[] = "em-";
273 const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
274 if (pos != buf + length) { // found
275 pos += strlen(kPrefix);
276 const size_t remainingLength = length - (pos - buf);
277 if (isSubtag(pos, remainingLength, "emoji", 5)) {
278 return EmojiStyle::EMOJI;
279 } else if (isSubtag(pos, remainingLength, "text", 4)) {
280 return EmojiStyle::TEXT;
281 } else if (isSubtag(pos, remainingLength, "default", 7)) {
282 return EmojiStyle::DEFAULT;
283 }
284 }
285 }
286 return EmojiStyle::EMPTY;
287 }
288
scriptToEmojiStyle(uint32_t script)289 EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
290 // If no emoji subtag was provided, resolve the emoji style from script code.
291 if (script == packScript('Z', 's', 'y', 'e')) {
292 return EmojiStyle::EMOJI;
293 } else if (script == packScript('Z', 's', 'y', 'm')) {
294 return EmojiStyle::TEXT;
295 }
296 return EmojiStyle::EMPTY;
297 }
298
299 // static
scriptToSubScriptBits(uint32_t script)300 uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
301 uint8_t subScriptBits = 0u;
302 switch (script) {
303 case packScript('B', 'o', 'p', 'o'):
304 subScriptBits = kBopomofoFlag;
305 break;
306 case packScript('H', 'a', 'n', 'g'):
307 subScriptBits = kHangulFlag;
308 break;
309 case packScript('H', 'a', 'n', 'b'):
310 // Bopomofo is almost exclusively used in Taiwan.
311 subScriptBits = kHanFlag | kBopomofoFlag;
312 break;
313 case packScript('H', 'a', 'n', 'i'):
314 subScriptBits = kHanFlag;
315 break;
316 case packScript('H', 'a', 'n', 's'):
317 subScriptBits = kHanFlag | kSimplifiedChineseFlag;
318 break;
319 case packScript('H', 'a', 'n', 't'):
320 subScriptBits = kHanFlag | kTraditionalChineseFlag;
321 break;
322 case packScript('H', 'i', 'r', 'a'):
323 subScriptBits = kHiraganaFlag;
324 break;
325 case packScript('H', 'r', 'k', 't'):
326 subScriptBits = kKatakanaFlag | kHiraganaFlag;
327 break;
328 case packScript('J', 'p', 'a', 'n'):
329 subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
330 break;
331 case packScript('K', 'a', 'n', 'a'):
332 subScriptBits = kKatakanaFlag;
333 break;
334 case packScript('K', 'o', 'r', 'e'):
335 subScriptBits = kHanFlag | kHangulFlag;
336 break;
337 }
338 return subScriptBits;
339 }
340
getString() const341 std::string Locale::getString() const {
342 char buf[32] = {};
343 size_t i;
344 if (mLanguage == NO_LANGUAGE) {
345 buf[0] = 'u';
346 buf[1] = 'n';
347 buf[2] = 'd';
348 i = 3;
349 } else {
350 i = unpackLanguage(mLanguage, buf);
351 }
352 if (mScript != NO_SCRIPT) {
353 uint32_t rawScript = unpackScript(mScript);
354 buf[i++] = '-';
355 buf[i++] = (rawScript >> 24) & 0xFFu;
356 buf[i++] = (rawScript >> 16) & 0xFFu;
357 buf[i++] = (rawScript >> 8) & 0xFFu;
358 buf[i++] = rawScript & 0xFFu;
359 }
360 if (mRegion != NO_REGION) {
361 buf[i++] = '-';
362 i += unpackRegion(mRegion, buf + i);
363 }
364 if (mVariant != Variant::NO_VARIANT) {
365 buf[i++] = '-';
366 buf[i++] = '1';
367 buf[i++] = '9';
368 switch (mVariant) {
369 case Variant::GERMAN_1901_ORTHOGRAPHY:
370 buf[i++] = '0';
371 buf[i++] = '1';
372 break;
373 case Variant::GERMAN_1996_ORTHOGRAPHY:
374 buf[i++] = '9';
375 buf[i++] = '6';
376 break;
377 default:
378 MINIKIN_ASSERT(false, "Must not reached.");
379 }
380 }
381 // Add line break unicode extension.
382 if (mLBStyle != LineBreakStyle::EMPTY) {
383 buf[i++] = '-';
384 buf[i++] = 'u';
385 buf[i++] = '-';
386 buf[i++] = 'l';
387 buf[i++] = 'b';
388 buf[i++] = '-';
389 switch (mLBStyle) {
390 case LineBreakStyle::LOOSE:
391 buf[i++] = 'l';
392 buf[i++] = 'o';
393 buf[i++] = 'o';
394 buf[i++] = 's';
395 buf[i++] = 'e';
396 break;
397 case LineBreakStyle::NORMAL:
398 buf[i++] = 'n';
399 buf[i++] = 'o';
400 buf[i++] = 'r';
401 buf[i++] = 'm';
402 buf[i++] = 'a';
403 buf[i++] = 'l';
404 break;
405 case LineBreakStyle::STRICT:
406 buf[i++] = 's';
407 buf[i++] = 't';
408 buf[i++] = 'r';
409 buf[i++] = 'i';
410 buf[i++] = 'c';
411 buf[i++] = 't';
412 break;
413 default:
414 MINIKIN_ASSERT(false, "Must not reached.");
415 }
416 }
417 return std::string(buf, i);
418 }
419
getPartialLocale(SubtagBits bits) const420 Locale Locale::getPartialLocale(SubtagBits bits) const {
421 Locale subLocale;
422 if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
423 subLocale.mLanguage = mLanguage;
424 } else {
425 subLocale.mLanguage = packLanguage("und");
426 }
427 if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
428 subLocale.mScript = mScript;
429 subLocale.mSubScriptBits = mSubScriptBits;
430 }
431 if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
432 subLocale.mRegion = mRegion;
433 }
434 if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
435 subLocale.mVariant = mVariant;
436 }
437 if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
438 subLocale.mEmojiStyle = mEmojiStyle;
439 }
440 return subLocale;
441 }
442
isEqualScript(const Locale & other) const443 bool Locale::isEqualScript(const Locale& other) const {
444 return other.mScript == mScript;
445 }
446
447 // static
supportsScript(uint8_t providedBits,uint8_t requestedBits)448 bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
449 return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
450 }
451
supportsHbScript(hb_script_t script) const452 bool Locale::supportsHbScript(hb_script_t script) const {
453 static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
454 "The Minikin script and HarfBuzz hb_script_t have different encodings.");
455 uint32_t packedScript = packScript(script);
456 if (packedScript == mScript) return true;
457 return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
458 }
459
calcScoreFor(const LocaleList & supported) const460 int Locale::calcScoreFor(const LocaleList& supported) const {
461 bool languageScriptMatch = false;
462 bool subtagMatch = false;
463 bool scriptMatch = false;
464
465 for (size_t i = 0; i < supported.size(); ++i) {
466 if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
467 subtagMatch = true;
468 if (mLanguage == supported[i].mLanguage) {
469 return 4;
470 }
471 }
472 if (isEqualScript(supported[i]) ||
473 supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
474 scriptMatch = true;
475 if (mLanguage == supported[i].mLanguage) {
476 languageScriptMatch = true;
477 }
478 }
479 }
480
481 if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
482 scriptMatch = true;
483 if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
484 return 3;
485 }
486 }
487
488 if (languageScriptMatch) {
489 return 3;
490 } else if (subtagMatch) {
491 return 2;
492 } else if (scriptMatch) {
493 return 1;
494 }
495 return 0;
496 }
497
buildHbLanguage(const Locale & locale)498 static hb_language_t buildHbLanguage(const Locale& locale) {
499 return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
500 : HB_LANGUAGE_INVALID;
501 }
502
LocaleList(std::vector<Locale> && locales)503 LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
504 mIsAllTheSameLocale = true;
505 mUnionOfSubScriptBits = 0u;
506 mHbLangs.reserve(mLocales.size());
507 mEmojiStyle = EmojiStyle::EMPTY;
508 const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
509 for (const Locale& locale : mLocales) {
510 mUnionOfSubScriptBits |= locale.mSubScriptBits;
511 if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
512 mIsAllTheSameLocale = false;
513 }
514 mHbLangs.push_back(buildHbLanguage(locale));
515 if (mEmojiStyle == EmojiStyle::EMPTY) {
516 mEmojiStyle = locale.getEmojiStyle();
517 }
518 }
519 }
520
521 } // namespace minikin
522