android-15.0.0_r1/s

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include "Locale.h"

#include <algorithm>

#include <hb.h>

#include "minikin/LocaleList.h"

#include "LocaleListCache.h"
#include "MinikinInternal.h"
#include "StringPiece.h"

namespace minikin {

constexpr uint32_t FIVE_BITS = 0x1f;

uint32_t registerLocaleList(const std::string& locales) {
    return LocaleListCache::getId(locales);
}

std::string getLocaleString(uint32_t localeId) {
    const LocaleList& localeList = LocaleListCache::getById(localeId);
    std::string out;
    for (size_t i = 0; i < localeList.size(); ++i) {
        if (i != 0) {
            out += ",";
        }
        out += localeList[i].getString();
    }
    return out;
}

// Check if a language code supports extension such as emoji and line break etc. according to its
// subtag
static bool isSubtag(const char* buf, size_t bufLen, const char* subtag, size_t subtagLen) {
    if (bufLen < subtagLen) {
        return false;
    }
    if (strncmp(buf, subtag, subtagLen) != 0) {
        return false;  // no match between two strings
    }
    return (bufLen == subtagLen || buf[subtagLen] == '\0' || buf[subtagLen] == '-' ||
            buf[subtagLen] == '_');
}

// Pack the three letter code into 15 bits and stored to 16 bit integer. The highest bit is 0.
// For the region code, the letters must be all digits in three letter case, so the number of
// possible values are 10. For the language code, the letters must be all small alphabets, so the
// number of possible values are 26. Thus, 5 bits are sufficient for each case and we can pack the
// three letter language code or region code to 15 bits.
//
// In case of two letter code, use fullbit(0x1f) for the first letter instead.
static uint16_t packLanguageOrRegion(const StringPiece& in, uint8_t twoLetterBase,
                                     uint8_t threeLetterBase) {
    if (in.length() == 2) {
        return 0x7c00u |  // 0x1fu << 10
               (uint16_t)(in[0] - twoLetterBase) << 5 | (uint16_t)(in[1] - twoLetterBase);
    } else {
        return ((uint16_t)(in[0] - threeLetterBase) << 10) |
               (uint16_t)(in[1] - threeLetterBase) << 5 | (uint16_t)(in[2] - threeLetterBase);
    }
}

static size_t unpackLanguageOrRegion(uint16_t in, char* out, uint8_t twoLetterBase,
                                     uint8_t threeLetterBase) {
    uint8_t first = (in >> 10) & FIVE_BITS;
    uint8_t second = (in >> 5) & FIVE_BITS;
    uint8_t third = in & FIVE_BITS;

    if (first == 0x1f) {
        out[0] = second + twoLetterBase;
        out[1] = third + twoLetterBase;
        return 2;
    } else {
        out[0] = first + threeLetterBase;
        out[1] = second + threeLetterBase;
        out[2] = third + threeLetterBase;
        return 3;
    }
}

static uint16_t packLanguage(const StringPiece& in) {
    return packLanguageOrRegion(in, 'a', 'a');
}

static size_t unpackLanguage(uint16_t in, char* out) {
    return unpackLanguageOrRegion(in, out, 'a', 'a');
}

constexpr uint32_t packScript(char c1, char c2, char c3, char c4) {
    constexpr char FIRST_LETTER_BASE = 'A';
    constexpr char REST_LETTER_BASE = 'a';
    return ((uint32_t)(c1 - FIRST_LETTER_BASE) << 15) | (uint32_t)(c2 - REST_LETTER_BASE) << 10 |
           ((uint32_t)(c3 - REST_LETTER_BASE) << 5) | (uint32_t)(c4 - REST_LETTER_BASE);
}

constexpr uint32_t packScript(uint32_t script) {
    return packScript(script >> 24, (script >> 16) & 0xff, (script >> 8) & 0xff, script & 0xff);
}

constexpr uint32_t unpackScript(uint32_t packedScript) {
    constexpr char FIRST_LETTER_BASE = 'A';
    constexpr char REST_LETTER_BASE = 'a';
    const uint32_t first = (packedScript >> 15) + FIRST_LETTER_BASE;
    const uint32_t second = ((packedScript >> 10) & FIVE_BITS) + REST_LETTER_BASE;
    const uint32_t third = ((packedScript >> 5) & FIVE_BITS) + REST_LETTER_BASE;
    const uint32_t fourth = (packedScript & FIVE_BITS) + REST_LETTER_BASE;

    return first << 24 | second << 16 | third << 8 | fourth;
}

static uint16_t packRegion(const StringPiece& in) {
    return packLanguageOrRegion(in, 'A', '0');
}

static size_t unpackRegion(uint16_t in, char* out) {
    return unpackLanguageOrRegion(in, out, 'A', '0');
}

static inline bool isLowercase(char c) {
    return 'a' <= c && c <= 'z';
}

static inline bool isUppercase(char c) {
    return 'A' <= c && c <= 'Z';
}

static inline bool isDigit(char c) {
    return '0' <= c && c <= '9';
}

// Returns true if the buffer is valid for language code.
static inline bool isValidLanguageCode(const StringPiece& buffer) {
    if (buffer.length() != 2 && buffer.length() != 3) return false;
    if (!isLowercase(buffer[0])) return false;
    if (!isLowercase(buffer[1])) return false;
    if (buffer.length() == 3 && !isLowercase(buffer[2])) return false;
    return true;
}

// Returns true if buffer is valid for script code. The length of buffer must be 4.
static inline bool isValidScriptCode(const StringPiece& buffer) {
    return buffer.size() == 4 && isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
           isLowercase(buffer[2]) && isLowercase(buffer[3]);
}

// Returns true if the buffer is valid for region code.
static inline bool isValidRegionCode(const StringPiece& buffer) {
    return (buffer.size() == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
           (buffer.size() == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) && isDigit(buffer[2]));
}

// Parse BCP 47 language identifier into internal structure
Locale::Locale(const StringPiece& input) : Locale() {
    SplitIterator it(input, '-');

    StringPiece language = it.next();
    if (isValidLanguageCode(language)) {
        mLanguage = packLanguage(language);
    } else {
        // We don't understand anything other than two-letter or three-letter
        // language codes, so we skip parsing the rest of the string.
        return;
    }

    if (!it.hasNext()) {
        return;  // Language code only.
    }
    StringPiece token = it.next();

    if (isValidScriptCode(token)) {
        mScript = packScript(token[0], token[1], token[2], token[3]);
        mSubScriptBits = scriptToSubScriptBits(mScript);

        if (!it.hasNext()) {
            goto finalize;  // No variant, emoji subtag and region code.
        }
        token = it.next();
    }

    if (isValidRegionCode(token)) {
        mRegion = packRegion(token);

        if (!it.hasNext()) {
            goto finalize;  // No variant or emoji subtag.
        }
        token = it.next();
    }

    if (language == "de") {  // We are only interested in German variants.
        if (token == "1901") {
            mVariant = Variant::GERMAN_1901_ORTHOGRAPHY;
        } else if (token == "1996") {
            mVariant = Variant::GERMAN_1996_ORTHOGRAPHY;
        }

        if (mVariant != Variant::NO_VARIANT) {
            if (!it.hasNext()) {
                goto finalize;  // No emoji subtag.
            }

            token = it.next();
        }
    }

    resolveUnicodeExtension(input.data(), input.length());

finalize:
    if (mEmojiStyle == EmojiStyle::EMPTY) {
        mEmojiStyle = scriptToEmojiStyle(mScript);
    }
}

void Locale::resolveUnicodeExtension(const char* buf, size_t length) {
    static const char kPrefix[] = "-u-";
    const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
    if (pos != buf + length) {
        pos += strlen(kPrefix);
        const size_t remainingLength = length - (pos - buf);
        mEmojiStyle = resolveEmojiStyle(pos, remainingLength);
    }
}

// static
// Lookup emoji subtag and determine the emoji style.
EmojiStyle Locale::resolveEmojiStyle(const char* buf, size_t length) {
    // 7 is the length of "-u-em-text", which is the shortest emoji subtag,
    // unnecessary comparison can be avoided if total length is smaller than 10.
    const size_t kMinSubtagLength = 7;
    if (length >= kMinSubtagLength) {
        static const char kPrefix[] = "em-";
        const char* pos = std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
        if (pos != buf + length) {  // found
            pos += strlen(kPrefix);
            const size_t remainingLength = length - (pos - buf);
            if (isSubtag(pos, remainingLength, "emoji", 5)) {
                return EmojiStyle::EMOJI;
            } else if (isSubtag(pos, remainingLength, "text", 4)) {
                return EmojiStyle::TEXT;
            } else if (isSubtag(pos, remainingLength, "default", 7)) {
                return EmojiStyle::DEFAULT;
            }
        }
    }
    return EmojiStyle::EMPTY;
}

EmojiStyle Locale::scriptToEmojiStyle(uint32_t script) {
    // If no emoji subtag was provided, resolve the emoji style from script code.
    if (script == packScript('Z', 's', 'y', 'e')) {
        return EmojiStyle::EMOJI;
    } else if (script == packScript('Z', 's', 'y', 'm')) {
        return EmojiStyle::TEXT;
    }
    return EmojiStyle::EMPTY;
}

// static
uint8_t Locale::scriptToSubScriptBits(uint32_t script) {
    uint8_t subScriptBits = 0u;
    switch (script) {
        case packScript('B', 'o', 'p', 'o'):
            subScriptBits = kBopomofoFlag;
            break;
        case packScript('H', 'a', 'n', 'g'):
            subScriptBits = kHangulFlag;
            break;
        case packScript('H', 'a', 'n', 'b'):
            // Bopomofo is almost exclusively used in Taiwan.
            subScriptBits = kHanFlag | kBopomofoFlag;
            break;
        case packScript('H', 'a', 'n', 'i'):
            subScriptBits = kHanFlag;
            break;
        case packScript('H', 'a', 'n', 's'):
            subScriptBits = kHanFlag | kSimplifiedChineseFlag;
            break;
        case packScript('H', 'a', 'n', 't'):
            subScriptBits = kHanFlag | kTraditionalChineseFlag;
            break;
        case packScript('H', 'i', 'r', 'a'):
            subScriptBits = kHiraganaFlag;
            break;
        case packScript('H', 'r', 'k', 't'):
            subScriptBits = kKatakanaFlag | kHiraganaFlag;
            break;
        case packScript('J', 'p', 'a', 'n'):
            subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
            break;
        case packScript('K', 'a', 'n', 'a'):
            subScriptBits = kKatakanaFlag;
            break;
        case packScript('K', 'o', 'r', 'e'):
            subScriptBits = kHanFlag | kHangulFlag;
            break;
    }
    return subScriptBits;
}

std::string Locale::getString() const {
    char buf[32];
    int i = buildLocaleString(buf);
    return std::string(buf, i);
}

std::string Locale::getStringWithLineBreakOption(LineBreakStyle lbStyle,
                                                 LineBreakWordStyle lbWordStyle) const {
    char buf[48];
    int i = buildLocaleString(buf);

    // Add line break unicode extension.
    if (lbStyle != LineBreakStyle::None || lbWordStyle != LineBreakWordStyle::None) {
        buf[i++] = '-';
        buf[i++] = 'u';
    }

    if (lbStyle != LineBreakStyle::None) {
        buf[i++] = '-';
        buf[i++] = 'l';
        buf[i++] = 'b';
        buf[i++] = '-';
        switch (lbStyle) {
            case LineBreakStyle::Loose:
                buf[i++] = 'l';
                buf[i++] = 'o';
                buf[i++] = 'o';
                buf[i++] = 's';
                buf[i++] = 'e';
                break;
            case LineBreakStyle::Normal:
                buf[i++] = 'n';
                buf[i++] = 'o';
                buf[i++] = 'r';
                buf[i++] = 'm';
                buf[i++] = 'a';
                buf[i++] = 'l';
                break;
            case LineBreakStyle::Strict:
                buf[i++] = 's';
                buf[i++] = 't';
                buf[i++] = 'r';
                buf[i++] = 'i';
                buf[i++] = 'c';
                buf[i++] = 't';
                break;
            default:
                MINIKIN_ASSERT(false, "Must not reached.");
        }
    }

    if (lbWordStyle != LineBreakWordStyle::None) {
        buf[i++] = '-';
        buf[i++] = 'l';
        buf[i++] = 'w';
        buf[i++] = '-';
        switch (lbWordStyle) {
            case LineBreakWordStyle::Phrase:
                buf[i++] = 'p';
                buf[i++] = 'h';
                buf[i++] = 'r';
                buf[i++] = 'a';
                buf[i++] = 's';
                buf[i++] = 'e';
                break;
            default:
                MINIKIN_ASSERT(false, "Must not reached.");
        }
    }
    return std::string(buf, i);
}

int Locale::buildLocaleString(char* buf) const {
    size_t i;
    if (mLanguage == NO_LANGUAGE) {
        buf[0] = 'u';
        buf[1] = 'n';
        buf[2] = 'd';
        i = 3;
    } else {
        i = unpackLanguage(mLanguage, buf);
    }
    if (mScript != NO_SCRIPT) {
        uint32_t rawScript = unpackScript(mScript);
        buf[i++] = '-';
        buf[i++] = (rawScript >> 24) & 0xFFu;
        buf[i++] = (rawScript >> 16) & 0xFFu;
        buf[i++] = (rawScript >> 8) & 0xFFu;
        buf[i++] = rawScript & 0xFFu;
    }
    if (mRegion != NO_REGION) {
        buf[i++] = '-';
        i += unpackRegion(mRegion, buf + i);
    }
    if (mVariant != Variant::NO_VARIANT) {
        buf[i++] = '-';
        buf[i++] = '1';
        buf[i++] = '9';
        switch (mVariant) {
            case Variant::GERMAN_1901_ORTHOGRAPHY:
                buf[i++] = '0';
                buf[i++] = '1';
                break;
            case Variant::GERMAN_1996_ORTHOGRAPHY:
                buf[i++] = '9';
                buf[i++] = '6';
                break;
            default:
                MINIKIN_ASSERT(false, "Must not reached.");
        }
    }
    return i;
}

Locale Locale::getPartialLocale(SubtagBits bits) const {
    Locale subLocale;
    if ((bits & SubtagBits::LANGUAGE) != SubtagBits::EMPTY) {
        subLocale.mLanguage = mLanguage;
    } else {
        subLocale.mLanguage = packLanguage("und");
    }
    if ((bits & SubtagBits::SCRIPT) != SubtagBits::EMPTY) {
        subLocale.mScript = mScript;
        subLocale.mSubScriptBits = mSubScriptBits;
    }
    if ((bits & SubtagBits::REGION) != SubtagBits::EMPTY) {
        subLocale.mRegion = mRegion;
    }
    if ((bits & SubtagBits::VARIANT) != SubtagBits::EMPTY) {
        subLocale.mVariant = mVariant;
    }
    if ((bits & SubtagBits::EMOJI) != SubtagBits::EMPTY) {
        subLocale.mEmojiStyle = mEmojiStyle;
    }
    return subLocale;
}

bool Locale::isEqualScript(const Locale& other) const {
    return other.mScript == mScript;
}

// static
bool Locale::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
    return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
}

bool Locale::supportsScript(uint32_t script) const {
    static_assert(unpackScript(packScript('J', 'p', 'a', 'n')) == HB_TAG('J', 'p', 'a', 'n'),
                  "The Minikin script and HarfBuzz hb_script_t have different encodings.");
    uint32_t packedScript = packScript(script);
    if (packedScript == mScript) return true;
    return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
}

bool Locale::supportsScript(char c1, char c2, char c3, char c4) const {
    uint32_t packedScript = packScript(c1, c2, c3, c4);
    if (packedScript == mScript) return true;
    return supportsScript(mSubScriptBits, scriptToSubScriptBits(packedScript));
}

int Locale::calcScoreFor(const LocaleList& supported) const {
    bool languageScriptMatch = false;
    bool subtagMatch = false;
    bool scriptMatch = false;

    for (size_t i = 0; i < supported.size(); ++i) {
        if (mEmojiStyle != EmojiStyle::EMPTY && mEmojiStyle == supported[i].mEmojiStyle) {
            subtagMatch = true;
            if (mLanguage == supported[i].mLanguage) {
                return 4;
            }
        }
        if (isEqualScript(supported[i]) ||
            supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
            scriptMatch = true;
            if (mLanguage == supported[i].mLanguage) {
                languageScriptMatch = true;
            }
        }
    }

    if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
        scriptMatch = true;
        if (mLanguage == supported[0].mLanguage && supported.isAllTheSameLocale()) {
            return 3;
        }
    }

    if (languageScriptMatch) {
        return 3;
    } else if (subtagMatch) {
        return 2;
    } else if (scriptMatch) {
        return 1;
    }
    return 0;
}

static hb_language_t buildHbLanguage(const Locale& locale) {
    return locale.isSupported() ? hb_language_from_string(locale.getString().c_str(), -1)
                                : HB_LANGUAGE_INVALID;
}

LocaleList::LocaleList(std::vector<Locale>&& locales) : mLocales(std::move(locales)) {
    mIsAllTheSameLocale = true;
    mUnionOfSubScriptBits = 0u;
    mHbLangs.reserve(mLocales.size());
    mEmojiStyle = EmojiStyle::EMPTY;
    const auto firstLanguage = mLocales.empty() ? NO_LANGUAGE : mLocales[0].mLanguage;
    for (const Locale& locale : mLocales) {
        mUnionOfSubScriptBits |= locale.mSubScriptBits;
        if (mIsAllTheSameLocale && firstLanguage != locale.mLanguage) {
            mIsAllTheSameLocale = false;
        }
        mHbLangs.push_back(buildHbLanguage(locale));
        if (mEmojiStyle == EmojiStyle::EMPTY) {
            mEmojiStyle = locale.getEmojiStyle();
        }
    }
}

bool LocaleList::atLeastOneScriptMatch(const LocaleList& list) const {
    if ((mUnionOfSubScriptBits & list.mUnionOfSubScriptBits) != 0) {
        return true;
    }

    for (const Locale& myLocale : mLocales) {
        for (const Locale& otherLocale : list.mLocales) {
            if (myLocale.isEqualScript(otherLocale)) {
                return true;
            }
        }
    }

    return false;
}

bool LocaleList::hasScript(char c1, char c2, char c3, char c4) const {
    for (const Locale& locale : mLocales) {
        if (locale.supportsScript(c1, c2, c3, c4)) {
            return true;
        }
    }
    return false;
}

}  // namespace minikin