OpenHarmony-v3.2.3-Release/search

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "Minikin"

#include "FontLanguage.h"

#include <hb.h>
#include <string.h>
#include <unicode/uloc.h>
#include <algorithm>

namespace minikin {

#define SCRIPT_TAG(c1, c2, c3, c4)                                           \
  (((uint32_t)(c1)) << 24 | ((uint32_t)(c2)) << 16 | ((uint32_t)(c3)) << 8 | \
   ((uint32_t)(c4)))

// Check if a language code supports emoji according to its subtag
static bool isEmojiSubtag(const char* buf,
                          size_t bufLen,
                          const char* subtag,
                          size_t subtagLen) {
  if (bufLen < subtagLen) {
    return false;
  }
  if (strncmp(buf, subtag, subtagLen) != 0) {
    return false;  // no match between two strings
  }
  return (bufLen == subtagLen || buf[subtagLen] == '\0' ||
          buf[subtagLen] == '-' || buf[subtagLen] == '_');
}

// Pack the three letter code into 15 bits and stored to 16 bit integer. The
// highest bit is 0. For the region code, the letters must be all digits in
// three letter case, so the number of possible values are 10. For the language
// code, the letters must be all small alphabets, so the number of possible
// values are 26. Thus, 5 bits are sufficient for each case and we can pack the
// three letter language code or region code to 15 bits.
//
// In case of two letter code, use fullbit(0x1f) for the first letter instead.
static uint16_t packLanguageOrRegion(const char* c,
                                     size_t length,
                                     uint8_t twoLetterBase,
                                     uint8_t threeLetterBase) {
  if (length == 2) {
    return 0x7c00u |  // 0x1fu << 10
           (uint16_t)(c[0] - twoLetterBase) << 5 |
           (uint16_t)(c[1] - twoLetterBase);
  } else {
    return ((uint16_t)(c[0] - threeLetterBase) << 10) |
           (uint16_t)(c[1] - threeLetterBase) << 5 |
           (uint16_t)(c[2] - threeLetterBase);
  }
}

static size_t unpackLanguageOrRegion(uint16_t in,
                                     char* out,
                                     uint8_t twoLetterBase,
                                     uint8_t threeLetterBase) {
  uint8_t first = (in >> 10) & 0x1f;
  uint8_t second = (in >> 5) & 0x1f;
  uint8_t third = in & 0x1f;

  if (first == 0x1f) {
    out[0] = second + twoLetterBase;
    out[1] = third + twoLetterBase;
    return 2;
  } else {
    out[0] = first + threeLetterBase;
    out[1] = second + threeLetterBase;
    out[2] = third + threeLetterBase;
    return 3;
  }
}

// Find the next '-' or '_' index from startOffset position. If not found,
// returns bufferLength.
static size_t nextDelimiterIndex(const char* buffer,
                                 size_t bufferLength,
                                 size_t startOffset) {
  for (size_t i = startOffset; i < bufferLength; ++i) {
    if (buffer[i] == '-' || buffer[i] == '_') {
      return i;
    }
  }
  return bufferLength;
}

static inline bool isLowercase(char c) {
  return 'a' <= c && c <= 'z';
}

static inline bool isUppercase(char c) {
  return 'A' <= c && c <= 'Z';
}

static inline bool isDigit(char c) {
  return '0' <= c && c <= '9';
}

// Returns true if the buffer is valid for language code.
static inline bool isValidLanguageCode(const char* buffer, size_t length) {
  if (length != 2 && length != 3)
    return false;
  if (!isLowercase(buffer[0]))
    return false;
  if (!isLowercase(buffer[1]))
    return false;
  if (length == 3 && !isLowercase(buffer[2]))
    return false;
  return true;
}

// Returns true if buffer is valid for script code. The length of buffer must
// be 4.
static inline bool isValidScriptCode(const char* buffer) {
  return isUppercase(buffer[0]) && isLowercase(buffer[1]) &&
         isLowercase(buffer[2]) && isLowercase(buffer[3]);
}

// Returns true if the buffer is valid for region code.
static inline bool isValidRegionCode(const char* buffer, size_t length) {
  return (length == 2 && isUppercase(buffer[0]) && isUppercase(buffer[1])) ||
         (length == 3 && isDigit(buffer[0]) && isDigit(buffer[1]) &&
          isDigit(buffer[2]));
}

// Parse BCP 47 language identifier into internal structure
FontLanguage::FontLanguage(const char* buf, size_t length) : FontLanguage() {
  size_t firstDelimiterPos = nextDelimiterIndex(buf, length, 0);
  if (isValidLanguageCode(buf, firstDelimiterPos)) {
    mLanguage = packLanguageOrRegion(buf, firstDelimiterPos, 'a', 'a');
  } else {
    // We don't understand anything other than two-letter or three-letter
    // language codes, so we skip parsing the rest of the string.
    return;
  }

  if (firstDelimiterPos == length) {
    mHbLanguage = hb_language_from_string(getString().c_str(), -1);
    return;  // Language code only.
  }

  size_t nextComponentStartPos = firstDelimiterPos + 1;
  size_t nextDelimiterPos =
      nextDelimiterIndex(buf, length, nextComponentStartPos);
  size_t componentLength = nextDelimiterPos - nextComponentStartPos;

  if (componentLength == 4) {
    // Possibly script code.
    const char* p = buf + nextComponentStartPos;
    if (isValidScriptCode(p)) {
      mScript = SCRIPT_TAG(p[0], p[1], p[2], p[3]);
      mSubScriptBits = scriptToSubScriptBits(mScript);
    }

    if (nextDelimiterPos == length) {
      mHbLanguage = hb_language_from_string(getString().c_str(), -1);
      mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
      return;  // No region code.
    }

    nextComponentStartPos = nextDelimiterPos + 1;
    nextDelimiterPos = nextDelimiterIndex(buf, length, nextComponentStartPos);
    componentLength = nextDelimiterPos - nextComponentStartPos;
  }

  if (componentLength == 2 || componentLength == 3) {
    // Possibly region code.
    const char* p = buf + nextComponentStartPos;
    if (isValidRegionCode(p, componentLength)) {
      mRegion = packLanguageOrRegion(p, componentLength, 'A', '0');
    }
  }

  mHbLanguage = hb_language_from_string(getString().c_str(), -1);
  mEmojiStyle = resolveEmojiStyle(buf, length, mScript);
}

// static
FontLanguage::EmojiStyle FontLanguage::resolveEmojiStyle(const char* buf,
                                                         size_t length,
                                                         uint32_t script) {
  // First, lookup emoji subtag.
  // 10 is the length of "-u-em-text", which is the shortest emoji subtag,
  // unnecessary comparison can be avoided if total length is smaller than 10.
  const size_t kMinSubtagLength = 10;
  if (length >= kMinSubtagLength) {
    static const char kPrefix[] = "-u-em-";
    const char* pos =
        std::search(buf, buf + length, kPrefix, kPrefix + strlen(kPrefix));
    if (pos != buf + length) {  // found
      pos += strlen(kPrefix);
      const size_t remainingLength = length - (pos - buf);
      if (isEmojiSubtag(pos, remainingLength, "emoji", 5)) {
        return EMSTYLE_EMOJI;
      } else if (isEmojiSubtag(pos, remainingLength, "text", 4)) {
        return EMSTYLE_TEXT;
      } else if (isEmojiSubtag(pos, remainingLength, "default", 7)) {
        return EMSTYLE_DEFAULT;
      }
    }
  }

  // If no emoji subtag was provided, resolve the emoji style from script code.
  if (script == SCRIPT_TAG('Z', 's', 'y', 'e')) {
    return EMSTYLE_EMOJI;
  } else if (script == SCRIPT_TAG('Z', 's', 'y', 'm')) {
    return EMSTYLE_TEXT;
  }

  return EMSTYLE_EMPTY;
}

// static
uint8_t FontLanguage::scriptToSubScriptBits(uint32_t script) {
  uint8_t subScriptBits = 0u;
  switch (script) {
    case SCRIPT_TAG('B', 'o', 'p', 'o'):
      subScriptBits = kBopomofoFlag;
      break;
    case SCRIPT_TAG('H', 'a', 'n', 'g'):
      subScriptBits = kHangulFlag;
      break;
    case SCRIPT_TAG('H', 'a', 'n', 'b'):
      // Bopomofo is almost exclusively used in Taiwan.
      subScriptBits = kHanFlag | kBopomofoFlag;
      break;
    case SCRIPT_TAG('H', 'a', 'n', 'i'):
      subScriptBits = kHanFlag;
      break;
    case SCRIPT_TAG('H', 'a', 'n', 's'):
      subScriptBits = kHanFlag | kSimplifiedChineseFlag;
      break;
    case SCRIPT_TAG('H', 'a', 'n', 't'):
      subScriptBits = kHanFlag | kTraditionalChineseFlag;
      break;
    case SCRIPT_TAG('H', 'i', 'r', 'a'):
      subScriptBits = kHiraganaFlag;
      break;
    case SCRIPT_TAG('H', 'r', 'k', 't'):
      subScriptBits = kKatakanaFlag | kHiraganaFlag;
      break;
    case SCRIPT_TAG('J', 'p', 'a', 'n'):
      subScriptBits = kHanFlag | kKatakanaFlag | kHiraganaFlag;
      break;
    case SCRIPT_TAG('K', 'a', 'n', 'a'):
      subScriptBits = kKatakanaFlag;
      break;
    case SCRIPT_TAG('K', 'o', 'r', 'e'):
      subScriptBits = kHanFlag | kHangulFlag;
      break;
  }
  return subScriptBits;
}

std::string FontLanguage::getString() const {
  if (isUnsupported()) {
    return "und";
  }
  char buf[16];
  size_t i = unpackLanguageOrRegion(mLanguage, buf, 'a', 'a');
  if (mScript != 0) {
    buf[i++] = '-';
    buf[i++] = (mScript >> 24) & 0xFFu;
    buf[i++] = (mScript >> 16) & 0xFFu;
    buf[i++] = (mScript >> 8) & 0xFFu;
    buf[i++] = mScript & 0xFFu;
  }
  if (mRegion != INVALID_CODE) {
    buf[i++] = '-';
    i += unpackLanguageOrRegion(mRegion, buf + i, 'A', '0');
  }
  return std::string(buf, i);
}

bool FontLanguage::isEqualScript(const FontLanguage& other) const {
  return other.mScript == mScript;
}

// static
bool FontLanguage::supportsScript(uint8_t providedBits, uint8_t requestedBits) {
  return requestedBits != 0 && (providedBits & requestedBits) == requestedBits;
}

bool FontLanguage::supportsHbScript(hb_script_t script) const {
  static_assert(
      SCRIPT_TAG('J', 'p', 'a', 'n') == HB_TAG('J', 'p', 'a', 'n'),
      "The Minikin script and HarfBuzz hb_script_t have different encodings.");
  if (script == mScript)
    return true;
  return supportsScript(mSubScriptBits, scriptToSubScriptBits(script));
}

int FontLanguage::calcScoreFor(const FontLanguages& supported) const {
  bool languageScriptMatch = false;
  bool subtagMatch = false;
  bool scriptMatch = false;

  for (size_t i = 0; i < supported.size(); ++i) {
    if (mEmojiStyle != EMSTYLE_EMPTY &&
        mEmojiStyle == supported[i].mEmojiStyle) {
      subtagMatch = true;
      if (mLanguage == supported[i].mLanguage) {
        return 4;
      }
    }
    if (isEqualScript(supported[i]) ||
        supportsScript(supported[i].mSubScriptBits, mSubScriptBits)) {
      scriptMatch = true;
      if (mLanguage == supported[i].mLanguage) {
        languageScriptMatch = true;
      }
    }
  }

  if (supportsScript(supported.getUnionOfSubScriptBits(), mSubScriptBits)) {
    scriptMatch = true;
    if (mLanguage == supported[0].mLanguage &&
        supported.isAllTheSameLanguage()) {
      return 3;
    }
  }

  if (languageScriptMatch) {
    return 3;
  } else if (subtagMatch) {
    return 2;
  } else if (scriptMatch) {
    return 1;
  }
  return 0;
}

FontLanguages::FontLanguages(std::vector<FontLanguage>&& languages)
    : mLanguages(std::move(languages)) {
  if (mLanguages.empty()) {
    return;
  }

  const FontLanguage& lang = mLanguages[0];

  mIsAllTheSameLanguage = true;
  mUnionOfSubScriptBits = lang.mSubScriptBits;
  for (size_t i = 1; i < mLanguages.size(); ++i) {
    mUnionOfSubScriptBits |= mLanguages[i].mSubScriptBits;
    if (mIsAllTheSameLanguage && lang.mLanguage != mLanguages[i].mLanguage) {
      mIsAllTheSameLanguage = false;
    }
  }
}

#undef SCRIPT_TAG
}  // namespace minikin