OpenHarmony-v3.2.3-Release/s

/*
 * Copyright (C) 2014 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#include <stdint.h>
#include <unicode/uchar.h>
#include <unicode/utf16.h>
#include <algorithm>

#include <minikin/Emoji.h>
#include <minikin/GraphemeBreak.h>
#include "MinikinInternal.h"
#include "utils/WindowsUtils.h"

namespace minikin {

int32_t tailoredGraphemeClusterBreak(uint32_t c) {
  // Characters defined as Control that we want to treat them as Extend.
  // These are curated manually.
  if (c == 0x00AD                      // SHY
      || c == 0x061C                   // ALM
      || c == 0x180E                   // MONGOLIAN VOWEL SEPARATOR
      || c == 0x200B                   // ZWSP
      || c == 0x200E                   // LRM
      || c == 0x200F                   // RLM
      || (0x202A <= c && c <= 0x202E)  // LRE, RLE, PDF, LRO, RLO
      || ((c | 0xF) ==
          0x206F)     // WJ, invisible math operators, LRI, RLI, FSI, PDI,
                      // and the deprecated invisible format controls
      || c == 0xFEFF  // BOM
      || ((c | 0x7F) ==
          0xE007F))  // recently undeprecated tag characters in Plane 14
    return U_GCB_EXTEND;
  // THAI CHARACTER SARA AM is treated as a normal letter by most other
  // implementations: they allow a grapheme break before it.
  else if (c == 0x0E33)
    return U_GCB_OTHER;
  else
    return u_getIntPropertyValue(c, UCHAR_GRAPHEME_CLUSTER_BREAK);
}

// Returns true for all characters whose IndicSyllabicCategory is Pure_Killer.
// From http://www.unicode.org/Public/9.0.0/ucd/IndicSyllabicCategory.txt
bool isPureKiller(uint32_t c) {
  return (c == 0x0E3A || c == 0x0E4E || c == 0x0F84 || c == 0x103A ||
          c == 0x1714 || c == 0x1734 || c == 0x17D1 || c == 0x1BAA ||
          c == 0x1BF2 || c == 0x1BF3 || c == 0xA806 || c == 0xA953 ||
          c == 0xABED || c == 0x11134 || c == 0x112EA || c == 0x1172B);
}

bool GraphemeBreak::isGraphemeBreak(const float* advances,
                                    const uint16_t* buf,
                                    size_t start,
                                    size_t count,
                                    const size_t offset) {
  // This implementation closely follows Unicode Standard Annex #29 on
  // Unicode Text Segmentation (http://www.unicode.org/reports/tr29/),
  // implementing a tailored version of extended grapheme clusters.
  // The GB rules refer to section 3.1.1, Grapheme Cluster Boundary Rules.

  // Rule GB1, sot ÷; Rule GB2, ÷ eot
  if (offset <= start || offset >= start + count) {
    return true;
  }
  if (U16_IS_TRAIL(buf[offset])) {
    // Don't break a surrogate pair, but a lonely trailing surrogate pair is a
    // break
    return !U16_IS_LEAD(buf[offset - 1]);
  }
  uint32_t c1 = 0;
  uint32_t c2 = 0;
  size_t offset_back = offset;
  size_t offset_forward = offset;
  U16_PREV(buf, start, offset_back, c1);
  U16_NEXT(buf, offset_forward, start + count, c2);
  int32_t p1 = tailoredGraphemeClusterBreak(c1);
  int32_t p2 = tailoredGraphemeClusterBreak(c2);
  // Rule GB3, CR x LF
  if (p1 == U_GCB_CR && p2 == U_GCB_LF) {
    return false;
  }
  // Rule GB4, (Control | CR | LF) ÷
  if (p1 == U_GCB_CONTROL || p1 == U_GCB_CR || p1 == U_GCB_LF) {
    return true;
  }
  // Rule GB5, ÷ (Control | CR | LF)
  if (p2 == U_GCB_CONTROL || p2 == U_GCB_CR || p2 == U_GCB_LF) {
    return true;
  }
  // Rule GB6, L x ( L | V | LV | LVT )
  if (p1 == U_GCB_L &&
      (p2 == U_GCB_L || p2 == U_GCB_V || p2 == U_GCB_LV || p2 == U_GCB_LVT)) {
    return false;
  }
  // Rule GB7, ( LV | V ) x ( V | T )
  if ((p1 == U_GCB_LV || p1 == U_GCB_V) && (p2 == U_GCB_V || p2 == U_GCB_T)) {
    return false;
  }
  // Rule GB8, ( LVT | T ) x T
  if ((p1 == U_GCB_LVT || p1 == U_GCB_T) && p2 == U_GCB_T) {
    return false;
  }
  // Rule GB9, x (Extend | ZWJ); Rule GB9a, x SpacingMark; Rule GB9b, Prepend x
  if (p2 == U_GCB_EXTEND || p2 == U_GCB_ZWJ || p2 == U_GCB_SPACING_MARK ||
      p1 == U_GCB_PREPEND) {
    return false;
  }

  // This is used to decide font-dependent grapheme clusters. If we don't have
  // the advance information, we become conservative in grapheme breaking and
  // assume that it has no advance.
  const bool c2_has_advance =
      (advances != nullptr && advances[offset - start] != 0.0);

  // All the following rules are font-dependent, in the way that if we know c2
  // has an advance, we definitely know that it cannot form a grapheme with the
  // character(s) before it. So we make the decision in favor a grapheme break
  // early.
  if (c2_has_advance) {
    return true;
  }

  // Note: For Rule GB10 and GB11 below, we do not use the Unicode line breaking
  // properties for determining emoji-ness and carry our own data, because our
  // data could be more fresh than what ICU provides.
  //
  // Tailored version of Rule GB10, (E_Base | EBG) Extend* × E_Modifier.
  // The rule itself says do not break between emoji base and emoji modifiers,
  // skipping all Extend characters. Variation selectors are considered Extend,
  // so they are handled fine.
  //
  // We tailor this by requiring that an actual ligature is formed. If the font
  // doesn't form a ligature, we allow a break before the modifier.
  if (isEmojiModifier(c2)) {
    uint32_t c0 = c1;
    size_t offset_backback = offset_back;
    int32_t p0 = p1;
    if (p0 == U_GCB_EXTEND && offset_backback > start) {
      // skip over emoji variation selector
      U16_PREV(buf, start, offset_backback, c0);
    }
    if (isEmojiBase(c0)) {
      return false;
    }
  }

  // Tailored version of Rule GB11, ZWJ × (Glue_After_Zwj | EBG)
  // We try to make emoji sequences with ZWJ a single grapheme cluster, but only
  // if they actually merge to one cluster. So we are more relaxed than the UAX
  // #29 rules in accepting any emoji character after the ZWJ, but are tighter
  // in that we only treat it as one cluster if a ligature is actually formed
  // and we also require the character before the ZWJ to also be an emoji.
  if (p1 == U_GCB_ZWJ && isEmoji(c2) && offset_back > start) {
    // look at character before ZWJ to see that both can participate in an
    // emoji zwj sequence
    uint32_t c0 = 0;
    size_t offset_backback = offset_back;
    U16_PREV(buf, start, offset_backback, c0);
    if (c0 == 0xFE0F && offset_backback > start) {
      // skip over emoji variation selector
      U16_PREV(buf, start, offset_backback, c0);
    }
    if (isEmoji(c0)) {
      return false;
    }
  }

  // Tailored version of Rule GB12 and Rule GB13 that look at even-odd cases.
  // sot   (RI RI)*  RI x RI
  // [^RI] (RI RI)*  RI x RI
  //
  // If we have font information, we have already broken the cluster if and only
  // if the second character had no advance, which means a ligature was formed.
  // If we don't, we look back like UAX #29 recommends, but only up to 1000 code
  // units.
  if (p1 == U_GCB_REGIONAL_INDICATOR && p2 == U_GCB_REGIONAL_INDICATOR) {
    if (advances != nullptr) {
      // We have advances information. But if we are here, we already know c2
      // has no advance. So we should definitely disallow a break.
      return false;
    } else {
      // Look at up to 1000 code units.
      const size_t lookback_barrier =
          std::max((ssize_t)start, (ssize_t)offset_back - 1000);
      size_t offset_backback = offset_back;
      while (offset_backback > lookback_barrier) {
        uint32_t c0 = 0;
        U16_PREV(buf, lookback_barrier, offset_backback, c0);
        if (tailoredGraphemeClusterBreak(c0) != U_GCB_REGIONAL_INDICATOR) {
          offset_backback += U16_LENGTH(c0);
          break;
        }
      }
      // The number 4 comes from the number of code units in a whole flag.
      return (offset - offset_backback) % 4 == 0;
    }
  }
  // Cluster Indic syllables together (tailoring of UAX #29).
  // Immediately after each virama (that is not just a pure killer) followed by
  // a letter, we disallow grapheme breaks (if we are here, we don't know about
  // advances, or we already know that c2 has no advance).
  if (u_getIntPropertyValue(c1, UCHAR_CANONICAL_COMBINING_CLASS) == 9  // virama
      && !isPureKiller(c1) &&
      u_getIntPropertyValue(c2, UCHAR_GENERAL_CATEGORY) == U_OTHER_LETTER) {
    return false;
  }
  // Rule GB999, Any ÷ Any
  return true;
}

size_t GraphemeBreak::getTextRunCursor(const float* advances,
                                       const uint16_t* buf,
                                       size_t start,
                                       size_t count,
                                       size_t offset,
                                       MoveOpt opt) {
  switch (opt) {
    case AFTER:
      if (offset < start + count) {
        offset++;
      }
      // fall through
    case AT_OR_AFTER:
      while (!isGraphemeBreak(advances, buf, start, count, offset)) {
        offset++;
      }
      break;
    case BEFORE:
      if (offset > start) {
        offset--;
      }
      // fall through
    case AT_OR_BEFORE:
      while (!isGraphemeBreak(advances, buf, start, count, offset)) {
        offset--;
      }
      break;
    case AT:
      if (!isGraphemeBreak(advances, buf, start, count, offset)) {
        offset = (size_t)-1;
      }
      break;
  }
  return offset;
}

}  // namespace minikin