src/minikin/WordBreaker.cpp

/*
 * Copyright (C) 2015 The Android Open Source Project
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

#define LOG_TAG "Minikin"

#include <log/log.h>

#include <minikin/Emoji.h>
#include <minikin/Hyphenator.h>
#include <minikin/WordBreaker.h>
#include "MinikinInternal.h"

#include <unicode/uchar.h>
#include <unicode/utf16.h>

namespace minikin {

const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
const uint32_t CHAR_ZWJ = 0x200D;

// libtxt extension: avoid the cost of initializing new ICU break iterators
// by constructing a global iterator using the default locale and then
// creating a clone for each WordBreaker instance.
static std::once_flag gLibtxtBreakIteratorInitFlag;
static icu::BreakIterator* gLibtxtDefaultBreakIterator = nullptr;

void WordBreaker::setLocale() {
  UErrorCode status = U_ZERO_ERROR;
  std::call_once(gLibtxtBreakIteratorInitFlag, [&status] {
    gLibtxtDefaultBreakIterator =
        icu::BreakIterator::createLineInstance(icu::Locale(), status);
  });
  mBreakIterator.reset(gLibtxtDefaultBreakIterator->clone());
  // TODO: handle failure status
  if (mText != nullptr) {
    mBreakIterator->setText(&mUText, status);
  }
  mIteratorWasReset = true;
}

void WordBreaker::setText(const uint16_t* data, size_t size) {
  mText = data;
  mTextSize = size;
  mIteratorWasReset = false;
  mLast = 0;
  mCurrent = 0;
  mScanOffset = 0;
  mInEmailOrUrl = false;
  UErrorCode status = U_ZERO_ERROR;
  utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size,
                   &status);
  mBreakIterator->setText(&mUText, status);
  mBreakIterator->first();
}

ssize_t WordBreaker::current() const {
  return mCurrent;
}

/**
 * Determine whether a line break at position i within the buffer buf is valid.
 *This represents customization beyond the ICU behavior, because plain ICU
 *provides some line break opportunities that we don't want.
 **/
static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
  uint32_t codePoint;
  size_t prev_offset = i;
  U16_PREV(buf, 0, prev_offset, codePoint);
  // Do not break on hard or soft hyphens. These are handled by automatic
  // hyphenation.
  if (Hyphenator::isLineBreakingHyphen(codePoint) ||
      codePoint == CHAR_SOFT_HYPHEN) {
    // txt addition: Temporarily always break on hyphen. Changed from false to
    // true.
    return true;
  }
  // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA,
  // consonant>. This is to go around a bug in ICU line breaking:
  // http://bugs.icu-project.org/trac/ticket/12561. To avoid too much looking
  // around in the strings, we simply avoid breaking after any Myanmar virama,
  // where no line break could be imagined, since the Myanmar virama is a pure
  // stacker.
  if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
    return false;
  }

  uint32_t next_codepoint;
  size_t next_offset = i;
  U16_NEXT(buf, next_offset, bufEnd, next_codepoint);

  // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may
  // have fresher emoji data than ICU does.
  if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
    return false;
  }

  // Rule LB30b. We need to this ourselves since we may have fresher emoji data
  // than ICU does.
  if (isEmojiModifier(next_codepoint)) {
    if (codePoint == 0xFE0F && prev_offset > 0) {
      // skip over emoji variation selector
      U16_PREV(buf, 0, prev_offset, codePoint);
    }
    if (isEmojiBase(codePoint)) {
      return false;
    }
  }
  return true;
}

// Customized iteratorNext that takes care of both resets and our modifications
// to ICU's behavior.
int32_t WordBreaker::iteratorNext() {
  int32_t result;
  do {
    if (mIteratorWasReset) {
      result = mBreakIterator->following(mCurrent);
      mIteratorWasReset = false;
    } else {
      result = mBreakIterator->next();
    }
  } while (!(result == icu::BreakIterator::DONE ||
             (size_t)result == mTextSize ||
             isBreakValid(mText, mTextSize, result)));
  return result;
}

// Chicago Manual of Style recommends breaking after these characters in URLs
// and email addresses
static bool breakAfter(uint16_t c) {
  return c == ':' || c == '=' || c == '&';
}

// Chicago Manual of Style recommends breaking before these characters in URLs
// and email addresses
static bool breakBefore(uint16_t c) {
  return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' ||
         c == '#' || c == '%' || c == '=' || c == '&';
}

enum ScanState {
  START,
  SAW_AT,
  SAW_COLON,
  SAW_COLON_SLASH,
  SAW_COLON_SLASH_SLASH,
};

void WordBreaker::detectEmailOrUrl() {
  // scan forward from current ICU position for email address or URL
  if (mLast >= mScanOffset) {
    ScanState state = START;
    size_t i;
    for (i = mLast; i < mTextSize; i++) {
      uint16_t c = mText[i];
      // scan only ASCII characters, stop at space
      if (!(' ' < c && c <= 0x007E)) {
        break;
      }
      if (state == START && c == '@') {
        state = SAW_AT;
      } else if (state == START && c == ':') {
        state = SAW_COLON;
      } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
        if (c == '/') {
          state = static_cast<ScanState>((int)state +
                                         1);  // next state adds a slash
        } else {
          state = START;
        }
      }
    }
    if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
      if (!mBreakIterator->isBoundary(i)) {
        // If there are combining marks or such at the end of the URL or the
        // email address, consider them a part of the URL or the email, and skip
        // to the next actual boundary.
        i = mBreakIterator->following(i);
      }
      mInEmailOrUrl = true;
      mIteratorWasReset = true;
    } else {
      mInEmailOrUrl = false;
    }
    mScanOffset = i;
  }
}

ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
  // special rules for email addresses and URL's as per Chicago Manual of Style
  // (16th ed.)
  uint16_t lastChar = mText[mLast];
  ssize_t i;
  for (i = mLast + 1; i < mScanOffset; i++) {
    if (breakAfter(lastChar)) {
      break;
    }
    // break after double slash
    if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
      break;
    }
    const uint16_t thisChar = mText[i];
    // never break after hyphen
    if (lastChar != '-') {
      if (breakBefore(thisChar)) {
        break;
      }
      // break before single slash
      if (thisChar == '/' && lastChar != '/' &&
          !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
        break;
      }
    }
    lastChar = thisChar;
  }
  return i;
}

ssize_t WordBreaker::next() {
  mLast = mCurrent;

  detectEmailOrUrl();
  if (mInEmailOrUrl) {
    mCurrent = findNextBreakInEmailOrUrl();
  } else {  // Business as usual
    mCurrent = (ssize_t)iteratorNext();
  }
  return mCurrent;
}

ssize_t WordBreaker::wordStart() const {
  if (mInEmailOrUrl) {
    return mLast;
  }
  ssize_t result = mLast;
  while (result < mCurrent) {
    UChar32 c;
    ssize_t ix = result;
    U16_NEXT(mText, ix, mCurrent, c);
    const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
    // strip leading punctuation, defined as OP and QU line breaking classes,
    // see UAX #14
    if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
      break;
    }
    result = ix;
  }
  return result;
}

ssize_t WordBreaker::wordEnd() const {
  if (mInEmailOrUrl) {
    return mLast;
  }
  ssize_t result = mCurrent;
  while (result > mLast) {
    UChar32 c;
    ssize_t ix = result;
    U16_PREV(mText, mLast, ix, c);
    const int32_t gc_mask = U_GET_GC_MASK(c);
    // strip trailing space and punctuation
    if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
      break;
    }
    result = ix;
  }
  return result;
}

int WordBreaker::breakBadness() const {
  return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
}

void WordBreaker::finish() {
  mText = nullptr;
  // Note: calling utext_close multiply is safe
  utext_close(&mUText);
}

}  // namespace minikin