1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <list>
20 #include <map>
21
22 #include <unicode/ubrk.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25
26 #include "minikin/Emoji.h"
27 #include "minikin/Hyphenator.h"
28
29 #include "Locale.h"
30 #include "MinikinInternal.h"
31
32 namespace minikin {
33
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static std::unique_ptr<BreakIterator> createNewIterator(const Locale& locale,
36 LineBreakStyle lbStyle,
37 LineBreakWordStyle lbWordStyle) {
38 MINIKIN_ASSERT(lbStyle != LineBreakStyle::Auto,
39 "LineBreakStyle::Auto must be resolved beforehand.");
40 MINIKIN_ASSERT(lbWordStyle != LineBreakWordStyle::Auto,
41 "LineBreakWordStyle::Auto must be resolved beforehand.");
42
43 // TODO: handle failure status
44 if (lbStyle == LineBreakStyle::NoBreak) {
45 return std::make_unique<NoBreakBreakIterator>();
46 } else {
47 UErrorCode status = U_ZERO_ERROR;
48 char localeID[ULOC_FULLNAME_CAPACITY] = {};
49 uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(),
50 localeID, ULOC_FULLNAME_CAPACITY, nullptr, &status);
51 IcuUbrkUniquePtr icuBrkPtr(
52 ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status));
53 return std::make_unique<ICUBreakIterator>(std::move(icuBrkPtr));
54 }
55 }
56 } // namespace
57
setText(UText * text,size_t)58 void ICUBreakIterator::setText(UText* text, size_t) {
59 UErrorCode status = U_ZERO_ERROR;
60 ubrk_setUText(mBreaker.get(), text, &status);
61 }
62
isBoundary(int32_t i)63 bool ICUBreakIterator::isBoundary(int32_t i) {
64 return ubrk_isBoundary(mBreaker.get(), i);
65 }
66
following(size_t i)67 int32_t ICUBreakIterator::following(size_t i) {
68 return ubrk_following(mBreaker.get(), i);
69 }
70
next()71 int32_t ICUBreakIterator::next() {
72 return ubrk_next(mBreaker.get());
73 }
74
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)75 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
76 LineBreakStyle lbStyle,
77 LineBreakWordStyle lbWordStyle) {
78 if (lbStyle == LineBreakStyle::Auto) {
79 lbStyle = locale.supportsScript('J', 'p', 'a', 'n') ? LineBreakStyle::Strict
80 : LineBreakStyle::None;
81 }
82
83 const uint64_t id = locale.getIdentifier();
84 std::lock_guard<std::mutex> lock(mMutex);
85 for (auto i = mPool.begin(); i != mPool.end(); i++) {
86 if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
87 Slot slot = std::move(*i);
88 mPool.erase(i);
89 return slot;
90 }
91 }
92
93 // Not found in pool. Create new one.
94 return {id, lbStyle, lbWordStyle, createNewIterator(locale, lbStyle, lbWordStyle)};
95 }
96
release(ICULineBreakerPool::Slot && slot)97 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
98 if (slot.breaker.get() == nullptr) {
99 return; // Already released slot. Do nothing.
100 }
101 std::lock_guard<std::mutex> lock(mMutex);
102 if (mPool.size() >= MAX_POOL_SIZE) {
103 // Pool is full. Move to local variable, so that the given slot will be released when the
104 // variable leaves the scope.
105 Slot localSlot = std::move(slot);
106 return;
107 }
108 mPool.push_front(std::move(slot));
109 }
110
WordBreaker()111 WordBreaker::WordBreaker()
112 : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}
113
WordBreaker(ICULineBreakerPool * pool)114 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}
115
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)116 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
117 LineBreakWordStyle lbWordStyle, size_t from) {
118 if (!mUText) {
119 return mCurrent;
120 }
121 mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
122 MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
123 // TODO: handle failure status
124 mIcuBreaker.breaker->setText(mUText.get(), mTextSize);
125 if (mInEmailOrUrl) {
126 // Note:
127 // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
128 // The email/URL detection doesn't support following() functionality, so that we can't
129 // restart from the specific position. This means following() can not be supported in
130 // general, but keeping old email/URL context works for LineBreaker since it just wants to
131 // re-calculate the next break point with the new locale.
132 } else {
133 mCurrent = mLast = mScanOffset = from;
134 next();
135 }
136 return mCurrent;
137 }
138
setText(const uint16_t * data,size_t size)139 void WordBreaker::setText(const uint16_t* data, size_t size) {
140 mText = data;
141 mTextSize = size;
142 mLast = 0;
143 mCurrent = 0;
144 mScanOffset = 0;
145 mInEmailOrUrl = false;
146 UErrorCode status = U_ZERO_ERROR;
147 mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
148 }
149
current() const150 ssize_t WordBreaker::current() const {
151 return mCurrent;
152 }
153
154 /**
155 * Determine whether a line break at position i within the buffer buf is valid. This
156 * represents customization beyond the ICU behavior, because plain ICU provides some
157 * line break opportunities that we don't want.
158 **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)159 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
160 const size_t position = static_cast<size_t>(i);
161 if (i == UBRK_DONE || position == bufEnd) {
162 // If the iterator reaches the end, treat as break.
163 return true;
164 }
165 uint32_t codePoint;
166 size_t prev_offset = position;
167 U16_PREV(buf, 0, prev_offset, codePoint);
168 // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
169 if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
170 return false;
171 }
172 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
173 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
174 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
175 // where no line break could be imagined, since the Myanmar virama is a pure stacker.
176 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
177 return false;
178 }
179
180 uint32_t next_codepoint;
181 size_t next_offset = position;
182 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
183
184 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
185 // emoji data than ICU does.
186 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
187 return false;
188 }
189
190 // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
191 if (isEmojiModifier(next_codepoint)) {
192 if (codePoint == 0xFE0F && prev_offset > 0) {
193 // skip over emoji variation selector
194 U16_PREV(buf, 0, prev_offset, codePoint);
195 }
196 if (isEmojiBase(codePoint)) {
197 return false;
198 }
199 }
200 return true;
201 }
202
203 // Customized iteratorNext that takes care of both resets and our modifications
204 // to ICU's behavior.
iteratorNext()205 int32_t WordBreaker::iteratorNext() {
206 int32_t result = mIcuBreaker.breaker->following(mCurrent);
207 while (!isValidBreak(mText, mTextSize, result)) {
208 result = mIcuBreaker.breaker->next();
209 }
210 return result;
211 }
212
213 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)214 static bool breakAfter(uint16_t c) {
215 return c == ':' || c == '=' || c == '&';
216 }
217
218 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)219 static bool breakBefore(uint16_t c) {
220 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
221 c == '%' || c == '=' || c == '&';
222 }
223
224 enum ScanState {
225 START,
226 SAW_AT,
227 SAW_COLON,
228 SAW_COLON_SLASH,
229 SAW_COLON_SLASH_SLASH,
230 };
231
detectEmailOrUrl()232 void WordBreaker::detectEmailOrUrl() {
233 // scan forward from current ICU position for email address or URL
234 if (mLast >= mScanOffset) {
235 ScanState state = START;
236 size_t i;
237 for (i = mLast; i < mTextSize; i++) {
238 uint16_t c = mText[i];
239 // scan only ASCII characters, stop at space
240 if (!(' ' < c && c <= 0x007E)) {
241 break;
242 }
243 if (state == START && c == '@') {
244 state = SAW_AT;
245 } else if (state == START && c == ':') {
246 state = SAW_COLON;
247 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
248 if (c == '/') {
249 state = static_cast<ScanState>((int)state + 1); // next state adds a slash
250 } else {
251 state = START;
252 }
253 }
254 }
255 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
256 if (!mIcuBreaker.breaker->isBoundary(i)) {
257 // If there are combining marks or such at the end of the URL or the email address,
258 // consider them a part of the URL or the email, and skip to the next actual
259 // boundary.
260 i = mIcuBreaker.breaker->following(i);
261 }
262 mInEmailOrUrl = true;
263 } else {
264 mInEmailOrUrl = false;
265 }
266 mScanOffset = i;
267 }
268 }
269
findNextBreakInEmailOrUrl()270 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
271 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
272 uint16_t lastChar = mText[mLast];
273 ssize_t i;
274 for (i = mLast + 1; i < mScanOffset; i++) {
275 if (breakAfter(lastChar)) {
276 break;
277 }
278 // break after double slash
279 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
280 break;
281 }
282 const uint16_t thisChar = mText[i];
283 // never break after hyphen
284 if (lastChar != '-') {
285 if (breakBefore(thisChar)) {
286 break;
287 }
288 // break before single slash
289 if (thisChar == '/' && lastChar != '/' &&
290 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
291 break;
292 }
293 }
294 lastChar = thisChar;
295 }
296 return i;
297 }
298
next()299 ssize_t WordBreaker::next() {
300 mLast = mCurrent;
301
302 detectEmailOrUrl();
303 if (mInEmailOrUrl) {
304 mCurrent = findNextBreakInEmailOrUrl();
305 } else { // Business as usual
306 mCurrent = (ssize_t)iteratorNext();
307 }
308 return mCurrent;
309 }
310
wordStart() const311 ssize_t WordBreaker::wordStart() const {
312 if (mInEmailOrUrl) {
313 return mLast;
314 }
315 ssize_t result = mLast;
316 while (result < mCurrent) {
317 UChar32 c;
318 ssize_t ix = result;
319 U16_NEXT(mText, ix, mCurrent, c);
320 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
321 // strip leading punctuation, defined as OP and QU line breaking classes,
322 // see UAX #14
323 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
324 break;
325 }
326 result = ix;
327 }
328 return result;
329 }
330
wordEnd() const331 ssize_t WordBreaker::wordEnd() const {
332 if (mInEmailOrUrl) {
333 return mLast;
334 }
335 ssize_t result = mCurrent;
336 while (result > mLast) {
337 UChar32 c;
338 ssize_t ix = result;
339 U16_PREV(mText, mLast, ix, c);
340 const int32_t gc_mask = U_GET_GC_MASK(c);
341 // strip trailing spaces, punctuation and control characters
342 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
343 break;
344 }
345 result = ix;
346 }
347 return result;
348 }
349
breakBadness() const350 int WordBreaker::breakBadness() const {
351 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
352 }
353
finish()354 void WordBreaker::finish() {
355 mText = nullptr;
356 mUText.reset();
357 mPool->release(std::move(mIcuBreaker));
358 }
359
360 } // namespace minikin
361