1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "WordBreaker.h"
18
19 #include <list>
20 #include <map>
21
22 #include <unicode/ubrk.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25
26 #include "minikin/Emoji.h"
27 #include "minikin/Hyphenator.h"
28
29 #include "Locale.h"
30 #include "MinikinInternal.h"
31
32 namespace minikin {
33
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static UBreakIterator* createNewIterator(const Locale& locale, LineBreakStyle lbStyle,
36 LineBreakWordStyle lbWordStyle) {
37 // TODO: handle failure status
38 UErrorCode status = U_ZERO_ERROR;
39 char localeID[ULOC_FULLNAME_CAPACITY] = {};
40 uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(), localeID,
41 ULOC_FULLNAME_CAPACITY, nullptr, &status);
42 return ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status);
43 }
44 } // namespace
45
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)46 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
47 LineBreakStyle lbStyle,
48 LineBreakWordStyle lbWordStyle) {
49 const uint64_t id = locale.getIdentifier();
50 std::lock_guard<std::mutex> lock(mMutex);
51 for (auto i = mPool.begin(); i != mPool.end(); i++) {
52 if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
53 Slot slot = std::move(*i);
54 mPool.erase(i);
55 return slot;
56 }
57 }
58
59 // Not found in pool. Create new one.
60 return {id, lbStyle, lbWordStyle,
61 IcuUbrkUniquePtr(createNewIterator(locale, lbStyle, lbWordStyle))};
62 }
63
release(ICULineBreakerPool::Slot && slot)64 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
65 if (slot.breaker.get() == nullptr) {
66 return; // Already released slot. Do nothing.
67 }
68 std::lock_guard<std::mutex> lock(mMutex);
69 if (mPool.size() >= MAX_POOL_SIZE) {
70 // Pool is full. Move to local variable, so that the given slot will be released when the
71 // variable leaves the scope.
72 Slot localSlot = std::move(slot);
73 return;
74 }
75 mPool.push_front(std::move(slot));
76 }
77
WordBreaker()78 WordBreaker::WordBreaker() : mPool(&ICULineBreakerPoolImpl::getInstance()) {}
79
WordBreaker(ICULineBreakerPool * pool)80 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool) {}
81
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)82 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
83 LineBreakWordStyle lbWordStyle, size_t from) {
84 mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
85 UErrorCode status = U_ZERO_ERROR;
86 MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
87 // TODO: handle failure status
88 ubrk_setUText(mIcuBreaker.breaker.get(), &mUText, &status);
89 if (mInEmailOrUrl) {
90 // Note:
91 // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
92 // The email/URL detection doesn't support following() functionality, so that we can't
93 // restart from the specific position. This means following() can not be supported in
94 // general, but keeping old email/URL context works for LineBreaker since it just wants to
95 // re-calculate the next break point with the new locale.
96 } else {
97 mCurrent = mLast = mScanOffset = from;
98 next();
99 }
100 return mCurrent;
101 }
102
setText(const uint16_t * data,size_t size)103 void WordBreaker::setText(const uint16_t* data, size_t size) {
104 mText = data;
105 mTextSize = size;
106 mLast = 0;
107 mCurrent = 0;
108 mScanOffset = 0;
109 mInEmailOrUrl = false;
110 UErrorCode status = U_ZERO_ERROR;
111 utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size, &status);
112 }
113
current() const114 ssize_t WordBreaker::current() const {
115 return mCurrent;
116 }
117
118 /**
119 * Determine whether a line break at position i within the buffer buf is valid. This
120 * represents customization beyond the ICU behavior, because plain ICU provides some
121 * line break opportunities that we don't want.
122 **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)123 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
124 const size_t position = static_cast<size_t>(i);
125 if (i == UBRK_DONE || position == bufEnd) {
126 // If the iterator reaches the end, treat as break.
127 return true;
128 }
129 uint32_t codePoint;
130 size_t prev_offset = position;
131 U16_PREV(buf, 0, prev_offset, codePoint);
132 // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
133 if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
134 return false;
135 }
136 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
137 // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
138 // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
139 // where no line break could be imagined, since the Myanmar virama is a pure stacker.
140 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
141 return false;
142 }
143
144 uint32_t next_codepoint;
145 size_t next_offset = position;
146 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
147
148 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
149 // emoji data than ICU does.
150 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
151 return false;
152 }
153
154 // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
155 if (isEmojiModifier(next_codepoint)) {
156 if (codePoint == 0xFE0F && prev_offset > 0) {
157 // skip over emoji variation selector
158 U16_PREV(buf, 0, prev_offset, codePoint);
159 }
160 if (isEmojiBase(codePoint)) {
161 return false;
162 }
163 }
164 return true;
165 }
166
167 // Customized iteratorNext that takes care of both resets and our modifications
168 // to ICU's behavior.
iteratorNext()169 int32_t WordBreaker::iteratorNext() {
170 int32_t result = ubrk_following(mIcuBreaker.breaker.get(), mCurrent);
171 while (!isValidBreak(mText, mTextSize, result)) {
172 result = ubrk_next(mIcuBreaker.breaker.get());
173 }
174 return result;
175 }
176
177 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)178 static bool breakAfter(uint16_t c) {
179 return c == ':' || c == '=' || c == '&';
180 }
181
182 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)183 static bool breakBefore(uint16_t c) {
184 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
185 c == '%' || c == '=' || c == '&';
186 }
187
188 enum ScanState {
189 START,
190 SAW_AT,
191 SAW_COLON,
192 SAW_COLON_SLASH,
193 SAW_COLON_SLASH_SLASH,
194 };
195
detectEmailOrUrl()196 void WordBreaker::detectEmailOrUrl() {
197 // scan forward from current ICU position for email address or URL
198 if (mLast >= mScanOffset) {
199 ScanState state = START;
200 size_t i;
201 for (i = mLast; i < mTextSize; i++) {
202 uint16_t c = mText[i];
203 // scan only ASCII characters, stop at space
204 if (!(' ' < c && c <= 0x007E)) {
205 break;
206 }
207 if (state == START && c == '@') {
208 state = SAW_AT;
209 } else if (state == START && c == ':') {
210 state = SAW_COLON;
211 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
212 if (c == '/') {
213 state = static_cast<ScanState>((int)state + 1); // next state adds a slash
214 } else {
215 state = START;
216 }
217 }
218 }
219 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
220 if (!ubrk_isBoundary(mIcuBreaker.breaker.get(), i)) {
221 // If there are combining marks or such at the end of the URL or the email address,
222 // consider them a part of the URL or the email, and skip to the next actual
223 // boundary.
224 i = ubrk_following(mIcuBreaker.breaker.get(), i);
225 }
226 mInEmailOrUrl = true;
227 } else {
228 mInEmailOrUrl = false;
229 }
230 mScanOffset = i;
231 }
232 }
233
findNextBreakInEmailOrUrl()234 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
235 // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
236 uint16_t lastChar = mText[mLast];
237 ssize_t i;
238 for (i = mLast + 1; i < mScanOffset; i++) {
239 if (breakAfter(lastChar)) {
240 break;
241 }
242 // break after double slash
243 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
244 break;
245 }
246 const uint16_t thisChar = mText[i];
247 // never break after hyphen
248 if (lastChar != '-') {
249 if (breakBefore(thisChar)) {
250 break;
251 }
252 // break before single slash
253 if (thisChar == '/' && lastChar != '/' &&
254 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
255 break;
256 }
257 }
258 lastChar = thisChar;
259 }
260 return i;
261 }
262
next()263 ssize_t WordBreaker::next() {
264 mLast = mCurrent;
265
266 detectEmailOrUrl();
267 if (mInEmailOrUrl) {
268 mCurrent = findNextBreakInEmailOrUrl();
269 } else { // Business as usual
270 mCurrent = (ssize_t)iteratorNext();
271 }
272 return mCurrent;
273 }
274
wordStart() const275 ssize_t WordBreaker::wordStart() const {
276 if (mInEmailOrUrl) {
277 return mLast;
278 }
279 ssize_t result = mLast;
280 while (result < mCurrent) {
281 UChar32 c;
282 ssize_t ix = result;
283 U16_NEXT(mText, ix, mCurrent, c);
284 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
285 // strip leading punctuation, defined as OP and QU line breaking classes,
286 // see UAX #14
287 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
288 break;
289 }
290 result = ix;
291 }
292 return result;
293 }
294
wordEnd() const295 ssize_t WordBreaker::wordEnd() const {
296 if (mInEmailOrUrl) {
297 return mLast;
298 }
299 ssize_t result = mCurrent;
300 while (result > mLast) {
301 UChar32 c;
302 ssize_t ix = result;
303 U16_PREV(mText, mLast, ix, c);
304 const int32_t gc_mask = U_GET_GC_MASK(c);
305 // strip trailing spaces, punctuation and control characters
306 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
307 break;
308 }
309 result = ix;
310 }
311 return result;
312 }
313
breakBadness() const314 int WordBreaker::breakBadness() const {
315 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
316 }
317
finish()318 void WordBreaker::finish() {
319 mText = nullptr;
320 // Note: calling utext_close multiply is safe
321 utext_close(&mUText);
322 mPool->release(std::move(mIcuBreaker));
323 }
324
325 } // namespace minikin
326