• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "WordBreaker.h"
18 
19 #include <list>
20 #include <map>
21 
22 #include <unicode/ubrk.h>
23 #include <unicode/uchar.h>
24 #include <unicode/utf16.h>
25 
26 #include "minikin/Emoji.h"
27 #include "minikin/Hyphenator.h"
28 
29 #include "Locale.h"
30 #include "MinikinInternal.h"
31 
32 namespace minikin {
33 
34 namespace {
createNewIterator(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)35 static UBreakIterator* createNewIterator(const Locale& locale, LineBreakStyle lbStyle,
36                                          LineBreakWordStyle lbWordStyle) {
37     // TODO: handle failure status
38     UErrorCode status = U_ZERO_ERROR;
39     char localeID[ULOC_FULLNAME_CAPACITY] = {};
40     uloc_forLanguageTag(locale.getStringWithLineBreakOption(lbStyle, lbWordStyle).c_str(), localeID,
41                         ULOC_FULLNAME_CAPACITY, nullptr, &status);
42     return ubrk_open(UBreakIteratorType::UBRK_LINE, localeID, nullptr, 0, &status);
43 }
44 }  // namespace
45 
acquire(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle)46 ICULineBreakerPool::Slot ICULineBreakerPoolImpl::acquire(const Locale& locale,
47                                                          LineBreakStyle lbStyle,
48                                                          LineBreakWordStyle lbWordStyle) {
49     const uint64_t id = locale.getIdentifier();
50     std::lock_guard<std::mutex> lock(mMutex);
51     for (auto i = mPool.begin(); i != mPool.end(); i++) {
52         if (i->localeId == id && i->lbStyle == lbStyle && i->lbWordStyle == lbWordStyle) {
53             Slot slot = std::move(*i);
54             mPool.erase(i);
55             return slot;
56         }
57     }
58 
59     // Not found in pool. Create new one.
60     return {id, lbStyle, lbWordStyle,
61             IcuUbrkUniquePtr(createNewIterator(locale, lbStyle, lbWordStyle))};
62 }
63 
release(ICULineBreakerPool::Slot && slot)64 void ICULineBreakerPoolImpl::release(ICULineBreakerPool::Slot&& slot) {
65     if (slot.breaker.get() == nullptr) {
66         return;  // Already released slot. Do nothing.
67     }
68     std::lock_guard<std::mutex> lock(mMutex);
69     if (mPool.size() >= MAX_POOL_SIZE) {
70         // Pool is full. Move to local variable, so that the given slot will be released when the
71         // variable leaves the scope.
72         Slot localSlot = std::move(slot);
73         return;
74     }
75     mPool.push_front(std::move(slot));
76 }
77 
WordBreaker()78 WordBreaker::WordBreaker()
79         : mPool(&ICULineBreakerPoolImpl::getInstance()), mUText(nullptr, &utext_close) {}
80 
WordBreaker(ICULineBreakerPool * pool)81 WordBreaker::WordBreaker(ICULineBreakerPool* pool) : mPool(pool), mUText(nullptr, &utext_close) {}
82 
followingWithLocale(const Locale & locale,LineBreakStyle lbStyle,LineBreakWordStyle lbWordStyle,size_t from)83 ssize_t WordBreaker::followingWithLocale(const Locale& locale, LineBreakStyle lbStyle,
84                                          LineBreakWordStyle lbWordStyle, size_t from) {
85     if (!mUText) {
86         return mCurrent;
87     }
88     mIcuBreaker = mPool->acquire(locale, lbStyle, lbWordStyle);
89     UErrorCode status = U_ZERO_ERROR;
90     MINIKIN_ASSERT(mText != nullptr, "setText must be called first");
91     // TODO: handle failure status
92     ubrk_setUText(mIcuBreaker.breaker.get(), mUText.get(), &status);
93     if (mInEmailOrUrl) {
94         // Note:
95         // Don't reset mCurrent, mLast, or mScanOffset for keeping email/URL context.
96         // The email/URL detection doesn't support following() functionality, so that we can't
97         // restart from the specific position. This means following() can not be supported in
98         // general, but keeping old email/URL context works for LineBreaker since it just wants to
99         // re-calculate the next break point with the new locale.
100     } else {
101         mCurrent = mLast = mScanOffset = from;
102         next();
103     }
104     return mCurrent;
105 }
106 
setText(const uint16_t * data,size_t size)107 void WordBreaker::setText(const uint16_t* data, size_t size) {
108     mText = data;
109     mTextSize = size;
110     mLast = 0;
111     mCurrent = 0;
112     mScanOffset = 0;
113     mInEmailOrUrl = false;
114     UErrorCode status = U_ZERO_ERROR;
115     mUText.reset(utext_openUChars(nullptr, reinterpret_cast<const UChar*>(data), size, &status));
116 }
117 
current() const118 ssize_t WordBreaker::current() const {
119     return mCurrent;
120 }
121 
122 /**
123  * Determine whether a line break at position i within the buffer buf is valid. This
124  * represents customization beyond the ICU behavior, because plain ICU provides some
125  * line break opportunities that we don't want.
126  **/
isValidBreak(const uint16_t * buf,size_t bufEnd,int32_t i)127 static bool isValidBreak(const uint16_t* buf, size_t bufEnd, int32_t i) {
128     const size_t position = static_cast<size_t>(i);
129     if (i == UBRK_DONE || position == bufEnd) {
130         // If the iterator reaches the end, treat as break.
131         return true;
132     }
133     uint32_t codePoint;
134     size_t prev_offset = position;
135     U16_PREV(buf, 0, prev_offset, codePoint);
136     // Do not break on hard or soft hyphens. These are handled by automatic hyphenation.
137     if (Hyphenator::isLineBreakingHyphen(codePoint) || codePoint == CHAR_SOFT_HYPHEN) {
138         return false;
139     }
140     // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA, consonant>. This is to go
141     // around a bug in ICU line breaking: http://bugs.icu-project.org/trac/ticket/12561. To avoid
142     // too much looking around in the strings, we simply avoid breaking after any Myanmar virama,
143     // where no line break could be imagined, since the Myanmar virama is a pure stacker.
144     if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
145         return false;
146     }
147 
148     uint32_t next_codepoint;
149     size_t next_offset = position;
150     U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
151 
152     // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may have fresher
153     // emoji data than ICU does.
154     if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
155         return false;
156     }
157 
158     // Rule LB30b. We need to this ourselves since we may have fresher emoji data than ICU does.
159     if (isEmojiModifier(next_codepoint)) {
160         if (codePoint == 0xFE0F && prev_offset > 0) {
161             // skip over emoji variation selector
162             U16_PREV(buf, 0, prev_offset, codePoint);
163         }
164         if (isEmojiBase(codePoint)) {
165             return false;
166         }
167     }
168     return true;
169 }
170 
171 // Customized iteratorNext that takes care of both resets and our modifications
172 // to ICU's behavior.
iteratorNext()173 int32_t WordBreaker::iteratorNext() {
174     int32_t result = ubrk_following(mIcuBreaker.breaker.get(), mCurrent);
175     while (!isValidBreak(mText, mTextSize, result)) {
176         result = ubrk_next(mIcuBreaker.breaker.get());
177     }
178     return result;
179 }
180 
181 // Chicago Manual of Style recommends breaking after these characters in URLs and email addresses
breakAfter(uint16_t c)182 static bool breakAfter(uint16_t c) {
183     return c == ':' || c == '=' || c == '&';
184 }
185 
186 // Chicago Manual of Style recommends breaking before these characters in URLs and email addresses
breakBefore(uint16_t c)187 static bool breakBefore(uint16_t c) {
188     return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' || c == '#' ||
189            c == '%' || c == '=' || c == '&';
190 }
191 
192 enum ScanState {
193     START,
194     SAW_AT,
195     SAW_COLON,
196     SAW_COLON_SLASH,
197     SAW_COLON_SLASH_SLASH,
198 };
199 
detectEmailOrUrl()200 void WordBreaker::detectEmailOrUrl() {
201     // scan forward from current ICU position for email address or URL
202     if (mLast >= mScanOffset) {
203         ScanState state = START;
204         size_t i;
205         for (i = mLast; i < mTextSize; i++) {
206             uint16_t c = mText[i];
207             // scan only ASCII characters, stop at space
208             if (!(' ' < c && c <= 0x007E)) {
209                 break;
210             }
211             if (state == START && c == '@') {
212                 state = SAW_AT;
213             } else if (state == START && c == ':') {
214                 state = SAW_COLON;
215             } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
216                 if (c == '/') {
217                     state = static_cast<ScanState>((int)state + 1);  // next state adds a slash
218                 } else {
219                     state = START;
220                 }
221             }
222         }
223         if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
224             if (!ubrk_isBoundary(mIcuBreaker.breaker.get(), i)) {
225                 // If there are combining marks or such at the end of the URL or the email address,
226                 // consider them a part of the URL or the email, and skip to the next actual
227                 // boundary.
228                 i = ubrk_following(mIcuBreaker.breaker.get(), i);
229             }
230             mInEmailOrUrl = true;
231         } else {
232             mInEmailOrUrl = false;
233         }
234         mScanOffset = i;
235     }
236 }
237 
findNextBreakInEmailOrUrl()238 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
239     // special rules for email addresses and URL's as per Chicago Manual of Style (16th ed.)
240     uint16_t lastChar = mText[mLast];
241     ssize_t i;
242     for (i = mLast + 1; i < mScanOffset; i++) {
243         if (breakAfter(lastChar)) {
244             break;
245         }
246         // break after double slash
247         if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
248             break;
249         }
250         const uint16_t thisChar = mText[i];
251         // never break after hyphen
252         if (lastChar != '-') {
253             if (breakBefore(thisChar)) {
254                 break;
255             }
256             // break before single slash
257             if (thisChar == '/' && lastChar != '/' &&
258                 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
259                 break;
260             }
261         }
262         lastChar = thisChar;
263     }
264     return i;
265 }
266 
next()267 ssize_t WordBreaker::next() {
268     mLast = mCurrent;
269 
270     detectEmailOrUrl();
271     if (mInEmailOrUrl) {
272         mCurrent = findNextBreakInEmailOrUrl();
273     } else {  // Business as usual
274         mCurrent = (ssize_t)iteratorNext();
275     }
276     return mCurrent;
277 }
278 
wordStart() const279 ssize_t WordBreaker::wordStart() const {
280     if (mInEmailOrUrl) {
281         return mLast;
282     }
283     ssize_t result = mLast;
284     while (result < mCurrent) {
285         UChar32 c;
286         ssize_t ix = result;
287         U16_NEXT(mText, ix, mCurrent, c);
288         const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
289         // strip leading punctuation, defined as OP and QU line breaking classes,
290         // see UAX #14
291         if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
292             break;
293         }
294         result = ix;
295     }
296     return result;
297 }
298 
wordEnd() const299 ssize_t WordBreaker::wordEnd() const {
300     if (mInEmailOrUrl) {
301         return mLast;
302     }
303     ssize_t result = mCurrent;
304     while (result > mLast) {
305         UChar32 c;
306         ssize_t ix = result;
307         U16_PREV(mText, mLast, ix, c);
308         const int32_t gc_mask = U_GET_GC_MASK(c);
309         // strip trailing spaces, punctuation and control characters
310         if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK | U_GC_CC_MASK)) == 0) {
311             break;
312         }
313         result = ix;
314     }
315     return result;
316 }
317 
breakBadness() const318 int WordBreaker::breakBadness() const {
319     return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
320 }
321 
finish()322 void WordBreaker::finish() {
323     mText = nullptr;
324     mUText.reset();
325     mPool->release(std::move(mIcuBreaker));
326 }
327 
328 }  // namespace minikin
329