1 /*
2 * Copyright (C) 2015 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #define LOG_TAG "Minikin"
18
19 #include <log/log.h>
20
21 #include <minikin/Emoji.h>
22 #include <minikin/Hyphenator.h>
23 #include <minikin/WordBreaker.h>
24 #include "MinikinInternal.h"
25
26 #include <unicode/uchar.h>
27 #include <unicode/utf16.h>
28
29 namespace minikin {
30
31 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32 const uint32_t CHAR_ZWJ = 0x200D;
33
34 // libtxt extension: avoid the cost of initializing new ICU break iterators
35 // by constructing a global iterator using the default locale and then
36 // creating a clone for each WordBreaker instance.
37 static std::once_flag gLibtxtBreakIteratorInitFlag;
38 static icu::BreakIterator* gLibtxtDefaultBreakIterator = nullptr;
39
setLocale()40 void WordBreaker::setLocale() {
41 UErrorCode status = U_ZERO_ERROR;
42 std::call_once(gLibtxtBreakIteratorInitFlag, [&status] {
43 gLibtxtDefaultBreakIterator =
44 icu::BreakIterator::createLineInstance(icu::Locale(), status);
45 });
46 mBreakIterator.reset(gLibtxtDefaultBreakIterator->clone());
47 // TODO: handle failure status
48 if (mText != nullptr) {
49 mBreakIterator->setText(&mUText, status);
50 }
51 mIteratorWasReset = true;
52 }
53
setText(const uint16_t * data,size_t size)54 void WordBreaker::setText(const uint16_t* data, size_t size) {
55 mText = data;
56 mTextSize = size;
57 mIteratorWasReset = false;
58 mLast = 0;
59 mCurrent = 0;
60 mScanOffset = 0;
61 mInEmailOrUrl = false;
62 UErrorCode status = U_ZERO_ERROR;
63 utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size,
64 &status);
65 mBreakIterator->setText(&mUText, status);
66 mBreakIterator->first();
67 }
68
current() const69 ssize_t WordBreaker::current() const {
70 return mCurrent;
71 }
72
73 /**
74 * Determine whether a line break at position i within the buffer buf is valid.
75 *This represents customization beyond the ICU behavior, because plain ICU
76 *provides some line break opportunities that we don't want.
77 **/
isBreakValid(const uint16_t * buf,size_t bufEnd,size_t i)78 static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
79 uint32_t codePoint;
80 size_t prev_offset = i;
81 U16_PREV(buf, 0, prev_offset, codePoint);
82 // Do not break on hard or soft hyphens. These are handled by automatic
83 // hyphenation.
84 if (Hyphenator::isLineBreakingHyphen(codePoint) ||
85 codePoint == CHAR_SOFT_HYPHEN) {
86 // txt addition: Temporarily always break on hyphen. Changed from false to
87 // true.
88 return true;
89 }
90 // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA,
91 // consonant>. This is to go around a bug in ICU line breaking:
92 // http://bugs.icu-project.org/trac/ticket/12561. To avoid too much looking
93 // around in the strings, we simply avoid breaking after any Myanmar virama,
94 // where no line break could be imagined, since the Myanmar virama is a pure
95 // stacker.
96 if (codePoint == 0x1039) { // MYANMAR SIGN VIRAMA
97 return false;
98 }
99
100 uint32_t next_codepoint;
101 size_t next_offset = i;
102 U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
103
104 // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may
105 // have fresher emoji data than ICU does.
106 if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
107 return false;
108 }
109
110 // Rule LB30b. We need to this ourselves since we may have fresher emoji data
111 // than ICU does.
112 if (isEmojiModifier(next_codepoint)) {
113 if (codePoint == 0xFE0F && prev_offset > 0) {
114 // skip over emoji variation selector
115 U16_PREV(buf, 0, prev_offset, codePoint);
116 }
117 if (isEmojiBase(codePoint)) {
118 return false;
119 }
120 }
121 return true;
122 }
123
124 // Customized iteratorNext that takes care of both resets and our modifications
125 // to ICU's behavior.
iteratorNext()126 int32_t WordBreaker::iteratorNext() {
127 int32_t result;
128 do {
129 if (mIteratorWasReset) {
130 result = mBreakIterator->following(mCurrent);
131 mIteratorWasReset = false;
132 } else {
133 result = mBreakIterator->next();
134 }
135 } while (!(result == icu::BreakIterator::DONE ||
136 (size_t)result == mTextSize ||
137 isBreakValid(mText, mTextSize, result)));
138 return result;
139 }
140
141 // Chicago Manual of Style recommends breaking after these characters in URLs
142 // and email addresses
breakAfter(uint16_t c)143 static bool breakAfter(uint16_t c) {
144 return c == ':' || c == '=' || c == '&';
145 }
146
147 // Chicago Manual of Style recommends breaking before these characters in URLs
148 // and email addresses
breakBefore(uint16_t c)149 static bool breakBefore(uint16_t c) {
150 return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' ||
151 c == '#' || c == '%' || c == '=' || c == '&';
152 }
153
154 enum ScanState {
155 START,
156 SAW_AT,
157 SAW_COLON,
158 SAW_COLON_SLASH,
159 SAW_COLON_SLASH_SLASH,
160 };
161
detectEmailOrUrl()162 void WordBreaker::detectEmailOrUrl() {
163 // scan forward from current ICU position for email address or URL
164 if (mLast >= mScanOffset) {
165 ScanState state = START;
166 size_t i;
167 for (i = mLast; i < mTextSize; i++) {
168 uint16_t c = mText[i];
169 // scan only ASCII characters, stop at space
170 if (!(' ' < c && c <= 0x007E)) {
171 break;
172 }
173 if (state == START && c == '@') {
174 state = SAW_AT;
175 } else if (state == START && c == ':') {
176 state = SAW_COLON;
177 } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
178 if (c == '/') {
179 state = static_cast<ScanState>((int)state +
180 1); // next state adds a slash
181 } else {
182 state = START;
183 }
184 }
185 }
186 if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
187 if (!mBreakIterator->isBoundary(i)) {
188 // If there are combining marks or such at the end of the URL or the
189 // email address, consider them a part of the URL or the email, and skip
190 // to the next actual boundary.
191 i = mBreakIterator->following(i);
192 }
193 mInEmailOrUrl = true;
194 mIteratorWasReset = true;
195 } else {
196 mInEmailOrUrl = false;
197 }
198 mScanOffset = i;
199 }
200 }
201
findNextBreakInEmailOrUrl()202 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
203 // special rules for email addresses and URL's as per Chicago Manual of Style
204 // (16th ed.)
205 uint16_t lastChar = mText[mLast];
206 ssize_t i;
207 for (i = mLast + 1; i < mScanOffset; i++) {
208 if (breakAfter(lastChar)) {
209 break;
210 }
211 // break after double slash
212 if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
213 break;
214 }
215 const uint16_t thisChar = mText[i];
216 // never break after hyphen
217 if (lastChar != '-') {
218 if (breakBefore(thisChar)) {
219 break;
220 }
221 // break before single slash
222 if (thisChar == '/' && lastChar != '/' &&
223 !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
224 break;
225 }
226 }
227 lastChar = thisChar;
228 }
229 return i;
230 }
231
next()232 ssize_t WordBreaker::next() {
233 mLast = mCurrent;
234
235 detectEmailOrUrl();
236 if (mInEmailOrUrl) {
237 mCurrent = findNextBreakInEmailOrUrl();
238 } else { // Business as usual
239 mCurrent = (ssize_t)iteratorNext();
240 }
241 return mCurrent;
242 }
243
wordStart() const244 ssize_t WordBreaker::wordStart() const {
245 if (mInEmailOrUrl) {
246 return mLast;
247 }
248 ssize_t result = mLast;
249 while (result < mCurrent) {
250 UChar32 c;
251 ssize_t ix = result;
252 U16_NEXT(mText, ix, mCurrent, c);
253 const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
254 // strip leading punctuation, defined as OP and QU line breaking classes,
255 // see UAX #14
256 if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
257 break;
258 }
259 result = ix;
260 }
261 return result;
262 }
263
wordEnd() const264 ssize_t WordBreaker::wordEnd() const {
265 if (mInEmailOrUrl) {
266 return mLast;
267 }
268 ssize_t result = mCurrent;
269 while (result > mLast) {
270 UChar32 c;
271 ssize_t ix = result;
272 U16_PREV(mText, mLast, ix, c);
273 const int32_t gc_mask = U_GET_GC_MASK(c);
274 // strip trailing space and punctuation
275 if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
276 break;
277 }
278 result = ix;
279 }
280 return result;
281 }
282
breakBadness() const283 int WordBreaker::breakBadness() const {
284 return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
285 }
286
finish()287 void WordBreaker::finish() {
288 mText = nullptr;
289 // Note: calling utext_close multiply is safe
290 utext_close(&mUText);
291 }
292
293 } // namespace minikin
294