• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2015 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #define LOG_TAG "Minikin"
18 
19 #include <log/log.h>
20 
21 #include <minikin/Emoji.h>
22 #include <minikin/Hyphenator.h>
23 #include <minikin/WordBreaker.h>
24 #include "MinikinInternal.h"
25 
26 #include <unicode/uchar.h>
27 #include <unicode/utf16.h>
28 
29 namespace minikin {
30 
31 const uint32_t CHAR_SOFT_HYPHEN = 0x00AD;
32 const uint32_t CHAR_ZWJ = 0x200D;
33 
34 // libtxt extension: avoid the cost of initializing new ICU break iterators
35 // by constructing a global iterator using the default locale and then
36 // creating a clone for each WordBreaker instance.
37 static std::once_flag gLibtxtBreakIteratorInitFlag;
38 static icu::BreakIterator* gLibtxtDefaultBreakIterator = nullptr;
39 
setLocale()40 void WordBreaker::setLocale() {
41   UErrorCode status = U_ZERO_ERROR;
42   std::call_once(gLibtxtBreakIteratorInitFlag, [&status] {
43     gLibtxtDefaultBreakIterator =
44         icu::BreakIterator::createLineInstance(icu::Locale(), status);
45   });
46   mBreakIterator.reset(gLibtxtDefaultBreakIterator->clone());
47   // TODO: handle failure status
48   if (mText != nullptr) {
49     mBreakIterator->setText(&mUText, status);
50   }
51   mIteratorWasReset = true;
52 }
53 
setText(const uint16_t * data,size_t size)54 void WordBreaker::setText(const uint16_t* data, size_t size) {
55   mText = data;
56   mTextSize = size;
57   mIteratorWasReset = false;
58   mLast = 0;
59   mCurrent = 0;
60   mScanOffset = 0;
61   mInEmailOrUrl = false;
62   UErrorCode status = U_ZERO_ERROR;
63   utext_openUChars(&mUText, reinterpret_cast<const UChar*>(data), size,
64                    &status);
65   mBreakIterator->setText(&mUText, status);
66   mBreakIterator->first();
67 }
68 
current() const69 ssize_t WordBreaker::current() const {
70   return mCurrent;
71 }
72 
73 /**
74  * Determine whether a line break at position i within the buffer buf is valid.
75  *This represents customization beyond the ICU behavior, because plain ICU
76  *provides some line break opportunities that we don't want.
77  **/
isBreakValid(const uint16_t * buf,size_t bufEnd,size_t i)78 static bool isBreakValid(const uint16_t* buf, size_t bufEnd, size_t i) {
79   uint32_t codePoint;
80   size_t prev_offset = i;
81   U16_PREV(buf, 0, prev_offset, codePoint);
82   // Do not break on hard or soft hyphens. These are handled by automatic
83   // hyphenation.
84   if (Hyphenator::isLineBreakingHyphen(codePoint) ||
85       codePoint == CHAR_SOFT_HYPHEN) {
86     // txt addition: Temporarily always break on hyphen. Changed from false to
87     // true.
88     return true;
89   }
90   // For Myanmar kinzi sequences, created by <consonant, ASAT, VIRAMA,
91   // consonant>. This is to go around a bug in ICU line breaking:
92   // http://bugs.icu-project.org/trac/ticket/12561. To avoid too much looking
93   // around in the strings, we simply avoid breaking after any Myanmar virama,
94   // where no line break could be imagined, since the Myanmar virama is a pure
95   // stacker.
96   if (codePoint == 0x1039) {  // MYANMAR SIGN VIRAMA
97     return false;
98   }
99 
100   uint32_t next_codepoint;
101   size_t next_offset = i;
102   U16_NEXT(buf, next_offset, bufEnd, next_codepoint);
103 
104   // Rule LB8 for Emoji ZWJ sequences. We need to do this ourselves since we may
105   // have fresher emoji data than ICU does.
106   if (codePoint == CHAR_ZWJ && isEmoji(next_codepoint)) {
107     return false;
108   }
109 
110   // Rule LB30b. We need to this ourselves since we may have fresher emoji data
111   // than ICU does.
112   if (isEmojiModifier(next_codepoint)) {
113     if (codePoint == 0xFE0F && prev_offset > 0) {
114       // skip over emoji variation selector
115       U16_PREV(buf, 0, prev_offset, codePoint);
116     }
117     if (isEmojiBase(codePoint)) {
118       return false;
119     }
120   }
121   return true;
122 }
123 
124 // Customized iteratorNext that takes care of both resets and our modifications
125 // to ICU's behavior.
iteratorNext()126 int32_t WordBreaker::iteratorNext() {
127   int32_t result;
128   do {
129     if (mIteratorWasReset) {
130       result = mBreakIterator->following(mCurrent);
131       mIteratorWasReset = false;
132     } else {
133       result = mBreakIterator->next();
134     }
135   } while (!(result == icu::BreakIterator::DONE ||
136              (size_t)result == mTextSize ||
137              isBreakValid(mText, mTextSize, result)));
138   return result;
139 }
140 
141 // Chicago Manual of Style recommends breaking after these characters in URLs
142 // and email addresses
breakAfter(uint16_t c)143 static bool breakAfter(uint16_t c) {
144   return c == ':' || c == '=' || c == '&';
145 }
146 
147 // Chicago Manual of Style recommends breaking before these characters in URLs
148 // and email addresses
breakBefore(uint16_t c)149 static bool breakBefore(uint16_t c) {
150   return c == '~' || c == '.' || c == ',' || c == '-' || c == '_' || c == '?' ||
151          c == '#' || c == '%' || c == '=' || c == '&';
152 }
153 
154 enum ScanState {
155   START,
156   SAW_AT,
157   SAW_COLON,
158   SAW_COLON_SLASH,
159   SAW_COLON_SLASH_SLASH,
160 };
161 
detectEmailOrUrl()162 void WordBreaker::detectEmailOrUrl() {
163   // scan forward from current ICU position for email address or URL
164   if (mLast >= mScanOffset) {
165     ScanState state = START;
166     size_t i;
167     for (i = mLast; i < mTextSize; i++) {
168       uint16_t c = mText[i];
169       // scan only ASCII characters, stop at space
170       if (!(' ' < c && c <= 0x007E)) {
171         break;
172       }
173       if (state == START && c == '@') {
174         state = SAW_AT;
175       } else if (state == START && c == ':') {
176         state = SAW_COLON;
177       } else if (state == SAW_COLON || state == SAW_COLON_SLASH) {
178         if (c == '/') {
179           state = static_cast<ScanState>((int)state +
180                                          1);  // next state adds a slash
181         } else {
182           state = START;
183         }
184       }
185     }
186     if (state == SAW_AT || state == SAW_COLON_SLASH_SLASH) {
187       if (!mBreakIterator->isBoundary(i)) {
188         // If there are combining marks or such at the end of the URL or the
189         // email address, consider them a part of the URL or the email, and skip
190         // to the next actual boundary.
191         i = mBreakIterator->following(i);
192       }
193       mInEmailOrUrl = true;
194       mIteratorWasReset = true;
195     } else {
196       mInEmailOrUrl = false;
197     }
198     mScanOffset = i;
199   }
200 }
201 
findNextBreakInEmailOrUrl()202 ssize_t WordBreaker::findNextBreakInEmailOrUrl() {
203   // special rules for email addresses and URL's as per Chicago Manual of Style
204   // (16th ed.)
205   uint16_t lastChar = mText[mLast];
206   ssize_t i;
207   for (i = mLast + 1; i < mScanOffset; i++) {
208     if (breakAfter(lastChar)) {
209       break;
210     }
211     // break after double slash
212     if (lastChar == '/' && i >= mLast + 2 && mText[i - 2] == '/') {
213       break;
214     }
215     const uint16_t thisChar = mText[i];
216     // never break after hyphen
217     if (lastChar != '-') {
218       if (breakBefore(thisChar)) {
219         break;
220       }
221       // break before single slash
222       if (thisChar == '/' && lastChar != '/' &&
223           !(i + 1 < mScanOffset && mText[i + 1] == '/')) {
224         break;
225       }
226     }
227     lastChar = thisChar;
228   }
229   return i;
230 }
231 
next()232 ssize_t WordBreaker::next() {
233   mLast = mCurrent;
234 
235   detectEmailOrUrl();
236   if (mInEmailOrUrl) {
237     mCurrent = findNextBreakInEmailOrUrl();
238   } else {  // Business as usual
239     mCurrent = (ssize_t)iteratorNext();
240   }
241   return mCurrent;
242 }
243 
wordStart() const244 ssize_t WordBreaker::wordStart() const {
245   if (mInEmailOrUrl) {
246     return mLast;
247   }
248   ssize_t result = mLast;
249   while (result < mCurrent) {
250     UChar32 c;
251     ssize_t ix = result;
252     U16_NEXT(mText, ix, mCurrent, c);
253     const int32_t lb = u_getIntPropertyValue(c, UCHAR_LINE_BREAK);
254     // strip leading punctuation, defined as OP and QU line breaking classes,
255     // see UAX #14
256     if (!(lb == U_LB_OPEN_PUNCTUATION || lb == U_LB_QUOTATION)) {
257       break;
258     }
259     result = ix;
260   }
261   return result;
262 }
263 
wordEnd() const264 ssize_t WordBreaker::wordEnd() const {
265   if (mInEmailOrUrl) {
266     return mLast;
267   }
268   ssize_t result = mCurrent;
269   while (result > mLast) {
270     UChar32 c;
271     ssize_t ix = result;
272     U16_PREV(mText, mLast, ix, c);
273     const int32_t gc_mask = U_GET_GC_MASK(c);
274     // strip trailing space and punctuation
275     if ((gc_mask & (U_GC_ZS_MASK | U_GC_P_MASK)) == 0) {
276       break;
277     }
278     result = ix;
279   }
280   return result;
281 }
282 
breakBadness() const283 int WordBreaker::breakBadness() const {
284   return (mInEmailOrUrl && mCurrent < mScanOffset) ? 1 : 0;
285 }
286 
finish()287 void WordBreaker::finish() {
288   mText = nullptr;
289   // Note: calling utext_close multiply is safe
290   utext_close(&mUText);
291 }
292 
293 }  // namespace minikin
294