1 /*
2 * Copyright 2020 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/core/SkString.h"
9 #include "include/core/SkTypes.h"
10 #include "include/private/SkBitmaskEnum.h"
11 #include "include/private/SkMutex.h"
12 #include "include/private/SkOnce.h"
13 #include "include/private/SkTArray.h"
14 #include "include/private/SkTemplates.h"
15 #include "include/private/SkTo.h"
16 #include "modules/skunicode/include/SkUnicode.h"
17 #include "modules/skunicode/src/SkUnicode_icu.h"
18 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
19 #include "src/utils/SkUTF.h"
20 #include "include/private/SkTHash.h"
21 #include <unicode/umachine.h>
22 #include <functional>
23 #include <string>
24 #include <utility>
25 #include <vector>
26
27 #if defined(SK_USING_THIRD_PARTY_ICU)
28 #include "SkLoadICU.h"
29 #endif
30
ICULib()31 static const SkICULib* ICULib() {
32 static const auto gICU = SkLoadICULib();
33
34 return gICU.get();
35 }
36
37 // sk_* wrappers for ICU funcs
38 #define SKICU_FUNC(funcname) \
39 template <typename... Args> \
40 auto sk_##funcname(Args&&... args) -> decltype(funcname(std::forward<Args>(args)...)) { \
41 return ICULib()->f_##funcname(std::forward<Args>(args)...); \
42 } \
43
44 SKICU_EMIT_FUNCS
45 #undef SKICU_FUNC
46
errorName(UErrorCode status)47 const char* SkUnicode_IcuBidi::errorName(UErrorCode status) {
48 return sk_u_errorName(status);
49 }
50
bidi_close(UBiDi * bidi)51 void SkUnicode_IcuBidi::bidi_close(UBiDi* bidi) {
52 sk_ubidi_close(bidi);
53 }
bidi_getDirection(const UBiDi * bidi)54 UBiDiDirection SkUnicode_IcuBidi::bidi_getDirection(const UBiDi* bidi) {
55 return sk_ubidi_getDirection(bidi);
56 }
bidi_getLength(const UBiDi * bidi)57 SkBidiIterator::Position SkUnicode_IcuBidi::bidi_getLength(const UBiDi* bidi) {
58 return sk_ubidi_getLength(bidi);
59 }
bidi_getLevelAt(const UBiDi * bidi,int pos)60 SkBidiIterator::Level SkUnicode_IcuBidi::bidi_getLevelAt(const UBiDi* bidi, int pos) {
61 return sk_ubidi_getLevelAt(bidi, pos);
62 }
bidi_openSized(int32_t maxLength,int32_t maxRunCount,UErrorCode * pErrorCode)63 UBiDi* SkUnicode_IcuBidi::bidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode* pErrorCode) {
64 return sk_ubidi_openSized(maxLength, maxRunCount, pErrorCode);
65 }
bidi_setPara(UBiDi * bidi,const UChar * text,int32_t length,UBiDiLevel paraLevel,UBiDiLevel * embeddingLevels,UErrorCode * status)66 void SkUnicode_IcuBidi::bidi_setPara(UBiDi* bidi,
67 const UChar* text,
68 int32_t length,
69 UBiDiLevel paraLevel,
70 UBiDiLevel* embeddingLevels,
71 UErrorCode* status) {
72 return sk_ubidi_setPara(bidi, text, length, paraLevel, embeddingLevels, status);
73 }
bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])74 void SkUnicode_IcuBidi::bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[],
75 int levelsCount,
76 int32_t logicalFromVisual[]) {
77 sk_ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
78 }
79
sk_ubrk_clone(const UBreakIterator * bi,UErrorCode * status)80 static inline UBreakIterator* sk_ubrk_clone(const UBreakIterator* bi, UErrorCode* status) {
81 const auto* icu = ICULib();
82 SkASSERT(icu->f_ubrk_clone_ || icu->f_ubrk_safeClone_);
83 return icu->f_ubrk_clone_
84 ? icu->f_ubrk_clone_(bi, status)
85 : icu->f_ubrk_safeClone_(bi, nullptr, nullptr, status);
86 }
87
utext_close_wrapper(UText * ut)88 static UText* utext_close_wrapper(UText* ut) {
89 return sk_utext_close(ut);
90 }
ubrk_close_wrapper(UBreakIterator * bi)91 static void ubrk_close_wrapper(UBreakIterator* bi) {
92 sk_ubrk_close(bi);
93 }
94
95 using ICUUText = std::unique_ptr<UText, SkFunctionWrapper<decltype(utext_close),
96 utext_close_wrapper>>;
97 using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionWrapper<decltype(ubrk_close),
98 ubrk_close_wrapper>>;
99 /** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
utf8_next(const char ** ptr,const char * end)100 static inline SkUnichar utf8_next(const char** ptr, const char* end) {
101 SkUnichar val = SkUTF::NextUTF8(ptr, end);
102 return val < 0 ? 0xFFFD : val;
103 }
104
convertType(SkUnicode::BreakType type)105 static UBreakIteratorType convertType(SkUnicode::BreakType type) {
106 switch (type) {
107 case SkUnicode::BreakType::kLines: return UBRK_LINE;
108 case SkUnicode::BreakType::kGraphemes: return UBRK_CHARACTER;
109 case SkUnicode::BreakType::kWords: return UBRK_WORD;
110 default:
111 return UBRK_CHARACTER;
112 }
113 }
114
115 class SkBreakIterator_icu : public SkBreakIterator {
116 ICUBreakIterator fBreakIterator;
117 Position fLastResult;
118 public:
SkBreakIterator_icu(ICUBreakIterator iter)119 explicit SkBreakIterator_icu(ICUBreakIterator iter)
120 : fBreakIterator(std::move(iter))
121 , fLastResult(0) {}
first()122 Position first() override { return fLastResult = sk_ubrk_first(fBreakIterator.get()); }
current()123 Position current() override { return fLastResult = sk_ubrk_current(fBreakIterator.get()); }
next()124 Position next() override { return fLastResult = sk_ubrk_next(fBreakIterator.get()); }
status()125 Status status() override { return sk_ubrk_getRuleStatus(fBreakIterator.get()); }
isDone()126 bool isDone() override { return fLastResult == UBRK_DONE; }
127
setText(const char utftext8[],int utf8Units)128 bool setText(const char utftext8[], int utf8Units) override {
129 UErrorCode status = U_ZERO_ERROR;
130 ICUUText text(sk_utext_openUTF8(nullptr, &utftext8[0], utf8Units, &status));
131
132 if (U_FAILURE(status)) {
133 SkDEBUGF("Break error: %s", sk_u_errorName(status));
134 return false;
135 }
136 SkASSERT(text);
137 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
138 if (U_FAILURE(status)) {
139 SkDEBUGF("Break error: %s", sk_u_errorName(status));
140 return false;
141 }
142 fLastResult = 0;
143 return true;
144 }
setText(const char16_t utftext16[],int utf16Units)145 bool setText(const char16_t utftext16[], int utf16Units) override {
146 UErrorCode status = U_ZERO_ERROR;
147 ICUUText text(sk_utext_openUChars(nullptr, reinterpret_cast<const UChar*>(&utftext16[0]),
148 utf16Units, &status));
149
150 if (U_FAILURE(status)) {
151 SkDEBUGF("Break error: %s", sk_u_errorName(status));
152 return false;
153 }
154 SkASSERT(text);
155 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
156 if (U_FAILURE(status)) {
157 SkDEBUGF("Break error: %s", sk_u_errorName(status));
158 return false;
159 }
160 fLastResult = 0;
161 return true;
162 }
163 };
164
165 class SkIcuBreakIteratorCache {
166 SkTHashMap<SkUnicode::BreakType, ICUBreakIterator> fBreakCache;
167 SkMutex fBreakCacheMutex;
168
169 public:
get()170 static SkIcuBreakIteratorCache& get() {
171 static SkIcuBreakIteratorCache instance;
172 return instance;
173 }
174
makeBreakIterator(SkUnicode::BreakType type)175 ICUBreakIterator makeBreakIterator(SkUnicode::BreakType type) {
176 UErrorCode status = U_ZERO_ERROR;
177 ICUBreakIterator* cachedIterator;
178 {
179 SkAutoMutexExclusive lock(fBreakCacheMutex);
180 cachedIterator = fBreakCache.find(type);
181 if (!cachedIterator) {
182 ICUBreakIterator newIterator(sk_ubrk_open(convertType(type), sk_uloc_getDefault(),
183 nullptr, 0, &status));
184 if (U_FAILURE(status)) {
185 SkDEBUGF("Break error: %s", sk_u_errorName(status));
186 } else {
187 cachedIterator = fBreakCache.set(type, std::move(newIterator));
188 }
189 }
190 }
191 ICUBreakIterator iterator;
192 if (cachedIterator) {
193 iterator.reset(sk_ubrk_clone(cachedIterator->get(), &status));
194 if (U_FAILURE(status)) {
195 SkDEBUGF("Break error: %s", sk_u_errorName(status));
196 }
197 }
198 return iterator;
199 }
200 };
201
202 class SkUnicode_icu : public SkUnicode {
203
copy()204 std::unique_ptr<SkUnicode> copy() override {
205 return std::make_unique<SkUnicode_icu>();
206 }
207
extractWords(uint16_t utf16[],int utf16Units,const char * locale,std::vector<Position> * words)208 static bool extractWords(uint16_t utf16[], int utf16Units, const char* locale, std::vector<Position>* words) {
209
210 UErrorCode status = U_ZERO_ERROR;
211
212 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(BreakType::kWords);
213 if (!iterator) {
214 SkDEBUGF("Break error: %s", sk_u_errorName(status));
215 return false;
216 }
217 SkASSERT(iterator);
218
219 ICUUText utf16UText(sk_utext_openUChars(nullptr, (UChar*)utf16, utf16Units, &status));
220 if (U_FAILURE(status)) {
221 SkDEBUGF("Break error: %s", sk_u_errorName(status));
222 return false;
223 }
224
225 sk_ubrk_setUText(iterator.get(), utf16UText.get(), &status);
226 if (U_FAILURE(status)) {
227 SkDEBUGF("Break error: %s", sk_u_errorName(status));
228 return false;
229 }
230
231 // Get the words
232 int32_t pos = sk_ubrk_first(iterator.get());
233 while (pos != UBRK_DONE) {
234 words->emplace_back(pos);
235 pos = sk_ubrk_next(iterator.get());
236 }
237
238 return true;
239 }
240
extractPositions(const char utf8[],int utf8Units,BreakType type,std::function<void (int,int)> setBreak)241 static bool extractPositions
242 (const char utf8[], int utf8Units, BreakType type, std::function<void(int, int)> setBreak) {
243
244 UErrorCode status = U_ZERO_ERROR;
245 ICUUText text(sk_utext_openUTF8(nullptr, &utf8[0], utf8Units, &status));
246
247 if (U_FAILURE(status)) {
248 SkDEBUGF("Break error: %s", sk_u_errorName(status));
249 return false;
250 }
251 SkASSERT(text);
252
253 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type);
254 if (!iterator) {
255 return false;
256 }
257
258 sk_ubrk_setUText(iterator.get(), text.get(), &status);
259 if (U_FAILURE(status)) {
260 SkDEBUGF("Break error: %s", sk_u_errorName(status));
261 return false;
262 }
263
264 auto iter = iterator.get();
265 int32_t pos = sk_ubrk_first(iter);
266 while (pos != UBRK_DONE) {
267 int s = type == SkUnicode::BreakType::kLines
268 ? UBRK_LINE_SOFT
269 : sk_ubrk_getRuleStatus(iter);
270 setBreak(pos, s);
271 pos = sk_ubrk_next(iter);
272 }
273
274 if (type == SkUnicode::BreakType::kLines) {
275 // This is a workaround for https://bugs.chromium.org/p/skia/issues/detail?id=10715
276 // (ICU line break iterator does not work correctly on Thai text with new lines)
277 // So, we only use the iterator to collect soft line breaks and
278 // scan the text for all hard line breaks ourselves
279 const char* end = utf8 + utf8Units;
280 const char* ch = utf8;
281 while (ch < end) {
282 auto unichar = utf8_next(&ch, end);
283 if (isHardLineBreak(unichar)) {
284 setBreak(ch - utf8, UBRK_LINE_HARD);
285 }
286 }
287 }
288 return true;
289 }
290
isControl(SkUnichar utf8)291 static bool isControl(SkUnichar utf8) {
292 return sk_u_iscntrl(utf8);
293 }
294
isWhitespace(SkUnichar utf8)295 static bool isWhitespace(SkUnichar utf8) {
296 return sk_u_isWhitespace(utf8);
297 }
298
isSpace(SkUnichar utf8)299 static bool isSpace(SkUnichar utf8) {
300 return sk_u_isspace(utf8);
301 }
302
isTabulation(SkUnichar utf8)303 static bool isTabulation(SkUnichar utf8) {
304 return utf8 == '\t';
305 }
306
isHardBreak(SkUnichar utf8)307 static bool isHardBreak(SkUnichar utf8) {
308 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
309 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
310 }
311
312 public:
~SkUnicode_icu()313 ~SkUnicode_icu() override { }
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)314 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
315 SkBidiIterator::Direction dir) override {
316 return SkUnicode::makeBidiIterator(text, count, dir);
317 }
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)318 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
319 int count,
320 SkBidiIterator::Direction dir) override {
321 return SkUnicode::makeBidiIterator(text, count, dir);
322 }
makeBreakIterator(const char locale[],BreakType breakType)323 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
324 BreakType breakType) override {
325 UErrorCode status = U_ZERO_ERROR;
326 ICUBreakIterator iterator(sk_ubrk_open(convertType(breakType), locale, nullptr, 0,
327 &status));
328 if (U_FAILURE(status)) {
329 SkDEBUGF("Break error: %s", sk_u_errorName(status));
330 return nullptr;
331 }
332 return std::unique_ptr<SkBreakIterator>(new SkBreakIterator_icu(std::move(iterator)));
333 }
makeBreakIterator(BreakType breakType)334 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override {
335 return makeBreakIterator(sk_uloc_getDefault(), breakType);
336 }
337
isHardLineBreak(SkUnichar utf8)338 static bool isHardLineBreak(SkUnichar utf8) {
339 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
340 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
341 }
342
toUpper(const SkString & str)343 SkString toUpper(const SkString& str) override {
344 // Convert to UTF16 since that's what ICU wants.
345 auto str16 = SkUnicode::convertUtf8ToUtf16(str.c_str(), str.size());
346
347 UErrorCode icu_err = U_ZERO_ERROR;
348 const auto upper16len = sk_u_strToUpper(nullptr, 0, (UChar*)(str16.c_str()), str16.size(),
349 nullptr, &icu_err);
350 if (icu_err != U_BUFFER_OVERFLOW_ERROR || upper16len <= 0) {
351 return SkString();
352 }
353
354 SkAutoSTArray<128, uint16_t> upper16(upper16len);
355 icu_err = U_ZERO_ERROR;
356 sk_u_strToUpper((UChar*)(upper16.get()), SkToS32(upper16.size()),
357 (UChar*)(str16.c_str()), str16.size(),
358 nullptr, &icu_err);
359 SkASSERT(!U_FAILURE(icu_err));
360
361 // ... and back to utf8 'cause that's what we want.
362 return convertUtf16ToUtf8((char16_t*)upper16.get(), upper16.size());
363 }
364
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)365 bool getBidiRegions(const char utf8[],
366 int utf8Units,
367 TextDirection dir,
368 std::vector<BidiRegion>* results) override {
369 return SkUnicode::extractBidi(utf8, utf8Units, dir, results);
370 }
371
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)372 bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override {
373
374 // Convert to UTF16 since we want the results in utf16
375 auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
376 return SkUnicode_icu::extractWords((uint16_t*)utf16.c_str(), utf16.size(), locale, results);
377 }
378
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,SkTArray<SkUnicode::CodeUnitFlags,true> * results)379 bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs,
380 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
381 results->reset();
382 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
383
384 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, [&](int pos,
385 int status) {
386 (*results)[pos] |= status == UBRK_LINE_HARD
387 ? CodeUnitFlags::kHardLineBreakBefore
388 : CodeUnitFlags::kSoftLineBreakBefore;
389 });
390
391 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, [&](int pos,
392 int status) {
393 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
394 });
395
396 const char* current = utf8;
397 const char* end = utf8 + utf8Units;
398 while (current < end) {
399 auto before = current - utf8;
400 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
401 if (unichar < 0) unichar = 0xFFFD;
402 auto after = current - utf8;
403 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) {
404 results->at(before) |= SkUnicode::kTabulation;
405 if (replaceTabs) {
406 unichar = ' ';
407 utf8[before] = ' ';
408 }
409 }
410 for (auto i = before; i < after; ++i) {
411 if (SkUnicode_icu::isSpace(unichar)) {
412 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
413 }
414 if (SkUnicode_icu::isWhitespace(unichar)) {
415 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
416 }
417 if (SkUnicode_icu::isControl(unichar)) {
418 results->at(i) |= SkUnicode::kControl;
419 }
420 }
421 }
422
423 return true;
424 }
425
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,SkTArray<SkUnicode::CodeUnitFlags,true> * results)426 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
427 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
428 results->reset();
429 results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
430
431 // Get white spaces
432 this->forEachCodepoint((char16_t*)&utf16[0], utf16Units,
433 [results, replaceTabs, &utf16](SkUnichar unichar, int32_t start, int32_t end) {
434 for (auto i = start; i < end; ++i) {
435 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) {
436 results->at(i) |= SkUnicode::kTabulation;
437 if (replaceTabs) {
438 unichar = ' ';
439 utf16[start] = ' ';
440 }
441 }
442 if (SkUnicode_icu::isSpace(unichar)) {
443 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
444 }
445 if (SkUnicode_icu::isWhitespace(unichar)) {
446 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
447 }
448 if (SkUnicode_icu::isControl(unichar)) {
449 results->at(i) |= SkUnicode::kControl;
450 }
451 }
452 });
453 // Get graphemes
454 this->forEachBreak((char16_t*)&utf16[0],
455 utf16Units,
456 SkUnicode::BreakType::kGraphemes,
457 [results](SkBreakIterator::Position pos, SkBreakIterator::Status) {
458 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
459 });
460 // Get line breaks
461 this->forEachBreak(
462 (char16_t*)&utf16[0],
463 utf16Units,
464 SkUnicode::BreakType::kLines,
465 [results](SkBreakIterator::Position pos, SkBreakIterator::Status status) {
466 if (status ==
467 (SkBreakIterator::Status)SkUnicode::LineBreakType::kHardLineBreak) {
468 // Hard line breaks clears off all the other flags
469 // TODO: Treat \n as a formatting mark and do not pass it to SkShaper
470 (*results)[pos-1] = CodeUnitFlags::kHardLineBreakBefore;
471 } else {
472 (*results)[pos] |= CodeUnitFlags::kSoftLineBreakBefore;
473 }
474 });
475
476 return true;
477 }
478
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])479 void reorderVisual(const BidiLevel runLevels[],
480 int levelsCount,
481 int32_t logicalFromVisual[]) override {
482 SkUnicode_IcuBidi::bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
483 }
484 };
485
MakeIcuBasedUnicode()486 std::unique_ptr<SkUnicode> SkUnicode::MakeIcuBasedUnicode() {
487 #if defined(SK_USING_THIRD_PARTY_ICU)
488 if (!SkLoadICU()) {
489 static SkOnce once;
490 once([] { SkDEBUGF("SkLoadICU() failed!\n"); });
491 return nullptr;
492 }
493 #endif
494
495 return ICULib()
496 ? std::make_unique<SkUnicode_icu>()
497 : nullptr;
498 }
499