1 /*
2 * Copyright 2020 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7
8 #include "include/core/SkString.h"
9 #include "include/core/SkTypes.h"
10 #include "include/private/SkBitmaskEnum.h"
11 #include "include/private/base/SkDebug.h"
12 #include "include/private/base/SkMutex.h"
13 #include "include/private/base/SkOnce.h"
14 #include "include/private/base/SkTArray.h"
15 #include "include/private/base/SkTemplates.h"
16 #include "include/private/base/SkTo.h"
17 #include "modules/skunicode/include/SkUnicode.h"
18 #include "modules/skunicode/src/SkUnicode_icu.h"
19 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
20 #include "src/base/SkUTF.h"
21 #include "src/core/SkTHash.h"
22 #include <unicode/umachine.h>
23 #include <functional>
24 #include <string>
25 #include <utility>
26 #include <vector>
27
28 #if defined(SK_USING_THIRD_PARTY_ICU)
29 #include "SkLoadICU.h"
30 #endif
31
32 using namespace skia_private;
33
ICULib()34 static const SkICULib* ICULib() {
35 static const auto gICU = SkLoadICULib();
36
37 return gICU.get();
38 }
39
40 // sk_* wrappers for ICU funcs
41 #define SKICU_FUNC(funcname) \
42 template <typename... Args> \
43 auto sk_##funcname(Args&&... args) -> decltype(funcname(std::forward<Args>(args)...)) { \
44 return ICULib()->f_##funcname(std::forward<Args>(args)...); \
45 } \
46
47 SKICU_EMIT_FUNCS
48 #undef SKICU_FUNC
49
errorName(UErrorCode status)50 const char* SkUnicode_IcuBidi::errorName(UErrorCode status) {
51 return sk_u_errorName(status);
52 }
53
bidi_close(UBiDi * bidi)54 void SkUnicode_IcuBidi::bidi_close(UBiDi* bidi) {
55 sk_ubidi_close(bidi);
56 }
bidi_getDirection(const UBiDi * bidi)57 UBiDiDirection SkUnicode_IcuBidi::bidi_getDirection(const UBiDi* bidi) {
58 return sk_ubidi_getDirection(bidi);
59 }
bidi_getLength(const UBiDi * bidi)60 SkBidiIterator::Position SkUnicode_IcuBidi::bidi_getLength(const UBiDi* bidi) {
61 return sk_ubidi_getLength(bidi);
62 }
bidi_getLevelAt(const UBiDi * bidi,int pos)63 SkBidiIterator::Level SkUnicode_IcuBidi::bidi_getLevelAt(const UBiDi* bidi, int pos) {
64 return sk_ubidi_getLevelAt(bidi, pos);
65 }
bidi_openSized(int32_t maxLength,int32_t maxRunCount,UErrorCode * pErrorCode)66 UBiDi* SkUnicode_IcuBidi::bidi_openSized(int32_t maxLength, int32_t maxRunCount, UErrorCode* pErrorCode) {
67 return sk_ubidi_openSized(maxLength, maxRunCount, pErrorCode);
68 }
bidi_setPara(UBiDi * bidi,const UChar * text,int32_t length,UBiDiLevel paraLevel,UBiDiLevel * embeddingLevels,UErrorCode * status)69 void SkUnicode_IcuBidi::bidi_setPara(UBiDi* bidi,
70 const UChar* text,
71 int32_t length,
72 UBiDiLevel paraLevel,
73 UBiDiLevel* embeddingLevels,
74 UErrorCode* status) {
75 return sk_ubidi_setPara(bidi, text, length, paraLevel, embeddingLevels, status);
76 }
bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])77 void SkUnicode_IcuBidi::bidi_reorderVisual(const SkUnicode::BidiLevel runLevels[],
78 int levelsCount,
79 int32_t logicalFromVisual[]) {
80 sk_ubidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
81 }
82
sk_ubrk_clone(const UBreakIterator * bi,UErrorCode * status)83 static inline UBreakIterator* sk_ubrk_clone(const UBreakIterator* bi, UErrorCode* status) {
84 const auto* icu = ICULib();
85 SkASSERT(icu->f_ubrk_clone_ || icu->f_ubrk_safeClone_);
86 return icu->f_ubrk_clone_
87 ? icu->f_ubrk_clone_(bi, status)
88 : icu->f_ubrk_safeClone_(bi, nullptr, nullptr, status);
89 }
90
utext_close_wrapper(UText * ut)91 static UText* utext_close_wrapper(UText* ut) {
92 return sk_utext_close(ut);
93 }
ubrk_close_wrapper(UBreakIterator * bi)94 static void ubrk_close_wrapper(UBreakIterator* bi) {
95 sk_ubrk_close(bi);
96 }
97
98 using ICUUText = std::unique_ptr<UText, SkFunctionObject<utext_close_wrapper>>;
99 using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionObject<ubrk_close_wrapper>>;
100 /** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
utf8_next(const char ** ptr,const char * end)101 static inline SkUnichar utf8_next(const char** ptr, const char* end) {
102 SkUnichar val = SkUTF::NextUTF8(ptr, end);
103 return val < 0 ? 0xFFFD : val;
104 }
105
convertType(SkUnicode::BreakType type)106 static UBreakIteratorType convertType(SkUnicode::BreakType type) {
107 switch (type) {
108 case SkUnicode::BreakType::kLines: return UBRK_LINE;
109 case SkUnicode::BreakType::kGraphemes: return UBRK_CHARACTER;
110 case SkUnicode::BreakType::kWords: return UBRK_WORD;
111 default:
112 return UBRK_CHARACTER;
113 }
114 }
115
116 class SkBreakIterator_icu : public SkBreakIterator {
117 ICUBreakIterator fBreakIterator;
118 Position fLastResult;
119 public:
SkBreakIterator_icu(ICUBreakIterator iter)120 explicit SkBreakIterator_icu(ICUBreakIterator iter)
121 : fBreakIterator(std::move(iter))
122 , fLastResult(0) {}
first()123 Position first() override { return fLastResult = sk_ubrk_first(fBreakIterator.get()); }
current()124 Position current() override { return fLastResult = sk_ubrk_current(fBreakIterator.get()); }
next()125 Position next() override { return fLastResult = sk_ubrk_next(fBreakIterator.get()); }
status()126 Status status() override { return sk_ubrk_getRuleStatus(fBreakIterator.get()); }
isDone()127 bool isDone() override { return fLastResult == UBRK_DONE; }
128
setText(const char utftext8[],int utf8Units)129 bool setText(const char utftext8[], int utf8Units) override {
130 UErrorCode status = U_ZERO_ERROR;
131 ICUUText text(sk_utext_openUTF8(nullptr, &utftext8[0], utf8Units, &status));
132
133 if (U_FAILURE(status)) {
134 SkDEBUGF("Break error: %s", sk_u_errorName(status));
135 return false;
136 }
137 SkASSERT(text);
138 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
139 if (U_FAILURE(status)) {
140 SkDEBUGF("Break error: %s", sk_u_errorName(status));
141 return false;
142 }
143 fLastResult = 0;
144 return true;
145 }
setText(const char16_t utftext16[],int utf16Units)146 bool setText(const char16_t utftext16[], int utf16Units) override {
147 UErrorCode status = U_ZERO_ERROR;
148 ICUUText text(sk_utext_openUChars(nullptr, reinterpret_cast<const UChar*>(&utftext16[0]),
149 utf16Units, &status));
150
151 if (U_FAILURE(status)) {
152 SkDEBUGF("Break error: %s", sk_u_errorName(status));
153 return false;
154 }
155 SkASSERT(text);
156 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
157 if (U_FAILURE(status)) {
158 SkDEBUGF("Break error: %s", sk_u_errorName(status));
159 return false;
160 }
161 fLastResult = 0;
162 return true;
163 }
164 };
165
166 class SkIcuBreakIteratorCache {
167 SkTHashMap<SkUnicode::BreakType, ICUBreakIterator> fBreakCache;
168 SkMutex fBreakCacheMutex;
169
170 public:
get()171 static SkIcuBreakIteratorCache& get() {
172 static SkIcuBreakIteratorCache instance;
173 return instance;
174 }
175
makeBreakIterator(SkUnicode::BreakType type)176 ICUBreakIterator makeBreakIterator(SkUnicode::BreakType type) {
177 UErrorCode status = U_ZERO_ERROR;
178 ICUBreakIterator* cachedIterator;
179 {
180 SkAutoMutexExclusive lock(fBreakCacheMutex);
181 cachedIterator = fBreakCache.find(type);
182 if (!cachedIterator) {
183 ICUBreakIterator newIterator(sk_ubrk_open(convertType(type), sk_uloc_getDefault(),
184 nullptr, 0, &status));
185 if (U_FAILURE(status)) {
186 SkDEBUGF("Break error: %s", sk_u_errorName(status));
187 } else {
188 cachedIterator = fBreakCache.set(type, std::move(newIterator));
189 }
190 }
191 }
192 ICUBreakIterator iterator;
193 if (cachedIterator) {
194 iterator.reset(sk_ubrk_clone(cachedIterator->get(), &status));
195 if (U_FAILURE(status)) {
196 SkDEBUGF("Break error: %s", sk_u_errorName(status));
197 }
198 }
199 return iterator;
200 }
201 };
202
203 class SkUnicode_icu : public SkUnicode {
204
copy()205 std::unique_ptr<SkUnicode> copy() override {
206 return std::make_unique<SkUnicode_icu>();
207 }
208
extractWords(uint16_t utf16[],int utf16Units,const char * locale,std::vector<Position> * words)209 static bool extractWords(uint16_t utf16[], int utf16Units, const char* locale, std::vector<Position>* words) {
210
211 UErrorCode status = U_ZERO_ERROR;
212
213 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(BreakType::kWords);
214 if (!iterator) {
215 SkDEBUGF("Break error: %s", sk_u_errorName(status));
216 return false;
217 }
218 SkASSERT(iterator);
219
220 ICUUText utf16UText(sk_utext_openUChars(nullptr, (UChar*)utf16, utf16Units, &status));
221 if (U_FAILURE(status)) {
222 SkDEBUGF("Break error: %s", sk_u_errorName(status));
223 return false;
224 }
225
226 sk_ubrk_setUText(iterator.get(), utf16UText.get(), &status);
227 if (U_FAILURE(status)) {
228 SkDEBUGF("Break error: %s", sk_u_errorName(status));
229 return false;
230 }
231
232 // Get the words
233 int32_t pos = sk_ubrk_first(iterator.get());
234 while (pos != UBRK_DONE) {
235 words->emplace_back(pos);
236 pos = sk_ubrk_next(iterator.get());
237 }
238
239 return true;
240 }
241
extractPositions(const char utf8[],int utf8Units,BreakType type,std::function<void (int,int)> setBreak)242 static bool extractPositions
243 (const char utf8[], int utf8Units, BreakType type, std::function<void(int, int)> setBreak) {
244
245 UErrorCode status = U_ZERO_ERROR;
246 ICUUText text(sk_utext_openUTF8(nullptr, &utf8[0], utf8Units, &status));
247
248 if (U_FAILURE(status)) {
249 SkDEBUGF("Break error: %s", sk_u_errorName(status));
250 return false;
251 }
252 SkASSERT(text);
253
254 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type);
255 if (!iterator) {
256 return false;
257 }
258
259 sk_ubrk_setUText(iterator.get(), text.get(), &status);
260 if (U_FAILURE(status)) {
261 SkDEBUGF("Break error: %s", sk_u_errorName(status));
262 return false;
263 }
264
265 auto iter = iterator.get();
266 int32_t pos = sk_ubrk_first(iter);
267 while (pos != UBRK_DONE) {
268 int s = type == SkUnicode::BreakType::kLines
269 ? UBRK_LINE_SOFT
270 : sk_ubrk_getRuleStatus(iter);
271 setBreak(pos, s);
272 pos = sk_ubrk_next(iter);
273 }
274
275 if (type == SkUnicode::BreakType::kLines) {
276 // This is a workaround for https://bugs.chromium.org/p/skia/issues/detail?id=10715
277 // (ICU line break iterator does not work correctly on Thai text with new lines)
278 // So, we only use the iterator to collect soft line breaks and
279 // scan the text for all hard line breaks ourselves
280 const char* end = utf8 + utf8Units;
281 const char* ch = utf8;
282 while (ch < end) {
283 auto unichar = utf8_next(&ch, end);
284 if (isHardLineBreak(unichar)) {
285 setBreak(ch - utf8, UBRK_LINE_HARD);
286 }
287 }
288 }
289 return true;
290 }
291
isControl(SkUnichar utf8)292 static bool isControl(SkUnichar utf8) {
293 return sk_u_iscntrl(utf8);
294 }
295
isWhitespace(SkUnichar utf8)296 static bool isWhitespace(SkUnichar utf8) {
297 return sk_u_isWhitespace(utf8);
298 }
299
isSpace(SkUnichar utf8)300 static bool isSpace(SkUnichar utf8) {
301 return sk_u_isspace(utf8);
302 }
303
isTabulation(SkUnichar utf8)304 static bool isTabulation(SkUnichar utf8) {
305 return utf8 == '\t';
306 }
307
isHardBreak(SkUnichar utf8)308 static bool isHardBreak(SkUnichar utf8) {
309 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
310 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
311 }
312
313 public:
~SkUnicode_icu()314 ~SkUnicode_icu() override { }
makeBidiIterator(const uint16_t text[],int count,SkBidiIterator::Direction dir)315 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
316 SkBidiIterator::Direction dir) override {
317 return SkUnicode::makeBidiIterator(text, count, dir);
318 }
makeBidiIterator(const char text[],int count,SkBidiIterator::Direction dir)319 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
320 int count,
321 SkBidiIterator::Direction dir) override {
322 return SkUnicode::makeBidiIterator(text, count, dir);
323 }
makeBreakIterator(const char locale[],BreakType breakType)324 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
325 BreakType breakType) override {
326 UErrorCode status = U_ZERO_ERROR;
327 ICUBreakIterator iterator(sk_ubrk_open(convertType(breakType), locale, nullptr, 0,
328 &status));
329 if (U_FAILURE(status)) {
330 SkDEBUGF("Break error: %s", sk_u_errorName(status));
331 return nullptr;
332 }
333 return std::unique_ptr<SkBreakIterator>(new SkBreakIterator_icu(std::move(iterator)));
334 }
makeBreakIterator(BreakType breakType)335 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType breakType) override {
336 return makeBreakIterator(sk_uloc_getDefault(), breakType);
337 }
338
isHardLineBreak(SkUnichar utf8)339 static bool isHardLineBreak(SkUnichar utf8) {
340 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
341 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
342 }
343
toUpper(const SkString & str)344 SkString toUpper(const SkString& str) override {
345 // Convert to UTF16 since that's what ICU wants.
346 auto str16 = SkUnicode::convertUtf8ToUtf16(str.c_str(), str.size());
347
348 UErrorCode icu_err = U_ZERO_ERROR;
349 const auto upper16len = sk_u_strToUpper(nullptr, 0, (UChar*)(str16.c_str()), str16.size(),
350 nullptr, &icu_err);
351 if (icu_err != U_BUFFER_OVERFLOW_ERROR || upper16len <= 0) {
352 return SkString();
353 }
354
355 AutoSTArray<128, uint16_t> upper16(upper16len);
356 icu_err = U_ZERO_ERROR;
357 sk_u_strToUpper((UChar*)(upper16.get()), SkToS32(upper16.size()),
358 (UChar*)(str16.c_str()), str16.size(),
359 nullptr, &icu_err);
360 SkASSERT(!U_FAILURE(icu_err));
361
362 // ... and back to utf8 'cause that's what we want.
363 return convertUtf16ToUtf8((char16_t*)upper16.get(), upper16.size());
364 }
365
getBidiRegions(const char utf8[],int utf8Units,TextDirection dir,std::vector<BidiRegion> * results)366 bool getBidiRegions(const char utf8[],
367 int utf8Units,
368 TextDirection dir,
369 std::vector<BidiRegion>* results) override {
370 return SkUnicode::extractBidi(utf8, utf8Units, dir, results);
371 }
372
getWords(const char utf8[],int utf8Units,const char * locale,std::vector<Position> * results)373 bool getWords(const char utf8[], int utf8Units, const char* locale, std::vector<Position>* results) override {
374
375 // Convert to UTF16 since we want the results in utf16
376 auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
377 return SkUnicode_icu::extractWords((uint16_t*)utf16.c_str(), utf16.size(), locale, results);
378 }
379
computeCodeUnitFlags(char utf8[],int utf8Units,bool replaceTabs,SkTArray<SkUnicode::CodeUnitFlags,true> * results)380 bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs,
381 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
382 results->clear();
383 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
384
385 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, [&](int pos,
386 int status) {
387 (*results)[pos] |= status == UBRK_LINE_HARD
388 ? CodeUnitFlags::kHardLineBreakBefore
389 : CodeUnitFlags::kSoftLineBreakBefore;
390 });
391
392 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, [&](int pos,
393 int status) {
394 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
395 });
396
397 const char* current = utf8;
398 const char* end = utf8 + utf8Units;
399 while (current < end) {
400 auto before = current - utf8;
401 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
402 if (unichar < 0) unichar = 0xFFFD;
403 auto after = current - utf8;
404 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) {
405 results->at(before) |= SkUnicode::kTabulation;
406 if (replaceTabs) {
407 unichar = ' ';
408 utf8[before] = ' ';
409 }
410 }
411 for (auto i = before; i < after; ++i) {
412 if (SkUnicode_icu::isSpace(unichar)) {
413 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
414 }
415 if (SkUnicode_icu::isWhitespace(unichar)) {
416 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
417 }
418 if (SkUnicode_icu::isControl(unichar)) {
419 results->at(i) |= SkUnicode::kControl;
420 }
421 }
422 }
423
424 return true;
425 }
426
computeCodeUnitFlags(char16_t utf16[],int utf16Units,bool replaceTabs,SkTArray<SkUnicode::CodeUnitFlags,true> * results)427 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
428 SkTArray<SkUnicode::CodeUnitFlags, true>* results) override {
429 results->clear();
430 results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
431
432 // Get white spaces
433 this->forEachCodepoint((char16_t*)&utf16[0], utf16Units,
434 [results, replaceTabs, &utf16](SkUnichar unichar, int32_t start, int32_t end) {
435 for (auto i = start; i < end; ++i) {
436 if (replaceTabs && SkUnicode_icu::isTabulation(unichar)) {
437 results->at(i) |= SkUnicode::kTabulation;
438 if (replaceTabs) {
439 unichar = ' ';
440 utf16[start] = ' ';
441 }
442 }
443 if (SkUnicode_icu::isSpace(unichar)) {
444 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
445 }
446 if (SkUnicode_icu::isWhitespace(unichar)) {
447 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
448 }
449 if (SkUnicode_icu::isControl(unichar)) {
450 results->at(i) |= SkUnicode::kControl;
451 }
452 }
453 });
454 // Get graphemes
455 this->forEachBreak((char16_t*)&utf16[0],
456 utf16Units,
457 SkUnicode::BreakType::kGraphemes,
458 [results](SkBreakIterator::Position pos, SkBreakIterator::Status) {
459 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
460 });
461 // Get line breaks
462 this->forEachBreak(
463 (char16_t*)&utf16[0],
464 utf16Units,
465 SkUnicode::BreakType::kLines,
466 [results](SkBreakIterator::Position pos, SkBreakIterator::Status status) {
467 if (status ==
468 (SkBreakIterator::Status)SkUnicode::LineBreakType::kHardLineBreak) {
469 // Hard line breaks clears off all the other flags
470 // TODO: Treat \n as a formatting mark and do not pass it to SkShaper
471 (*results)[pos-1] = CodeUnitFlags::kHardLineBreakBefore;
472 } else {
473 (*results)[pos] |= CodeUnitFlags::kSoftLineBreakBefore;
474 }
475 });
476
477 return true;
478 }
479
reorderVisual(const BidiLevel runLevels[],int levelsCount,int32_t logicalFromVisual[])480 void reorderVisual(const BidiLevel runLevels[],
481 int levelsCount,
482 int32_t logicalFromVisual[]) override {
483 SkUnicode_IcuBidi::bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
484 }
485 };
486
MakeIcuBasedUnicode()487 std::unique_ptr<SkUnicode> SkUnicode::MakeIcuBasedUnicode() {
488 #if defined(SK_USING_THIRD_PARTY_ICU)
489 if (!SkLoadICU()) {
490 static SkOnce once;
491 once([] { SkDEBUGF("SkLoadICU() failed!\n"); });
492 return nullptr;
493 }
494 #endif
495
496 return ICULib()
497 ? std::make_unique<SkUnicode_icu>()
498 : nullptr;
499 }
500