1 /*
2 * Copyright 2020 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7 #include "modules/skunicode/include/SkUnicode_icu.h"
8
9 #include "include/core/SkRefCnt.h"
10 #include "include/core/SkString.h"
11 #include "include/core/SkTypes.h"
12 #include "include/private/base/SkDebug.h"
13 #include "include/private/base/SkMutex.h"
14 #include "include/private/base/SkSpan_impl.h"
15 #include "include/private/base/SkTArray.h"
16 #include "include/private/base/SkTemplates.h"
17 #include "include/private/base/SkTo.h"
18 #include "modules/skunicode/include/SkUnicode.h"
19 #include "modules/skunicode/src/SkBidiFactory_icu_full.h"
20 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
21 #include "modules/skunicode/src/SkUnicode_icupriv.h"
22 #include "src/base/SkBitmaskEnum.h"
23 #include "src/base/SkUTF.h"
24 #include "src/core/SkChecksum.h"
25 #include "src/core/SkTHash.h"
26
27 #include <unicode/ubrk.h>
28 #include <unicode/uchar.h>
29 #include <unicode/uloc.h>
30 #include <unicode/umachine.h>
31 #include <unicode/utext.h>
32 #include <unicode/utypes.h>
33
34 #include <cstdint>
35 #include <cstring>
36 #include <functional>
37 #include <memory>
38 #include <string>
39 #include <utility>
40 #include <vector>
41
42 #ifdef ENABLE_TEXT_ENHANCE
43 #include <unordered_set>
44 #endif
45 #if defined(SK_USING_THIRD_PARTY_ICU) && defined(SK_BUILD_FOR_WIN)
46 #include "SkLoadICU.h"
47 #include "include/private/base/SkOnce.h"
48 #endif
49
50 using namespace skia_private;
51
52 #ifdef ENABLE_DRAWING_ADAPTER
53 namespace SkiaRsText {
54 #endif
SkGetICULib()55 const SkICULib* SkGetICULib() {
56 static const auto gICU = SkLoadICULib();
57 return gICU.get();
58 }
59
60 // sk_* wrappers for ICU funcs
61 #define SKICU_FUNC(funcname) \
62 template <typename... Args> \
63 auto sk_##funcname(Args&&... args) -> decltype(funcname(std::forward<Args>(args)...)) { \
64 return SkGetICULib()->f_##funcname(std::forward<Args>(args)...); \
65 } \
66
67 SKICU_EMIT_FUNCS
68 #undef SKICU_FUNC
69
sk_ubrk_clone(const UBreakIterator * bi,UErrorCode * status)70 static inline UBreakIterator* sk_ubrk_clone(const UBreakIterator* bi, UErrorCode* status) {
71 const auto* icu = SkGetICULib();
72 SkASSERT(icu->f_ubrk_clone_ || icu->f_ubrk_safeClone_);
73 return icu->f_ubrk_clone_
74 ? icu->f_ubrk_clone_(bi, status)
75 : icu->f_ubrk_safeClone_(bi, nullptr, nullptr, status);
76 }
77
utext_close_wrapper(UText * ut)78 static UText* utext_close_wrapper(UText* ut) {
79 return sk_utext_close(ut);
80 }
ubrk_close_wrapper(UBreakIterator * bi)81 static void ubrk_close_wrapper(UBreakIterator* bi) {
82 sk_ubrk_close(bi);
83 }
84
85 using ICUUText = std::unique_ptr<UText, SkFunctionObject<utext_close_wrapper>>;
86 using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionObject<ubrk_close_wrapper>>;
87 /** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
utf8_next(const char ** ptr,const char * end)88 static inline SkUnichar utf8_next(const char** ptr, const char* end) {
89 SkUnichar val = SkUTF::NextUTF8(ptr, end);
90 return val < 0 ? 0xFFFD : val;
91 }
92
convertType(SkUnicode::BreakType type)93 static UBreakIteratorType convertType(SkUnicode::BreakType type) {
94 switch (type) {
95 case SkUnicode::BreakType::kLines: return UBRK_LINE;
96 case SkUnicode::BreakType::kGraphemes: return UBRK_CHARACTER;
97 case SkUnicode::BreakType::kWords: return UBRK_WORD;
98 case SkUnicode::BreakType::kSentences:
99 return UBRK_SENTENCE;
100 default:
101 return UBRK_CHARACTER;
102 }
103 }
104
105 class SkBreakIterator_icu : public SkBreakIterator {
106 ICUBreakIterator fBreakIterator;
107 Position fLastResult;
108 public:
SkBreakIterator_icu(ICUBreakIterator iter)109 explicit SkBreakIterator_icu(ICUBreakIterator iter)
110 : fBreakIterator(std::move(iter))
111 , fLastResult(0) {}
first()112 Position first() override { return fLastResult = sk_ubrk_first(fBreakIterator.get()); }
current()113 Position current() override { return fLastResult = sk_ubrk_current(fBreakIterator.get()); }
next()114 Position next() override { return fLastResult = sk_ubrk_next(fBreakIterator.get()); }
status()115 Status status() override { return sk_ubrk_getRuleStatus(fBreakIterator.get()); }
isDone()116 bool isDone() override { return fLastResult == UBRK_DONE; }
117
setText(const char utftext8[],int utf8Units)118 bool setText(const char utftext8[], int utf8Units) override {
119 UErrorCode status = U_ZERO_ERROR;
120 ICUUText text(sk_utext_openUTF8(nullptr, &utftext8[0], utf8Units, &status));
121
122 if (U_FAILURE(status)) {
123 SkDEBUGF("Break error: %s", sk_u_errorName(status));
124 return false;
125 }
126 SkASSERT(text);
127 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
128 if (U_FAILURE(status)) {
129 SkDEBUGF("Break error: %s", sk_u_errorName(status));
130 return false;
131 }
132 fLastResult = 0;
133 return true;
134 }
setText(const char16_t utftext16[],int utf16Units)135 bool setText(const char16_t utftext16[], int utf16Units) override {
136 UErrorCode status = U_ZERO_ERROR;
137 ICUUText text(sk_utext_openUChars(nullptr, reinterpret_cast<const UChar*>(&utftext16[0]),
138 utf16Units, &status));
139
140 if (U_FAILURE(status)) {
141 SkDEBUGF("Break error: %s", sk_u_errorName(status));
142 return false;
143 }
144 SkASSERT(text);
145 sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
146 if (U_FAILURE(status)) {
147 SkDEBUGF("Break error: %s", sk_u_errorName(status));
148 return false;
149 }
150 fLastResult = 0;
151 return true;
152 }
153 };
154
155 class SkIcuBreakIteratorCache final {
156 struct Request final {
RequestSkiaRsText::SkIcuBreakIteratorCache::Request157 Request(SkUnicode::BreakType type, const char* icuLocale)
158 : fType(type)
159 , fIcuLocale(icuLocale)
160 , hash(SkGoodHash()(type) ^ SkGoodHash()(fIcuLocale))
161 {}
162 const SkUnicode::BreakType fType;
163 const SkString fIcuLocale;
164 const uint32_t hash;
165 struct Hash {
operator ()SkiaRsText::SkIcuBreakIteratorCache::Request::Hash166 uint32_t operator()(const Request& key) const {
167 return key.hash;
168 }
169 };
operator ==SkiaRsText::SkIcuBreakIteratorCache::Request170 bool operator==(const Request& that) const {
171 return this->fType == that.fType && this->fIcuLocale == that.fIcuLocale;
172 }
173 };
174 /* Every holder of this class is referencing the same (logical) break iterator.
175 * Due to caching, the actual break iterator may come and go.
176 */
177 class BreakIteratorRef final {
178 public:
BreakIteratorRef(ICUBreakIterator iter)179 BreakIteratorRef(ICUBreakIterator iter) : breakIterator(iter.release()), fRefCnt(1) {
180 ++Instances;
181 }
182 BreakIteratorRef(SkRefCntBase&&) = delete;
183 BreakIteratorRef(const SkRefCntBase&) = delete;
184 BreakIteratorRef& operator=(SkRefCntBase&&) = delete;
185 BreakIteratorRef& operator=(const SkRefCntBase&) = delete;
~BreakIteratorRef()186 ~BreakIteratorRef() {
187 if (breakIterator) {
188 ubrk_close_wrapper(breakIterator);
189 }
190 }
191
ref() const192 void ref() const {
193 SkASSERT(fRefCnt > 0);
194 ++fRefCnt;
195 }
unref() const196 void unref() const {
197 SkASSERT(fRefCnt > 0);
198 if (1 == fRefCnt--) {
199 delete this;
200 --Instances;
201 }
202 }
203
204 UBreakIterator* breakIterator;
GetInstanceCount()205 static int32_t GetInstanceCount() { return Instances; }
206 private:
207 mutable int32_t fRefCnt;
208 static int32_t Instances;
209 };
210 THashMap<Request, sk_sp<BreakIteratorRef>, Request::Hash> fRequestCache;
211 SkMutex fCacheMutex;
212
purgeIfNeeded()213 void purgeIfNeeded() {
214 // If there are too many requests remove some (oldest first?)
215 // This may free some break iterators
216 if (fRequestCache.count() > 100) {
217 // remove the oldest requests
218 fRequestCache.reset();
219 }
220 // If there are still too many break iterators remove some (oldest first?)
221 if (BreakIteratorRef::GetInstanceCount() > 4) {
222 // delete the oldest break iterators and set the references to nullptr
223 for (auto&& [key, value] : fRequestCache) {
224 if (value->breakIterator) {
225 sk_ubrk_close(value->breakIterator);
226 value->breakIterator = nullptr;
227 }
228 }
229 }
230 }
231
232 public:
get()233 static SkIcuBreakIteratorCache& get() {
234 static SkIcuBreakIteratorCache instance;
235 return instance;
236 }
237
makeBreakIterator(SkUnicode::BreakType type,const char * bcp47)238 ICUBreakIterator makeBreakIterator(SkUnicode::BreakType type, const char* bcp47) {
239 SkAutoMutexExclusive lock(fCacheMutex);
240 UErrorCode status = U_ZERO_ERROR;
241
242 // Get ICU locale for BCP47 langtag
243 char localeIDStorage[ULOC_FULLNAME_CAPACITY];
244 const char* localeID = nullptr;
245 if (bcp47) {
246 sk_uloc_forLanguageTag(bcp47, localeIDStorage, ULOC_FULLNAME_CAPACITY, nullptr, &status);
247 if (U_FAILURE(status)) {
248 SkDEBUGF("Break error could not get language tag: %s", sk_u_errorName(status));
249 } else if (localeIDStorage[0]) {
250 localeID = localeIDStorage;
251 }
252 }
253 if (!localeID) {
254 localeID = sk_uloc_getDefault();
255 }
256
257 auto make = [](const Request& request) -> UBreakIterator* {
258 UErrorCode status = U_ZERO_ERROR;
259 UBreakIterator* bi = sk_ubrk_open(convertType(request.fType),
260 request.fIcuLocale.c_str(),
261 nullptr, 0, &status);
262 if (U_FAILURE(status)) {
263 SkDEBUGF("Break error: %s", sk_u_errorName(status));
264 }
265 return bi;
266 };
267
268 auto clone = [](const UBreakIterator* existing) -> ICUBreakIterator {
269 if (!existing) {
270 return nullptr;
271 }
272
273 UErrorCode status = U_ZERO_ERROR;
274 ICUBreakIterator clone(sk_ubrk_clone(existing, &status));
275 if (U_FAILURE(status)) {
276 SkDEBUGF("Break error: %s", sk_u_errorName(status));
277 }
278 return clone;
279 };
280
281 Request request(type, localeID);
282
283 // See if this request is already in the cache
284 const sk_sp<BreakIteratorRef>* ref = fRequestCache.find(request);
285 if (ref) {
286 // See if the breakIterator needs to be re-created
287 if (!(*ref)->breakIterator) {
288 (*ref)->breakIterator = make(request);
289 }
290 return clone((*ref)->breakIterator);
291 }
292
293 // This request was not in the cache, create an iterator.
294 ICUBreakIterator newIter(make(request));
295 if (!newIter) {
296 return nullptr;
297 }
298
299 sk_sp<BreakIteratorRef> newRef;
300
301 // Check if the new iterator is a duplicate
302 // Android doesn't expose ubrk_getLocaleByType so there is no means of de-duplicating.
303 // ubrk_getAvailable seems like it should work, but the implementation is just every locale.
304 if (SkGetICULib()->f_ubrk_getLocaleByType) {
305 const char* actualLocale = SkGetICULib()->f_ubrk_getLocaleByType(
306 newIter.get(), ULOC_ACTUAL_LOCALE, &status);
307 // Android doesn't expose ubrk_getLocaleByType so a wrapper may return an error.
308 if (!U_FAILURE(status)) {
309 if (!actualLocale) {
310 actualLocale = "";
311 }
312 // If the actual locale is the same as the requested locale we know there is no entry.
313 if (strcmp(actualLocale, localeID) != 0) {
314 Request actualRequest(type, actualLocale);
315 const sk_sp<BreakIteratorRef>* actualRef = fRequestCache.find(actualRequest);
316 if (actualRef) {
317 if (!(*actualRef)->breakIterator) {
318 (*actualRef)->breakIterator = newIter.release();
319 }
320 actualRef = fRequestCache.set(request, *actualRef);
321 return clone((*actualRef)->breakIterator);
322 } else {
323 this->purgeIfNeeded();
324 newRef = sk_make_sp<BreakIteratorRef>(std::move(newIter));
325 fRequestCache.set(actualRequest, newRef);
326 }
327 }
328 }
329 }
330
331 if (!newRef) {
332 this->purgeIfNeeded();
333 newRef = sk_make_sp<BreakIteratorRef>(std::move(newIter));
334 }
335 fRequestCache.set(request, newRef);
336
337 return clone(newRef->breakIterator);
338 }
339 };
340 /*static*/ int32_t SkIcuBreakIteratorCache::BreakIteratorRef::Instances{0};
341
342 class SkUnicode_icu : public SkUnicode {
343
extractWords(uint16_t utf16[],int utf16Units,const char * locale,std::vector<Position> * words)344 static bool extractWords(uint16_t utf16[], int utf16Units, const char* locale,
345 std::vector<Position>* words) {
346
347 UErrorCode status = U_ZERO_ERROR;
348
349 const BreakType type = BreakType::kWords;
350 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
351 if (!iterator) {
352 SkDEBUGF("Break error: %s", sk_u_errorName(status));
353 return false;
354 }
355 SkASSERT(iterator);
356
357 ICUUText utf16UText(sk_utext_openUChars(nullptr, (UChar*)utf16, utf16Units, &status));
358 if (U_FAILURE(status)) {
359 SkDEBUGF("Break error: %s", sk_u_errorName(status));
360 return false;
361 }
362
363 sk_ubrk_setUText(iterator.get(), utf16UText.get(), &status);
364 if (U_FAILURE(status)) {
365 SkDEBUGF("Break error: %s", sk_u_errorName(status));
366 return false;
367 }
368
369 // Get the words
370 int32_t pos = sk_ubrk_first(iterator.get());
371 while (pos != UBRK_DONE) {
372 words->emplace_back(pos);
373 pos = sk_ubrk_next(iterator.get());
374 }
375
376 return true;
377 }
378
379 #ifdef ENABLE_TEXT_ENHANCE
extractPositions(const char utf8[],int utf8Units,BreakType type,const char locale[],std::function<void (int,int)> setBreak)380 static bool extractPositions(const char utf8[], int utf8Units, BreakType type,
381 const char locale[], std::function<void(int, int)> setBreak) {
382 #else
383 static bool extractPositions(const char utf8[], int utf8Units,
384 BreakType type, const char* locale,
385 const std::function<void(int, int)>& setBreak) {
386 #endif
387
388 UErrorCode status = U_ZERO_ERROR;
389 ICUUText text(sk_utext_openUTF8(nullptr, &utf8[0], utf8Units, &status));
390 if (U_FAILURE(status)) {
391 SkDEBUGF("Break error: %s", sk_u_errorName(status));
392 return false;
393 }
394 SkASSERT(text);
395
396 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
397 if (!iterator) {
398 return false;
399 }
400
401 sk_ubrk_setUText(iterator.get(), text.get(), &status);
402 if (U_FAILURE(status)) {
403 SkDEBUGF("Break error: %s", sk_u_errorName(status));
404 return false;
405 }
406
407 auto iter = iterator.get();
408 int32_t pos = sk_ubrk_first(iter);
409 while (pos != UBRK_DONE) {
410 int s = type == SkUnicode::BreakType::kLines
411 ? UBRK_LINE_SOFT
412 : sk_ubrk_getRuleStatus(iter);
413 setBreak(pos, s);
414 pos = sk_ubrk_next(iter);
415 }
416
417 if (type == SkUnicode::BreakType::kLines) {
418 // This is a workaround for https://bugs.chromium.org/p/skia/issues/detail?id=10715
419 // (ICU line break iterator does not work correctly on Thai text with new lines)
420 // So, we only use the iterator to collect soft line breaks and
421 // scan the text for all hard line breaks ourselves
422 const char* end = utf8 + utf8Units;
423 const char* ch = utf8;
424 while (ch < end) {
425 auto unichar = utf8_next(&ch, end);
426 if (SkUnicode_icu::isHardLineBreak(unichar)) {
427 setBreak(ch - utf8, UBRK_LINE_HARD);
428 }
429 }
430 }
431 return true;
432 }
433
434 bool isControl(SkUnichar utf8) override {
435 return sk_u_iscntrl(utf8);
436 }
437
438 bool isWhitespace(SkUnichar utf8) override {
439 return sk_u_isWhitespace(utf8);
440 }
441
442 bool isSpace(SkUnichar utf8) override {
443 return sk_u_isspace(utf8);
444 }
445
446 bool isHardBreak(SkUnichar utf8) override {
447 return SkUnicode_icu::isHardLineBreak(utf8);
448 }
449
450 bool isEmoji(SkUnichar unichar) override {
451 return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI);
452 }
453
454 bool isEmojiComponent(SkUnichar unichar) override {
455 return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_COMPONENT);
456 }
457
458 bool isEmojiModifierBase(SkUnichar unichar) override {
459 return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_MODIFIER_BASE);
460 }
461
462 bool isEmojiModifier(SkUnichar unichar) override {
463 return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_MODIFIER);
464 }
465
466 bool isRegionalIndicator(SkUnichar unichar) override {
467 return sk_u_hasBinaryProperty(unichar, UCHAR_REGIONAL_INDICATOR);
468 }
469
470 bool isIdeographic(SkUnichar unichar) override {
471 return sk_u_hasBinaryProperty(unichar, UCHAR_IDEOGRAPHIC);
472 }
473
474 bool isTabulation(SkUnichar utf8) override {
475 return utf8 == '\t';
476 }
477
478 static bool isHardLineBreak(SkUnichar utf8) {
479 auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
480 return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
481 }
482
483 #ifdef ENABLE_TEXT_ENHANCE
484 static bool isPunctuation(SkUnichar unichar)
485 {
486 if (sk_u_ispunct(unichar)) {
487 return true;
488 }
489 static constexpr std::array<std::pair<SkUnichar, SkUnichar>, 13> ranges{{
490 {0x0021, 0x002F}, // ASCII punctuation (e.g., ! " # $ % & ' ( ) * + , - . /)
491 {0x003A, 0x0040}, // ASCII punctuation (e.g., : ; < = > ? @)
492 {0x005B, 0x0060}, // ASCII punctuation (e.g., [ \ ] ^ _ `)
493 {0x007B, 0x007E}, // ASCII punctuation (e.g., { | } ~)
494 {0x2000, 0x206F}, // Common punctuation (Chinese & English)
495 {0xFF00, 0xFFEF}, // Full-width characters and symbols
496 {0x2E00, 0x2E7F}, // Supplemental punctuation (e.g., ancient)
497 {0x3001, 0x3003}, // CJK punctuation (e.g., Chinese comma)
498 {0xFF01, 0xFF0F}, // Full-width ASCII punctuation (0x21-0x2F)
499 {0xFF1A, 0xFF20}, // Full-width ASCII punctuation (0x3A-0x40)
500 {0xFF3B, 0xFF40}, // Full-width ASCII punctuation (0x5B-0x60)
501 {0xFF5B, 0xFF65}, // Other full-width punctuation (e.g., quotes)
502 }};
503 for (auto range : ranges) {
504 if (range.first <= unichar && unichar <= range.second) {
505 return true;
506 }
507 }
508 return false;
509 }
510 static bool isEllipsis(SkUnichar unichar) { return (unichar == 0x2026 || unichar == 0x002E); }
511 static bool isGraphemeExtend(SkUnichar unichar) {
512 return sk_u_hasBinaryProperty(unichar, UCHAR_GRAPHEME_EXTEND);
513 }
514 static bool isCustomSoftBreak(SkUnichar unichar) {
515 // ‘ “ ( [ { < « — – • – – $ £ € + = × \ % ° # * @ _ § © ®
516 static const std::unordered_set<SkUnichar> kBreakTriggerCodePoints {
517 0x2018, 0x201C, 0x0028, 0x005B, 0x007B, 0x003C, 0x00AB, 0x2014, 0x2013,
518 0x2022, 0x0024, 0x00A3, 0x20AC, 0x002B, 0x003D, 0x00D7, 0x005C, 0x0025,
519 0x00B0, 0x0023, 0x002A, 0x0040, 0x005F, 0x00A7, 0x00A9, 0x00AE
520 };
521
522 return kBreakTriggerCodePoints.count(unichar) > 0;
523 }
524 #endif
525
526 public:
527 ~SkUnicode_icu() override { }
528 std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
529 SkBidiIterator::Direction dir) override {
530 return fBidiFact->MakeIterator(text, count, dir);
531 }
532 std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
533 int count,
534 SkBidiIterator::Direction dir) override {
535 return fBidiFact->MakeIterator(text, count, dir);
536 }
537 std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
538 BreakType type) override {
539 ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
540 if (!iterator) {
541 return nullptr;
542 }
543 return std::unique_ptr<SkBreakIterator>(new SkBreakIterator_icu(std::move(iterator)));
544 }
545 std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType type) override {
546 return makeBreakIterator(sk_uloc_getDefault(), type);
547 }
548
549 SkString toUpper(const SkString& str) override {
550 return this->toUpper(str, nullptr);
551 }
552
553 SkString toUpper(const SkString& str, const char* locale) override {
554 // Convert to UTF16 since that's what ICU wants.
555 auto str16 = SkUnicode::convertUtf8ToUtf16(str.c_str(), str.size());
556
557 UErrorCode icu_err = U_ZERO_ERROR;
558 const auto upper16len = sk_u_strToUpper(nullptr, 0, (UChar*)(str16.c_str()), str16.size(),
559 locale, &icu_err);
560 if (icu_err != U_BUFFER_OVERFLOW_ERROR || upper16len <= 0) {
561 return SkString();
562 }
563
564 AutoSTArray<128, uint16_t> upper16(upper16len);
565 icu_err = U_ZERO_ERROR;
566 sk_u_strToUpper((UChar*)(upper16.get()), SkToS32(upper16.size()),
567 (UChar*)(str16.c_str()), str16.size(),
568 locale, &icu_err);
569 SkASSERT(!U_FAILURE(icu_err));
570
571 // ... and back to utf8 'cause that's what we want.
572 return convertUtf16ToUtf8((char16_t*)upper16.get(), upper16.size());
573 }
574
575 bool getBidiRegions(const char utf8[],
576 int utf8Units,
577 TextDirection dir,
578 std::vector<BidiRegion>* results) override {
579 return fBidiFact->ExtractBidi(utf8, utf8Units, dir, results);
580 }
581
582 bool getWords(const char utf8[], int utf8Units, const char* locale,
583 std::vector<Position>* results) override {
584
585 // Convert to UTF16 since we want the results in utf16
586 auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
587 return SkUnicode_icu::extractWords((uint16_t*)utf16.c_str(), utf16.size(), locale, results);
588 }
589
590 bool getUtf8Words(const char utf8[],
591 int utf8Units,
592 const char* locale,
593 std::vector<Position>* results) override {
594 // Convert to UTF16 since we want the results in utf16
595 auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
596 std::vector<Position> utf16Results;
597 if (!SkUnicode_icu::extractWords(
598 (uint16_t*)utf16.c_str(), utf16.size(), locale, &utf16Results)) {
599 return false;
600 }
601
602 std::vector<Position> mapping;
603 SkSpan<const char> text(utf8, utf8Units);
604 SkUnicode::extractUtfConversionMapping(
605 text, [&](size_t index) { mapping.emplace_back(index); }, [&](size_t index) {});
606
607 for (auto i16 : utf16Results) {
608 results->emplace_back(mapping[i16]);
609 }
610 return true;
611 }
612
613 bool getSentences(const char utf8[],
614 int utf8Units,
615 const char* locale,
616 std::vector<SkUnicode::Position>* results) override {
617 SkUnicode_icu::extractPositions(
618 utf8, utf8Units, BreakType::kSentences, nullptr,
619 [&](int pos, int status) {
620 results->emplace_back(pos);
621 });
622 return true;
623 }
624
625 #ifdef ENABLE_TEXT_ENHANCE
626 void processPunctuationAndEllipsis(skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results,
627 int i, SkUnichar unichar) {
628 if (SkUnicode_icu::isPunctuation(unichar)) {
629 results->at(i) |= SkUnicode::kPunctuation;
630 }
631 if (SkUnicode_icu::isEllipsis(unichar)) {
632 results->at(i) |= SkUnicode::kEllipsis;
633 }
634 if (SkUnicode_icu::isCustomSoftBreak(unichar)) {
635 results->at(i) |= SkUnicode::kSoftLineBreakBefore;
636 }
637 }
638
639 bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs, const char locale[],
640 TArray<SkUnicode::CodeUnitFlags, true>* results) override {
641 results->clear();
642 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
643
644 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, locale,
645 [&](int pos, int status) {
646 (*results)[pos] |= status == UBRK_LINE_HARD ? CodeUnitFlags::kHardLineBreakBefore :
647 CodeUnitFlags::kSoftLineBreakBefore;
648 });
649
650 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, locale,
651 [&](int pos, int status) {
652 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
653 });
654
655 const char* current = utf8;
656 const char* end = utf8 + utf8Units;
657 while (current < end) {
658 auto before = current - utf8;
659 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
660 if (unichar < 0) unichar = 0xFFFD;
661 auto after = current - utf8;
662 if (replaceTabs && this->isTabulation(unichar)) {
663 results->at(before) |= SkUnicode::kTabulation;
664 if (replaceTabs) {
665 unichar = ' ';
666 utf8[before] = ' ';
667 }
668 }
669 for (auto i = before; i < after; ++i) {
670 if (this->isSpace(unichar)) {
671 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
672 }
673 if (this->isWhitespace(unichar)) {
674 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
675 }
676 if (this->isControl(unichar)) {
677 results->at(i) |= SkUnicode::kControl;
678 }
679 if (this->isIdeographic(unichar)) {
680 results->at(i) |= SkUnicode::kIdeographic;
681 }
682 processPunctuationAndEllipsis(results, i, unichar);
683 }
684
685 if (SkUnicode_icu::isGraphemeExtend(unichar)) {
686 // Current unichar is a combining one.
687 results->at(before) |= SkUnicode::kCombine;
688 }
689 }
690
691 return true;
692 }
693 #else
694 bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs,
695 TArray<SkUnicode::CodeUnitFlags, true>* results) override {
696 results->clear();
697 results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
698
699 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, nullptr, // TODO: locale
700 [&](int pos, int status) {
701 (*results)[pos] |= status == UBRK_LINE_HARD
702 ? CodeUnitFlags::kHardLineBreakBefore
703 : CodeUnitFlags::kSoftLineBreakBefore;
704 });
705
706 SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, nullptr, //TODO
707 [&](int pos, int status) {
708 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
709 });
710
711 const char* current = utf8;
712 const char* end = utf8 + utf8Units;
713 while (current < end) {
714 auto before = current - utf8;
715 SkUnichar unichar = SkUTF::NextUTF8(¤t, end);
716 if (unichar < 0) unichar = 0xFFFD;
717 auto after = current - utf8;
718 if (replaceTabs && this->isTabulation(unichar)) {
719 results->at(before) |= SkUnicode::kTabulation;
720 if (replaceTabs) {
721 unichar = ' ';
722 utf8[before] = ' ';
723 }
724 }
725 for (auto i = before; i < after; ++i) {
726 if (this->isSpace(unichar)) {
727 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
728 }
729 if (this->isWhitespace(unichar)) {
730 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
731 }
732 if (this->isControl(unichar)) {
733 results->at(i) |= SkUnicode::kControl;
734 }
735 if (this->isIdeographic(unichar)) {
736 results->at(i) |= SkUnicode::kIdeographic;
737 }
738 }
739 }
740
741 return true;
742 }
743 #endif
744
745 #ifdef ENABLE_TEXT_ENHANCE
746 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs, const char locale[],
747 #else
748 bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
749 #endif
750 TArray<SkUnicode::CodeUnitFlags, true>* results) override {
751 results->clear();
752 results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
753
754 // Get white spaces
755 this->forEachCodepoint((char16_t*)&utf16[0], utf16Units,
756 [this, results, replaceTabs, &utf16](SkUnichar unichar, int32_t start, int32_t end) {
757 for (auto i = start; i < end; ++i) {
758 if (replaceTabs && this->isTabulation(unichar)) {
759 results->at(i) |= SkUnicode::kTabulation;
760 if (replaceTabs) {
761 unichar = ' ';
762 utf16[start] = ' ';
763 }
764 }
765 if (this->isSpace(unichar)) {
766 results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
767 }
768 if (this->isWhitespace(unichar)) {
769 results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
770 }
771 if (this->isControl(unichar)) {
772 results->at(i) |= SkUnicode::kControl;
773 }
774 }
775 });
776 // Get graphemes
777 this->forEachBreak((char16_t*)&utf16[0],
778 utf16Units,
779 SkUnicode::BreakType::kGraphemes,
780 #ifdef ENABLE_TEXT_ENHANCE
781 locale,
782 #endif
783 [results](SkBreakIterator::Position pos, SkBreakIterator::Status) {
784 (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
785 });
786 // Get line breaks
787 this->forEachBreak(
788 (char16_t*)&utf16[0],
789 utf16Units,
790 SkUnicode::BreakType::kLines,
791 #ifdef ENABLE_TEXT_ENHANCE
792 locale,
793 #endif
794 [results](SkBreakIterator::Position pos, SkBreakIterator::Status status) {
795 if (status ==
796 (SkBreakIterator::Status)SkUnicode::LineBreakType::kHardLineBreak) {
797 // Hard line breaks clears off all the other flags
798 // TODO: Treat \n as a formatting mark and do not pass it to SkShaper
799 (*results)[pos-1] = CodeUnitFlags::kHardLineBreakBefore;
800 } else {
801 (*results)[pos] |= CodeUnitFlags::kSoftLineBreakBefore;
802 }
803 });
804
805 return true;
806 }
807
808 void reorderVisual(const BidiLevel runLevels[],
809 int levelsCount,
810 int32_t logicalFromVisual[]) override {
811 fBidiFact->bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
812 }
813
814 private:
815 sk_sp<SkBidiFactory> fBidiFact = sk_make_sp<SkBidiICUFactory>();
816 };
817
818 namespace SkUnicodes::ICU {
Make()819 sk_sp<SkUnicode> Make() {
820 // We haven't yet created a way to encode the ICU data for assembly on Windows,
821 // so we use a helper library to load icudtl.dat from the harddrive.
822 #if defined(SK_USING_THIRD_PARTY_ICU) && defined(SK_BUILD_FOR_WIN)
823 if (!SkLoadICU()) {
824 static SkOnce once;
825 once([] { SkDEBUGF("SkLoadICU() failed!\n"); });
826 return nullptr;
827 }
828 #endif
829 if (SkGetICULib()) {
830 return sk_make_sp<SkUnicode_icu>();
831 }
832 return nullptr;
833 }
834 } // namespace SkUnicodes::ICU
835 #ifdef ENABLE_DRAWING_ADAPTER
836 }
837 #endif // ENABLE_DRAWING_ADAPTER