• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 * Copyright 2020 Google Inc.
3 *
4 * Use of this source code is governed by a BSD-style license that can be
5 * found in the LICENSE file.
6 */
7 #include "modules/skunicode/include/SkUnicode_icu.h"
8 
9 #include "include/core/SkRefCnt.h"
10 #include "include/core/SkString.h"
11 #include "include/core/SkTypes.h"
12 #include "include/private/base/SkDebug.h"
13 #include "include/private/base/SkMutex.h"
14 #include "include/private/base/SkSpan_impl.h"
15 #include "include/private/base/SkTArray.h"
16 #include "include/private/base/SkTemplates.h"
17 #include "include/private/base/SkTo.h"
18 #include "modules/skunicode/include/SkUnicode.h"
19 #include "modules/skunicode/src/SkBidiFactory_icu_full.h"
20 #include "modules/skunicode/src/SkUnicode_icu_bidi.h"
21 #include "modules/skunicode/src/SkUnicode_icupriv.h"
22 #include "src/base/SkBitmaskEnum.h"
23 #include "src/base/SkUTF.h"
24 #include "src/core/SkChecksum.h"
25 #include "src/core/SkTHash.h"
26 
27 #include <unicode/ubrk.h>
28 #include <unicode/uchar.h>
29 #include <unicode/uloc.h>
30 #include <unicode/umachine.h>
31 #include <unicode/utext.h>
32 #include <unicode/utypes.h>
33 
34 #include <cstdint>
35 #include <cstring>
36 #include <functional>
37 #include <memory>
38 #include <string>
39 #include <utility>
40 #include <vector>
41 
42 #ifdef ENABLE_TEXT_ENHANCE
43 #include <unordered_set>
44 #endif
45 #if defined(SK_USING_THIRD_PARTY_ICU) && defined(SK_BUILD_FOR_WIN)
46 #include "SkLoadICU.h"
47 #include "include/private/base/SkOnce.h"
48 #endif
49 
50 using namespace skia_private;
51 
52 #ifdef ENABLE_DRAWING_ADAPTER
53 namespace SkiaRsText {
54 #endif
SkGetICULib()55 const SkICULib* SkGetICULib() {
56     static const auto gICU = SkLoadICULib();
57     return gICU.get();
58 }
59 
60 // sk_* wrappers for ICU funcs
61 #define SKICU_FUNC(funcname)                                                                \
62     template <typename... Args>                                                             \
63     auto sk_##funcname(Args&&... args) -> decltype(funcname(std::forward<Args>(args)...)) { \
64         return SkGetICULib()->f_##funcname(std::forward<Args>(args)...);                    \
65     }                                                                                       \
66 
67 SKICU_EMIT_FUNCS
68 #undef SKICU_FUNC
69 
sk_ubrk_clone(const UBreakIterator * bi,UErrorCode * status)70 static inline UBreakIterator* sk_ubrk_clone(const UBreakIterator* bi, UErrorCode* status) {
71     const auto* icu = SkGetICULib();
72     SkASSERT(icu->f_ubrk_clone_ || icu->f_ubrk_safeClone_);
73     return icu->f_ubrk_clone_
74         ? icu->f_ubrk_clone_(bi, status)
75         : icu->f_ubrk_safeClone_(bi, nullptr, nullptr, status);
76 }
77 
utext_close_wrapper(UText * ut)78 static UText* utext_close_wrapper(UText* ut) {
79     return sk_utext_close(ut);
80 }
ubrk_close_wrapper(UBreakIterator * bi)81 static void ubrk_close_wrapper(UBreakIterator* bi) {
82     sk_ubrk_close(bi);
83 }
84 
85 using ICUUText = std::unique_ptr<UText, SkFunctionObject<utext_close_wrapper>>;
86 using ICUBreakIterator = std::unique_ptr<UBreakIterator, SkFunctionObject<ubrk_close_wrapper>>;
87 /** Replaces invalid utf-8 sequences with REPLACEMENT CHARACTER U+FFFD. */
utf8_next(const char ** ptr,const char * end)88 static inline SkUnichar utf8_next(const char** ptr, const char* end) {
89     SkUnichar val = SkUTF::NextUTF8(ptr, end);
90     return val < 0 ? 0xFFFD : val;
91 }
92 
convertType(SkUnicode::BreakType type)93 static UBreakIteratorType convertType(SkUnicode::BreakType type) {
94     switch (type) {
95         case SkUnicode::BreakType::kLines: return UBRK_LINE;
96         case SkUnicode::BreakType::kGraphemes: return UBRK_CHARACTER;
97         case SkUnicode::BreakType::kWords: return UBRK_WORD;
98         case SkUnicode::BreakType::kSentences:
99             return UBRK_SENTENCE;
100         default:
101             return UBRK_CHARACTER;
102     }
103 }
104 
105 class SkBreakIterator_icu : public SkBreakIterator {
106     ICUBreakIterator fBreakIterator;
107     Position fLastResult;
108  public:
SkBreakIterator_icu(ICUBreakIterator iter)109     explicit SkBreakIterator_icu(ICUBreakIterator iter)
110             : fBreakIterator(std::move(iter))
111             , fLastResult(0) {}
first()112     Position first() override { return fLastResult = sk_ubrk_first(fBreakIterator.get()); }
current()113     Position current() override { return fLastResult = sk_ubrk_current(fBreakIterator.get()); }
next()114     Position next() override { return fLastResult = sk_ubrk_next(fBreakIterator.get()); }
status()115     Status status() override { return sk_ubrk_getRuleStatus(fBreakIterator.get()); }
isDone()116     bool isDone() override { return fLastResult == UBRK_DONE; }
117 
setText(const char utftext8[],int utf8Units)118     bool setText(const char utftext8[], int utf8Units) override {
119         UErrorCode status = U_ZERO_ERROR;
120         ICUUText text(sk_utext_openUTF8(nullptr, &utftext8[0], utf8Units, &status));
121 
122         if (U_FAILURE(status)) {
123             SkDEBUGF("Break error: %s", sk_u_errorName(status));
124             return false;
125         }
126         SkASSERT(text);
127         sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
128         if (U_FAILURE(status)) {
129             SkDEBUGF("Break error: %s", sk_u_errorName(status));
130             return false;
131         }
132         fLastResult = 0;
133         return true;
134     }
setText(const char16_t utftext16[],int utf16Units)135     bool setText(const char16_t utftext16[], int utf16Units) override {
136         UErrorCode status = U_ZERO_ERROR;
137         ICUUText text(sk_utext_openUChars(nullptr, reinterpret_cast<const UChar*>(&utftext16[0]),
138                                           utf16Units, &status));
139 
140         if (U_FAILURE(status)) {
141             SkDEBUGF("Break error: %s", sk_u_errorName(status));
142             return false;
143         }
144         SkASSERT(text);
145         sk_ubrk_setUText(fBreakIterator.get(), text.get(), &status);
146         if (U_FAILURE(status)) {
147             SkDEBUGF("Break error: %s", sk_u_errorName(status));
148             return false;
149         }
150         fLastResult = 0;
151         return true;
152     }
153 };
154 
155 class SkIcuBreakIteratorCache final {
156     struct Request final {
RequestSkiaRsText::SkIcuBreakIteratorCache::Request157         Request(SkUnicode::BreakType type, const char* icuLocale)
158             : fType(type)
159             , fIcuLocale(icuLocale)
160             , hash(SkGoodHash()(type) ^ SkGoodHash()(fIcuLocale))
161         {}
162         const SkUnicode::BreakType fType;
163         const SkString fIcuLocale;
164         const uint32_t hash;
165         struct Hash {
operator ()SkiaRsText::SkIcuBreakIteratorCache::Request::Hash166             uint32_t operator()(const Request& key) const {
167                 return key.hash;
168             }
169         };
operator ==SkiaRsText::SkIcuBreakIteratorCache::Request170         bool operator==(const Request& that) const {
171             return this->fType == that.fType && this->fIcuLocale == that.fIcuLocale;
172         }
173     };
174     /* Every holder of this class is referencing the same (logical) break iterator.
175      * Due to caching, the actual break iterator may come and go.
176      */
177     class BreakIteratorRef final {
178     public:
BreakIteratorRef(ICUBreakIterator iter)179         BreakIteratorRef(ICUBreakIterator iter) : breakIterator(iter.release()), fRefCnt(1) {
180             ++Instances;
181         }
182         BreakIteratorRef(SkRefCntBase&&) = delete;
183         BreakIteratorRef(const SkRefCntBase&) = delete;
184         BreakIteratorRef& operator=(SkRefCntBase&&) = delete;
185         BreakIteratorRef& operator=(const SkRefCntBase&) = delete;
~BreakIteratorRef()186         ~BreakIteratorRef() {
187             if (breakIterator) {
188                 ubrk_close_wrapper(breakIterator);
189             }
190         }
191 
ref() const192         void ref() const {
193             SkASSERT(fRefCnt > 0);
194             ++fRefCnt;
195         }
unref() const196         void unref() const {
197             SkASSERT(fRefCnt > 0);
198             if (1 == fRefCnt--) {
199                 delete this;
200                 --Instances;
201             }
202         }
203 
204         UBreakIterator* breakIterator;
GetInstanceCount()205         static int32_t GetInstanceCount() { return Instances; }
206     private:
207         mutable int32_t fRefCnt;
208         static int32_t Instances;
209     };
210     THashMap<Request, sk_sp<BreakIteratorRef>, Request::Hash> fRequestCache;
211     SkMutex fCacheMutex;
212 
purgeIfNeeded()213     void purgeIfNeeded() {
214         // If there are too many requests remove some (oldest first?)
215         // This may free some break iterators
216         if (fRequestCache.count() > 100) {
217             // remove the oldest requests
218             fRequestCache.reset();
219         }
220         // If there are still too many break iterators remove some (oldest first?)
221         if (BreakIteratorRef::GetInstanceCount() > 4) {
222             // delete the oldest break iterators and set the references to nullptr
223             for (auto&& [key, value] : fRequestCache) {
224                 if (value->breakIterator) {
225                     sk_ubrk_close(value->breakIterator);
226                     value->breakIterator = nullptr;
227                 }
228             }
229         }
230     }
231 
232  public:
get()233     static SkIcuBreakIteratorCache& get() {
234         static SkIcuBreakIteratorCache instance;
235         return instance;
236     }
237 
makeBreakIterator(SkUnicode::BreakType type,const char * bcp47)238     ICUBreakIterator makeBreakIterator(SkUnicode::BreakType type, const char* bcp47) {
239         SkAutoMutexExclusive lock(fCacheMutex);
240         UErrorCode status = U_ZERO_ERROR;
241 
242         // Get ICU locale for BCP47 langtag
243         char localeIDStorage[ULOC_FULLNAME_CAPACITY];
244         const char* localeID = nullptr;
245         if (bcp47) {
246             sk_uloc_forLanguageTag(bcp47, localeIDStorage, ULOC_FULLNAME_CAPACITY, nullptr, &status);
247             if (U_FAILURE(status)) {
248                 SkDEBUGF("Break error could not get language tag: %s", sk_u_errorName(status));
249             } else if (localeIDStorage[0]) {
250                 localeID = localeIDStorage;
251             }
252         }
253         if (!localeID) {
254             localeID = sk_uloc_getDefault();
255         }
256 
257         auto make = [](const Request& request) -> UBreakIterator* {
258             UErrorCode status = U_ZERO_ERROR;
259             UBreakIterator* bi = sk_ubrk_open(convertType(request.fType),
260                                               request.fIcuLocale.c_str(),
261                                               nullptr, 0, &status);
262             if (U_FAILURE(status)) {
263                 SkDEBUGF("Break error: %s", sk_u_errorName(status));
264             }
265             return bi;
266         };
267 
268         auto clone = [](const UBreakIterator* existing) -> ICUBreakIterator {
269             if (!existing) {
270                 return nullptr;
271             }
272 
273             UErrorCode status = U_ZERO_ERROR;
274             ICUBreakIterator clone(sk_ubrk_clone(existing, &status));
275             if (U_FAILURE(status)) {
276                 SkDEBUGF("Break error: %s", sk_u_errorName(status));
277             }
278             return clone;
279         };
280 
281         Request request(type, localeID);
282 
283         // See if this request is already in the cache
284         const sk_sp<BreakIteratorRef>* ref = fRequestCache.find(request);
285         if (ref) {
286             // See if the breakIterator needs to be re-created
287             if (!(*ref)->breakIterator) {
288                 (*ref)->breakIterator = make(request);
289             }
290             return clone((*ref)->breakIterator);
291         }
292 
293         // This request was not in the cache, create an iterator.
294         ICUBreakIterator newIter(make(request));
295         if (!newIter) {
296             return nullptr;
297         }
298 
299         sk_sp<BreakIteratorRef> newRef;
300 
301         // Check if the new iterator is a duplicate
302         // Android doesn't expose ubrk_getLocaleByType so there is no means of de-duplicating.
303         // ubrk_getAvailable seems like it should work, but the implementation is just every locale.
304         if (SkGetICULib()->f_ubrk_getLocaleByType) {
305             const char* actualLocale = SkGetICULib()->f_ubrk_getLocaleByType(
306                                            newIter.get(), ULOC_ACTUAL_LOCALE, &status);
307             // Android doesn't expose ubrk_getLocaleByType so a wrapper may return an error.
308             if (!U_FAILURE(status)) {
309                 if (!actualLocale) {
310                     actualLocale = "";
311                 }
312                 // If the actual locale is the same as the requested locale we know there is no entry.
313                 if (strcmp(actualLocale, localeID) != 0) {
314                     Request actualRequest(type, actualLocale);
315                     const sk_sp<BreakIteratorRef>* actualRef = fRequestCache.find(actualRequest);
316                     if (actualRef) {
317                         if (!(*actualRef)->breakIterator) {
318                             (*actualRef)->breakIterator = newIter.release();
319                         }
320                         actualRef = fRequestCache.set(request, *actualRef);
321                         return clone((*actualRef)->breakIterator);
322                     } else {
323                         this->purgeIfNeeded();
324                         newRef = sk_make_sp<BreakIteratorRef>(std::move(newIter));
325                         fRequestCache.set(actualRequest, newRef);
326                     }
327                 }
328             }
329         }
330 
331         if (!newRef) {
332             this->purgeIfNeeded();
333             newRef = sk_make_sp<BreakIteratorRef>(std::move(newIter));
334         }
335         fRequestCache.set(request, newRef);
336 
337         return clone(newRef->breakIterator);
338     }
339 };
340 /*static*/ int32_t SkIcuBreakIteratorCache::BreakIteratorRef::Instances{0};
341 
342 class SkUnicode_icu : public SkUnicode {
343 
extractWords(uint16_t utf16[],int utf16Units,const char * locale,std::vector<Position> * words)344     static bool extractWords(uint16_t utf16[], int utf16Units, const char* locale,
345                              std::vector<Position>* words) {
346 
347         UErrorCode status = U_ZERO_ERROR;
348 
349         const BreakType type = BreakType::kWords;
350         ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
351         if (!iterator) {
352             SkDEBUGF("Break error: %s", sk_u_errorName(status));
353             return false;
354         }
355         SkASSERT(iterator);
356 
357         ICUUText utf16UText(sk_utext_openUChars(nullptr, (UChar*)utf16, utf16Units, &status));
358         if (U_FAILURE(status)) {
359             SkDEBUGF("Break error: %s", sk_u_errorName(status));
360             return false;
361         }
362 
363         sk_ubrk_setUText(iterator.get(), utf16UText.get(), &status);
364         if (U_FAILURE(status)) {
365             SkDEBUGF("Break error: %s", sk_u_errorName(status));
366             return false;
367         }
368 
369         // Get the words
370         int32_t pos = sk_ubrk_first(iterator.get());
371         while (pos != UBRK_DONE) {
372             words->emplace_back(pos);
373             pos = sk_ubrk_next(iterator.get());
374         }
375 
376         return true;
377     }
378 
379 #ifdef ENABLE_TEXT_ENHANCE
extractPositions(const char utf8[],int utf8Units,BreakType type,const char locale[],std::function<void (int,int)> setBreak)380     static bool extractPositions(const char utf8[], int utf8Units, BreakType type,
381         const char locale[], std::function<void(int, int)> setBreak) {
382 #else
383     static bool extractPositions(const char utf8[], int utf8Units,
384                                  BreakType type, const char* locale,
385                                  const std::function<void(int, int)>& setBreak) {
386 #endif
387 
388         UErrorCode status = U_ZERO_ERROR;
389         ICUUText text(sk_utext_openUTF8(nullptr, &utf8[0], utf8Units, &status));
390         if (U_FAILURE(status)) {
391             SkDEBUGF("Break error: %s", sk_u_errorName(status));
392             return false;
393         }
394         SkASSERT(text);
395 
396         ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
397         if (!iterator) {
398             return false;
399         }
400 
401         sk_ubrk_setUText(iterator.get(), text.get(), &status);
402         if (U_FAILURE(status)) {
403             SkDEBUGF("Break error: %s", sk_u_errorName(status));
404             return false;
405         }
406 
407         auto iter = iterator.get();
408         int32_t pos = sk_ubrk_first(iter);
409         while (pos != UBRK_DONE) {
410             int s = type == SkUnicode::BreakType::kLines
411                         ? UBRK_LINE_SOFT
412                         : sk_ubrk_getRuleStatus(iter);
413             setBreak(pos, s);
414             pos = sk_ubrk_next(iter);
415         }
416 
417         if (type == SkUnicode::BreakType::kLines) {
418             // This is a workaround for https://bugs.chromium.org/p/skia/issues/detail?id=10715
419             // (ICU line break iterator does not work correctly on Thai text with new lines)
420             // So, we only use the iterator to collect soft line breaks and
421             // scan the text for all hard line breaks ourselves
422             const char* end = utf8 + utf8Units;
423             const char* ch = utf8;
424             while (ch < end) {
425                 auto unichar = utf8_next(&ch, end);
426                 if (SkUnicode_icu::isHardLineBreak(unichar)) {
427                     setBreak(ch - utf8, UBRK_LINE_HARD);
428                 }
429             }
430         }
431         return true;
432     }
433 
434     bool isControl(SkUnichar utf8) override {
435         return sk_u_iscntrl(utf8);
436     }
437 
438     bool isWhitespace(SkUnichar utf8) override {
439         return sk_u_isWhitespace(utf8);
440     }
441 
442     bool isSpace(SkUnichar utf8) override {
443         return sk_u_isspace(utf8);
444     }
445 
446     bool isHardBreak(SkUnichar utf8) override {
447         return SkUnicode_icu::isHardLineBreak(utf8);
448     }
449 
450     bool isEmoji(SkUnichar unichar) override {
451         return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI);
452     }
453 
454     bool isEmojiComponent(SkUnichar unichar) override {
455         return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_COMPONENT);
456     }
457 
458     bool isEmojiModifierBase(SkUnichar unichar) override {
459         return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_MODIFIER_BASE);
460     }
461 
462     bool isEmojiModifier(SkUnichar unichar) override {
463         return sk_u_hasBinaryProperty(unichar, UCHAR_EMOJI_MODIFIER);
464     }
465 
466     bool isRegionalIndicator(SkUnichar unichar) override {
467         return sk_u_hasBinaryProperty(unichar, UCHAR_REGIONAL_INDICATOR);
468     }
469 
470     bool isIdeographic(SkUnichar unichar) override {
471         return sk_u_hasBinaryProperty(unichar, UCHAR_IDEOGRAPHIC);
472     }
473 
474     bool isTabulation(SkUnichar utf8) override {
475         return utf8 == '\t';
476     }
477 
478     static bool isHardLineBreak(SkUnichar utf8) {
479         auto property = sk_u_getIntPropertyValue(utf8, UCHAR_LINE_BREAK);
480         return property == U_LB_LINE_FEED || property == U_LB_MANDATORY_BREAK;
481     }
482 
483 #ifdef ENABLE_TEXT_ENHANCE
484     static bool isPunctuation(SkUnichar unichar)
485     {
486         if (sk_u_ispunct(unichar)) {
487             return true;
488         }
489         static constexpr std::array<std::pair<SkUnichar, SkUnichar>, 13> ranges{{
490                 {0x0021, 0x002F},  // ASCII punctuation (e.g., ! " # $ % & ' ( ) * + , - . /)
491                 {0x003A, 0x0040},  // ASCII punctuation (e.g., : ; < = > ? @)
492                 {0x005B, 0x0060},  // ASCII punctuation (e.g., [ \ ] ^ _ `)
493                 {0x007B, 0x007E},  // ASCII punctuation (e.g., { | } ~)
494                 {0x2000, 0x206F},  // Common punctuation (Chinese & English)
495                 {0xFF00, 0xFFEF},  // Full-width characters and symbols
496                 {0x2E00, 0x2E7F},  // Supplemental punctuation (e.g., ancient)
497                 {0x3001, 0x3003},  // CJK punctuation (e.g., Chinese comma)
498                 {0xFF01, 0xFF0F},  // Full-width ASCII punctuation (0x21-0x2F)
499                 {0xFF1A, 0xFF20},  // Full-width ASCII punctuation (0x3A-0x40)
500                 {0xFF3B, 0xFF40},  // Full-width ASCII punctuation (0x5B-0x60)
501                 {0xFF5B, 0xFF65},  // Other full-width punctuation (e.g., quotes)
502         }};
503         for (auto range : ranges) {
504             if (range.first <= unichar && unichar <= range.second) {
505                 return true;
506             }
507         }
508         return false;
509     }
510     static bool isEllipsis(SkUnichar unichar) { return (unichar == 0x2026 || unichar == 0x002E); }
511     static bool isGraphemeExtend(SkUnichar unichar) {
512         return sk_u_hasBinaryProperty(unichar, UCHAR_GRAPHEME_EXTEND);
513     }
514     static bool isCustomSoftBreak(SkUnichar unichar) {
515         // ‘ “ ( [ { < « — – • – – $ £ € + = × \ % ° # * @ _ § © ®
516         static const std::unordered_set<SkUnichar> kBreakTriggerCodePoints {
517             0x2018, 0x201C, 0x0028, 0x005B, 0x007B, 0x003C, 0x00AB, 0x2014, 0x2013,
518             0x2022, 0x0024, 0x00A3, 0x20AC, 0x002B, 0x003D, 0x00D7, 0x005C, 0x0025,
519             0x00B0, 0x0023, 0x002A, 0x0040, 0x005F, 0x00A7, 0x00A9, 0x00AE
520         };
521 
522         return kBreakTriggerCodePoints.count(unichar) > 0;
523     }
524 #endif
525 
526 public:
527     ~SkUnicode_icu() override { }
528     std::unique_ptr<SkBidiIterator> makeBidiIterator(const uint16_t text[], int count,
529                                                      SkBidiIterator::Direction dir) override {
530         return fBidiFact->MakeIterator(text, count, dir);
531     }
532     std::unique_ptr<SkBidiIterator> makeBidiIterator(const char text[],
533                                                      int count,
534                                                      SkBidiIterator::Direction dir) override {
535         return fBidiFact->MakeIterator(text, count, dir);
536     }
537     std::unique_ptr<SkBreakIterator> makeBreakIterator(const char locale[],
538                                                        BreakType type) override {
539         ICUBreakIterator iterator = SkIcuBreakIteratorCache::get().makeBreakIterator(type, locale);
540         if (!iterator) {
541             return nullptr;
542         }
543         return std::unique_ptr<SkBreakIterator>(new SkBreakIterator_icu(std::move(iterator)));
544     }
545     std::unique_ptr<SkBreakIterator> makeBreakIterator(BreakType type) override {
546         return makeBreakIterator(sk_uloc_getDefault(), type);
547     }
548 
549     SkString toUpper(const SkString& str) override {
550         return this->toUpper(str, nullptr);
551     }
552 
553     SkString toUpper(const SkString& str, const char* locale) override {
554         // Convert to UTF16 since that's what ICU wants.
555         auto str16 = SkUnicode::convertUtf8ToUtf16(str.c_str(), str.size());
556 
557         UErrorCode icu_err = U_ZERO_ERROR;
558         const auto upper16len = sk_u_strToUpper(nullptr, 0, (UChar*)(str16.c_str()), str16.size(),
559                                                 locale, &icu_err);
560         if (icu_err != U_BUFFER_OVERFLOW_ERROR || upper16len <= 0) {
561             return SkString();
562         }
563 
564         AutoSTArray<128, uint16_t> upper16(upper16len);
565         icu_err = U_ZERO_ERROR;
566         sk_u_strToUpper((UChar*)(upper16.get()), SkToS32(upper16.size()),
567                         (UChar*)(str16.c_str()), str16.size(),
568                         locale, &icu_err);
569         SkASSERT(!U_FAILURE(icu_err));
570 
571         // ... and back to utf8 'cause that's what we want.
572         return convertUtf16ToUtf8((char16_t*)upper16.get(), upper16.size());
573     }
574 
575     bool getBidiRegions(const char utf8[],
576                         int utf8Units,
577                         TextDirection dir,
578                         std::vector<BidiRegion>* results) override {
579         return fBidiFact->ExtractBidi(utf8, utf8Units, dir, results);
580     }
581 
582     bool getWords(const char utf8[], int utf8Units, const char* locale,
583                   std::vector<Position>* results) override {
584 
585         // Convert to UTF16 since we want the results in utf16
586         auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
587         return SkUnicode_icu::extractWords((uint16_t*)utf16.c_str(), utf16.size(), locale, results);
588     }
589 
590     bool getUtf8Words(const char utf8[],
591                       int utf8Units,
592                       const char* locale,
593                       std::vector<Position>* results) override {
594         // Convert to UTF16 since we want the results in utf16
595         auto utf16 = convertUtf8ToUtf16(utf8, utf8Units);
596         std::vector<Position> utf16Results;
597         if (!SkUnicode_icu::extractWords(
598                     (uint16_t*)utf16.c_str(), utf16.size(), locale, &utf16Results)) {
599             return false;
600         }
601 
602         std::vector<Position> mapping;
603         SkSpan<const char> text(utf8, utf8Units);
604         SkUnicode::extractUtfConversionMapping(
605                 text, [&](size_t index) { mapping.emplace_back(index); }, [&](size_t index) {});
606 
607         for (auto i16 : utf16Results) {
608             results->emplace_back(mapping[i16]);
609         }
610         return true;
611     }
612 
613     bool getSentences(const char utf8[],
614                       int utf8Units,
615                       const char* locale,
616                       std::vector<SkUnicode::Position>* results) override {
617         SkUnicode_icu::extractPositions(
618                 utf8, utf8Units, BreakType::kSentences, nullptr,
619                 [&](int pos, int status) {
620                     results->emplace_back(pos);
621                 });
622         return true;
623     }
624 
625 #ifdef ENABLE_TEXT_ENHANCE
626     void processPunctuationAndEllipsis(skia_private::TArray<SkUnicode::CodeUnitFlags, true>* results,
627         int i, SkUnichar unichar) {
628         if (SkUnicode_icu::isPunctuation(unichar)) {
629             results->at(i) |= SkUnicode::kPunctuation;
630         }
631         if (SkUnicode_icu::isEllipsis(unichar)) {
632             results->at(i) |= SkUnicode::kEllipsis;
633         }
634         if (SkUnicode_icu::isCustomSoftBreak(unichar)) {
635             results->at(i) |= SkUnicode::kSoftLineBreakBefore;
636         }
637     }
638 
639     bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs, const char locale[],
640                               TArray<SkUnicode::CodeUnitFlags, true>* results) override {
641         results->clear();
642         results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
643 
644         SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, locale,
645                                         [&](int pos, int status) {
646             (*results)[pos] |= status == UBRK_LINE_HARD ? CodeUnitFlags::kHardLineBreakBefore :
647                 CodeUnitFlags::kSoftLineBreakBefore;
648         });
649 
650         SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, locale,
651                                         [&](int pos, int status) {
652             (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
653         });
654 
655         const char* current = utf8;
656         const char* end = utf8 + utf8Units;
657         while (current < end) {
658             auto before = current - utf8;
659             SkUnichar unichar = SkUTF::NextUTF8(&current, end);
660             if (unichar < 0) unichar = 0xFFFD;
661             auto after = current - utf8;
662             if (replaceTabs && this->isTabulation(unichar)) {
663                 results->at(before) |= SkUnicode::kTabulation;
664                 if (replaceTabs) {
665                     unichar = ' ';
666                     utf8[before] = ' ';
667                 }
668             }
669             for (auto i = before; i < after; ++i) {
670                 if (this->isSpace(unichar)) {
671                     results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
672                 }
673                 if (this->isWhitespace(unichar)) {
674                     results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
675                 }
676                 if (this->isControl(unichar)) {
677                     results->at(i) |= SkUnicode::kControl;
678                 }
679                 if (this->isIdeographic(unichar)) {
680                     results->at(i) |= SkUnicode::kIdeographic;
681                 }
682                 processPunctuationAndEllipsis(results, i, unichar);
683             }
684 
685             if (SkUnicode_icu::isGraphemeExtend(unichar)) {
686                 // Current unichar is a combining one.
687                 results->at(before) |= SkUnicode::kCombine;
688             }
689         }
690 
691         return true;
692     }
693 #else
694     bool computeCodeUnitFlags(char utf8[], int utf8Units, bool replaceTabs,
695                               TArray<SkUnicode::CodeUnitFlags, true>* results) override {
696         results->clear();
697         results->push_back_n(utf8Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
698 
699         SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kLines, nullptr, // TODO: locale
700                                         [&](int pos, int status) {
701             (*results)[pos] |= status == UBRK_LINE_HARD
702                                        ? CodeUnitFlags::kHardLineBreakBefore
703                                        : CodeUnitFlags::kSoftLineBreakBefore;
704         });
705 
706         SkUnicode_icu::extractPositions(utf8, utf8Units, BreakType::kGraphemes, nullptr, //TODO
707                                         [&](int pos, int status) {
708             (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
709         });
710 
711         const char* current = utf8;
712         const char* end = utf8 + utf8Units;
713         while (current < end) {
714             auto before = current - utf8;
715             SkUnichar unichar = SkUTF::NextUTF8(&current, end);
716             if (unichar < 0) unichar = 0xFFFD;
717             auto after = current - utf8;
718             if (replaceTabs && this->isTabulation(unichar)) {
719                 results->at(before) |= SkUnicode::kTabulation;
720                 if (replaceTabs) {
721                     unichar = ' ';
722                     utf8[before] = ' ';
723                 }
724             }
725             for (auto i = before; i < after; ++i) {
726                 if (this->isSpace(unichar)) {
727                     results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
728                 }
729                 if (this->isWhitespace(unichar)) {
730                     results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
731                 }
732                 if (this->isControl(unichar)) {
733                     results->at(i) |= SkUnicode::kControl;
734                 }
735                 if (this->isIdeographic(unichar)) {
736                     results->at(i) |= SkUnicode::kIdeographic;
737                 }
738             }
739         }
740 
741         return true;
742     }
743 #endif
744 
745 #ifdef ENABLE_TEXT_ENHANCE
746     bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs, const char locale[],
747 #else
748     bool computeCodeUnitFlags(char16_t utf16[], int utf16Units, bool replaceTabs,
749 #endif
750                           TArray<SkUnicode::CodeUnitFlags, true>* results) override {
751         results->clear();
752         results->push_back_n(utf16Units + 1, CodeUnitFlags::kNoCodeUnitFlag);
753 
754         // Get white spaces
755         this->forEachCodepoint((char16_t*)&utf16[0], utf16Units,
756            [this, results, replaceTabs, &utf16](SkUnichar unichar, int32_t start, int32_t end) {
757                 for (auto i = start; i < end; ++i) {
758                     if (replaceTabs && this->isTabulation(unichar)) {
759                         results->at(i) |= SkUnicode::kTabulation;
760                     if (replaceTabs) {
761                             unichar = ' ';
762                             utf16[start] = ' ';
763                         }
764                     }
765                     if (this->isSpace(unichar)) {
766                         results->at(i) |= SkUnicode::kPartOfIntraWordBreak;
767                     }
768                     if (this->isWhitespace(unichar)) {
769                         results->at(i) |= SkUnicode::kPartOfWhiteSpaceBreak;
770                     }
771                     if (this->isControl(unichar)) {
772                         results->at(i) |= SkUnicode::kControl;
773                     }
774                 }
775            });
776         // Get graphemes
777         this->forEachBreak((char16_t*)&utf16[0],
778                            utf16Units,
779                            SkUnicode::BreakType::kGraphemes,
780 #ifdef ENABLE_TEXT_ENHANCE
781                            locale,
782 #endif
783                            [results](SkBreakIterator::Position pos, SkBreakIterator::Status) {
784                                (*results)[pos] |= CodeUnitFlags::kGraphemeStart;
785                            });
786         // Get line breaks
787         this->forEachBreak(
788                 (char16_t*)&utf16[0],
789                 utf16Units,
790                 SkUnicode::BreakType::kLines,
791 #ifdef ENABLE_TEXT_ENHANCE
792                 locale,
793 #endif
794                 [results](SkBreakIterator::Position pos, SkBreakIterator::Status status) {
795                     if (status ==
796                         (SkBreakIterator::Status)SkUnicode::LineBreakType::kHardLineBreak) {
797                         // Hard line breaks clears off all the other flags
798                         // TODO: Treat \n as a formatting mark and do not pass it to SkShaper
799                         (*results)[pos-1] = CodeUnitFlags::kHardLineBreakBefore;
800                     } else {
801                         (*results)[pos] |= CodeUnitFlags::kSoftLineBreakBefore;
802                     }
803                 });
804 
805         return true;
806     }
807 
808     void reorderVisual(const BidiLevel runLevels[],
809                        int levelsCount,
810                        int32_t logicalFromVisual[]) override {
811         fBidiFact->bidi_reorderVisual(runLevels, levelsCount, logicalFromVisual);
812     }
813 
814 private:
815     sk_sp<SkBidiFactory> fBidiFact = sk_make_sp<SkBidiICUFactory>();
816 };
817 
818 namespace SkUnicodes::ICU {
Make()819 sk_sp<SkUnicode> Make() {
820     // We haven't yet created a way to encode the ICU data for assembly on Windows,
821     // so we use a helper library to load icudtl.dat from the harddrive.
822 #if defined(SK_USING_THIRD_PARTY_ICU) && defined(SK_BUILD_FOR_WIN)
823     if (!SkLoadICU()) {
824         static SkOnce once;
825         once([] { SkDEBUGF("SkLoadICU() failed!\n"); });
826         return nullptr;
827     }
828 #endif
829     if (SkGetICULib()) {
830         return sk_make_sp<SkUnicode_icu>();
831     }
832     return nullptr;
833 }
834 }  // namespace SkUnicodes::ICU
835 #ifdef ENABLE_DRAWING_ADAPTER
836 }
837 #endif // ENABLE_DRAWING_ADAPTER