• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 Google LLC.
2 // Use of this source code is governed by a BSD-style license that can be found in the LICENSE file.
3 
4 #include "experimental/editor/word_boundaries.h"
5 
6 #include <unicode/brkiter.h>
7 #include <unicode/unistr.h>
8 
9 #include <memory>
10 
GetUtf8WordBoundaries(const char * begin,size_t byteCount,const char * locale)11 std::vector<bool> GetUtf8WordBoundaries(const char* begin, size_t byteCount, const char* locale) {
12     static constexpr UBreakIteratorType kIteratorType = UBRK_WORD;
13     struct UTextCloser {
14         void operator()(UText* p) { (void)utext_close(p); }
15     };
16     struct UBreakCloser {
17         void operator()(UBreakIterator* p) { (void)ubrk_close(p); }
18     };
19 
20     std::vector<bool> result;
21     if (0 == byteCount) {
22         return result;
23     }
24     result.resize(byteCount);
25 
26     UText utf8UText = UTEXT_INITIALIZER;
27     UErrorCode errorCode = U_ZERO_ERROR;
28     (void)utext_openUTF8(&utf8UText, begin, byteCount, &errorCode);
29     std::unique_ptr<UText, UTextCloser> autoclose1(&utf8UText);
30     if (U_FAILURE(errorCode)) {
31         return result;
32     }
33     UBreakIterator* iter = ubrk_open(kIteratorType, locale, nullptr, 0, &errorCode);
34     std::unique_ptr<UBreakIterator, UBreakCloser> autoclose2(iter);
35     if (U_FAILURE(errorCode)) {
36         return result;
37     }
38     ubrk_setUText(iter, &utf8UText, &errorCode);
39     if (U_FAILURE(errorCode)) {
40         return result;
41     }
42     int pos = ubrk_first(iter);
43     while (pos != icu::BreakIterator::DONE) {
44         if ((unsigned)pos < (unsigned)byteCount) {
45             result[pos] = true;
46         }
47         pos = ubrk_next(iter);
48     }
49     return result;
50 }
51