• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/break_iterator.h"
6 
7 #include <stdint.h>
8 
9 #include "base/logging.h"
10 #include "third_party/icu/source/common/unicode/ubrk.h"
11 #include "third_party/icu/source/common/unicode/uchar.h"
12 #include "third_party/icu/source/common/unicode/ustring.h"
13 
14 namespace base {
15 namespace i18n {
16 
17 const size_t npos = static_cast<size_t>(-1);
18 
BreakIterator(const StringPiece16 & str,BreakType break_type)19 BreakIterator::BreakIterator(const StringPiece16& str, BreakType break_type)
20     : iter_(nullptr),
21       string_(str),
22       break_type_(break_type),
23       prev_(npos),
24       pos_(0) {}
25 
BreakIterator(const StringPiece16 & str,const string16 & rules)26 BreakIterator::BreakIterator(const StringPiece16& str, const string16& rules)
27     : iter_(nullptr),
28       string_(str),
29       rules_(rules),
30       break_type_(RULE_BASED),
31       prev_(npos),
32       pos_(0) {}
33 
~BreakIterator()34 BreakIterator::~BreakIterator() {
35   if (iter_)
36     ubrk_close(static_cast<UBreakIterator*>(iter_));
37 }
38 
Init()39 bool BreakIterator::Init() {
40   UErrorCode status = U_ZERO_ERROR;
41   UParseError parse_error;
42   UBreakIteratorType break_type;
43   switch (break_type_) {
44     case BREAK_CHARACTER:
45       break_type = UBRK_CHARACTER;
46       break;
47     case BREAK_WORD:
48       break_type = UBRK_WORD;
49       break;
50     case BREAK_LINE:
51     case BREAK_NEWLINE:
52     case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
53       break_type = UBRK_LINE;
54       break;
55     default:
56       NOTREACHED() << "invalid break_type_";
57       return false;
58   }
59   if (break_type_ == RULE_BASED) {
60     iter_ = ubrk_openRules(rules_.c_str(),
61                            static_cast<int32_t>(rules_.length()),
62                            string_.data(),
63                            static_cast<int32_t>(string_.size()),
64                            &parse_error,
65                            &status);
66     if (U_FAILURE(status)) {
67       NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
68           << parse_error.line << ", offset " << parse_error.offset;
69     }
70   } else {
71     iter_ = ubrk_open(break_type, nullptr, string_.data(),
72                       static_cast<int32_t>(string_.size()), &status);
73     if (U_FAILURE(status)) {
74       NOTREACHED() << "ubrk_open failed for type " << break_type
75           << " with error " << status;
76     }
77   }
78 
79   if (U_FAILURE(status)) {
80     return false;
81   }
82 
83   // Move the iterator to the beginning of the string.
84   ubrk_first(static_cast<UBreakIterator*>(iter_));
85   return true;
86 }
87 
Advance()88 bool BreakIterator::Advance() {
89   int32_t pos;
90   int32_t status;
91   prev_ = pos_;
92   switch (break_type_) {
93     case BREAK_CHARACTER:
94     case BREAK_WORD:
95     case BREAK_LINE:
96     case RULE_BASED:
97       pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
98       if (pos == UBRK_DONE) {
99         pos_ = npos;
100         return false;
101       }
102       pos_ = static_cast<size_t>(pos);
103       return true;
104     case BREAK_NEWLINE:
105       do {
106         pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
107         if (pos == UBRK_DONE)
108           break;
109         pos_ = static_cast<size_t>(pos);
110         status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
111       } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
112       if (pos == UBRK_DONE && prev_ == pos_) {
113         pos_ = npos;
114         return false;
115       }
116       return true;
117     default:
118       NOTREACHED() << "invalid break_type_";
119       return false;
120   }
121 }
122 
SetText(const base::char16 * text,const size_t length)123 bool BreakIterator::SetText(const base::char16* text, const size_t length) {
124   UErrorCode status = U_ZERO_ERROR;
125   ubrk_setText(static_cast<UBreakIterator*>(iter_),
126                text, length, &status);
127   pos_ = 0;  // implicit when ubrk_setText is done
128   prev_ = npos;
129   if (U_FAILURE(status)) {
130     NOTREACHED() << "ubrk_setText failed";
131     return false;
132   }
133   string_ = StringPiece16(text, length);
134   return true;
135 }
136 
IsWord() const137 bool BreakIterator::IsWord() const {
138   return GetWordBreakStatus() == IS_WORD_BREAK;
139 }
140 
GetWordBreakStatus() const141 BreakIterator::WordBreakStatus BreakIterator::GetWordBreakStatus() const {
142   int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
143   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
144     return IS_LINE_OR_CHAR_BREAK;
145   // In ICU 60, trying to advance past the end of the text does not change
146   // |status| so that |pos_| has to be checked as well as |status|.
147   // See http://bugs.icu-project.org/trac/ticket/13447 .
148   return (status == UBRK_WORD_NONE || pos_ == npos) ? IS_SKIPPABLE_WORD
149                                                     : IS_WORD_BREAK;
150 }
151 
IsEndOfWord(size_t position) const152 bool BreakIterator::IsEndOfWord(size_t position) const {
153   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
154     return false;
155 
156   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
157   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
158   int32_t status = ubrk_getRuleStatus(iter);
159   return (!!boundary && status != UBRK_WORD_NONE);
160 }
161 
IsStartOfWord(size_t position) const162 bool BreakIterator::IsStartOfWord(size_t position) const {
163   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
164     return false;
165 
166   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
167   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
168   ubrk_next(iter);
169   int32_t next_status = ubrk_getRuleStatus(iter);
170   return (!!boundary && next_status != UBRK_WORD_NONE);
171 }
172 
IsGraphemeBoundary(size_t position) const173 bool BreakIterator::IsGraphemeBoundary(size_t position) const {
174   if (break_type_ != BREAK_CHARACTER)
175     return false;
176 
177   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
178   return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
179 }
180 
GetString() const181 string16 BreakIterator::GetString() const {
182   return GetStringPiece().as_string();
183 }
184 
GetStringPiece() const185 StringPiece16 BreakIterator::GetStringPiece() const {
186   DCHECK(prev_ != npos && pos_ != npos);
187   return string_.substr(prev_, pos_ - prev_);
188 }
189 
190 }  // namespace i18n
191 }  // namespace base
192