• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright (c) 2011 The Chromium Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "base/i18n/break_iterator.h"
6 
7 #include "base/logging.h"
8 #include "third_party/icu/source/common/unicode/ubrk.h"
9 #include "third_party/icu/source/common/unicode/uchar.h"
10 #include "third_party/icu/source/common/unicode/ustring.h"
11 
12 namespace base {
13 namespace i18n {
14 
15 const size_t npos = static_cast<size_t>(-1);
16 
BreakIterator(const string16 & str,BreakType break_type)17 BreakIterator::BreakIterator(const string16& str, BreakType break_type)
18     : iter_(NULL),
19       string_(str),
20       break_type_(break_type),
21       prev_(npos),
22       pos_(0) {
23 }
24 
BreakIterator(const string16 & str,const string16 & rules)25 BreakIterator::BreakIterator(const string16& str, const string16& rules)
26     : iter_(NULL),
27       string_(str),
28       rules_(rules),
29       break_type_(RULE_BASED),
30       prev_(npos),
31       pos_(0) {
32 }
33 
~BreakIterator()34 BreakIterator::~BreakIterator() {
35   if (iter_)
36     ubrk_close(static_cast<UBreakIterator*>(iter_));
37 }
38 
Init()39 bool BreakIterator::Init() {
40   UErrorCode status = U_ZERO_ERROR;
41   UParseError parse_error;
42   UBreakIteratorType break_type;
43   switch (break_type_) {
44     case BREAK_CHARACTER:
45       break_type = UBRK_CHARACTER;
46       break;
47     case BREAK_WORD:
48       break_type = UBRK_WORD;
49       break;
50     case BREAK_LINE:
51     case BREAK_NEWLINE:
52     case RULE_BASED: // (Keep compiler happy, break_type not used in this case)
53       break_type = UBRK_LINE;
54       break;
55     default:
56       NOTREACHED() << "invalid break_type_";
57       return false;
58   }
59   if (break_type_ == RULE_BASED) {
60     iter_ = ubrk_openRules(rules_.c_str(),
61                            static_cast<int32_t>(rules_.length()),
62                            string_.data(),
63                            static_cast<int32_t>(string_.size()),
64                            &parse_error,
65                            &status);
66     if (U_FAILURE(status)) {
67       NOTREACHED() << "ubrk_openRules failed to parse rule string at line "
68           << parse_error.line << ", offset " << parse_error.offset;
69     }
70   } else {
71     iter_ = ubrk_open(break_type,
72                       NULL,
73                       string_.data(),
74                       static_cast<int32_t>(string_.size()),
75                       &status);
76     if (U_FAILURE(status)) {
77       NOTREACHED() << "ubrk_open failed";
78     }
79   }
80 
81   if (U_FAILURE(status)) {
82     return false;
83   }
84 
85   // Move the iterator to the beginning of the string.
86   ubrk_first(static_cast<UBreakIterator*>(iter_));
87   return true;
88 }
89 
Advance()90 bool BreakIterator::Advance() {
91   int32_t pos;
92   int32_t status;
93   prev_ = pos_;
94   switch (break_type_) {
95     case BREAK_CHARACTER:
96     case BREAK_WORD:
97     case BREAK_LINE:
98     case RULE_BASED:
99       pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
100       if (pos == UBRK_DONE) {
101         pos_ = npos;
102         return false;
103       }
104       pos_ = static_cast<size_t>(pos);
105       return true;
106     case BREAK_NEWLINE:
107       do {
108         pos = ubrk_next(static_cast<UBreakIterator*>(iter_));
109         if (pos == UBRK_DONE)
110           break;
111         pos_ = static_cast<size_t>(pos);
112         status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
113       } while (status >= UBRK_LINE_SOFT && status < UBRK_LINE_SOFT_LIMIT);
114       if (pos == UBRK_DONE && prev_ == pos_) {
115         pos_ = npos;
116         return false;
117       }
118       return true;
119     default:
120       NOTREACHED() << "invalid break_type_";
121       return false;
122   }
123 }
124 
SetText(const base::char16 * text,const size_t length)125 bool BreakIterator::SetText(const base::char16* text, const size_t length) {
126   UErrorCode status = U_ZERO_ERROR;
127   ubrk_setText(static_cast<UBreakIterator*>(iter_),
128                text, length, &status);
129   pos_ = 0;  // implicit when ubrk_setText is done
130   prev_ = npos;
131   if (U_FAILURE(status)) {
132     NOTREACHED() << "ubrk_setText failed";
133     return false;
134   }
135   return true;
136 }
137 
IsWord() const138 bool BreakIterator::IsWord() const {
139   int32_t status = ubrk_getRuleStatus(static_cast<UBreakIterator*>(iter_));
140   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
141     return false;
142   return status != UBRK_WORD_NONE;
143 }
144 
IsEndOfWord(size_t position) const145 bool BreakIterator::IsEndOfWord(size_t position) const {
146   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
147     return false;
148 
149   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
150   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
151   int32_t status = ubrk_getRuleStatus(iter);
152   return (!!boundary && status != UBRK_WORD_NONE);
153 }
154 
IsStartOfWord(size_t position) const155 bool BreakIterator::IsStartOfWord(size_t position) const {
156   if (break_type_ != BREAK_WORD && break_type_ != RULE_BASED)
157     return false;
158 
159   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
160   UBool boundary = ubrk_isBoundary(iter, static_cast<int32_t>(position));
161   ubrk_next(iter);
162   int32_t next_status = ubrk_getRuleStatus(iter);
163   return (!!boundary && next_status != UBRK_WORD_NONE);
164 }
165 
IsGraphemeBoundary(size_t position) const166 bool BreakIterator::IsGraphemeBoundary(size_t position) const {
167   if (break_type_ != BREAK_CHARACTER)
168     return false;
169 
170   UBreakIterator* iter = static_cast<UBreakIterator*>(iter_);
171   return !!ubrk_isBoundary(iter, static_cast<int32_t>(position));
172 }
173 
GetString() const174 string16 BreakIterator::GetString() const {
175   DCHECK(prev_ != npos && pos_ != npos);
176   return string_.substr(prev_, pos_ - prev_);
177 }
178 
179 }  // namespace i18n
180 }  // namespace base
181