• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2017 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "util/utf8/unilib-icu.h"
18 
19 #include <utility>
20 
21 namespace libtextclassifier2 {
22 
ParseInt32(const UnicodeText & text,int * result) const23 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
24   UErrorCode status = U_ZERO_ERROR;
25   UNumberFormat* format_alias =
26       unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status);
27   if (U_FAILURE(status)) {
28     return false;
29   }
30   icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8(
31       icu::StringPiece(text.data(), text.size_bytes()));
32   int parse_index = 0;
33   const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(),
34                                    utf8_string.length(), &parse_index, &status);
35   *result = integer;
36   unum_close(format_alias);
37   if (U_FAILURE(status) || parse_index != utf8_string.length()) {
38     return false;
39   }
40   return true;
41 }
42 
IsOpeningBracket(char32 codepoint) const43 bool UniLib::IsOpeningBracket(char32 codepoint) const {
44   return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
45          U_BPT_OPEN;
46 }
47 
IsClosingBracket(char32 codepoint) const48 bool UniLib::IsClosingBracket(char32 codepoint) const {
49   return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
50          U_BPT_CLOSE;
51 }
52 
IsWhitespace(char32 codepoint) const53 bool UniLib::IsWhitespace(char32 codepoint) const {
54   return u_isWhitespace(codepoint);
55 }
56 
IsDigit(char32 codepoint) const57 bool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); }
58 
IsUpper(char32 codepoint) const59 bool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); }
60 
ToLower(char32 codepoint) const61 char32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); }
62 
GetPairedBracket(char32 codepoint) const63 char32 UniLib::GetPairedBracket(char32 codepoint) const {
64   return u_getBidiPairedBracket(codepoint);
65 }
66 
RegexMatcher(icu::RegexPattern * pattern,icu::UnicodeString text)67 UniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
68                                    icu::UnicodeString text)
69     : text_(std::move(text)),
70       last_find_offset_(0),
71       last_find_offset_codepoints_(0),
72       last_find_offset_dirty_(true) {
73   UErrorCode status = U_ZERO_ERROR;
74   matcher_.reset(pattern->matcher(text_, status));
75   if (U_FAILURE(status)) {
76     matcher_.reset(nullptr);
77   }
78 }
79 
Matcher(const UnicodeText & input) const80 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
81     const UnicodeText& input) const {
82   return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher(
83       pattern_.get(), icu::UnicodeString::fromUTF8(
84                           icu::StringPiece(input.data(), input.size_bytes()))));
85 }
86 
87 constexpr int UniLib::RegexMatcher::kError;
88 constexpr int UniLib::RegexMatcher::kNoError;
89 
Matches(int * status) const90 bool UniLib::RegexMatcher::Matches(int* status) const {
91   if (!matcher_) {
92     *status = kError;
93     return false;
94   }
95 
96   UErrorCode icu_status = U_ZERO_ERROR;
97   const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
98   if (U_FAILURE(icu_status)) {
99     *status = kError;
100     return false;
101   }
102   *status = kNoError;
103   return result;
104 }
105 
ApproximatelyMatches(int * status)106 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
107   if (!matcher_) {
108     *status = kError;
109     return false;
110   }
111 
112   matcher_->reset();
113   *status = kNoError;
114   if (!Find(status) || *status != kNoError) {
115     return false;
116   }
117   const int found_start = Start(status);
118   if (*status != kNoError) {
119     return false;
120   }
121   const int found_end = End(status);
122   if (*status != kNoError) {
123     return false;
124   }
125   if (found_start != 0 || found_end != text_.countChar32()) {
126     return false;
127   }
128   return true;
129 }
130 
UpdateLastFindOffset() const131 bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
132   if (!last_find_offset_dirty_) {
133     return true;
134   }
135 
136   // Update the position of the match.
137   UErrorCode icu_status = U_ZERO_ERROR;
138   const int find_offset = matcher_->start(0, icu_status);
139   if (U_FAILURE(icu_status)) {
140     return false;
141   }
142   last_find_offset_codepoints_ +=
143       text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
144   last_find_offset_ = find_offset;
145   last_find_offset_dirty_ = false;
146 
147   return true;
148 }
149 
Find(int * status)150 bool UniLib::RegexMatcher::Find(int* status) {
151   if (!matcher_) {
152     *status = kError;
153     return false;
154   }
155   UErrorCode icu_status = U_ZERO_ERROR;
156   const bool result = matcher_->find(icu_status);
157   if (U_FAILURE(icu_status)) {
158     *status = kError;
159     return false;
160   }
161 
162   last_find_offset_dirty_ = true;
163   *status = kNoError;
164   return result;
165 }
166 
Start(int * status) const167 int UniLib::RegexMatcher::Start(int* status) const {
168   return Start(/*group_idx=*/0, status);
169 }
170 
Start(int group_idx,int * status) const171 int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
172   if (!matcher_ || !UpdateLastFindOffset()) {
173     *status = kError;
174     return kError;
175   }
176 
177   UErrorCode icu_status = U_ZERO_ERROR;
178   const int result = matcher_->start(group_idx, icu_status);
179   if (U_FAILURE(icu_status)) {
180     *status = kError;
181     return kError;
182   }
183   *status = kNoError;
184 
185   // If the group didn't participate in the match the result is -1 and is
186   // incompatible with the caching logic bellow.
187   if (result == -1) {
188     return -1;
189   }
190 
191   return last_find_offset_codepoints_ +
192          text_.countChar32(/*start=*/last_find_offset_,
193                            /*length=*/result - last_find_offset_);
194 }
195 
End(int * status) const196 int UniLib::RegexMatcher::End(int* status) const {
197   return End(/*group_idx=*/0, status);
198 }
199 
End(int group_idx,int * status) const200 int UniLib::RegexMatcher::End(int group_idx, int* status) const {
201   if (!matcher_ || !UpdateLastFindOffset()) {
202     *status = kError;
203     return kError;
204   }
205   UErrorCode icu_status = U_ZERO_ERROR;
206   const int result = matcher_->end(group_idx, icu_status);
207   if (U_FAILURE(icu_status)) {
208     *status = kError;
209     return kError;
210   }
211   *status = kNoError;
212 
213   // If the group didn't participate in the match the result is -1 and is
214   // incompatible with the caching logic bellow.
215   if (result == -1) {
216     return -1;
217   }
218 
219   return last_find_offset_codepoints_ +
220          text_.countChar32(/*start=*/last_find_offset_,
221                            /*length=*/result - last_find_offset_);
222 }
223 
Group(int * status) const224 UnicodeText UniLib::RegexMatcher::Group(int* status) const {
225   return Group(/*group_idx=*/0, status);
226 }
227 
Group(int group_idx,int * status) const228 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
229   if (!matcher_) {
230     *status = kError;
231     return UTF8ToUnicodeText("", /*do_copy=*/false);
232   }
233   std::string result = "";
234   UErrorCode icu_status = U_ZERO_ERROR;
235   const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
236   if (U_FAILURE(icu_status)) {
237     *status = kError;
238     return UTF8ToUnicodeText("", /*do_copy=*/false);
239   }
240   result_icu.toUTF8String(result);
241   *status = kNoError;
242   return UTF8ToUnicodeText(result, /*do_copy=*/true);
243 }
244 
245 constexpr int UniLib::BreakIterator::kDone;
246 
BreakIterator(const UnicodeText & text)247 UniLib::BreakIterator::BreakIterator(const UnicodeText& text)
248     : text_(icu::UnicodeString::fromUTF8(
249           icu::StringPiece(text.data(), text.size_bytes()))),
250       last_break_index_(0),
251       last_unicode_index_(0) {
252   icu::ErrorCode status;
253   break_iterator_.reset(
254       icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
255   if (!status.isSuccess()) {
256     break_iterator_.reset();
257     return;
258   }
259   break_iterator_->setText(text_);
260 }
261 
Next()262 int UniLib::BreakIterator::Next() {
263   const int break_index = break_iterator_->next();
264   if (break_index == icu::BreakIterator::DONE) {
265     return BreakIterator::kDone;
266   }
267   last_unicode_index_ +=
268       text_.countChar32(last_break_index_, break_index - last_break_index_);
269   last_break_index_ = break_index;
270   return last_unicode_index_;
271 }
272 
CreateRegexPattern(const UnicodeText & regex) const273 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
274     const UnicodeText& regex) const {
275   UErrorCode status = U_ZERO_ERROR;
276   std::unique_ptr<icu::RegexPattern> pattern(
277       icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece(
278                                      regex.data(), regex.size_bytes())),
279                                  /*flags=*/UREGEX_MULTILINE, status));
280   if (U_FAILURE(status) || !pattern) {
281     return nullptr;
282   }
283   return std::unique_ptr<UniLib::RegexPattern>(
284       new UniLib::RegexPattern(std::move(pattern)));
285 }
286 
CreateBreakIterator(const UnicodeText & text) const287 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
288     const UnicodeText& text) const {
289   return std::unique_ptr<UniLib::BreakIterator>(
290       new UniLib::BreakIterator(text));
291 }
292 
293 }  // namespace libtextclassifier2
294