1 /*
2 * Copyright (C) 2017 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "util/utf8/unilib-icu.h"
18
19 #include <utility>
20
21 namespace libtextclassifier2 {
22
ParseInt32(const UnicodeText & text,int * result) const23 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
24 UErrorCode status = U_ZERO_ERROR;
25 UNumberFormat* format_alias =
26 unum_open(UNUM_DECIMAL, nullptr, 0, "en_US_POSIX", nullptr, &status);
27 if (U_FAILURE(status)) {
28 return false;
29 }
30 icu::UnicodeString utf8_string = icu::UnicodeString::fromUTF8(
31 icu::StringPiece(text.data(), text.size_bytes()));
32 int parse_index = 0;
33 const int32 integer = unum_parse(format_alias, utf8_string.getBuffer(),
34 utf8_string.length(), &parse_index, &status);
35 *result = integer;
36 unum_close(format_alias);
37 if (U_FAILURE(status) || parse_index != utf8_string.length()) {
38 return false;
39 }
40 return true;
41 }
42
IsOpeningBracket(char32 codepoint) const43 bool UniLib::IsOpeningBracket(char32 codepoint) const {
44 return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
45 U_BPT_OPEN;
46 }
47
IsClosingBracket(char32 codepoint) const48 bool UniLib::IsClosingBracket(char32 codepoint) const {
49 return u_getIntPropertyValue(codepoint, UCHAR_BIDI_PAIRED_BRACKET_TYPE) ==
50 U_BPT_CLOSE;
51 }
52
IsWhitespace(char32 codepoint) const53 bool UniLib::IsWhitespace(char32 codepoint) const {
54 return u_isWhitespace(codepoint);
55 }
56
IsDigit(char32 codepoint) const57 bool UniLib::IsDigit(char32 codepoint) const { return u_isdigit(codepoint); }
58
IsUpper(char32 codepoint) const59 bool UniLib::IsUpper(char32 codepoint) const { return u_isupper(codepoint); }
60
ToLower(char32 codepoint) const61 char32 UniLib::ToLower(char32 codepoint) const { return u_tolower(codepoint); }
62
GetPairedBracket(char32 codepoint) const63 char32 UniLib::GetPairedBracket(char32 codepoint) const {
64 return u_getBidiPairedBracket(codepoint);
65 }
66
RegexMatcher(icu::RegexPattern * pattern,icu::UnicodeString text)67 UniLib::RegexMatcher::RegexMatcher(icu::RegexPattern* pattern,
68 icu::UnicodeString text)
69 : text_(std::move(text)),
70 last_find_offset_(0),
71 last_find_offset_codepoints_(0),
72 last_find_offset_dirty_(true) {
73 UErrorCode status = U_ZERO_ERROR;
74 matcher_.reset(pattern->matcher(text_, status));
75 if (U_FAILURE(status)) {
76 matcher_.reset(nullptr);
77 }
78 }
79
Matcher(const UnicodeText & input) const80 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
81 const UnicodeText& input) const {
82 return std::unique_ptr<UniLib::RegexMatcher>(new UniLib::RegexMatcher(
83 pattern_.get(), icu::UnicodeString::fromUTF8(
84 icu::StringPiece(input.data(), input.size_bytes()))));
85 }
86
87 constexpr int UniLib::RegexMatcher::kError;
88 constexpr int UniLib::RegexMatcher::kNoError;
89
Matches(int * status) const90 bool UniLib::RegexMatcher::Matches(int* status) const {
91 if (!matcher_) {
92 *status = kError;
93 return false;
94 }
95
96 UErrorCode icu_status = U_ZERO_ERROR;
97 const bool result = matcher_->matches(/*startIndex=*/0, icu_status);
98 if (U_FAILURE(icu_status)) {
99 *status = kError;
100 return false;
101 }
102 *status = kNoError;
103 return result;
104 }
105
ApproximatelyMatches(int * status)106 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
107 if (!matcher_) {
108 *status = kError;
109 return false;
110 }
111
112 matcher_->reset();
113 *status = kNoError;
114 if (!Find(status) || *status != kNoError) {
115 return false;
116 }
117 const int found_start = Start(status);
118 if (*status != kNoError) {
119 return false;
120 }
121 const int found_end = End(status);
122 if (*status != kNoError) {
123 return false;
124 }
125 if (found_start != 0 || found_end != text_.countChar32()) {
126 return false;
127 }
128 return true;
129 }
130
UpdateLastFindOffset() const131 bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
132 if (!last_find_offset_dirty_) {
133 return true;
134 }
135
136 // Update the position of the match.
137 UErrorCode icu_status = U_ZERO_ERROR;
138 const int find_offset = matcher_->start(0, icu_status);
139 if (U_FAILURE(icu_status)) {
140 return false;
141 }
142 last_find_offset_codepoints_ +=
143 text_.countChar32(last_find_offset_, find_offset - last_find_offset_);
144 last_find_offset_ = find_offset;
145 last_find_offset_dirty_ = false;
146
147 return true;
148 }
149
Find(int * status)150 bool UniLib::RegexMatcher::Find(int* status) {
151 if (!matcher_) {
152 *status = kError;
153 return false;
154 }
155 UErrorCode icu_status = U_ZERO_ERROR;
156 const bool result = matcher_->find(icu_status);
157 if (U_FAILURE(icu_status)) {
158 *status = kError;
159 return false;
160 }
161
162 last_find_offset_dirty_ = true;
163 *status = kNoError;
164 return result;
165 }
166
Start(int * status) const167 int UniLib::RegexMatcher::Start(int* status) const {
168 return Start(/*group_idx=*/0, status);
169 }
170
Start(int group_idx,int * status) const171 int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
172 if (!matcher_ || !UpdateLastFindOffset()) {
173 *status = kError;
174 return kError;
175 }
176
177 UErrorCode icu_status = U_ZERO_ERROR;
178 const int result = matcher_->start(group_idx, icu_status);
179 if (U_FAILURE(icu_status)) {
180 *status = kError;
181 return kError;
182 }
183 *status = kNoError;
184
185 // If the group didn't participate in the match the result is -1 and is
186 // incompatible with the caching logic bellow.
187 if (result == -1) {
188 return -1;
189 }
190
191 return last_find_offset_codepoints_ +
192 text_.countChar32(/*start=*/last_find_offset_,
193 /*length=*/result - last_find_offset_);
194 }
195
End(int * status) const196 int UniLib::RegexMatcher::End(int* status) const {
197 return End(/*group_idx=*/0, status);
198 }
199
End(int group_idx,int * status) const200 int UniLib::RegexMatcher::End(int group_idx, int* status) const {
201 if (!matcher_ || !UpdateLastFindOffset()) {
202 *status = kError;
203 return kError;
204 }
205 UErrorCode icu_status = U_ZERO_ERROR;
206 const int result = matcher_->end(group_idx, icu_status);
207 if (U_FAILURE(icu_status)) {
208 *status = kError;
209 return kError;
210 }
211 *status = kNoError;
212
213 // If the group didn't participate in the match the result is -1 and is
214 // incompatible with the caching logic bellow.
215 if (result == -1) {
216 return -1;
217 }
218
219 return last_find_offset_codepoints_ +
220 text_.countChar32(/*start=*/last_find_offset_,
221 /*length=*/result - last_find_offset_);
222 }
223
Group(int * status) const224 UnicodeText UniLib::RegexMatcher::Group(int* status) const {
225 return Group(/*group_idx=*/0, status);
226 }
227
Group(int group_idx,int * status) const228 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
229 if (!matcher_) {
230 *status = kError;
231 return UTF8ToUnicodeText("", /*do_copy=*/false);
232 }
233 std::string result = "";
234 UErrorCode icu_status = U_ZERO_ERROR;
235 const icu::UnicodeString result_icu = matcher_->group(group_idx, icu_status);
236 if (U_FAILURE(icu_status)) {
237 *status = kError;
238 return UTF8ToUnicodeText("", /*do_copy=*/false);
239 }
240 result_icu.toUTF8String(result);
241 *status = kNoError;
242 return UTF8ToUnicodeText(result, /*do_copy=*/true);
243 }
244
245 constexpr int UniLib::BreakIterator::kDone;
246
BreakIterator(const UnicodeText & text)247 UniLib::BreakIterator::BreakIterator(const UnicodeText& text)
248 : text_(icu::UnicodeString::fromUTF8(
249 icu::StringPiece(text.data(), text.size_bytes()))),
250 last_break_index_(0),
251 last_unicode_index_(0) {
252 icu::ErrorCode status;
253 break_iterator_.reset(
254 icu::BreakIterator::createWordInstance(icu::Locale("en"), status));
255 if (!status.isSuccess()) {
256 break_iterator_.reset();
257 return;
258 }
259 break_iterator_->setText(text_);
260 }
261
Next()262 int UniLib::BreakIterator::Next() {
263 const int break_index = break_iterator_->next();
264 if (break_index == icu::BreakIterator::DONE) {
265 return BreakIterator::kDone;
266 }
267 last_unicode_index_ +=
268 text_.countChar32(last_break_index_, break_index - last_break_index_);
269 last_break_index_ = break_index;
270 return last_unicode_index_;
271 }
272
CreateRegexPattern(const UnicodeText & regex) const273 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
274 const UnicodeText& regex) const {
275 UErrorCode status = U_ZERO_ERROR;
276 std::unique_ptr<icu::RegexPattern> pattern(
277 icu::RegexPattern::compile(icu::UnicodeString::fromUTF8(icu::StringPiece(
278 regex.data(), regex.size_bytes())),
279 /*flags=*/UREGEX_MULTILINE, status));
280 if (U_FAILURE(status) || !pattern) {
281 return nullptr;
282 }
283 return std::unique_ptr<UniLib::RegexPattern>(
284 new UniLib::RegexPattern(std::move(pattern)));
285 }
286
CreateBreakIterator(const UnicodeText & text) const287 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
288 const UnicodeText& text) const {
289 return std::unique_ptr<UniLib::BreakIterator>(
290 new UniLib::BreakIterator(text));
291 }
292
293 } // namespace libtextclassifier2
294