1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unilib-javaicu.h"
18
19 #include <math.h>
20
21 #include <cassert>
22 #include <cctype>
23 #include <map>
24
25 #include "utils/base/logging.h"
26 #include "utils/base/statusor.h"
27 #include "utils/java/jni-base.h"
28 #include "utils/java/string_utils.h"
29 #include "utils/utf8/unicodetext.h"
30 #include "utils/utf8/unilib-common.h"
31
32 namespace libtextclassifier3 {
33
UniLibBase()34 UniLibBase::UniLibBase() {
35 TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
36 }
37
UniLibBase(const std::shared_ptr<JniCache> & jni_cache)38 UniLibBase::UniLibBase(const std::shared_ptr<JniCache>& jni_cache)
39 : jni_cache_(jni_cache) {}
40
IsOpeningBracket(char32 codepoint) const41 bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
42 return libtextclassifier3::IsOpeningBracket(codepoint);
43 }
44
IsClosingBracket(char32 codepoint) const45 bool UniLibBase::IsClosingBracket(char32 codepoint) const {
46 return libtextclassifier3::IsClosingBracket(codepoint);
47 }
48
IsWhitespace(char32 codepoint) const49 bool UniLibBase::IsWhitespace(char32 codepoint) const {
50 return libtextclassifier3::IsWhitespace(codepoint);
51 }
52
IsDigit(char32 codepoint) const53 bool UniLibBase::IsDigit(char32 codepoint) const {
54 return libtextclassifier3::IsDigit(codepoint);
55 }
56
IsLower(char32 codepoint) const57 bool UniLibBase::IsLower(char32 codepoint) const {
58 return libtextclassifier3::IsLower(codepoint);
59 }
60
IsUpper(char32 codepoint) const61 bool UniLibBase::IsUpper(char32 codepoint) const {
62 return libtextclassifier3::IsUpper(codepoint);
63 }
64
IsPunctuation(char32 codepoint) const65 bool UniLibBase::IsPunctuation(char32 codepoint) const {
66 return libtextclassifier3::IsPunctuation(codepoint);
67 }
68
ToLower(char32 codepoint) const69 char32 UniLibBase::ToLower(char32 codepoint) const {
70 return libtextclassifier3::ToLower(codepoint);
71 }
72
ToUpper(char32 codepoint) const73 char32 UniLibBase::ToUpper(char32 codepoint) const {
74 return libtextclassifier3::ToUpper(codepoint);
75 }
76
GetPairedBracket(char32 codepoint) const77 char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
78 return libtextclassifier3::GetPairedBracket(codepoint);
79 }
80
81 // -----------------------------------------------------------------------------
82 // Implementations that call out to JVM. Behold the beauty.
83 // -----------------------------------------------------------------------------
84
ParseInt32(const UnicodeText & text,int32 * result) const85 bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
86 return ParseInt(text, result);
87 }
88
ParseInt64(const UnicodeText & text,int64 * result) const89 bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
90 return ParseInt(text, result);
91 }
92
ParseDouble(const UnicodeText & text,double * result) const93 bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
94 if (!jni_cache_) {
95 return false;
96 }
97
98 JNIEnv* env = jni_cache_->GetEnv();
99 auto it_dot = text.begin();
100 for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
101 }
102
103 int64 integer_part;
104 if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
105 &integer_part)) {
106 return false;
107 }
108
109 int64 fractional_part = 0;
110 if (it_dot != text.end()) {
111 std::string fractional_part_str =
112 UnicodeText::UTF8Substring(++it_dot, text.end());
113 TC3_ASSIGN_OR_RETURN_FALSE(
114 const ScopedLocalRef<jstring> fractional_text_java,
115 jni_cache_->ConvertToJavaString(fractional_part_str));
116 TC3_ASSIGN_OR_RETURN_FALSE(
117 fractional_part,
118 JniHelper::CallStaticIntMethod<int64>(
119 env, jni_cache_->integer_class.get(), jni_cache_->integer_parse_int,
120 fractional_text_java.get()));
121 }
122
123 double factional_part_double = fractional_part;
124 while (factional_part_double >= 1) {
125 factional_part_double /= 10;
126 }
127 *result = integer_part + factional_part_double;
128
129 return true;
130 }
131
CreateRegexPattern(const UnicodeText & regex) const132 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
133 const UnicodeText& regex) const {
134 return std::unique_ptr<UniLibBase::RegexPattern>(
135 new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
136 }
137
CreateLazyRegexPattern(const UnicodeText & regex) const138 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
139 const UnicodeText& regex) const {
140 return std::unique_ptr<UniLibBase::RegexPattern>(
141 new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
142 }
143
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)144 UniLibBase::RegexPattern::RegexPattern(const JniCache* jni_cache,
145 const UnicodeText& pattern, bool lazy)
146 : jni_cache_(jni_cache),
147 pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
148 initialized_(false),
149 initialization_failure_(false),
150 pattern_text_(pattern) {
151 if (!lazy) {
152 LockedInitializeIfNotAlready();
153 }
154 }
155
LockedInitializeIfNotAlready() const156 Status UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
157 std::lock_guard<std::mutex> guard(mutex_);
158 if (initialized_ || initialization_failure_) {
159 return Status::OK;
160 }
161
162 if (jni_cache_) {
163 JNIEnv* jenv = jni_cache_->GetEnv();
164 initialization_failure_ = true;
165 TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> regex_java,
166 jni_cache_->ConvertToJavaString(pattern_text_));
167 TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobject> pattern,
168 JniHelper::CallStaticObjectMethod(
169 jenv, jni_cache_->pattern_class.get(),
170 jni_cache_->pattern_compile, regex_java.get()));
171 pattern_ = MakeGlobalRef(pattern.get(), jenv, jni_cache_->jvm);
172 if (pattern_ == nullptr) {
173 return Status::UNKNOWN;
174 }
175
176 initialization_failure_ = false;
177 initialized_ = true;
178 pattern_text_.clear(); // We don't need this anymore.
179 }
180 return Status::OK;
181 }
182
183 constexpr int UniLibBase::RegexMatcher::kError;
184 constexpr int UniLibBase::RegexMatcher::kNoError;
185
Matcher(const UnicodeText & context) const186 std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
187 const UnicodeText& context) const {
188 LockedInitializeIfNotAlready(); // Possibly lazy initialization.
189 if (initialization_failure_) {
190 return nullptr;
191 }
192
193 if (jni_cache_) {
194 JNIEnv* env = jni_cache_->GetEnv();
195 const StatusOr<ScopedLocalRef<jstring>> status_or_context_java =
196 jni_cache_->ConvertToJavaString(context);
197 if (!status_or_context_java.ok() || !status_or_context_java.ValueOrDie()) {
198 return nullptr;
199 }
200 const StatusOr<ScopedLocalRef<jobject>> status_or_matcher =
201 JniHelper::CallObjectMethod(env, pattern_.get(),
202 jni_cache_->pattern_matcher,
203 status_or_context_java.ValueOrDie().get());
204 if (jni_cache_->ExceptionCheckAndClear() || !status_or_matcher.ok() ||
205 !status_or_matcher.ValueOrDie()) {
206 return nullptr;
207 }
208 return std::unique_ptr<UniLibBase::RegexMatcher>(new RegexMatcher(
209 jni_cache_,
210 MakeGlobalRef(status_or_matcher.ValueOrDie().get(), env,
211 jni_cache_->jvm),
212 MakeGlobalRef(status_or_context_java.ValueOrDie().get(), env,
213 jni_cache_->jvm)));
214 } else {
215 // NOTE: A valid object needs to be created here to pass the interface
216 // tests.
217 return std::unique_ptr<UniLibBase::RegexMatcher>(
218 new RegexMatcher(jni_cache_, {}, {}));
219 }
220 }
221
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)222 UniLibBase::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
223 ScopedGlobalRef<jobject> matcher,
224 ScopedGlobalRef<jstring> text)
225 : jni_cache_(jni_cache),
226 matcher_(std::move(matcher)),
227 text_(std::move(text)) {}
228
Matches(int * status) const229 bool UniLibBase::RegexMatcher::Matches(int* status) const {
230 if (jni_cache_) {
231 *status = kNoError;
232 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
233 matcher_.get(), jni_cache_->matcher_matches);
234 if (jni_cache_->ExceptionCheckAndClear()) {
235 *status = kError;
236 return false;
237 }
238 return result;
239 } else {
240 *status = kError;
241 return false;
242 }
243 }
244
ApproximatelyMatches(int * status)245 bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
246 *status = kNoError;
247
248 jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
249 jni_cache_->matcher_reset);
250 if (jni_cache_->ExceptionCheckAndClear()) {
251 *status = kError;
252 return kError;
253 }
254
255 if (!Find(status) || *status != kNoError) {
256 return false;
257 }
258
259 const int found_start = jni_cache_->GetEnv()->CallIntMethod(
260 matcher_.get(), jni_cache_->matcher_start_idx, 0);
261 if (jni_cache_->ExceptionCheckAndClear()) {
262 *status = kError;
263 return kError;
264 }
265
266 const int found_end = jni_cache_->GetEnv()->CallIntMethod(
267 matcher_.get(), jni_cache_->matcher_end_idx, 0);
268 if (jni_cache_->ExceptionCheckAndClear()) {
269 *status = kError;
270 return kError;
271 }
272
273 int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
274 text_.get(), jni_cache_->string_length);
275 if (jni_cache_->ExceptionCheckAndClear()) {
276 *status = kError;
277 return false;
278 }
279
280 if (found_start != 0 || found_end != context_length_bmp) {
281 return false;
282 }
283
284 return true;
285 }
286
UpdateLastFindOffset() const287 bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
288 if (!last_find_offset_dirty_) {
289 return true;
290 }
291
292 const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
293 matcher_.get(), jni_cache_->matcher_start_idx, 0);
294 if (jni_cache_->ExceptionCheckAndClear()) {
295 return false;
296 }
297
298 const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
299 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
300 find_offset);
301 if (jni_cache_->ExceptionCheckAndClear()) {
302 return false;
303 }
304
305 last_find_offset_codepoints_ += codepoint_count;
306 last_find_offset_ = find_offset;
307 last_find_offset_dirty_ = false;
308
309 return true;
310 }
311
Find(int * status)312 bool UniLibBase::RegexMatcher::Find(int* status) {
313 if (jni_cache_) {
314 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
315 matcher_.get(), jni_cache_->matcher_find);
316 if (jni_cache_->ExceptionCheckAndClear()) {
317 *status = kError;
318 return false;
319 }
320
321 last_find_offset_dirty_ = true;
322 *status = kNoError;
323 return result;
324 } else {
325 *status = kError;
326 return false;
327 }
328 }
329
Start(int * status) const330 int UniLibBase::RegexMatcher::Start(int* status) const {
331 return Start(/*group_idx=*/0, status);
332 }
333
Start(int group_idx,int * status) const334 int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
335 if (jni_cache_) {
336 *status = kNoError;
337
338 if (!UpdateLastFindOffset()) {
339 *status = kError;
340 return kError;
341 }
342
343 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
344 matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
345 if (jni_cache_->ExceptionCheckAndClear()) {
346 *status = kError;
347 return kError;
348 }
349
350 // If the group didn't participate in the match the index is -1.
351 if (java_index == -1) {
352 return -1;
353 }
354
355 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
356 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
357 java_index);
358 if (jni_cache_->ExceptionCheckAndClear()) {
359 *status = kError;
360 return kError;
361 }
362
363 return unicode_index + last_find_offset_codepoints_;
364 } else {
365 *status = kError;
366 return kError;
367 }
368 }
369
End(int * status) const370 int UniLibBase::RegexMatcher::End(int* status) const {
371 return End(/*group_idx=*/0, status);
372 }
373
End(int group_idx,int * status) const374 int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
375 if (jni_cache_) {
376 *status = kNoError;
377
378 if (!UpdateLastFindOffset()) {
379 *status = kError;
380 return kError;
381 }
382
383 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
384 matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
385 if (jni_cache_->ExceptionCheckAndClear()) {
386 *status = kError;
387 return kError;
388 }
389
390 // If the group didn't participate in the match the index is -1.
391 if (java_index == -1) {
392 return -1;
393 }
394
395 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
396 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
397 java_index);
398 if (jni_cache_->ExceptionCheckAndClear()) {
399 *status = kError;
400 return kError;
401 }
402
403 return unicode_index + last_find_offset_codepoints_;
404 } else {
405 *status = kError;
406 return kError;
407 }
408 }
409
Group(int * status) const410 UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
411 if (jni_cache_) {
412 JNIEnv* jenv = jni_cache_->GetEnv();
413 StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
414 JniHelper::CallObjectMethod<jstring>(jenv, matcher_.get(),
415 jni_cache_->matcher_group);
416
417 if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok() ||
418 !status_or_java_result.ValueOrDie()) {
419 *status = kError;
420 return UTF8ToUnicodeText("", /*do_copy=*/false);
421 }
422
423 std::string result;
424 if (!JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get(),
425 &result)) {
426 *status = kError;
427 return UTF8ToUnicodeText("", /*do_copy=*/false);
428 }
429 *status = kNoError;
430 return UTF8ToUnicodeText(result, /*do_copy=*/true);
431 } else {
432 *status = kError;
433 return UTF8ToUnicodeText("", /*do_copy=*/false);
434 }
435 }
436
Group(int group_idx,int * status) const437 UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
438 if (jni_cache_) {
439 JNIEnv* jenv = jni_cache_->GetEnv();
440
441 StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
442 JniHelper::CallObjectMethod<jstring>(
443 jenv, matcher_.get(), jni_cache_->matcher_group_idx, group_idx);
444 if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok()) {
445 *status = kError;
446 TC3_LOG(ERROR) << "Exception occurred";
447 return UTF8ToUnicodeText("", /*do_copy=*/false);
448 }
449
450 // java_result is nullptr when the group did not participate in the match.
451 // For these cases other UniLib implementations return empty string, and
452 // the participation can be checked by checking if Start() == -1.
453 if (!status_or_java_result.ValueOrDie()) {
454 *status = kNoError;
455 return UTF8ToUnicodeText("", /*do_copy=*/false);
456 }
457
458 std::string result;
459 if (!JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get(),
460 &result)) {
461 *status = kError;
462 return UTF8ToUnicodeText("", /*do_copy=*/false);
463 }
464 *status = kNoError;
465 return UTF8ToUnicodeText(result, /*do_copy=*/true);
466 } else {
467 *status = kError;
468 return UTF8ToUnicodeText("", /*do_copy=*/false);
469 }
470 }
471
472 constexpr int UniLibBase::BreakIterator::kDone;
473
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)474 UniLibBase::BreakIterator::BreakIterator(const JniCache* jni_cache,
475 const UnicodeText& text)
476 : jni_cache_(jni_cache),
477 text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
478 iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
479 last_break_index_(0),
480 last_unicode_index_(0) {
481 if (jni_cache_) {
482 JNIEnv* jenv = jni_cache_->GetEnv();
483 StatusOr<ScopedLocalRef<jstring>> status_or_text =
484 jni_cache_->ConvertToJavaString(text);
485 if (!status_or_text.ok()) {
486 return;
487 }
488 text_ =
489 MakeGlobalRef(status_or_text.ValueOrDie().get(), jenv, jni_cache->jvm);
490 if (!text_) {
491 return;
492 }
493
494 StatusOr<ScopedLocalRef<jobject>> status_or_iterator =
495 JniHelper::CallStaticObjectMethod(
496 jenv, jni_cache->breakiterator_class.get(),
497 jni_cache->breakiterator_getwordinstance,
498 jni_cache->locale_us.get());
499 if (!status_or_iterator.ok()) {
500 return;
501 }
502 iterator_ = MakeGlobalRef(status_or_iterator.ValueOrDie().get(), jenv,
503 jni_cache->jvm);
504 if (!iterator_) {
505 return;
506 }
507 JniHelper::CallVoidMethod(jenv, iterator_.get(),
508 jni_cache->breakiterator_settext, text_.get());
509 }
510 }
511
Next()512 int UniLibBase::BreakIterator::Next() {
513 if (jni_cache_) {
514 const int break_index = jni_cache_->GetEnv()->CallIntMethod(
515 iterator_.get(), jni_cache_->breakiterator_next);
516 if (jni_cache_->ExceptionCheckAndClear() ||
517 break_index == BreakIterator::kDone) {
518 return BreakIterator::kDone;
519 }
520
521 const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
522 text_.get(), jni_cache_->string_code_point_count, last_break_index_,
523 break_index);
524 if (jni_cache_->ExceptionCheckAndClear()) {
525 return BreakIterator::kDone;
526 }
527
528 last_break_index_ = break_index;
529 return last_unicode_index_ += token_unicode_length;
530 }
531 return BreakIterator::kDone;
532 }
533
CreateBreakIterator(const UnicodeText & text) const534 std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
535 const UnicodeText& text) const {
536 return std::unique_ptr<UniLibBase::BreakIterator>(
537 new UniLibBase::BreakIterator(jni_cache_.get(), text));
538 }
539
540 } // namespace libtextclassifier3
541