1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unilib-javaicu.h"
18
19 #include <math.h>
20
21 #include <cassert>
22 #include <cctype>
23 #include <map>
24
25 #include "utils/base/logging.h"
26 #include "utils/base/statusor.h"
27 #include "utils/java/jni-base.h"
28 #include "utils/java/jni-helper.h"
29 #include "utils/utf8/unicodetext.h"
30
31 namespace libtextclassifier3 {
32
UniLibBase()33 UniLibBase::UniLibBase() {
34 TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
35 }
36
UniLibBase(const std::shared_ptr<JniCache> & jni_cache)37 UniLibBase::UniLibBase(const std::shared_ptr<JniCache>& jni_cache)
38 : jni_cache_(jni_cache) {}
39
IsOpeningBracket(char32 codepoint) const40 bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
41 return libtextclassifier3::IsOpeningBracket(codepoint);
42 }
43
IsClosingBracket(char32 codepoint) const44 bool UniLibBase::IsClosingBracket(char32 codepoint) const {
45 return libtextclassifier3::IsClosingBracket(codepoint);
46 }
47
IsWhitespace(char32 codepoint) const48 bool UniLibBase::IsWhitespace(char32 codepoint) const {
49 return libtextclassifier3::IsWhitespace(codepoint);
50 }
51
IsDigit(char32 codepoint) const52 bool UniLibBase::IsDigit(char32 codepoint) const {
53 return libtextclassifier3::IsDigit(codepoint);
54 }
55
IsLower(char32 codepoint) const56 bool UniLibBase::IsLower(char32 codepoint) const {
57 return libtextclassifier3::IsLower(codepoint);
58 }
59
IsUpper(char32 codepoint) const60 bool UniLibBase::IsUpper(char32 codepoint) const {
61 return libtextclassifier3::IsUpper(codepoint);
62 }
63
IsPunctuation(char32 codepoint) const64 bool UniLibBase::IsPunctuation(char32 codepoint) const {
65 return libtextclassifier3::IsPunctuation(codepoint);
66 }
67
ToLower(char32 codepoint) const68 char32 UniLibBase::ToLower(char32 codepoint) const {
69 return libtextclassifier3::ToLower(codepoint);
70 }
71
ToUpper(char32 codepoint) const72 char32 UniLibBase::ToUpper(char32 codepoint) const {
73 return libtextclassifier3::ToUpper(codepoint);
74 }
75
GetPairedBracket(char32 codepoint) const76 char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
77 return libtextclassifier3::GetPairedBracket(codepoint);
78 }
79
80 // -----------------------------------------------------------------------------
81 // Implementations that call out to JVM. Behold the beauty.
82 // -----------------------------------------------------------------------------
83
Length(const UnicodeText & text) const84 StatusOr<int32> UniLibBase::Length(const UnicodeText& text) const {
85 TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> text_java,
86 jni_cache_->ConvertToJavaString(text));
87
88 JNIEnv* jenv = jni_cache_->GetEnv();
89 TC3_ASSIGN_OR_RETURN(int utf16_length,
90 JniHelper::CallIntMethod(jenv, text_java.get(),
91 jni_cache_->string_length));
92
93 return JniHelper::CallIntMethod(jenv, text_java.get(),
94 jni_cache_->string_code_point_count, 0,
95 utf16_length);
96 }
97
ParseInt32(const UnicodeText & text,int32 * result) const98 bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
99 return ParseInt(text, result);
100 }
101
ParseInt64(const UnicodeText & text,int64 * result) const102 bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
103 return ParseInt(text, result);
104 }
105
ParseDouble(const UnicodeText & text,double * result) const106 bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
107 if (!jni_cache_) {
108 return false;
109 }
110
111 auto it_dot = text.begin();
112 for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
113 }
114
115 int32 integer_part;
116 if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
117 &integer_part)) {
118 return false;
119 }
120
121 int32 fractional_part = 0;
122 if (it_dot != text.end()) {
123 if (!ParseInt(
124 UnicodeText::Substring(++it_dot, text.end(), /*do_copy=*/false),
125 &fractional_part)) {
126 return false;
127 }
128 }
129
130 double factional_part_double = fractional_part;
131 while (factional_part_double >= 1) {
132 factional_part_double /= 10;
133 }
134 *result = integer_part + factional_part_double;
135
136 return true;
137 }
138
CreateRegexPattern(const UnicodeText & regex) const139 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
140 const UnicodeText& regex) const {
141 return std::unique_ptr<UniLibBase::RegexPattern>(
142 new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
143 }
144
CreateLazyRegexPattern(const UnicodeText & regex) const145 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
146 const UnicodeText& regex) const {
147 return std::unique_ptr<UniLibBase::RegexPattern>(
148 new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
149 }
150
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)151 UniLibBase::RegexPattern::RegexPattern(const JniCache* jni_cache,
152 const UnicodeText& pattern, bool lazy)
153 : jni_cache_(jni_cache),
154 pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
155 initialized_(false),
156 initialization_failure_(false),
157 pattern_text_(pattern) {
158 if (!lazy) {
159 LockedInitializeIfNotAlready();
160 }
161 }
162
LockedInitializeIfNotAlready() const163 Status UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
164 std::lock_guard<std::mutex> guard(mutex_);
165 if (initialized_ || initialization_failure_) {
166 return Status::OK;
167 }
168
169 if (jni_cache_) {
170 JNIEnv* jenv = jni_cache_->GetEnv();
171 initialization_failure_ = true;
172 TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> regex_java,
173 jni_cache_->ConvertToJavaString(pattern_text_));
174 TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobject> pattern,
175 JniHelper::CallStaticObjectMethod(
176 jenv, jni_cache_->pattern_class.get(),
177 jni_cache_->pattern_compile, regex_java.get()));
178 pattern_ = MakeGlobalRef(pattern.get(), jenv, jni_cache_->jvm);
179 if (pattern_ == nullptr) {
180 return Status::UNKNOWN;
181 }
182
183 initialization_failure_ = false;
184 initialized_ = true;
185 pattern_text_.clear(); // We don't need this anymore.
186 }
187 return Status::OK;
188 }
189
190 constexpr int UniLibBase::RegexMatcher::kError;
191 constexpr int UniLibBase::RegexMatcher::kNoError;
192
Matcher(const UnicodeText & context) const193 std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
194 const UnicodeText& context) const {
195 LockedInitializeIfNotAlready(); // Possibly lazy initialization.
196 if (initialization_failure_) {
197 return nullptr;
198 }
199
200 if (jni_cache_) {
201 JNIEnv* env = jni_cache_->GetEnv();
202 const StatusOr<ScopedLocalRef<jstring>> status_or_context_java =
203 jni_cache_->ConvertToJavaString(context);
204 if (!status_or_context_java.ok() || !status_or_context_java.ValueOrDie()) {
205 return nullptr;
206 }
207 const StatusOr<ScopedLocalRef<jobject>> status_or_matcher =
208 JniHelper::CallObjectMethod(env, pattern_.get(),
209 jni_cache_->pattern_matcher,
210 status_or_context_java.ValueOrDie().get());
211 if (jni_cache_->ExceptionCheckAndClear() || !status_or_matcher.ok() ||
212 !status_or_matcher.ValueOrDie()) {
213 return nullptr;
214 }
215 return std::unique_ptr<UniLibBase::RegexMatcher>(new RegexMatcher(
216 jni_cache_,
217 MakeGlobalRef(status_or_matcher.ValueOrDie().get(), env,
218 jni_cache_->jvm),
219 MakeGlobalRef(status_or_context_java.ValueOrDie().get(), env,
220 jni_cache_->jvm)));
221 } else {
222 // NOTE: A valid object needs to be created here to pass the interface
223 // tests.
224 return std::unique_ptr<UniLibBase::RegexMatcher>(
225 new RegexMatcher(jni_cache_, {}, {}));
226 }
227 }
228
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)229 UniLibBase::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
230 ScopedGlobalRef<jobject> matcher,
231 ScopedGlobalRef<jstring> text)
232 : jni_cache_(jni_cache),
233 matcher_(std::move(matcher)),
234 text_(std::move(text)) {}
235
Matches(int * status) const236 bool UniLibBase::RegexMatcher::Matches(int* status) const {
237 if (jni_cache_) {
238 *status = kNoError;
239 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
240 matcher_.get(), jni_cache_->matcher_matches);
241 if (jni_cache_->ExceptionCheckAndClear()) {
242 *status = kError;
243 return false;
244 }
245 return result;
246 } else {
247 *status = kError;
248 return false;
249 }
250 }
251
ApproximatelyMatches(int * status)252 bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
253 *status = kNoError;
254
255 jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
256 jni_cache_->matcher_reset);
257 if (jni_cache_->ExceptionCheckAndClear()) {
258 *status = kError;
259 return kError;
260 }
261
262 if (!Find(status) || *status != kNoError) {
263 return false;
264 }
265
266 const int found_start = jni_cache_->GetEnv()->CallIntMethod(
267 matcher_.get(), jni_cache_->matcher_start_idx, 0);
268 if (jni_cache_->ExceptionCheckAndClear()) {
269 *status = kError;
270 return kError;
271 }
272
273 const int found_end = jni_cache_->GetEnv()->CallIntMethod(
274 matcher_.get(), jni_cache_->matcher_end_idx, 0);
275 if (jni_cache_->ExceptionCheckAndClear()) {
276 *status = kError;
277 return kError;
278 }
279
280 int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
281 text_.get(), jni_cache_->string_length);
282 if (jni_cache_->ExceptionCheckAndClear()) {
283 *status = kError;
284 return false;
285 }
286
287 if (found_start != 0 || found_end != context_length_bmp) {
288 return false;
289 }
290
291 return true;
292 }
293
UpdateLastFindOffset() const294 bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
295 if (!last_find_offset_dirty_) {
296 return true;
297 }
298
299 const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
300 matcher_.get(), jni_cache_->matcher_start_idx, 0);
301 if (jni_cache_->ExceptionCheckAndClear()) {
302 return false;
303 }
304
305 const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
306 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
307 find_offset);
308 if (jni_cache_->ExceptionCheckAndClear()) {
309 return false;
310 }
311
312 last_find_offset_codepoints_ += codepoint_count;
313 last_find_offset_ = find_offset;
314 last_find_offset_dirty_ = false;
315
316 return true;
317 }
318
Find(int * status)319 bool UniLibBase::RegexMatcher::Find(int* status) {
320 if (jni_cache_) {
321 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
322 matcher_.get(), jni_cache_->matcher_find);
323 if (jni_cache_->ExceptionCheckAndClear()) {
324 *status = kError;
325 return false;
326 }
327
328 last_find_offset_dirty_ = true;
329 *status = kNoError;
330 return result;
331 } else {
332 *status = kError;
333 return false;
334 }
335 }
336
Start(int * status) const337 int UniLibBase::RegexMatcher::Start(int* status) const {
338 return Start(/*group_idx=*/0, status);
339 }
340
Start(int group_idx,int * status) const341 int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
342 if (jni_cache_) {
343 *status = kNoError;
344
345 if (!UpdateLastFindOffset()) {
346 *status = kError;
347 return kError;
348 }
349
350 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
351 matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
352 if (jni_cache_->ExceptionCheckAndClear()) {
353 *status = kError;
354 return kError;
355 }
356
357 // If the group didn't participate in the match the index is -1.
358 if (java_index == -1) {
359 return -1;
360 }
361
362 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
363 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
364 java_index);
365 if (jni_cache_->ExceptionCheckAndClear()) {
366 *status = kError;
367 return kError;
368 }
369
370 return unicode_index + last_find_offset_codepoints_;
371 } else {
372 *status = kError;
373 return kError;
374 }
375 }
376
End(int * status) const377 int UniLibBase::RegexMatcher::End(int* status) const {
378 return End(/*group_idx=*/0, status);
379 }
380
End(int group_idx,int * status) const381 int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
382 if (jni_cache_) {
383 *status = kNoError;
384
385 if (!UpdateLastFindOffset()) {
386 *status = kError;
387 return kError;
388 }
389
390 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
391 matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
392 if (jni_cache_->ExceptionCheckAndClear()) {
393 *status = kError;
394 return kError;
395 }
396
397 // If the group didn't participate in the match the index is -1.
398 if (java_index == -1) {
399 return -1;
400 }
401
402 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
403 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
404 java_index);
405 if (jni_cache_->ExceptionCheckAndClear()) {
406 *status = kError;
407 return kError;
408 }
409
410 return unicode_index + last_find_offset_codepoints_;
411 } else {
412 *status = kError;
413 return kError;
414 }
415 }
416
Group(int * status) const417 UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
418 if (jni_cache_) {
419 JNIEnv* jenv = jni_cache_->GetEnv();
420 StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
421 JniHelper::CallObjectMethod<jstring>(jenv, matcher_.get(),
422 jni_cache_->matcher_group);
423
424 if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok() ||
425 !status_or_java_result.ValueOrDie()) {
426 *status = kError;
427 return UTF8ToUnicodeText("", /*do_copy=*/false);
428 }
429
430 StatusOr<std::string> status_or_result =
431 JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
432 if (!status_or_result.ok()) {
433 *status = kError;
434 return UTF8ToUnicodeText("", /*do_copy=*/false);
435 }
436 *status = kNoError;
437 return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
438 } else {
439 *status = kError;
440 return UTF8ToUnicodeText("", /*do_copy=*/false);
441 }
442 }
443
Group(int group_idx,int * status) const444 UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
445 if (jni_cache_) {
446 JNIEnv* jenv = jni_cache_->GetEnv();
447
448 StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
449 JniHelper::CallObjectMethod<jstring>(
450 jenv, matcher_.get(), jni_cache_->matcher_group_idx, group_idx);
451 if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok()) {
452 *status = kError;
453 TC3_LOG(ERROR) << "Exception occurred";
454 return UTF8ToUnicodeText("", /*do_copy=*/false);
455 }
456
457 // java_result is nullptr when the group did not participate in the match.
458 // For these cases other UniLib implementations return empty string, and
459 // the participation can be checked by checking if Start() == -1.
460 if (!status_or_java_result.ValueOrDie()) {
461 *status = kNoError;
462 return UTF8ToUnicodeText("", /*do_copy=*/false);
463 }
464
465 StatusOr<std::string> status_or_result =
466 JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get());
467 if (!status_or_result.ok()) {
468 *status = kError;
469 return UTF8ToUnicodeText("", /*do_copy=*/false);
470 }
471 *status = kNoError;
472 return UTF8ToUnicodeText(status_or_result.ValueOrDie(), /*do_copy=*/true);
473 } else {
474 *status = kError;
475 return UTF8ToUnicodeText("", /*do_copy=*/false);
476 }
477 }
478
479 constexpr int UniLibBase::BreakIterator::kDone;
480
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)481 UniLibBase::BreakIterator::BreakIterator(const JniCache* jni_cache,
482 const UnicodeText& text)
483 : jni_cache_(jni_cache),
484 text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
485 iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
486 last_break_index_(0),
487 last_unicode_index_(0) {
488 if (jni_cache_) {
489 JNIEnv* jenv = jni_cache_->GetEnv();
490 StatusOr<ScopedLocalRef<jstring>> status_or_text =
491 jni_cache_->ConvertToJavaString(text);
492 if (!status_or_text.ok()) {
493 return;
494 }
495 text_ =
496 MakeGlobalRef(status_or_text.ValueOrDie().get(), jenv, jni_cache->jvm);
497 if (!text_) {
498 return;
499 }
500
501 StatusOr<ScopedLocalRef<jobject>> status_or_iterator =
502 JniHelper::CallStaticObjectMethod(
503 jenv, jni_cache->breakiterator_class.get(),
504 jni_cache->breakiterator_getwordinstance,
505 jni_cache->locale_us.get());
506 if (!status_or_iterator.ok()) {
507 return;
508 }
509 iterator_ = MakeGlobalRef(status_or_iterator.ValueOrDie().get(), jenv,
510 jni_cache->jvm);
511 if (!iterator_) {
512 return;
513 }
514 JniHelper::CallVoidMethod(jenv, iterator_.get(),
515 jni_cache->breakiterator_settext, text_.get());
516 }
517 }
518
Next()519 int UniLibBase::BreakIterator::Next() {
520 if (jni_cache_) {
521 const int break_index = jni_cache_->GetEnv()->CallIntMethod(
522 iterator_.get(), jni_cache_->breakiterator_next);
523 if (jni_cache_->ExceptionCheckAndClear() ||
524 break_index == BreakIterator::kDone) {
525 return BreakIterator::kDone;
526 }
527
528 const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
529 text_.get(), jni_cache_->string_code_point_count, last_break_index_,
530 break_index);
531 if (jni_cache_->ExceptionCheckAndClear()) {
532 return BreakIterator::kDone;
533 }
534
535 last_break_index_ = break_index;
536 return last_unicode_index_ += token_unicode_length;
537 }
538 return BreakIterator::kDone;
539 }
540
CreateBreakIterator(const UnicodeText & text) const541 std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
542 const UnicodeText& text) const {
543 return std::unique_ptr<UniLibBase::BreakIterator>(
544 new UniLibBase::BreakIterator(jni_cache_.get(), text));
545 }
546
547 } // namespace libtextclassifier3
548