• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (C) 2018 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include "utils/utf8/unilib-javaicu.h"
18 
19 #include <math.h>
20 
21 #include <cassert>
22 #include <cctype>
23 #include <map>
24 
25 #include "utils/base/logging.h"
26 #include "utils/base/statusor.h"
27 #include "utils/java/jni-base.h"
28 #include "utils/java/string_utils.h"
29 #include "utils/utf8/unicodetext.h"
30 #include "utils/utf8/unilib-common.h"
31 
32 namespace libtextclassifier3 {
33 
UniLibBase()34 UniLibBase::UniLibBase() {
35   TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
36 }
37 
UniLibBase(const std::shared_ptr<JniCache> & jni_cache)38 UniLibBase::UniLibBase(const std::shared_ptr<JniCache>& jni_cache)
39     : jni_cache_(jni_cache) {}
40 
IsOpeningBracket(char32 codepoint) const41 bool UniLibBase::IsOpeningBracket(char32 codepoint) const {
42   return libtextclassifier3::IsOpeningBracket(codepoint);
43 }
44 
IsClosingBracket(char32 codepoint) const45 bool UniLibBase::IsClosingBracket(char32 codepoint) const {
46   return libtextclassifier3::IsClosingBracket(codepoint);
47 }
48 
IsWhitespace(char32 codepoint) const49 bool UniLibBase::IsWhitespace(char32 codepoint) const {
50   return libtextclassifier3::IsWhitespace(codepoint);
51 }
52 
IsDigit(char32 codepoint) const53 bool UniLibBase::IsDigit(char32 codepoint) const {
54   return libtextclassifier3::IsDigit(codepoint);
55 }
56 
IsLower(char32 codepoint) const57 bool UniLibBase::IsLower(char32 codepoint) const {
58   return libtextclassifier3::IsLower(codepoint);
59 }
60 
IsUpper(char32 codepoint) const61 bool UniLibBase::IsUpper(char32 codepoint) const {
62   return libtextclassifier3::IsUpper(codepoint);
63 }
64 
IsPunctuation(char32 codepoint) const65 bool UniLibBase::IsPunctuation(char32 codepoint) const {
66   return libtextclassifier3::IsPunctuation(codepoint);
67 }
68 
ToLower(char32 codepoint) const69 char32 UniLibBase::ToLower(char32 codepoint) const {
70   return libtextclassifier3::ToLower(codepoint);
71 }
72 
ToUpper(char32 codepoint) const73 char32 UniLibBase::ToUpper(char32 codepoint) const {
74   return libtextclassifier3::ToUpper(codepoint);
75 }
76 
GetPairedBracket(char32 codepoint) const77 char32 UniLibBase::GetPairedBracket(char32 codepoint) const {
78   return libtextclassifier3::GetPairedBracket(codepoint);
79 }
80 
81 // -----------------------------------------------------------------------------
82 // Implementations that call out to JVM. Behold the beauty.
83 // -----------------------------------------------------------------------------
84 
ParseInt32(const UnicodeText & text,int32 * result) const85 bool UniLibBase::ParseInt32(const UnicodeText& text, int32* result) const {
86   return ParseInt(text, result);
87 }
88 
ParseInt64(const UnicodeText & text,int64 * result) const89 bool UniLibBase::ParseInt64(const UnicodeText& text, int64* result) const {
90   return ParseInt(text, result);
91 }
92 
ParseDouble(const UnicodeText & text,double * result) const93 bool UniLibBase::ParseDouble(const UnicodeText& text, double* result) const {
94   if (!jni_cache_) {
95     return false;
96   }
97 
98   JNIEnv* env = jni_cache_->GetEnv();
99   auto it_dot = text.begin();
100   for (; it_dot != text.end() && !IsDot(*it_dot); it_dot++) {
101   }
102 
103   int64 integer_part;
104   if (!ParseInt(UnicodeText::Substring(text.begin(), it_dot, /*do_copy=*/false),
105                 &integer_part)) {
106     return false;
107   }
108 
109   int64 fractional_part = 0;
110   if (it_dot != text.end()) {
111     std::string fractional_part_str =
112         UnicodeText::UTF8Substring(++it_dot, text.end());
113     TC3_ASSIGN_OR_RETURN_FALSE(
114         const ScopedLocalRef<jstring> fractional_text_java,
115         jni_cache_->ConvertToJavaString(fractional_part_str));
116     TC3_ASSIGN_OR_RETURN_FALSE(
117         fractional_part,
118         JniHelper::CallStaticIntMethod<int64>(
119             env, jni_cache_->integer_class.get(), jni_cache_->integer_parse_int,
120             fractional_text_java.get()));
121   }
122 
123   double factional_part_double = fractional_part;
124   while (factional_part_double >= 1) {
125     factional_part_double /= 10;
126   }
127   *result = integer_part + factional_part_double;
128 
129   return true;
130 }
131 
CreateRegexPattern(const UnicodeText & regex) const132 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateRegexPattern(
133     const UnicodeText& regex) const {
134   return std::unique_ptr<UniLibBase::RegexPattern>(
135       new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
136 }
137 
CreateLazyRegexPattern(const UnicodeText & regex) const138 std::unique_ptr<UniLibBase::RegexPattern> UniLibBase::CreateLazyRegexPattern(
139     const UnicodeText& regex) const {
140   return std::unique_ptr<UniLibBase::RegexPattern>(
141       new UniLibBase::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
142 }
143 
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)144 UniLibBase::RegexPattern::RegexPattern(const JniCache* jni_cache,
145                                        const UnicodeText& pattern, bool lazy)
146     : jni_cache_(jni_cache),
147       pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
148       initialized_(false),
149       initialization_failure_(false),
150       pattern_text_(pattern) {
151   if (!lazy) {
152     LockedInitializeIfNotAlready();
153   }
154 }
155 
LockedInitializeIfNotAlready() const156 Status UniLibBase::RegexPattern::LockedInitializeIfNotAlready() const {
157   std::lock_guard<std::mutex> guard(mutex_);
158   if (initialized_ || initialization_failure_) {
159     return Status::OK;
160   }
161 
162   if (jni_cache_) {
163     JNIEnv* jenv = jni_cache_->GetEnv();
164     initialization_failure_ = true;
165     TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jstring> regex_java,
166                          jni_cache_->ConvertToJavaString(pattern_text_));
167     TC3_ASSIGN_OR_RETURN(ScopedLocalRef<jobject> pattern,
168                          JniHelper::CallStaticObjectMethod(
169                              jenv, jni_cache_->pattern_class.get(),
170                              jni_cache_->pattern_compile, regex_java.get()));
171     pattern_ = MakeGlobalRef(pattern.get(), jenv, jni_cache_->jvm);
172     if (pattern_ == nullptr) {
173       return Status::UNKNOWN;
174     }
175 
176     initialization_failure_ = false;
177     initialized_ = true;
178     pattern_text_.clear();  // We don't need this anymore.
179   }
180   return Status::OK;
181 }
182 
183 constexpr int UniLibBase::RegexMatcher::kError;
184 constexpr int UniLibBase::RegexMatcher::kNoError;
185 
Matcher(const UnicodeText & context) const186 std::unique_ptr<UniLibBase::RegexMatcher> UniLibBase::RegexPattern::Matcher(
187     const UnicodeText& context) const {
188   LockedInitializeIfNotAlready();  // Possibly lazy initialization.
189   if (initialization_failure_) {
190     return nullptr;
191   }
192 
193   if (jni_cache_) {
194     JNIEnv* env = jni_cache_->GetEnv();
195     const StatusOr<ScopedLocalRef<jstring>> status_or_context_java =
196         jni_cache_->ConvertToJavaString(context);
197     if (!status_or_context_java.ok() || !status_or_context_java.ValueOrDie()) {
198       return nullptr;
199     }
200     const StatusOr<ScopedLocalRef<jobject>> status_or_matcher =
201         JniHelper::CallObjectMethod(env, pattern_.get(),
202                                     jni_cache_->pattern_matcher,
203                                     status_or_context_java.ValueOrDie().get());
204     if (jni_cache_->ExceptionCheckAndClear() || !status_or_matcher.ok() ||
205         !status_or_matcher.ValueOrDie()) {
206       return nullptr;
207     }
208     return std::unique_ptr<UniLibBase::RegexMatcher>(new RegexMatcher(
209         jni_cache_,
210         MakeGlobalRef(status_or_matcher.ValueOrDie().get(), env,
211                       jni_cache_->jvm),
212         MakeGlobalRef(status_or_context_java.ValueOrDie().get(), env,
213                       jni_cache_->jvm)));
214   } else {
215     // NOTE: A valid object needs to be created here to pass the interface
216     // tests.
217     return std::unique_ptr<UniLibBase::RegexMatcher>(
218         new RegexMatcher(jni_cache_, {}, {}));
219   }
220 }
221 
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)222 UniLibBase::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
223                                        ScopedGlobalRef<jobject> matcher,
224                                        ScopedGlobalRef<jstring> text)
225     : jni_cache_(jni_cache),
226       matcher_(std::move(matcher)),
227       text_(std::move(text)) {}
228 
Matches(int * status) const229 bool UniLibBase::RegexMatcher::Matches(int* status) const {
230   if (jni_cache_) {
231     *status = kNoError;
232     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
233         matcher_.get(), jni_cache_->matcher_matches);
234     if (jni_cache_->ExceptionCheckAndClear()) {
235       *status = kError;
236       return false;
237     }
238     return result;
239   } else {
240     *status = kError;
241     return false;
242   }
243 }
244 
ApproximatelyMatches(int * status)245 bool UniLibBase::RegexMatcher::ApproximatelyMatches(int* status) {
246   *status = kNoError;
247 
248   jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
249                                          jni_cache_->matcher_reset);
250   if (jni_cache_->ExceptionCheckAndClear()) {
251     *status = kError;
252     return kError;
253   }
254 
255   if (!Find(status) || *status != kNoError) {
256     return false;
257   }
258 
259   const int found_start = jni_cache_->GetEnv()->CallIntMethod(
260       matcher_.get(), jni_cache_->matcher_start_idx, 0);
261   if (jni_cache_->ExceptionCheckAndClear()) {
262     *status = kError;
263     return kError;
264   }
265 
266   const int found_end = jni_cache_->GetEnv()->CallIntMethod(
267       matcher_.get(), jni_cache_->matcher_end_idx, 0);
268   if (jni_cache_->ExceptionCheckAndClear()) {
269     *status = kError;
270     return kError;
271   }
272 
273   int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
274       text_.get(), jni_cache_->string_length);
275   if (jni_cache_->ExceptionCheckAndClear()) {
276     *status = kError;
277     return false;
278   }
279 
280   if (found_start != 0 || found_end != context_length_bmp) {
281     return false;
282   }
283 
284   return true;
285 }
286 
UpdateLastFindOffset() const287 bool UniLibBase::RegexMatcher::UpdateLastFindOffset() const {
288   if (!last_find_offset_dirty_) {
289     return true;
290   }
291 
292   const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
293       matcher_.get(), jni_cache_->matcher_start_idx, 0);
294   if (jni_cache_->ExceptionCheckAndClear()) {
295     return false;
296   }
297 
298   const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
299       text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
300       find_offset);
301   if (jni_cache_->ExceptionCheckAndClear()) {
302     return false;
303   }
304 
305   last_find_offset_codepoints_ += codepoint_count;
306   last_find_offset_ = find_offset;
307   last_find_offset_dirty_ = false;
308 
309   return true;
310 }
311 
Find(int * status)312 bool UniLibBase::RegexMatcher::Find(int* status) {
313   if (jni_cache_) {
314     const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
315         matcher_.get(), jni_cache_->matcher_find);
316     if (jni_cache_->ExceptionCheckAndClear()) {
317       *status = kError;
318       return false;
319     }
320 
321     last_find_offset_dirty_ = true;
322     *status = kNoError;
323     return result;
324   } else {
325     *status = kError;
326     return false;
327   }
328 }
329 
Start(int * status) const330 int UniLibBase::RegexMatcher::Start(int* status) const {
331   return Start(/*group_idx=*/0, status);
332 }
333 
Start(int group_idx,int * status) const334 int UniLibBase::RegexMatcher::Start(int group_idx, int* status) const {
335   if (jni_cache_) {
336     *status = kNoError;
337 
338     if (!UpdateLastFindOffset()) {
339       *status = kError;
340       return kError;
341     }
342 
343     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
344         matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
345     if (jni_cache_->ExceptionCheckAndClear()) {
346       *status = kError;
347       return kError;
348     }
349 
350     // If the group didn't participate in the match the index is -1.
351     if (java_index == -1) {
352       return -1;
353     }
354 
355     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
356         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
357         java_index);
358     if (jni_cache_->ExceptionCheckAndClear()) {
359       *status = kError;
360       return kError;
361     }
362 
363     return unicode_index + last_find_offset_codepoints_;
364   } else {
365     *status = kError;
366     return kError;
367   }
368 }
369 
End(int * status) const370 int UniLibBase::RegexMatcher::End(int* status) const {
371   return End(/*group_idx=*/0, status);
372 }
373 
End(int group_idx,int * status) const374 int UniLibBase::RegexMatcher::End(int group_idx, int* status) const {
375   if (jni_cache_) {
376     *status = kNoError;
377 
378     if (!UpdateLastFindOffset()) {
379       *status = kError;
380       return kError;
381     }
382 
383     const int java_index = jni_cache_->GetEnv()->CallIntMethod(
384         matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
385     if (jni_cache_->ExceptionCheckAndClear()) {
386       *status = kError;
387       return kError;
388     }
389 
390     // If the group didn't participate in the match the index is -1.
391     if (java_index == -1) {
392       return -1;
393     }
394 
395     const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
396         text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
397         java_index);
398     if (jni_cache_->ExceptionCheckAndClear()) {
399       *status = kError;
400       return kError;
401     }
402 
403     return unicode_index + last_find_offset_codepoints_;
404   } else {
405     *status = kError;
406     return kError;
407   }
408 }
409 
Group(int * status) const410 UnicodeText UniLibBase::RegexMatcher::Group(int* status) const {
411   if (jni_cache_) {
412     JNIEnv* jenv = jni_cache_->GetEnv();
413     StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
414         JniHelper::CallObjectMethod<jstring>(jenv, matcher_.get(),
415                                              jni_cache_->matcher_group);
416 
417     if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok() ||
418         !status_or_java_result.ValueOrDie()) {
419       *status = kError;
420       return UTF8ToUnicodeText("", /*do_copy=*/false);
421     }
422 
423     std::string result;
424     if (!JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get(),
425                              &result)) {
426       *status = kError;
427       return UTF8ToUnicodeText("", /*do_copy=*/false);
428     }
429     *status = kNoError;
430     return UTF8ToUnicodeText(result, /*do_copy=*/true);
431   } else {
432     *status = kError;
433     return UTF8ToUnicodeText("", /*do_copy=*/false);
434   }
435 }
436 
Group(int group_idx,int * status) const437 UnicodeText UniLibBase::RegexMatcher::Group(int group_idx, int* status) const {
438   if (jni_cache_) {
439     JNIEnv* jenv = jni_cache_->GetEnv();
440 
441     StatusOr<ScopedLocalRef<jstring>> status_or_java_result =
442         JniHelper::CallObjectMethod<jstring>(
443             jenv, matcher_.get(), jni_cache_->matcher_group_idx, group_idx);
444     if (jni_cache_->ExceptionCheckAndClear() || !status_or_java_result.ok()) {
445       *status = kError;
446       TC3_LOG(ERROR) << "Exception occurred";
447       return UTF8ToUnicodeText("", /*do_copy=*/false);
448     }
449 
450     // java_result is nullptr when the group did not participate in the match.
451     // For these cases other UniLib implementations return empty string, and
452     // the participation can be checked by checking if Start() == -1.
453     if (!status_or_java_result.ValueOrDie()) {
454       *status = kNoError;
455       return UTF8ToUnicodeText("", /*do_copy=*/false);
456     }
457 
458     std::string result;
459     if (!JStringToUtf8String(jenv, status_or_java_result.ValueOrDie().get(),
460                              &result)) {
461       *status = kError;
462       return UTF8ToUnicodeText("", /*do_copy=*/false);
463     }
464     *status = kNoError;
465     return UTF8ToUnicodeText(result, /*do_copy=*/true);
466   } else {
467     *status = kError;
468     return UTF8ToUnicodeText("", /*do_copy=*/false);
469   }
470 }
471 
472 constexpr int UniLibBase::BreakIterator::kDone;
473 
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)474 UniLibBase::BreakIterator::BreakIterator(const JniCache* jni_cache,
475                                          const UnicodeText& text)
476     : jni_cache_(jni_cache),
477       text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
478       iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
479       last_break_index_(0),
480       last_unicode_index_(0) {
481   if (jni_cache_) {
482     JNIEnv* jenv = jni_cache_->GetEnv();
483     StatusOr<ScopedLocalRef<jstring>> status_or_text =
484         jni_cache_->ConvertToJavaString(text);
485     if (!status_or_text.ok()) {
486       return;
487     }
488     text_ =
489         MakeGlobalRef(status_or_text.ValueOrDie().get(), jenv, jni_cache->jvm);
490     if (!text_) {
491       return;
492     }
493 
494     StatusOr<ScopedLocalRef<jobject>> status_or_iterator =
495         JniHelper::CallStaticObjectMethod(
496             jenv, jni_cache->breakiterator_class.get(),
497             jni_cache->breakiterator_getwordinstance,
498             jni_cache->locale_us.get());
499     if (!status_or_iterator.ok()) {
500       return;
501     }
502     iterator_ = MakeGlobalRef(status_or_iterator.ValueOrDie().get(), jenv,
503                               jni_cache->jvm);
504     if (!iterator_) {
505       return;
506     }
507     JniHelper::CallVoidMethod(jenv, iterator_.get(),
508                               jni_cache->breakiterator_settext, text_.get());
509   }
510 }
511 
Next()512 int UniLibBase::BreakIterator::Next() {
513   if (jni_cache_) {
514     const int break_index = jni_cache_->GetEnv()->CallIntMethod(
515         iterator_.get(), jni_cache_->breakiterator_next);
516     if (jni_cache_->ExceptionCheckAndClear() ||
517         break_index == BreakIterator::kDone) {
518       return BreakIterator::kDone;
519     }
520 
521     const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
522         text_.get(), jni_cache_->string_code_point_count, last_break_index_,
523         break_index);
524     if (jni_cache_->ExceptionCheckAndClear()) {
525       return BreakIterator::kDone;
526     }
527 
528     last_break_index_ = break_index;
529     return last_unicode_index_ += token_unicode_length;
530   }
531   return BreakIterator::kDone;
532 }
533 
CreateBreakIterator(const UnicodeText & text) const534 std::unique_ptr<UniLibBase::BreakIterator> UniLibBase::CreateBreakIterator(
535     const UnicodeText& text) const {
536   return std::unique_ptr<UniLibBase::BreakIterator>(
537       new UniLibBase::BreakIterator(jni_cache_.get(), text));
538 }
539 
540 }  // namespace libtextclassifier3
541