/* * Copyright (C) 2018 The Android Open Source Project * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ // An implementation of Unilib that uses Android Java interfaces via JNI. The // performance critical ops have been re-implemented in C++. // Specifically, this class must be compatible with API level 14 (ICS). #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ #include #include #include // NOLINT #include #include "utils/base/integral_types.h" #include "utils/java/jni-base.h" #include "utils/java/jni-cache.h" #include "utils/java/jni-helper.h" #include "utils/utf8/unicodetext.h" #include "utils/utf8/unilib-common.h" namespace libtextclassifier3 { class UniLibBase { public: UniLibBase(); explicit UniLibBase(const std::shared_ptr& jni_cache); bool ParseInt32(const UnicodeText& text, int32* result) const; bool ParseInt64(const UnicodeText& text, int64* result) const; bool ParseDouble(const UnicodeText& text, double* result) const; bool IsOpeningBracket(char32 codepoint) const; bool IsClosingBracket(char32 codepoint) const; bool IsWhitespace(char32 codepoint) const; bool IsDigit(char32 codepoint) const; bool IsLower(char32 codepoint) const; bool IsUpper(char32 codepoint) const; bool IsPunctuation(char32 codepoint) const; char32 ToLower(char32 codepoint) const; char32 ToUpper(char32 codepoint) const; char32 GetPairedBracket(char32 codepoint) const; StatusOr Length(const UnicodeText& text) const; // Forward declaration for friend. class RegexPattern; class RegexMatcher { public: static constexpr int kError = -1; static constexpr int kNoError = 0; // Checks whether the input text matches the pattern exactly. bool Matches(int* status) const; // Approximate Matches() implementation implemented using Find(). It uses // the first Find() result and then checks that it spans the whole input. // NOTE: Unlike Matches() it can result in false negatives. // NOTE: Resets the matcher, so the current Find() state will be lost. bool ApproximatelyMatches(int* status); // Finds occurrences of the pattern in the input text. // Can be called repeatedly to find all occurrences. A call will update // internal state, so that 'Start', 'End' and 'Group' can be called to get // information about the match. // NOTE: Any call to ApproximatelyMatches() in between Find() calls will // modify the state. bool Find(int* status); // Gets the start offset of the last match (from 'Find'). // Sets status to 'kError' if 'Find' // was not called previously. int Start(int* status) const; // Gets the start offset of the specified group of the last match. // (from 'Find'). // Sets status to 'kError' if an invalid group was specified or if 'Find' // was not called previously. int Start(int group_idx, int* status) const; // Gets the end offset of the last match (from 'Find'). // Sets status to 'kError' if 'Find' // was not called previously. int End(int* status) const; // Gets the end offset of the specified group of the last match. // (from 'Find'). // Sets status to 'kError' if an invalid group was specified or if 'Find' // was not called previously. int End(int group_idx, int* status) const; // Gets the text of the last match (from 'Find'). // Sets status to 'kError' if 'Find' was not called previously. UnicodeText Group(int* status) const; // Gets the text of the specified group of the last match (from 'Find'). // Sets status to 'kError' if an invalid group was specified or if 'Find' // was not called previously. UnicodeText Group(int group_idx, int* status) const; // Returns the matched text (the 0th capturing group). std::string Text() const { StatusOr status_or_result = JStringToUtf8String(jni_cache_->GetEnv(), text_.get()); if (!status_or_result.ok()) { TC3_LOG(ERROR) << "JStringToUtf8String failed."; return ""; } return status_or_result.ValueOrDie(); } private: friend class RegexPattern; RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef matcher, ScopedGlobalRef text); bool UpdateLastFindOffset() const; const JniCache* jni_cache_; ScopedGlobalRef matcher_; ScopedGlobalRef text_; mutable int last_find_offset_ = 0; mutable int last_find_offset_codepoints_ = 0; mutable bool last_find_offset_dirty_ = true; }; class RegexPattern { public: std::unique_ptr Matcher(const UnicodeText& context) const; private: friend class UniLibBase; RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern, bool lazy); Status LockedInitializeIfNotAlready() const; const JniCache* jni_cache_; // These members need to be mutable because of the lazy initialization. // NOTE: The Matcher method first ensures (using a lock) that the // initialization was attempted (by using LockedInitializeIfNotAlready) and // then can access them without locking. mutable std::mutex mutex_; mutable ScopedGlobalRef pattern_; mutable bool initialized_; mutable bool initialization_failure_; mutable UnicodeText pattern_text_; }; class BreakIterator { public: int Next(); static constexpr int kDone = -1; private: friend class UniLibBase; BreakIterator(const JniCache* jni_cache, const UnicodeText& text); const JniCache* jni_cache_; ScopedGlobalRef text_; ScopedGlobalRef iterator_; int last_break_index_; int last_unicode_index_; }; std::unique_ptr CreateRegexPattern( const UnicodeText& regex) const; std::unique_ptr CreateLazyRegexPattern( const UnicodeText& regex) const; std::unique_ptr CreateBreakIterator( const UnicodeText& text) const; private: template bool ParseInt(const UnicodeText& text, T* result) const; std::shared_ptr jni_cache_; }; template bool UniLibBase::ParseInt(const UnicodeText& text, T* result) const { if (!jni_cache_) { return false; } // Avoid throwing exceptions when the text is unlikely to be a number. int32 result32 = 0; if (!PassesIntPreChesks(text, result32)) { return false; } JNIEnv* env = jni_cache_->GetEnv(); TC3_ASSIGN_OR_RETURN_FALSE(const ScopedLocalRef text_java, jni_cache_->ConvertToJavaString(text)); TC3_ASSIGN_OR_RETURN_FALSE( *result, JniHelper::CallStaticIntMethod( env, /*print_exception_on_error=*/false, jni_cache_->integer_class.get(), jni_cache_->integer_parse_int, text_java.get())); return true; } } // namespace libtextclassifier3 #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_