1 /* 2 * Copyright (C) 2018 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 // An implementation of Unilib that uses Android Java interfaces via JNI. The 18 // performance critical ops have been re-implemented in C++. 19 // Specifically, this class must be compatible with API level 14 (ICS). 20 21 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ 22 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ 23 24 #include <jni.h> 25 #include <memory> 26 #include <mutex> // NOLINT 27 #include <string> 28 29 #include "utils/base/integral_types.h" 30 #include "utils/java/jni-cache.h" 31 #include "utils/java/scoped_global_ref.h" 32 #include "utils/java/scoped_local_ref.h" 33 #include "utils/java/string_utils.h" 34 #include "utils/utf8/unicodetext.h" 35 36 namespace libtextclassifier3 { 37 38 class UniLib { 39 public: 40 UniLib(); 41 explicit UniLib(const std::shared_ptr<JniCache>& jni_cache); 42 43 bool ParseInt32(const UnicodeText& text, int* result) const; 44 bool IsOpeningBracket(char32 codepoint) const; 45 bool IsClosingBracket(char32 codepoint) const; 46 bool IsWhitespace(char32 codepoint) const; 47 bool IsDigit(char32 codepoint) const; 48 bool IsUpper(char32 codepoint) const; 49 50 char32 ToLower(char32 codepoint) const; 51 char32 GetPairedBracket(char32 codepoint) const; 52 53 // Forward declaration for friend. 54 class RegexPattern; 55 56 class RegexMatcher { 57 public: 58 static constexpr int kError = -1; 59 static constexpr int kNoError = 0; 60 61 // Checks whether the input text matches the pattern exactly. 62 bool Matches(int* status) const; 63 64 // Approximate Matches() implementation implemented using Find(). It uses 65 // the first Find() result and then checks that it spans the whole input. 66 // NOTE: Unlike Matches() it can result in false negatives. 67 // NOTE: Resets the matcher, so the current Find() state will be lost. 68 bool ApproximatelyMatches(int* status); 69 70 // Finds occurrences of the pattern in the input text. 71 // Can be called repeatedly to find all occurences. A call will update 72 // internal state, so that 'Start', 'End' and 'Group' can be called to get 73 // information about the match. 74 // NOTE: Any call to ApproximatelyMatches() in between Find() calls will 75 // modify the state. 76 bool Find(int* status); 77 78 // Gets the start offset of the last match (from 'Find'). 79 // Sets status to 'kError' if 'Find' 80 // was not called previously. 81 int Start(int* status) const; 82 83 // Gets the start offset of the specified group of the last match. 84 // (from 'Find'). 85 // Sets status to 'kError' if an invalid group was specified or if 'Find' 86 // was not called previously. 87 int Start(int group_idx, int* status) const; 88 89 // Gets the end offset of the last match (from 'Find'). 90 // Sets status to 'kError' if 'Find' 91 // was not called previously. 92 int End(int* status) const; 93 94 // Gets the end offset of the specified group of the last match. 95 // (from 'Find'). 96 // Sets status to 'kError' if an invalid group was specified or if 'Find' 97 // was not called previously. 98 int End(int group_idx, int* status) const; 99 100 // Gets the text of the last match (from 'Find'). 101 // Sets status to 'kError' if 'Find' was not called previously. 102 UnicodeText Group(int* status) const; 103 104 // Gets the text of the specified group of the last match (from 'Find'). 105 // Sets status to 'kError' if an invalid group was specified or if 'Find' 106 // was not called previously. 107 UnicodeText Group(int group_idx, int* status) const; 108 109 // Returns the matched text (the 0th capturing group). Text()110 std::string Text() const { 111 ScopedStringChars text_str = 112 GetScopedStringChars(jni_cache_->GetEnv(), text_.get()); 113 return text_str.get(); 114 } 115 116 private: 117 friend class RegexPattern; 118 RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher, 119 ScopedGlobalRef<jstring> text); 120 bool UpdateLastFindOffset() const; 121 122 const JniCache* jni_cache_; 123 ScopedGlobalRef<jobject> matcher_; 124 ScopedGlobalRef<jstring> text_; 125 mutable int last_find_offset_ = 0; 126 mutable int last_find_offset_codepoints_ = 0; 127 mutable bool last_find_offset_dirty_ = true; 128 }; 129 130 class RegexPattern { 131 public: 132 std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const; 133 134 private: 135 friend class UniLib; 136 RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern, 137 bool lazy); 138 void LockedInitializeIfNotAlready() const; 139 140 const JniCache* jni_cache_; 141 142 // These members need to be mutable because of the lazy initialization. 143 // NOTE: The Matcher method first ensures (using a lock) that the 144 // initialization was attempted (by using LockedInitializeIfNotAlready) and 145 // then can access them without locking. 146 mutable std::mutex mutex_; 147 mutable ScopedGlobalRef<jobject> pattern_; 148 mutable bool initialized_; 149 mutable bool initialization_failure_; 150 mutable UnicodeText pattern_text_; 151 }; 152 153 class BreakIterator { 154 public: 155 int Next(); 156 157 static constexpr int kDone = -1; 158 159 private: 160 friend class UniLib; 161 BreakIterator(const JniCache* jni_cache, const UnicodeText& text); 162 163 const JniCache* jni_cache_; 164 ScopedGlobalRef<jstring> text_; 165 ScopedGlobalRef<jobject> iterator_; 166 int last_break_index_; 167 int last_unicode_index_; 168 }; 169 170 std::unique_ptr<RegexPattern> CreateRegexPattern( 171 const UnicodeText& regex) const; 172 std::unique_ptr<RegexPattern> CreateLazyRegexPattern( 173 const UnicodeText& regex) const; 174 std::unique_ptr<BreakIterator> CreateBreakIterator( 175 const UnicodeText& text) const; 176 177 private: 178 std::shared_ptr<JniCache> jni_cache_; 179 }; 180 181 } // namespace libtextclassifier3 182 183 #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_ 184