1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 // An implementation of Unilib that uses Android Java interfaces via JNI. The
18 // performance critical ops have been re-implemented in C++.
19 // Specifically, this class must be compatible with API level 14 (ICS).
20
21 #ifndef LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
22 #define LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
23
24 #include <jni.h>
25
26 #include <memory>
27 #include <mutex> // NOLINT
28 #include <string>
29
30 #include "utils/base/integral_types.h"
31 #include "utils/java/jni-base.h"
32 #include "utils/java/jni-cache.h"
33 #include "utils/java/jni-helper.h"
34 #include "utils/utf8/unicodetext.h"
35 #include "utils/utf8/unilib-common.h"
36
37 namespace libtextclassifier3 {
38
39 class UniLibBase {
40 public:
41 UniLibBase();
42 explicit UniLibBase(const std::shared_ptr<JniCache>& jni_cache);
43
44 bool ParseInt32(const UnicodeText& text, int32* result) const;
45 bool ParseInt64(const UnicodeText& text, int64* result) const;
46 bool ParseDouble(const UnicodeText& text, double* result) const;
47
48 bool IsOpeningBracket(char32 codepoint) const;
49 bool IsClosingBracket(char32 codepoint) const;
50 bool IsWhitespace(char32 codepoint) const;
51 bool IsDigit(char32 codepoint) const;
52 bool IsLower(char32 codepoint) const;
53 bool IsUpper(char32 codepoint) const;
54 bool IsPunctuation(char32 codepoint) const;
55
56 char32 ToLower(char32 codepoint) const;
57 char32 ToUpper(char32 codepoint) const;
58 char32 GetPairedBracket(char32 codepoint) const;
59
60 StatusOr<int32> Length(const UnicodeText& text) const;
61
62 // Forward declaration for friend.
63 class RegexPattern;
64
65 class RegexMatcher {
66 public:
67 static constexpr int kError = -1;
68 static constexpr int kNoError = 0;
69
70 // Checks whether the input text matches the pattern exactly.
71 bool Matches(int* status) const;
72
73 // Approximate Matches() implementation implemented using Find(). It uses
74 // the first Find() result and then checks that it spans the whole input.
75 // NOTE: Unlike Matches() it can result in false negatives.
76 // NOTE: Resets the matcher, so the current Find() state will be lost.
77 bool ApproximatelyMatches(int* status);
78
79 // Finds occurrences of the pattern in the input text.
80 // Can be called repeatedly to find all occurrences. A call will update
81 // internal state, so that 'Start', 'End' and 'Group' can be called to get
82 // information about the match.
83 // NOTE: Any call to ApproximatelyMatches() in between Find() calls will
84 // modify the state.
85 bool Find(int* status);
86
87 // Gets the start offset of the last match (from 'Find').
88 // Sets status to 'kError' if 'Find'
89 // was not called previously.
90 int Start(int* status) const;
91
92 // Gets the start offset of the specified group of the last match.
93 // (from 'Find').
94 // Sets status to 'kError' if an invalid group was specified or if 'Find'
95 // was not called previously.
96 int Start(int group_idx, int* status) const;
97
98 // Gets the end offset of the last match (from 'Find').
99 // Sets status to 'kError' if 'Find'
100 // was not called previously.
101 int End(int* status) const;
102
103 // Gets the end offset of the specified group of the last match.
104 // (from 'Find').
105 // Sets status to 'kError' if an invalid group was specified or if 'Find'
106 // was not called previously.
107 int End(int group_idx, int* status) const;
108
109 // Gets the text of the last match (from 'Find').
110 // Sets status to 'kError' if 'Find' was not called previously.
111 UnicodeText Group(int* status) const;
112
113 // Gets the text of the specified group of the last match (from 'Find').
114 // Sets status to 'kError' if an invalid group was specified or if 'Find'
115 // was not called previously.
116 UnicodeText Group(int group_idx, int* status) const;
117
118 // Returns the matched text (the 0th capturing group).
Text()119 std::string Text() const {
120 StatusOr<std::string> status_or_result =
121 JStringToUtf8String(jni_cache_->GetEnv(), text_.get());
122 if (!status_or_result.ok()) {
123 TC3_LOG(ERROR) << "JStringToUtf8String failed.";
124 return "";
125 }
126 return status_or_result.ValueOrDie();
127 }
128
129 private:
130 friend class RegexPattern;
131 RegexMatcher(const JniCache* jni_cache, ScopedGlobalRef<jobject> matcher,
132 ScopedGlobalRef<jstring> text);
133 bool UpdateLastFindOffset() const;
134
135 const JniCache* jni_cache_;
136 ScopedGlobalRef<jobject> matcher_;
137 ScopedGlobalRef<jstring> text_;
138 mutable int last_find_offset_ = 0;
139 mutable int last_find_offset_codepoints_ = 0;
140 mutable bool last_find_offset_dirty_ = true;
141 };
142
143 class RegexPattern {
144 public:
145 std::unique_ptr<RegexMatcher> Matcher(const UnicodeText& context) const;
146
147 private:
148 friend class UniLibBase;
149 RegexPattern(const JniCache* jni_cache, const UnicodeText& pattern,
150 bool lazy);
151 Status LockedInitializeIfNotAlready() const;
152
153 const JniCache* jni_cache_;
154
155 // These members need to be mutable because of the lazy initialization.
156 // NOTE: The Matcher method first ensures (using a lock) that the
157 // initialization was attempted (by using LockedInitializeIfNotAlready) and
158 // then can access them without locking.
159 mutable std::mutex mutex_;
160 mutable ScopedGlobalRef<jobject> pattern_;
161 mutable bool initialized_;
162 mutable bool initialization_failure_;
163 mutable UnicodeText pattern_text_;
164 };
165
166 class BreakIterator {
167 public:
168 int Next();
169
170 static constexpr int kDone = -1;
171
172 private:
173 friend class UniLibBase;
174 BreakIterator(const JniCache* jni_cache, const UnicodeText& text);
175
176 const JniCache* jni_cache_;
177 ScopedGlobalRef<jstring> text_;
178 ScopedGlobalRef<jobject> iterator_;
179 int last_break_index_;
180 int last_unicode_index_;
181 };
182
183 std::unique_ptr<RegexPattern> CreateRegexPattern(
184 const UnicodeText& regex) const;
185 std::unique_ptr<RegexPattern> CreateLazyRegexPattern(
186 const UnicodeText& regex) const;
187 std::unique_ptr<BreakIterator> CreateBreakIterator(
188 const UnicodeText& text) const;
189
190 private:
191 template <class T>
192 bool ParseInt(const UnicodeText& text, T* result) const;
193
194 std::shared_ptr<JniCache> jni_cache_;
195 };
196
197 template <class T>
ParseInt(const UnicodeText & text,T * result)198 bool UniLibBase::ParseInt(const UnicodeText& text, T* result) const {
199 if (!jni_cache_) {
200 return false;
201 }
202
203 // Avoid throwing exceptions when the text is unlikely to be a number.
204 int32 result32 = 0;
205 if (!PassesIntPreChesks(text, result32)) {
206 return false;
207 }
208
209 JNIEnv* env = jni_cache_->GetEnv();
210 TC3_ASSIGN_OR_RETURN_FALSE(const ScopedLocalRef<jstring> text_java,
211 jni_cache_->ConvertToJavaString(text));
212 TC3_ASSIGN_OR_RETURN_FALSE(
213 *result,
214 JniHelper::CallStaticIntMethod<T>(
215 env,
216 /*print_exception_on_error=*/false, jni_cache_->integer_class.get(),
217 jni_cache_->integer_parse_int, text_java.get()));
218 return true;
219 }
220
221 } // namespace libtextclassifier3
222
223 #endif // LIBTEXTCLASSIFIER_UTILS_UTF8_UNILIB_JAVAICU_H_
224