1 /*
2 * Copyright (C) 2018 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "utils/utf8/unilib-javaicu.h"
18
19 #include <algorithm>
20 #include <cassert>
21 #include <cctype>
22 #include <map>
23
24 #include "utils/java/string_utils.h"
25
26 namespace libtextclassifier3 {
27 namespace {
28
29 // -----------------------------------------------------------------------------
30 // Native implementations.
31 // -----------------------------------------------------------------------------
32
33 #define ARRAYSIZE(a) sizeof(a) / sizeof(*a)
34
35 // Derived from http://www.unicode.org/Public/UNIDATA/UnicodeData.txt
36 // grep -E "Ps" UnicodeData.txt | \
37 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
38 // IMPORTANT: entries with the same offsets in kOpeningBrackets and
39 // kClosingBrackets must be counterparts.
40 constexpr char32 kOpeningBrackets[] = {
41 0x0028, 0x005B, 0x007B, 0x0F3C, 0x2045, 0x207D, 0x208D, 0x2329, 0x2768,
42 0x276A, 0x276C, 0x2770, 0x2772, 0x2774, 0x27E6, 0x27E8, 0x27EA, 0x27EC,
43 0x27EE, 0x2983, 0x2985, 0x2987, 0x2989, 0x298B, 0x298D, 0x298F, 0x2991,
44 0x2993, 0x2995, 0x2997, 0x29FC, 0x2E22, 0x2E24, 0x2E26, 0x2E28, 0x3008,
45 0x300A, 0x300C, 0x300E, 0x3010, 0x3014, 0x3016, 0x3018, 0x301A, 0xFD3F,
46 0xFE17, 0xFE35, 0xFE37, 0xFE39, 0xFE3B, 0xFE3D, 0xFE3F, 0xFE41, 0xFE43,
47 0xFE47, 0xFE59, 0xFE5B, 0xFE5D, 0xFF08, 0xFF3B, 0xFF5B, 0xFF5F, 0xFF62};
48 constexpr int kNumOpeningBrackets = ARRAYSIZE(kOpeningBrackets);
49
50 // grep -E "Pe" UnicodeData.txt | \
51 // sed -rne "s/^([0-9A-Z]{4});.*(PAREN|BRACKET|BRAKCET|BRACE).*/0x\1, /p"
52 constexpr char32 kClosingBrackets[] = {
53 0x0029, 0x005D, 0x007D, 0x0F3D, 0x2046, 0x207E, 0x208E, 0x232A, 0x2769,
54 0x276B, 0x276D, 0x2771, 0x2773, 0x2775, 0x27E7, 0x27E9, 0x27EB, 0x27ED,
55 0x27EF, 0x2984, 0x2986, 0x2988, 0x298A, 0x298C, 0x298E, 0x2990, 0x2992,
56 0x2994, 0x2996, 0x2998, 0x29FD, 0x2E23, 0x2E25, 0x2E27, 0x2E29, 0x3009,
57 0x300B, 0x300D, 0x300F, 0x3011, 0x3015, 0x3017, 0x3019, 0x301B, 0xFD3E,
58 0xFE18, 0xFE36, 0xFE38, 0xFE3A, 0xFE3C, 0xFE3E, 0xFE40, 0xFE42, 0xFE44,
59 0xFE48, 0xFE5A, 0xFE5C, 0xFE5E, 0xFF09, 0xFF3D, 0xFF5D, 0xFF60, 0xFF63};
60 constexpr int kNumClosingBrackets = ARRAYSIZE(kClosingBrackets);
61
62 // grep -E "WS" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
63 constexpr char32 kWhitespaces[] = {
64 0x000C, 0x0020, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004,
65 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x205F,
66 0x21C7, 0x21C8, 0x21C9, 0x21CA, 0x21F6, 0x2B31, 0x2B84, 0x2B85,
67 0x2B86, 0x2B87, 0x2B94, 0x3000, 0x4DCC, 0x10344, 0x10347, 0x1DA0A,
68 0x1DA0B, 0x1DA0C, 0x1DA0D, 0x1DA0E, 0x1DA0F, 0x1DA10, 0x1F4F0, 0x1F500,
69 0x1F501, 0x1F502, 0x1F503, 0x1F504, 0x1F5D8, 0x1F5DE};
70 constexpr int kNumWhitespaces = ARRAYSIZE(kWhitespaces);
71
72 // grep -E "Nd" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
73 // As the name suggests, these ranges are always 10 codepoints long, so we just
74 // store the end of the range.
75 constexpr char32 kDecimalDigitRangesEnd[] = {
76 0x0039, 0x0669, 0x06f9, 0x07c9, 0x096f, 0x09ef, 0x0a6f, 0x0aef,
77 0x0b6f, 0x0bef, 0x0c6f, 0x0cef, 0x0d6f, 0x0def, 0x0e59, 0x0ed9,
78 0x0f29, 0x1049, 0x1099, 0x17e9, 0x1819, 0x194f, 0x19d9, 0x1a89,
79 0x1a99, 0x1b59, 0x1bb9, 0x1c49, 0x1c59, 0xa629, 0xa8d9, 0xa909,
80 0xa9d9, 0xa9f9, 0xaa59, 0xabf9, 0xff19, 0x104a9, 0x1106f, 0x110f9,
81 0x1113f, 0x111d9, 0x112f9, 0x11459, 0x114d9, 0x11659, 0x116c9, 0x11739,
82 0x118e9, 0x11c59, 0x11d59, 0x16a69, 0x16b59, 0x1d7ff};
83 constexpr int kNumDecimalDigitRangesEnd = ARRAYSIZE(kDecimalDigitRangesEnd);
84
85 // grep -E "Lu" UnicodeData.txt | sed -re "s/([0-9A-Z]+);.*/0x\1, /"
86 // There are three common ways in which upper/lower case codepoint ranges
87 // were introduced: one offs, dense ranges, and ranges that alternate between
88 // lower and upper case. For the sake of keeping out binary size down, we
89 // treat each independently.
90 constexpr char32 kUpperSingles[] = {
91 0x01b8, 0x01bc, 0x01c4, 0x01c7, 0x01ca, 0x01f1, 0x0376, 0x037f,
92 0x03cf, 0x03f4, 0x03fa, 0x10c7, 0x10cd, 0x2102, 0x2107, 0x2115,
93 0x2145, 0x2183, 0x2c72, 0x2c75, 0x2cf2, 0xa7b6};
94 constexpr int kNumUpperSingles = ARRAYSIZE(kUpperSingles);
95 constexpr char32 kUpperRanges1Start[] = {
96 0x0041, 0x00c0, 0x00d8, 0x0181, 0x018a, 0x018e, 0x0193, 0x0196,
97 0x019c, 0x019f, 0x01b2, 0x01f7, 0x023a, 0x023d, 0x0244, 0x0389,
98 0x0392, 0x03a3, 0x03d2, 0x03fd, 0x0531, 0x10a0, 0x13a0, 0x1f08,
99 0x1f18, 0x1f28, 0x1f38, 0x1f48, 0x1f68, 0x1fb8, 0x1fc8, 0x1fd8,
100 0x1fe8, 0x1ff8, 0x210b, 0x2110, 0x2119, 0x212b, 0x2130, 0x213e,
101 0x2c00, 0x2c63, 0x2c6e, 0x2c7e, 0xa7ab, 0xa7b0};
102 constexpr int kNumUpperRanges1Start = ARRAYSIZE(kUpperRanges1Start);
103 constexpr char32 kUpperRanges1End[] = {
104 0x005a, 0x00d6, 0x00de, 0x0182, 0x018b, 0x0191, 0x0194, 0x0198,
105 0x019d, 0x01a0, 0x01b3, 0x01f8, 0x023b, 0x023e, 0x0246, 0x038a,
106 0x03a1, 0x03ab, 0x03d4, 0x042f, 0x0556, 0x10c5, 0x13f5, 0x1f0f,
107 0x1f1d, 0x1f2f, 0x1f3f, 0x1f4d, 0x1f6f, 0x1fbb, 0x1fcb, 0x1fdb,
108 0x1fec, 0x1ffb, 0x210d, 0x2112, 0x211d, 0x212d, 0x2133, 0x213f,
109 0x2c2e, 0x2c64, 0x2c70, 0x2c80, 0xa7ae, 0xa7b4};
110 constexpr int kNumUpperRanges1End = ARRAYSIZE(kUpperRanges1End);
111 constexpr char32 kUpperRanges2Start[] = {
112 0x0100, 0x0139, 0x014a, 0x0179, 0x0184, 0x0187, 0x01a2, 0x01a7, 0x01ac,
113 0x01af, 0x01b5, 0x01cd, 0x01de, 0x01f4, 0x01fa, 0x0241, 0x0248, 0x0370,
114 0x0386, 0x038c, 0x038f, 0x03d8, 0x03f7, 0x0460, 0x048a, 0x04c1, 0x04d0,
115 0x1e00, 0x1e9e, 0x1f59, 0x2124, 0x2c60, 0x2c67, 0x2c82, 0x2ceb, 0xa640,
116 0xa680, 0xa722, 0xa732, 0xa779, 0xa77e, 0xa78b, 0xa790, 0xa796};
117 constexpr int kNumUpperRanges2Start = ARRAYSIZE(kUpperRanges2Start);
118 constexpr char32 kUpperRanges2End[] = {
119 0x0136, 0x0147, 0x0178, 0x017d, 0x0186, 0x0189, 0x01a6, 0x01a9, 0x01ae,
120 0x01b1, 0x01b7, 0x01db, 0x01ee, 0x01f6, 0x0232, 0x0243, 0x024e, 0x0372,
121 0x0388, 0x038e, 0x0391, 0x03ee, 0x03f9, 0x0480, 0x04c0, 0x04cd, 0x052e,
122 0x1e94, 0x1efe, 0x1f5f, 0x212a, 0x2c62, 0x2c6d, 0x2ce2, 0x2ced, 0xa66c,
123 0xa69a, 0xa72e, 0xa76e, 0xa77d, 0xa786, 0xa78d, 0xa792, 0xa7aa};
124 constexpr int kNumUpperRanges2End = ARRAYSIZE(kUpperRanges2End);
125
126 // grep -E "Lu" UnicodeData.txt | \
127 // sed -rne "s/^([0-9A-Z]+);.*;([0-9A-Z]+);$/(0x\1, 0x\2), /p"
128 // We have two strategies for mapping from upper to lower case. We have single
129 // character lookups that do not follow a pattern, and ranges for which there
130 // is a constant codepoint shift.
131 // Note that these ranges ignore anything that's not an upper case character,
132 // so when applied to a non-uppercase character the result is incorrect.
133 constexpr int kToLowerSingles[] = {
134 0x0130, 0x0178, 0x0181, 0x0186, 0x018b, 0x018e, 0x018f, 0x0190, 0x0191,
135 0x0194, 0x0196, 0x0197, 0x0198, 0x019c, 0x019d, 0x019f, 0x01a6, 0x01a9,
136 0x01ae, 0x01b7, 0x01f6, 0x01f7, 0x0220, 0x023a, 0x023d, 0x023e, 0x0243,
137 0x0244, 0x0245, 0x037f, 0x0386, 0x038c, 0x03cf, 0x03f4, 0x03f9, 0x04c0,
138 0x1e9e, 0x1fec, 0x2126, 0x212a, 0x212b, 0x2132, 0x2183, 0x2c60, 0x2c62,
139 0x2c63, 0x2c64, 0x2c6d, 0x2c6e, 0x2c6f, 0x2c70, 0xa77d, 0xa78d, 0xa7aa,
140 0xa7ab, 0xa7ac, 0xa7ad, 0xa7ae, 0xa7b0, 0xa7b1, 0xa7b2, 0xa7b3};
141 constexpr int kNumToLowerSingles = ARRAYSIZE(kToLowerSingles);
142 constexpr int kToLowerSinglesOffsets[] = {
143 -199, -121, 210, 206, 1, 79, 202, 203, 1,
144 207, 211, 209, 1, 211, 213, 214, 218, 218,
145 218, 219, -97, -56, -130, 10795, -163, 10792, -195,
146 69, 71, 116, 38, 64, 8, -60, -7, 15,
147 -7615, -7, -7517, -8383, -8262, 28, 1, 1, -10743,
148 -3814, -10727, -10780, -10749, -10783, -10782, -35332, -42280, -42308,
149 -42319, -42315, -42305, -42308, -42258, -42282, -42261, 928};
150 constexpr int kNumToLowerSinglesOffsets = ARRAYSIZE(kToLowerSinglesOffsets);
151 constexpr int kToLowerRangesStart[] = {
152 0x0041, 0x0100, 0x0189, 0x01a0, 0x01b1, 0x01b3, 0x0388, 0x038e, 0x0391,
153 0x03d8, 0x03fd, 0x0400, 0x0410, 0x0460, 0x0531, 0x10a0, 0x13a0, 0x13f0,
154 0x1e00, 0x1f08, 0x1fba, 0x1fc8, 0x1fd8, 0x1fda, 0x1fe8, 0x1fea, 0x1ff8,
155 0x1ffa, 0x2c00, 0x2c67, 0x2c7e, 0x2c80, 0xff21, 0x10400, 0x10c80, 0x118a0};
156 constexpr int kNumToLowerRangesStart = ARRAYSIZE(kToLowerRangesStart);
157 constexpr int kToLowerRangesEnd[] = {
158 0x00de, 0x0187, 0x019f, 0x01af, 0x01b2, 0x0386, 0x038c, 0x038f, 0x03cf,
159 0x03fa, 0x03ff, 0x040f, 0x042f, 0x052e, 0x0556, 0x10cd, 0x13ef, 0x13f5,
160 0x1efe, 0x1fb9, 0x1fbb, 0x1fcb, 0x1fd9, 0x1fdb, 0x1fe9, 0x1fec, 0x1ff9,
161 0x2183, 0x2c64, 0x2c75, 0x2c7f, 0xa7b6, 0xff3a, 0x104d3, 0x10cb2, 0x118bf};
162 constexpr int kNumToLowerRangesEnd = ARRAYSIZE(kToLowerRangesEnd);
163 constexpr int kToLowerRangesOffsets[] = {
164 32, 1, 205, 1, 217, 1, 37, 63, 32, 1, -130, 80,
165 32, 1, 48, 7264, 38864, 8, 1, -8, -74, -86, -8, -100,
166 -8, -112, -128, -126, 48, 1, -10815, 1, 32, 40, 64, 32};
167 constexpr int kNumToLowerRangesOffsets = ARRAYSIZE(kToLowerRangesOffsets);
168
169 #undef ARRAYSIZE
170
171 static_assert(kNumOpeningBrackets == kNumClosingBrackets,
172 "mismatching number of opening and closing brackets");
173 static_assert(kNumUpperRanges1Start == kNumUpperRanges1End,
174 "number of uppercase stride 1 range starts/ends doesn't match");
175 static_assert(kNumUpperRanges2Start == kNumUpperRanges2End,
176 "number of uppercase stride 2 range starts/ends doesn't match");
177 static_assert(kNumToLowerSingles == kNumToLowerSinglesOffsets,
178 "number of to lower singles and offsets doesn't match");
179 static_assert(kNumToLowerRangesStart == kNumToLowerRangesEnd,
180 "mismatching number of range starts/ends for to lower ranges");
181 static_assert(kNumToLowerRangesStart == kNumToLowerRangesOffsets,
182 "number of to lower ranges and offsets doesn't match");
183
184 constexpr int kNoMatch = -1;
185
186 // Returns the index of the element in the array that matched the given
187 // codepoint, or kNoMatch if the element didn't exist.
188 // The input array must be in sorted order.
GetMatchIndex(const char32 * array,int array_length,char32 c)189 int GetMatchIndex(const char32* array, int array_length, char32 c) {
190 const char32* end = array + array_length;
191 const auto find_it = std::lower_bound(array, end, c);
192 if (find_it != end && *find_it == c) {
193 return find_it - array;
194 } else {
195 return kNoMatch;
196 }
197 }
198
199 // Returns the index of the range in the array that overlapped the given
200 // codepoint, or kNoMatch if no such range existed.
201 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * arr,int arr_length,int range_length,char32 c)202 int GetOverlappingRangeIndex(const char32* arr, int arr_length,
203 int range_length, char32 c) {
204 const char32* end = arr + arr_length;
205 const auto find_it = std::lower_bound(arr, end, c);
206 if (find_it == end) {
207 return kNoMatch;
208 }
209 // The end is inclusive, we so subtract one less than the range length.
210 const char32 range_end = *find_it;
211 const char32 range_start = range_end - (range_length - 1);
212 if (c < range_start || range_end < c) {
213 return kNoMatch;
214 } else {
215 return find_it - arr;
216 }
217 }
218
219 // As above, but with explicit codepoint start and end indices for the range.
220 // The input array must be in sorted order.
GetOverlappingRangeIndex(const char32 * start_arr,const char32 * end_arr,int arr_length,int stride,char32 c)221 int GetOverlappingRangeIndex(const char32* start_arr, const char32* end_arr,
222 int arr_length, int stride, char32 c) {
223 const char32* end_arr_end = end_arr + arr_length;
224 const auto find_it = std::lower_bound(end_arr, end_arr_end, c);
225 if (find_it == end_arr_end) {
226 return kNoMatch;
227 }
228 // Find the corresponding start.
229 const int range_index = find_it - end_arr;
230 const char32 range_start = start_arr[range_index];
231 const char32 range_end = *find_it;
232 if (c < range_start || range_end < c) {
233 return kNoMatch;
234 }
235 if ((c - range_start) % stride == 0) {
236 return range_index;
237 } else {
238 return kNoMatch;
239 }
240 }
241
242 } // anonymous namespace
243
UniLib()244 UniLib::UniLib() {
245 TC3_LOG(FATAL) << "Java ICU UniLib must be initialized with a JniCache.";
246 }
247
UniLib(const std::shared_ptr<JniCache> & jni_cache)248 UniLib::UniLib(const std::shared_ptr<JniCache>& jni_cache)
249 : jni_cache_(jni_cache) {}
250
IsOpeningBracket(char32 codepoint) const251 bool UniLib::IsOpeningBracket(char32 codepoint) const {
252 return GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint) >= 0;
253 }
254
IsClosingBracket(char32 codepoint) const255 bool UniLib::IsClosingBracket(char32 codepoint) const {
256 return GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint) >= 0;
257 }
258
IsWhitespace(char32 codepoint) const259 bool UniLib::IsWhitespace(char32 codepoint) const {
260 return GetMatchIndex(kWhitespaces, kNumWhitespaces, codepoint) >= 0;
261 }
262
IsDigit(char32 codepoint) const263 bool UniLib::IsDigit(char32 codepoint) const {
264 return GetOverlappingRangeIndex(kDecimalDigitRangesEnd,
265 kNumDecimalDigitRangesEnd,
266 /*range_length=*/10, codepoint) >= 0;
267 }
268
IsUpper(char32 codepoint) const269 bool UniLib::IsUpper(char32 codepoint) const {
270 if (GetMatchIndex(kUpperSingles, kNumUpperSingles, codepoint) >= 0) {
271 return true;
272 } else if (GetOverlappingRangeIndex(kUpperRanges1Start, kUpperRanges1End,
273 kNumUpperRanges1Start, /*stride=*/1,
274 codepoint) >= 0) {
275 return true;
276 } else if (GetOverlappingRangeIndex(kUpperRanges2Start, kUpperRanges2End,
277 kNumUpperRanges2Start, /*stride=*/2,
278 codepoint) >= 0) {
279 return true;
280 } else {
281 return false;
282 }
283 }
284
ToLower(char32 codepoint) const285 char32 UniLib::ToLower(char32 codepoint) const {
286 // Make sure we still produce output even if the method is called for a
287 // codepoint that's not an uppercase character.
288 if (!IsUpper(codepoint)) {
289 return codepoint;
290 }
291 const int singles_idx =
292 GetMatchIndex(kToLowerSingles, kNumToLowerSingles, codepoint);
293 if (singles_idx >= 0) {
294 return codepoint + kToLowerSinglesOffsets[singles_idx];
295 }
296 const int ranges_idx =
297 GetOverlappingRangeIndex(kToLowerRangesStart, kToLowerRangesEnd,
298 kNumToLowerRangesStart, /*stride=*/1, codepoint);
299 if (ranges_idx >= 0) {
300 return codepoint + kToLowerRangesOffsets[ranges_idx];
301 }
302 return codepoint;
303 }
304
GetPairedBracket(char32 codepoint) const305 char32 UniLib::GetPairedBracket(char32 codepoint) const {
306 const int open_offset =
307 GetMatchIndex(kOpeningBrackets, kNumOpeningBrackets, codepoint);
308 if (open_offset >= 0) {
309 return kClosingBrackets[open_offset];
310 }
311 const int close_offset =
312 GetMatchIndex(kClosingBrackets, kNumClosingBrackets, codepoint);
313 if (close_offset >= 0) {
314 return kOpeningBrackets[close_offset];
315 }
316 return codepoint;
317 }
318
319 // -----------------------------------------------------------------------------
320 // Implementations that call out to JVM. Behold the beauty.
321 // -----------------------------------------------------------------------------
322
ParseInt32(const UnicodeText & text,int * result) const323 bool UniLib::ParseInt32(const UnicodeText& text, int* result) const {
324 if (jni_cache_) {
325 JNIEnv* env = jni_cache_->GetEnv();
326 const ScopedLocalRef<jstring> text_java =
327 jni_cache_->ConvertToJavaString(text);
328 jint res = env->CallStaticIntMethod(jni_cache_->integer_class.get(),
329 jni_cache_->integer_parse_int,
330 text_java.get());
331 if (jni_cache_->ExceptionCheckAndClear()) {
332 return false;
333 }
334 *result = res;
335 return true;
336 }
337 return false;
338 }
339
CreateRegexPattern(const UnicodeText & regex) const340 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateRegexPattern(
341 const UnicodeText& regex) const {
342 return std::unique_ptr<UniLib::RegexPattern>(
343 new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/false));
344 }
345
CreateLazyRegexPattern(const UnicodeText & regex) const346 std::unique_ptr<UniLib::RegexPattern> UniLib::CreateLazyRegexPattern(
347 const UnicodeText& regex) const {
348 return std::unique_ptr<UniLib::RegexPattern>(
349 new UniLib::RegexPattern(jni_cache_.get(), regex, /*lazy=*/true));
350 }
351
RegexPattern(const JniCache * jni_cache,const UnicodeText & pattern,bool lazy)352 UniLib::RegexPattern::RegexPattern(const JniCache* jni_cache,
353 const UnicodeText& pattern, bool lazy)
354 : jni_cache_(jni_cache),
355 pattern_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
356 initialized_(false),
357 initialization_failure_(false),
358 pattern_text_(pattern) {
359 if (!lazy) {
360 LockedInitializeIfNotAlready();
361 }
362 }
363
LockedInitializeIfNotAlready() const364 void UniLib::RegexPattern::LockedInitializeIfNotAlready() const {
365 std::lock_guard<std::mutex> guard(mutex_);
366 if (initialized_ || initialization_failure_) {
367 return;
368 }
369
370 if (jni_cache_) {
371 JNIEnv* jenv = jni_cache_->GetEnv();
372 const ScopedLocalRef<jstring> regex_java =
373 jni_cache_->ConvertToJavaString(pattern_text_);
374 pattern_ = MakeGlobalRef(jenv->CallStaticObjectMethod(
375 jni_cache_->pattern_class.get(),
376 jni_cache_->pattern_compile, regex_java.get()),
377 jenv, jni_cache_->jvm);
378
379 if (jni_cache_->ExceptionCheckAndClear() || pattern_ == nullptr) {
380 initialization_failure_ = true;
381 pattern_.reset();
382 return;
383 }
384
385 initialized_ = true;
386 pattern_text_.clear(); // We don't need this anymore.
387 }
388 }
389
390 constexpr int UniLib::RegexMatcher::kError;
391 constexpr int UniLib::RegexMatcher::kNoError;
392
Matcher(const UnicodeText & context) const393 std::unique_ptr<UniLib::RegexMatcher> UniLib::RegexPattern::Matcher(
394 const UnicodeText& context) const {
395 LockedInitializeIfNotAlready(); // Possibly lazy initialization.
396 if (initialization_failure_) {
397 return nullptr;
398 }
399
400 if (jni_cache_) {
401 JNIEnv* env = jni_cache_->GetEnv();
402 const jstring context_java =
403 jni_cache_->ConvertToJavaString(context).release();
404 if (!context_java) {
405 return nullptr;
406 }
407 const jobject matcher = env->CallObjectMethod(
408 pattern_.get(), jni_cache_->pattern_matcher, context_java);
409 if (jni_cache_->ExceptionCheckAndClear() || !matcher) {
410 return nullptr;
411 }
412 return std::unique_ptr<UniLib::RegexMatcher>(new RegexMatcher(
413 jni_cache_, MakeGlobalRef(matcher, env, jni_cache_->jvm),
414 MakeGlobalRef(context_java, env, jni_cache_->jvm)));
415 } else {
416 // NOTE: A valid object needs to be created here to pass the interface
417 // tests.
418 return std::unique_ptr<UniLib::RegexMatcher>(
419 new RegexMatcher(jni_cache_, nullptr, nullptr));
420 }
421 }
422
RegexMatcher(const JniCache * jni_cache,ScopedGlobalRef<jobject> matcher,ScopedGlobalRef<jstring> text)423 UniLib::RegexMatcher::RegexMatcher(const JniCache* jni_cache,
424 ScopedGlobalRef<jobject> matcher,
425 ScopedGlobalRef<jstring> text)
426 : jni_cache_(jni_cache),
427 matcher_(std::move(matcher)),
428 text_(std::move(text)) {}
429
Matches(int * status) const430 bool UniLib::RegexMatcher::Matches(int* status) const {
431 if (jni_cache_) {
432 *status = kNoError;
433 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
434 matcher_.get(), jni_cache_->matcher_matches);
435 if (jni_cache_->ExceptionCheckAndClear()) {
436 *status = kError;
437 return false;
438 }
439 return result;
440 } else {
441 *status = kError;
442 return false;
443 }
444 }
445
ApproximatelyMatches(int * status)446 bool UniLib::RegexMatcher::ApproximatelyMatches(int* status) {
447 *status = kNoError;
448
449 jni_cache_->GetEnv()->CallObjectMethod(matcher_.get(),
450 jni_cache_->matcher_reset);
451 if (jni_cache_->ExceptionCheckAndClear()) {
452 *status = kError;
453 return kError;
454 }
455
456 if (!Find(status) || *status != kNoError) {
457 return false;
458 }
459
460 const int found_start = jni_cache_->GetEnv()->CallIntMethod(
461 matcher_.get(), jni_cache_->matcher_start_idx, 0);
462 if (jni_cache_->ExceptionCheckAndClear()) {
463 *status = kError;
464 return kError;
465 }
466
467 const int found_end = jni_cache_->GetEnv()->CallIntMethod(
468 matcher_.get(), jni_cache_->matcher_end_idx, 0);
469 if (jni_cache_->ExceptionCheckAndClear()) {
470 *status = kError;
471 return kError;
472 }
473
474 int context_length_bmp = jni_cache_->GetEnv()->CallIntMethod(
475 text_.get(), jni_cache_->string_length);
476 if (jni_cache_->ExceptionCheckAndClear()) {
477 *status = kError;
478 return false;
479 }
480
481 if (found_start != 0 || found_end != context_length_bmp) {
482 return false;
483 }
484
485 return true;
486 }
487
UpdateLastFindOffset() const488 bool UniLib::RegexMatcher::UpdateLastFindOffset() const {
489 if (!last_find_offset_dirty_) {
490 return true;
491 }
492
493 const int find_offset = jni_cache_->GetEnv()->CallIntMethod(
494 matcher_.get(), jni_cache_->matcher_start_idx, 0);
495 if (jni_cache_->ExceptionCheckAndClear()) {
496 return false;
497 }
498
499 const int codepoint_count = jni_cache_->GetEnv()->CallIntMethod(
500 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
501 find_offset);
502 if (jni_cache_->ExceptionCheckAndClear()) {
503 return false;
504 }
505
506 last_find_offset_codepoints_ += codepoint_count;
507 last_find_offset_ = find_offset;
508 last_find_offset_dirty_ = false;
509
510 return true;
511 }
512
Find(int * status)513 bool UniLib::RegexMatcher::Find(int* status) {
514 if (jni_cache_) {
515 const bool result = jni_cache_->GetEnv()->CallBooleanMethod(
516 matcher_.get(), jni_cache_->matcher_find);
517 if (jni_cache_->ExceptionCheckAndClear()) {
518 *status = kError;
519 return false;
520 }
521
522 last_find_offset_dirty_ = true;
523 *status = kNoError;
524 return result;
525 } else {
526 *status = kError;
527 return false;
528 }
529 }
530
Start(int * status) const531 int UniLib::RegexMatcher::Start(int* status) const {
532 return Start(/*group_idx=*/0, status);
533 }
534
Start(int group_idx,int * status) const535 int UniLib::RegexMatcher::Start(int group_idx, int* status) const {
536 if (jni_cache_) {
537 *status = kNoError;
538
539 if (!UpdateLastFindOffset()) {
540 *status = kError;
541 return kError;
542 }
543
544 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
545 matcher_.get(), jni_cache_->matcher_start_idx, group_idx);
546 if (jni_cache_->ExceptionCheckAndClear()) {
547 *status = kError;
548 return kError;
549 }
550
551 // If the group didn't participate in the match the index is -1.
552 if (java_index == -1) {
553 return -1;
554 }
555
556 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
557 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
558 java_index);
559 if (jni_cache_->ExceptionCheckAndClear()) {
560 *status = kError;
561 return kError;
562 }
563
564 return unicode_index + last_find_offset_codepoints_;
565 } else {
566 *status = kError;
567 return kError;
568 }
569 }
570
End(int * status) const571 int UniLib::RegexMatcher::End(int* status) const {
572 return End(/*group_idx=*/0, status);
573 }
574
End(int group_idx,int * status) const575 int UniLib::RegexMatcher::End(int group_idx, int* status) const {
576 if (jni_cache_) {
577 *status = kNoError;
578
579 if (!UpdateLastFindOffset()) {
580 *status = kError;
581 return kError;
582 }
583
584 const int java_index = jni_cache_->GetEnv()->CallIntMethod(
585 matcher_.get(), jni_cache_->matcher_end_idx, group_idx);
586 if (jni_cache_->ExceptionCheckAndClear()) {
587 *status = kError;
588 return kError;
589 }
590
591 // If the group didn't participate in the match the index is -1.
592 if (java_index == -1) {
593 return -1;
594 }
595
596 const int unicode_index = jni_cache_->GetEnv()->CallIntMethod(
597 text_.get(), jni_cache_->string_code_point_count, last_find_offset_,
598 java_index);
599 if (jni_cache_->ExceptionCheckAndClear()) {
600 *status = kError;
601 return kError;
602 }
603
604 return unicode_index + last_find_offset_codepoints_;
605 } else {
606 *status = kError;
607 return kError;
608 }
609 }
610
Group(int * status) const611 UnicodeText UniLib::RegexMatcher::Group(int* status) const {
612 if (jni_cache_) {
613 JNIEnv* jenv = jni_cache_->GetEnv();
614 const ScopedLocalRef<jstring> java_result(
615 reinterpret_cast<jstring>(
616 jenv->CallObjectMethod(matcher_.get(), jni_cache_->matcher_group)),
617 jenv);
618 if (jni_cache_->ExceptionCheckAndClear() || !java_result) {
619 *status = kError;
620 return UTF8ToUnicodeText("", /*do_copy=*/false);
621 }
622
623 std::string result;
624 if (!JStringToUtf8String(jenv, java_result.get(), &result)) {
625 *status = kError;
626 return UTF8ToUnicodeText("", /*do_copy=*/false);
627 }
628 *status = kNoError;
629 return UTF8ToUnicodeText(result, /*do_copy=*/true);
630 } else {
631 *status = kError;
632 return UTF8ToUnicodeText("", /*do_copy=*/false);
633 }
634 }
635
Group(int group_idx,int * status) const636 UnicodeText UniLib::RegexMatcher::Group(int group_idx, int* status) const {
637 if (jni_cache_) {
638 JNIEnv* jenv = jni_cache_->GetEnv();
639 const ScopedLocalRef<jstring> java_result(
640 reinterpret_cast<jstring>(jenv->CallObjectMethod(
641 matcher_.get(), jni_cache_->matcher_group_idx, group_idx)),
642 jenv);
643 if (jni_cache_->ExceptionCheckAndClear()) {
644 *status = kError;
645 TC3_LOG(ERROR) << "Exception occurred";
646 return UTF8ToUnicodeText("", /*do_copy=*/false);
647 }
648
649 // java_result is nullptr when the group did not participate in the match.
650 // For these cases other UniLib implementations return empty string, and
651 // the participation can be checked by checking if Start() == -1.
652 if (!java_result) {
653 *status = kNoError;
654 return UTF8ToUnicodeText("", /*do_copy=*/false);
655 }
656
657 std::string result;
658 if (!JStringToUtf8String(jenv, java_result.get(), &result)) {
659 *status = kError;
660 return UTF8ToUnicodeText("", /*do_copy=*/false);
661 }
662 *status = kNoError;
663 return UTF8ToUnicodeText(result, /*do_copy=*/true);
664 } else {
665 *status = kError;
666 return UTF8ToUnicodeText("", /*do_copy=*/false);
667 }
668 }
669
670 constexpr int UniLib::BreakIterator::kDone;
671
BreakIterator(const JniCache * jni_cache,const UnicodeText & text)672 UniLib::BreakIterator::BreakIterator(const JniCache* jni_cache,
673 const UnicodeText& text)
674 : jni_cache_(jni_cache),
675 text_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
676 iterator_(nullptr, jni_cache ? jni_cache->jvm : nullptr),
677 last_break_index_(0),
678 last_unicode_index_(0) {
679 if (jni_cache_) {
680 JNIEnv* jenv = jni_cache_->GetEnv();
681 text_ = MakeGlobalRef(jni_cache_->ConvertToJavaString(text).release(), jenv,
682 jni_cache->jvm);
683 if (!text_) {
684 return;
685 }
686
687 iterator_ = MakeGlobalRef(
688 jenv->CallStaticObjectMethod(jni_cache->breakiterator_class.get(),
689 jni_cache->breakiterator_getwordinstance,
690 jni_cache->locale_us.get()),
691 jenv, jni_cache->jvm);
692 if (!iterator_) {
693 return;
694 }
695 jenv->CallVoidMethod(iterator_.get(), jni_cache->breakiterator_settext,
696 text_.get());
697 }
698 }
699
Next()700 int UniLib::BreakIterator::Next() {
701 if (jni_cache_) {
702 const int break_index = jni_cache_->GetEnv()->CallIntMethod(
703 iterator_.get(), jni_cache_->breakiterator_next);
704 if (jni_cache_->ExceptionCheckAndClear() ||
705 break_index == BreakIterator::kDone) {
706 return BreakIterator::kDone;
707 }
708
709 const int token_unicode_length = jni_cache_->GetEnv()->CallIntMethod(
710 text_.get(), jni_cache_->string_code_point_count, last_break_index_,
711 break_index);
712 if (jni_cache_->ExceptionCheckAndClear()) {
713 return BreakIterator::kDone;
714 }
715
716 last_break_index_ = break_index;
717 return last_unicode_index_ += token_unicode_length;
718 }
719 return BreakIterator::kDone;
720 }
721
CreateBreakIterator(const UnicodeText & text) const722 std::unique_ptr<UniLib::BreakIterator> UniLib::CreateBreakIterator(
723 const UnicodeText& text) const {
724 return std::unique_ptr<UniLib::BreakIterator>(
725 new UniLib::BreakIterator(jni_cache_.get(), text));
726 }
727
728 } // namespace libtextclassifier3
729