1 /**
2 * Copyright (c) 2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "plugins/ets/stdlib/native/core/IntlSegmenter.h"
17 #include "plugins/ets/stdlib/native/core/IntlLocaleMatch.h"
18 #include "plugins/ets/stdlib/native/core/IntlCommon.h"
19 #include "stdlib_ani_helpers.h"
20 #include "libpandabase/macros.h"
21 #include "unicode/unistr.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/ubrk.h"
24
25 #include <cstring>
26 #include <string>
27 #include <vector>
28 #include <array>
29 #include <cassert>
30 #include <optional>
31
32 namespace ark::ets::stdlib::intl {
33
34 using BreakerFactory = icu::BreakIterator *(*)(const icu::Locale &, UErrorCode &);
35
36 /// @brief Helper function that converts an ETS string to ICU Unicode string
AniToUnicode(ani_env * env,ani_string etsStr)37 icu::UnicodeString AniToUnicode(ani_env *env, ani_string etsStr)
38 {
39 auto str = ConvertFromAniString(env, etsStr);
40 icu::UnicodeString unicodeStr = icu::UnicodeString::fromUTF8(str);
41 return unicodeStr;
42 }
43
44 /// @brief Converts an ETS string containing a BCP47 language tag to ICU Locale
EtsToLocale(ani_env * env,const ani_string & bcp47Locale)45 std::optional<icu::Locale> EtsToLocale(ani_env *env, const ani_string &bcp47Locale)
46 {
47 UErrorCode status = U_ZERO_ERROR;
48 std::string stdStr = ConvertFromAniString(env, bcp47Locale);
49
50 icu::Locale locale = icu::Locale::forLanguageTag(stdStr, status);
51 if (UNLIKELY(U_FAILURE(status))) {
52 std::cout << "ICU error: " << u_errorName(status) << std::endl;
53 return std::nullopt;
54 }
55 if (locale.isBogus() != 0) {
56 std::cout << "Created locale is bogus" << std::endl;
57 return std::nullopt;
58 }
59 return std::optional<icu::Locale> {locale};
60 }
61
62 /**
63 * @brief Creates a new IntlCluster object with the specified properties
64 * @param env Pointer to ANI environment
65 * @param klass ANI class reference (unused)
66 * @param cluster String containing the cluster text
67 * @param index Starting index of the cluster in original string
68 * @param isWordLike Boolean indicating if cluster represents a word-like segment
69 * @return Pointer to newly created IntlCluster object, nullptr if creation fails
70 * @throws RuntimeException if class/constructor/field lookups fail
71 */
StdCoreIntlCreateClusterObject(ani_env * env,ani_class klass,ani_string cluster,ani_int index,ani_boolean isWordLike)72 ani_object StdCoreIntlCreateClusterObject(ani_env *env, [[maybe_unused]] ani_class klass, ani_string cluster,
73 ani_int index, ani_boolean isWordLike)
74 {
75 // Find the cluster class
76 ani_class clusterClass;
77 ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Cluster;", &clusterClass));
78
79 // Find the constructor method
80 ani_method constructorMethod;
81 ANI_FATAL_IF_ERROR(env->Class_FindMethod(clusterClass, "<ctor>", ":V", &constructorMethod));
82
83 // Create a new instance
84 ani_object clusterObj;
85
86 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87 ANI_FATAL_IF_ERROR(env->Object_New(clusterClass, constructorMethod, &clusterObj));
88
89 // Get field IDs
90 ani_field clusterField;
91 ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "cluster", &clusterField));
92
93 ani_field indexField;
94 ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "index", &indexField));
95
96 ani_field isWordLikeField;
97 ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "isWordLike", &isWordLikeField));
98
99 // Set field values
100 ANI_FATAL_IF_ERROR(env->Object_SetField_Ref(clusterObj, clusterField, cluster));
101 ANI_FATAL_IF_ERROR(env->Object_SetField_Int(clusterObj, indexField, index));
102 ANI_FATAL_IF_ERROR(env->Object_SetField_Boolean(clusterObj, isWordLikeField, isWordLike));
103
104 return clusterObj;
105 }
106
107 /**
108 * @brief Determines if current cluster segment represents a word-like unit
109 * @param break_iterator Reference to ICU break iterator
110 * @return true if current cluster is word-like, false otherwise
111 */
IntlCurrentClusterIsWordLike(std::unique_ptr<icu::BreakIterator> & breakIterator)112 ani_boolean IntlCurrentClusterIsWordLike(std::unique_ptr<icu::BreakIterator> &breakIterator)
113 {
114 auto ruleStatus = static_cast<int>(breakIterator->getRuleStatus());
115 // Number-type words (digits, numbers)
116 bool result = (ruleStatus >= UBRK_WORD_NUMBER && ruleStatus < UBRK_WORD_NUMBER_LIMIT);
117 // Letter-based words (alphabetic characters)
118 result = result || (ruleStatus >= UBRK_WORD_LETTER && ruleStatus < UBRK_WORD_LETTER_LIMIT);
119 // Kana words (Japanese Hiragana/Katakana characters)
120 result = result || (ruleStatus >= UBRK_WORD_KANA && ruleStatus < UBRK_WORD_KANA_LIMIT);
121 // Ideographic words (Chinese/Japanese/Korean characters)
122 result = result || (ruleStatus >= UBRK_WORD_IDEO && ruleStatus < UBRK_WORD_IDEO_LIMIT);
123 return static_cast<ani_boolean>(result);
124 }
125
126 /**
127 * @brief Generic function to segment text into clusters using specified ICU break iterator
128 * @param env Pointer to ANI environment
129 * @param klass ANI class reference
130 * @param factory Function pointer to create specific type of ICU break iterator
131 * @param str Input string to segment
132 * @param localeStr BCP47 language tag string for locale-specific segmentation
133 * @return Array of IntlCluster objects representing the segments, nullptr if operation fails
134 * @throws RuntimeException if locale creation or break iterator initialization fails
135 */
IntlClusters(ani_env * env,ani_class klass,BreakerFactory factory,ani_string str,ani_string localeStr)136 ani_array_ref IntlClusters(ani_env *env, [[maybe_unused]] ani_class klass, BreakerFactory factory, ani_string str,
137 ani_string localeStr)
138 {
139 std::optional<icu::Locale> locale = EtsToLocale(env, localeStr);
140 if (!locale) {
141 std::string message = "Unable to create ICU locale for specified tag (bcp47): ";
142 message += ConvertFromAniString(env, localeStr);
143 ThrowNewError(env, "Lstd/core/RuntimeException;", message.c_str(), "Lstd/core/String;:V");
144 return nullptr;
145 }
146 icu::Locale breakLocale = locale.value();
147
148 UErrorCode status = U_ZERO_ERROR;
149 std::unique_ptr<icu::BreakIterator> breaker(factory(breakLocale, status));
150 if (UNLIKELY(U_FAILURE(status))) {
151 std::string message = "Unable to create break iterator";
152 ThrowNewError(env, "Lstd/core/RuntimeException;", message.c_str(), "Lstd/core/String;:V");
153 return nullptr;
154 }
155
156 icu::UnicodeString uniStr = AniToUnicode(env, str);
157 breaker->setText(uniStr);
158
159 std::vector<ani_object> clusters;
160 int32_t current = breaker->first();
161 int32_t next = breaker->next();
162
163 // Process each segment
164 while (next != icu::BreakIterator::DONE) {
165 icu::UnicodeString cluster = uniStr.tempSubStringBetween(current, next);
166 std::string utf8Cluster;
167 cluster.toUTF8String(utf8Cluster);
168
169 ani_string clusterStr = StdStrToAni(env, utf8Cluster);
170 ani_boolean isWordLike = IntlCurrentClusterIsWordLike(breaker);
171 ani_object clusterObject = StdCoreIntlCreateClusterObject(env, klass, clusterStr, current, isWordLike);
172
173 clusters.push_back(clusterObject);
174 current = next;
175 next = breaker->next();
176 }
177
178 // Find cluster class for array creation
179 ani_class clusterClass;
180 ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Cluster;", &clusterClass));
181
182 // Create array of the correct size
183 ani_array_ref resultArray;
184 ANI_FATAL_IF_ERROR(env->Array_New_Ref(clusterClass, clusters.size(), nullptr, &resultArray));
185
186 // Fill the array with cluster objects
187 for (size_t i = 0; i < clusters.size(); ++i) {
188 ANI_FATAL_IF_ERROR(env->Array_Set_Ref(resultArray, i, clusters[i]));
189 }
190
191 return resultArray;
192 }
193
194 /**
195 * @brief Segments text into grapheme clusters
196 * @param env Pointer to ANI environment
197 * @param klass ANI class reference
198 * @param str Input string to segment
199 * @param localeStr BCP47 language tag string for locale-specific segmentation
200 * @return Array of IntlCluster objects representing grapheme clusters
201 */
StdCoreIntlGraphemeClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)202 ani_array_ref StdCoreIntlGraphemeClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
203 {
204 return IntlClusters(env, klass, icu::BreakIterator::createCharacterInstance, str, localeStr);
205 }
206
207 /**
208 * @brief Segments text into word clusters
209 * @param env Pointer to ANI environment
210 * @param klass ANI class reference
211 * @param str Input string to segment
212 * @param localeStr BCP47 language tag string for locale-specific segmentation
213 * @return Array of IntlCluster objects representing word clusters
214 */
StdCoreIntlWordClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)215 ani_array_ref StdCoreIntlWordClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
216 {
217 return IntlClusters(env, klass, icu::BreakIterator::createWordInstance, str, localeStr);
218 }
219
220 /**
221 * @brief Segments text into sentence clusters
222 * @param env Pointer to ANI environment
223 * @param klass ANI class reference
224 * @param str Input string to segment
225 * @param localeStr BCP47 language tag string for locale-specific segmentation
226 * @return Array of IntlCluster objects representing sentence clusters
227 */
StdCoreIntlSentenceClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)228 ani_array_ref StdCoreIntlSentenceClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
229 {
230 return IntlClusters(env, klass, icu::BreakIterator::createSentenceInstance, str, localeStr);
231 }
232
233 /**
234 * @brief Registers native methods for IntlSegmenter class
235 * @param env Pointer to ETS environment
236 * @return Status code indicating success/failure of registration
237 */
RegisterIntlSegmenter(ani_env * env)238 ani_status RegisterIntlSegmenter(ani_env *env)
239 {
240 const auto methods = std::array {
241 ani_native_function {"graphemeClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
242 reinterpret_cast<void *>(StdCoreIntlGraphemeClusters)},
243 ani_native_function {"wordClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
244 reinterpret_cast<void *>(StdCoreIntlWordClusters)},
245 ani_native_function {"sentenceClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
246 reinterpret_cast<void *>(StdCoreIntlSentenceClusters)},
247 };
248
249 ani_class segmenterClass;
250 ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Segmenter;", &segmenterClass));
251
252 return env->Class_BindNativeMethods(segmenterClass, methods.data(), methods.size());
253 }
254
255 } // namespace ark::ets::stdlib::intl
256