• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "plugins/ets/stdlib/native/core/IntlSegmenter.h"
17 #include "plugins/ets/stdlib/native/core/IntlLocaleMatch.h"
18 #include "plugins/ets/stdlib/native/core/IntlCommon.h"
19 #include "stdlib_ani_helpers.h"
20 #include "libpandabase/macros.h"
21 #include "unicode/unistr.h"
22 #include "unicode/brkiter.h"
23 #include "unicode/ubrk.h"
24 
25 #include <cstring>
26 #include <string>
27 #include <vector>
28 #include <array>
29 #include <cassert>
30 #include <optional>
31 
32 namespace ark::ets::stdlib::intl {
33 
34 using BreakerFactory = icu::BreakIterator *(*)(const icu::Locale &, UErrorCode &);
35 
36 /// @brief Helper function that converts an ETS string to ICU Unicode string
AniToUnicode(ani_env * env,ani_string etsStr)37 icu::UnicodeString AniToUnicode(ani_env *env, ani_string etsStr)
38 {
39     auto str = ConvertFromAniString(env, etsStr);
40     icu::UnicodeString unicodeStr = icu::UnicodeString::fromUTF8(str);
41     return unicodeStr;
42 }
43 
44 /// @brief Converts an ETS string containing a BCP47 language tag to ICU Locale
EtsToLocale(ani_env * env,const ani_string & bcp47Locale)45 std::optional<icu::Locale> EtsToLocale(ani_env *env, const ani_string &bcp47Locale)
46 {
47     UErrorCode status = U_ZERO_ERROR;
48     std::string stdStr = ConvertFromAniString(env, bcp47Locale);
49 
50     icu::Locale locale = icu::Locale::forLanguageTag(stdStr, status);
51     if (UNLIKELY(U_FAILURE(status))) {
52         std::cout << "ICU error: " << u_errorName(status) << std::endl;
53         return std::nullopt;
54     }
55     if (locale.isBogus() != 0) {
56         std::cout << "Created locale is bogus" << std::endl;
57         return std::nullopt;
58     }
59     return std::optional<icu::Locale> {locale};
60 }
61 
62 /**
63  * @brief Creates a new IntlCluster object with the specified properties
64  * @param env Pointer to ANI environment
65  * @param klass ANI class reference (unused)
66  * @param cluster String containing the cluster text
67  * @param index Starting index of the cluster in original string
68  * @param isWordLike Boolean indicating if cluster represents a word-like segment
69  * @return Pointer to newly created IntlCluster object, nullptr if creation fails
70  * @throws RuntimeException if class/constructor/field lookups fail
71  */
StdCoreIntlCreateClusterObject(ani_env * env,ani_class klass,ani_string cluster,ani_int index,ani_boolean isWordLike)72 ani_object StdCoreIntlCreateClusterObject(ani_env *env, [[maybe_unused]] ani_class klass, ani_string cluster,
73                                           ani_int index, ani_boolean isWordLike)
74 {
75     // Find the cluster class
76     ani_class clusterClass;
77     ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Cluster;", &clusterClass));
78 
79     // Find the constructor method
80     ani_method constructorMethod;
81     ANI_FATAL_IF_ERROR(env->Class_FindMethod(clusterClass, "<ctor>", ":V", &constructorMethod));
82 
83     // Create a new instance
84     ani_object clusterObj;
85 
86     // NOLINTNEXTLINE(cppcoreguidelines-pro-type-vararg)
87     ANI_FATAL_IF_ERROR(env->Object_New(clusterClass, constructorMethod, &clusterObj));
88 
89     // Get field IDs
90     ani_field clusterField;
91     ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "cluster", &clusterField));
92 
93     ani_field indexField;
94     ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "index", &indexField));
95 
96     ani_field isWordLikeField;
97     ANI_FATAL_IF_ERROR(env->Class_FindField(clusterClass, "isWordLike", &isWordLikeField));
98 
99     // Set field values
100     ANI_FATAL_IF_ERROR(env->Object_SetField_Ref(clusterObj, clusterField, cluster));
101     ANI_FATAL_IF_ERROR(env->Object_SetField_Int(clusterObj, indexField, index));
102     ANI_FATAL_IF_ERROR(env->Object_SetField_Boolean(clusterObj, isWordLikeField, isWordLike));
103 
104     return clusterObj;
105 }
106 
107 /**
108  * @brief Determines if current cluster segment represents a word-like unit
109  * @param break_iterator Reference to ICU break iterator
110  * @return true if current cluster is word-like, false otherwise
111  */
IntlCurrentClusterIsWordLike(std::unique_ptr<icu::BreakIterator> & breakIterator)112 ani_boolean IntlCurrentClusterIsWordLike(std::unique_ptr<icu::BreakIterator> &breakIterator)
113 {
114     auto ruleStatus = static_cast<int>(breakIterator->getRuleStatus());
115     // Number-type words (digits, numbers)
116     bool result = (ruleStatus >= UBRK_WORD_NUMBER && ruleStatus < UBRK_WORD_NUMBER_LIMIT);
117     // Letter-based words (alphabetic characters)
118     result = result || (ruleStatus >= UBRK_WORD_LETTER && ruleStatus < UBRK_WORD_LETTER_LIMIT);
119     // Kana words (Japanese Hiragana/Katakana characters)
120     result = result || (ruleStatus >= UBRK_WORD_KANA && ruleStatus < UBRK_WORD_KANA_LIMIT);
121     // Ideographic words (Chinese/Japanese/Korean characters)
122     result = result || (ruleStatus >= UBRK_WORD_IDEO && ruleStatus < UBRK_WORD_IDEO_LIMIT);
123     return static_cast<ani_boolean>(result);
124 }
125 
126 /**
127  * @brief Generic function to segment text into clusters using specified ICU break iterator
128  * @param env Pointer to ANI environment
129  * @param klass ANI class reference
130  * @param factory Function pointer to create specific type of ICU break iterator
131  * @param str Input string to segment
132  * @param localeStr BCP47 language tag string for locale-specific segmentation
133  * @return Array of IntlCluster objects representing the segments, nullptr if operation fails
134  * @throws RuntimeException if locale creation or break iterator initialization fails
135  */
IntlClusters(ani_env * env,ani_class klass,BreakerFactory factory,ani_string str,ani_string localeStr)136 ani_array_ref IntlClusters(ani_env *env, [[maybe_unused]] ani_class klass, BreakerFactory factory, ani_string str,
137                            ani_string localeStr)
138 {
139     std::optional<icu::Locale> locale = EtsToLocale(env, localeStr);
140     if (!locale) {
141         std::string message = "Unable to create ICU locale for specified tag (bcp47): ";
142         message += ConvertFromAniString(env, localeStr);
143         ThrowNewError(env, "Lstd/core/RuntimeException;", message.c_str(), "Lstd/core/String;:V");
144         return nullptr;
145     }
146     icu::Locale breakLocale = locale.value();
147 
148     UErrorCode status = U_ZERO_ERROR;
149     std::unique_ptr<icu::BreakIterator> breaker(factory(breakLocale, status));
150     if (UNLIKELY(U_FAILURE(status))) {
151         std::string message = "Unable to create break iterator";
152         ThrowNewError(env, "Lstd/core/RuntimeException;", message.c_str(), "Lstd/core/String;:V");
153         return nullptr;
154     }
155 
156     icu::UnicodeString uniStr = AniToUnicode(env, str);
157     breaker->setText(uniStr);
158 
159     std::vector<ani_object> clusters;
160     int32_t current = breaker->first();
161     int32_t next = breaker->next();
162 
163     // Process each segment
164     while (next != icu::BreakIterator::DONE) {
165         icu::UnicodeString cluster = uniStr.tempSubStringBetween(current, next);
166         std::string utf8Cluster;
167         cluster.toUTF8String(utf8Cluster);
168 
169         ani_string clusterStr = StdStrToAni(env, utf8Cluster);
170         ani_boolean isWordLike = IntlCurrentClusterIsWordLike(breaker);
171         ani_object clusterObject = StdCoreIntlCreateClusterObject(env, klass, clusterStr, current, isWordLike);
172 
173         clusters.push_back(clusterObject);
174         current = next;
175         next = breaker->next();
176     }
177 
178     // Find cluster class for array creation
179     ani_class clusterClass;
180     ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Cluster;", &clusterClass));
181 
182     // Create array of the correct size
183     ani_array_ref resultArray;
184     ANI_FATAL_IF_ERROR(env->Array_New_Ref(clusterClass, clusters.size(), nullptr, &resultArray));
185 
186     // Fill the array with cluster objects
187     for (size_t i = 0; i < clusters.size(); ++i) {
188         ANI_FATAL_IF_ERROR(env->Array_Set_Ref(resultArray, i, clusters[i]));
189     }
190 
191     return resultArray;
192 }
193 
194 /**
195  * @brief Segments text into grapheme clusters
196  * @param env Pointer to ANI environment
197  * @param klass ANI class reference
198  * @param str Input string to segment
199  * @param localeStr BCP47 language tag string for locale-specific segmentation
200  * @return Array of IntlCluster objects representing grapheme clusters
201  */
StdCoreIntlGraphemeClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)202 ani_array_ref StdCoreIntlGraphemeClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
203 {
204     return IntlClusters(env, klass, icu::BreakIterator::createCharacterInstance, str, localeStr);
205 }
206 
207 /**
208  * @brief Segments text into word clusters
209  * @param env Pointer to ANI environment
210  * @param klass ANI class reference
211  * @param str Input string to segment
212  * @param localeStr BCP47 language tag string for locale-specific segmentation
213  * @return Array of IntlCluster objects representing word clusters
214  */
StdCoreIntlWordClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)215 ani_array_ref StdCoreIntlWordClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
216 {
217     return IntlClusters(env, klass, icu::BreakIterator::createWordInstance, str, localeStr);
218 }
219 
220 /**
221  * @brief Segments text into sentence clusters
222  * @param env Pointer to ANI environment
223  * @param klass ANI class reference
224  * @param str Input string to segment
225  * @param localeStr BCP47 language tag string for locale-specific segmentation
226  * @return Array of IntlCluster objects representing sentence clusters
227  */
StdCoreIntlSentenceClusters(ani_env * env,ani_class klass,ani_string str,ani_string localeStr)228 ani_array_ref StdCoreIntlSentenceClusters(ani_env *env, ani_class klass, ani_string str, ani_string localeStr)
229 {
230     return IntlClusters(env, klass, icu::BreakIterator::createSentenceInstance, str, localeStr);
231 }
232 
233 /**
234  * @brief Registers native methods for IntlSegmenter class
235  * @param env Pointer to ETS environment
236  * @return Status code indicating success/failure of registration
237  */
RegisterIntlSegmenter(ani_env * env)238 ani_status RegisterIntlSegmenter(ani_env *env)
239 {
240     const auto methods = std::array {
241         ani_native_function {"graphemeClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
242                              reinterpret_cast<void *>(StdCoreIntlGraphemeClusters)},
243         ani_native_function {"wordClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
244                              reinterpret_cast<void *>(StdCoreIntlWordClusters)},
245         ani_native_function {"sentenceClusters", "Lstd/core/String;Lstd/core/String;:[Lstd/core/Intl/Cluster;",
246                              reinterpret_cast<void *>(StdCoreIntlSentenceClusters)},
247     };
248 
249     ani_class segmenterClass;
250     ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/Segmenter;", &segmenterClass));
251 
252     return env->Class_BindNativeMethods(segmenterClass, methods.data(), methods.size());
253 }
254 
255 }  // namespace ark::ets::stdlib::intl
256