1 /**
2 * Copyright (c) 2024-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "plugins/ets/stdlib/native/core/IntlDisplayNames.h"
17 #include "plugins/ets/stdlib/native/core/IntlCommon.h"
18 #include "plugins/ets/stdlib/native/core/IntlState.h"
19 #include "plugins/ets/stdlib/native/core/stdlib_ani_helpers.h"
20 #include "libpandabase/macros.h"
21 #include "unicode/numberformatter.h"
22 #include "unicode/numberrangeformatter.h"
23 #include "unicode/locid.h"
24 #include "unicode/unistr.h"
25 #include "unicode/uloc.h"
26 #include "unicode/ucurr.h"
27 #include "unicode/uldnames.h"
28 #include "unicode/udisplaycontext.h"
29 #include "IntlCommon.h"
30
31 #include <string>
32 #include <string_view>
33 #include <cstdlib>
34 #include <array>
35 #include <optional>
36 #include <algorithm>
37 #include <memory>
38 #include <cctype>
39
40 namespace ark::ets::stdlib::intl {
41
42 /**
43 * @enum CodeType
44 * @brief Enumeration of supported display name types
45 *
46 * This enum defines the different types of codes that can be used for localized
47 * display names, following international standards.
48 */
49 enum class CodeType {
50 /** Language code (ISO 639) */
51 LANGUAGE,
52 /** Script code (ISO 15924) */
53 SCRIPT,
54 /** Region/Country code (ISO 3166) */
55 REGION,
56 /** Currency code (ISO 4217) */
57 CURRENCY
58 };
59
60 constexpr int32_t ISO_SCRIPT_CODE_LENGTH = 4;
61 constexpr int32_t ISO_REGION_ALPHA_LENGTH = 2;
62 constexpr int32_t ISO_REGION_NUMERIC_LENGTH = 3;
63 constexpr int32_t ISO_CURRENCY_CODE_LENGTH = 3;
64 constexpr int32_t BUFFER_CAPACITY_SMALL = 100;
65 constexpr int32_t BUFFER_CAPACITY_LARGE = 200;
66
67 constexpr size_t FIRST_CHAR_INDEX = 0;
68 constexpr size_t SECOND_CHAR_INDEX = 1;
69 constexpr size_t THIRD_CHAR_INDEX = 2;
70 constexpr size_t FOURTH_CHAR_INDEX = 3;
71
72 /**
73 * @brief Validates a script code according to ISO 15924 standard
74 *
75 * Script codes should be 4 letters with the first letter uppercase and
76 * the remaining letters lowercase (e.g., "Latn", "Cyrl", "Arab").
77 *
78 * @param code The script code to validate
79 * @return true if the code is a valid ISO 15924 script code, false otherwise
80 */
IsValidScriptCode(std::string_view code)81 [[nodiscard]] constexpr bool IsValidScriptCode(std::string_view code) noexcept
82 {
83 // Script codes should be 4 letters according to ISO-15924
84 if (code.length() != ISO_SCRIPT_CODE_LENGTH) {
85 return false;
86 }
87
88 // First letter should be uppercase, the rest lowercase
89 return (std::isupper(code[FIRST_CHAR_INDEX]) != 0) && (std::islower(code[SECOND_CHAR_INDEX]) != 0) &&
90 (std::islower(code[THIRD_CHAR_INDEX]) != 0) && (std::islower(code[FOURTH_CHAR_INDEX]) != 0);
91 }
92
93 /**
94 * @brief Validates a region code according to ISO 3166 or UN M49 standards
95 * Valid region codes are either:
96 * - Two uppercase letters (ISO 3166-1 alpha-2, e.g., "US", "JP", "DE")
97 * - Three digits (UN M49 numeric code, e.g., "840", "392", "276")
98 *
99 * @param code The region code to validate
100 * @return true if the code is a valid region code, false otherwise
101 */
IsValidRegionCode(std::string_view code)102 [[nodiscard]] constexpr bool IsValidRegionCode(std::string_view code) noexcept
103 {
104 // Region codes are either 2 uppercase letters (ISO-3166) or 3 digits (UN M49)
105 if (code.length() == ISO_REGION_ALPHA_LENGTH) {
106 return (std::isupper(code[FIRST_CHAR_INDEX]) != 0) && (std::isupper(code[SECOND_CHAR_INDEX]) != 0);
107 }
108
109 if (code.length() == ISO_REGION_NUMERIC_LENGTH) {
110 return (std::isdigit(code[FIRST_CHAR_INDEX]) != 0) && (std::isdigit(code[SECOND_CHAR_INDEX]) != 0) &&
111 (std::isdigit(code[THIRD_CHAR_INDEX]) != 0);
112 }
113
114 return false;
115 }
116
117 /**
118 * @brief Validates a currency code according to ISO 4217 standard
119 *
120 * Currency codes should be 3 uppercase letters (e.g., "USD", "EUR", "JPY").
121 *
122 * @param code The currency code to validate
123 * @return true if the code is a valid ISO 4217 currency code, false otherwise
124 */
IsValidCurrencyCode(std::string_view code)125 [[nodiscard]] constexpr bool IsValidCurrencyCode(std::string_view code) noexcept
126 {
127 // Currency codes should be 3 uppercase letters according to ISO-4217
128 if (code.length() != ISO_CURRENCY_CODE_LENGTH) {
129 return false;
130 }
131
132 return (std::isupper(code[FIRST_CHAR_INDEX]) != 0) && (std::isupper(code[SECOND_CHAR_INDEX]) != 0) &&
133 (std::isupper(code[THIRD_CHAR_INDEX]) != 0);
134 }
135
136 /**
137 * @brief Validates a language code according to BCP 47 standard
138 *
139 * Performs basic validation for language codes according to BCP 47.
140 * Language codes should be 2-3 letters (ISO 639) or 5-8 characters for language-extlang.
141 *
142 * @param code The language code to validate
143 * @return true if the code appears to be a valid BCP 47 language code, false otherwise
144 */
IsValidLanguageCode(std::string_view code)145 [[nodiscard]] bool IsValidLanguageCode(std::string_view code)
146 {
147 // Basic validation for language codes according to BCP 47
148 // Language codes should be 2-3 letters (ISO 639) or 5-8 characters for language-extlang
149 if (code.empty()) {
150 return false;
151 }
152
153 // Check if the code uses valid characters
154 if (!std::all_of(code.begin(), code.end(), [](char c) { return (std::isalpha(c) != 0) || c == '-' || c == '_'; })) {
155 return false;
156 }
157
158 // Additional validation: try canonicalizing with ICU and check for errors
159 UErrorCode status = U_ZERO_ERROR;
160 std::array<char, BUFFER_CAPACITY_SMALL> canonical {};
161
162 uloc_canonicalize(std::string(code).c_str(), canonical.data(), BUFFER_CAPACITY_SMALL, &status);
163
164 return (U_SUCCESS(status) != 0);
165 }
166
167 /**
168 * @brief Canonicalizes a language code according to BCP 47 standard
169 *
170 * Attempts to convert a language code to its canonical form using ICU.
171 * If canonicalization fails, the original code is returned.
172 *
173 * @param code The language code to canonicalize
174 * @return The canonicalized language code, or the original if canonicalization fails
175 */
CanonicalizeLanguageCode(std::string_view code)176 [[nodiscard]] std::string CanonicalizeLanguageCode(std::string_view code)
177 {
178 UErrorCode status = U_ZERO_ERROR;
179 std::array<char, BUFFER_CAPACITY_SMALL> canonical {};
180
181 uloc_canonicalize(std::string(code).c_str(), canonical.data(), BUFFER_CAPACITY_SMALL, &status);
182
183 if ((U_FAILURE(status) != 0)) {
184 return std::string(code); // Return original if canonicalization fails
185 }
186
187 return std::string(canonical.data());
188 }
189
190 /**
191 * @brief Converts a string type identifier to the corresponding CodeType enum
192 *
193 * @param typeStr The type string to convert ("language", "script", "region", or "currency")
194 * @param isValid Reference to a boolean that will be set to false if the type is invalid
195 * @return The corresponding CodeType enum value
196 */
StringToCodeType(std::string_view typeStr,bool & isValid)197 [[nodiscard]] CodeType StringToCodeType(std::string_view typeStr, bool &isValid) noexcept
198 {
199 isValid = true;
200
201 if (typeStr == "language") {
202 return CodeType::LANGUAGE;
203 }
204 if (typeStr == "script") {
205 return CodeType::SCRIPT;
206 }
207 if (typeStr == "region") {
208 return CodeType::REGION;
209 }
210 if (typeStr == "currency") {
211 return CodeType::CURRENCY;
212 }
213
214 isValid = false;
215 return CodeType::LANGUAGE; // Default value, not used when isValid is false
216 }
217
218 /**
219 * @brief Generic template function to retrieve a display name using ICU
220 *
221 * This function handles the common pattern of calling an ICU function to get a
222 * display name, dealing with buffer allocation, error checking, and result conversion.
223 *
224 * @tparam Func Type of the function to call (usually a function pointer or lambda)
225 * @param displayNames The ICU ULocaleDisplayNames object to use
226 * @param code The code to get the display name for
227 * @param getNameFunc The function to call to get the display name
228 * @return An optional containing the Unicode string result, or std::nullopt if the operation failed
229 */
230 template <typename Func>
GetDisplayName(ULocaleDisplayNames * displayNames,const char * code,Func getNameFunc)231 std::optional<icu::UnicodeString> GetDisplayName(ULocaleDisplayNames *displayNames, const char *code, Func getNameFunc)
232 {
233 UErrorCode status = U_ZERO_ERROR;
234 std::array<UChar, BUFFER_CAPACITY_LARGE> buffer {};
235
236 int32_t length = getNameFunc(displayNames, code, buffer.data(), BUFFER_CAPACITY_LARGE, &status);
237 if ((U_SUCCESS(status) != 0) && length > 0) {
238 return icu::UnicodeString(buffer.data(), length);
239 }
240 return std::nullopt;
241 }
242
243 /**
244 * @brief Validates code and returns canonicalized form if valid
245 *
246 * @param env The environment
247 * @param codeType The type of code to validate
248 * @param codeStr The code string to validate
249 * @param fallbackStr The fallback behavior if validation fails
250 * @param isValid Reference to a boolean that will be set to the validation result
251 * @return The canonicalized code if valid, or the original code otherwise
252 */
ValidateAndCanonicalizeCode(ani_env * env,CodeType codeType,const std::string & codeStr,const std::string & fallbackStr,bool & isValid)253 static std::string ValidateAndCanonicalizeCode(ani_env *env, CodeType codeType, const std::string &codeStr,
254 const std::string &fallbackStr, bool &isValid)
255 {
256 std::string canonicalizedCode = codeStr;
257
258 switch (codeType) {
259 case CodeType::LANGUAGE:
260 isValid = IsValidLanguageCode(codeStr);
261 if (isValid) {
262 canonicalizedCode = CanonicalizeLanguageCode(codeStr);
263 }
264 break;
265 case CodeType::SCRIPT:
266 isValid = IsValidScriptCode(codeStr);
267 break;
268 case CodeType::REGION:
269 isValid = IsValidRegionCode(codeStr);
270 break;
271 case CodeType::CURRENCY:
272 isValid = IsValidCurrencyCode(codeStr);
273 break;
274 }
275
276 if (!isValid) {
277 if (fallbackStr == "code") {
278 return codeStr;
279 }
280 ThrowNewError(env, "Lstd/core/RangeError;", ("Invalid code: " + codeStr).c_str(), "Lstd/core/String;:V");
281 }
282
283 return canonicalizedCode;
284 }
285
286 /**
287 * @brief Gets the display name based on the code type
288 *
289 * @param displayNames The ICU ULocaleDisplayNames object to use
290 * @param codeType The type of code
291 * @param canonicalizedCode The canonicalized code to get the display name for
292 * @return An optional containing the Unicode string result
293 */
GetDisplayNameByType(ULocaleDisplayNames * displayNames,CodeType codeType,const std::string & canonicalizedCode)294 static std::optional<icu::UnicodeString> GetDisplayNameByType(ULocaleDisplayNames *displayNames, CodeType codeType,
295 const std::string &canonicalizedCode)
296 {
297 std::optional<icu::UnicodeString> result;
298
299 switch (codeType) {
300 case CodeType::LANGUAGE:
301 result = GetDisplayName(displayNames, canonicalizedCode.c_str(), uldn_localeDisplayName);
302 break;
303 case CodeType::SCRIPT:
304 result = GetDisplayName(displayNames, canonicalizedCode.c_str(), uldn_scriptDisplayName);
305 break;
306 case CodeType::REGION:
307 result = GetDisplayName(displayNames, canonicalizedCode.c_str(), uldn_regionDisplayName);
308 break;
309 case CodeType::CURRENCY: {
310 result = GetDisplayName(
311 displayNames, canonicalizedCode.c_str(),
312 [](ULocaleDisplayNames *ldn, const char *code, UChar *dest, int32_t destCapacity, UErrorCode *uStatus) {
313 return uldn_keyValueDisplayName(ldn, "currency", code, dest, destCapacity, uStatus);
314 });
315 if (!result) {
316 // Fallback: just use the currency code
317 result = icu::UnicodeString(canonicalizedCode.c_str(), static_cast<int32_t>(canonicalizedCode.length()),
318 "UTF-8");
319 }
320 break;
321 }
322 }
323
324 return result;
325 }
326
327 /// @brief Safely closes a ULocaleDisplayNames object
SafeCloseDisplayNames(ULocaleDisplayNames * displayNames)328 static inline void SafeCloseDisplayNames(ULocaleDisplayNames *displayNames)
329 {
330 if (displayNames != nullptr) {
331 uldn_close(displayNames);
332 }
333 }
334
335 /**
336 * @brief Native implementation of DisplayNames.of() method
337 *
338 * Retrieves a localized display name for a given code based on the specified type and locale.
339 *
340 * @param env The environment
341 * @param klass The class from which this method was called (unused)
342 * @param locale The BCP 47 language tag for the desired locale
343 * @param type The type of code ("language", "script", "region", or "currency")
344 * @param code The code to get the display name for
345 * @param style The style of the display name ("long" or "short")
346 * @param fallback The fallback behavior if the display name cannot be found ("code" or "none")
347 * @param languageDisplay How to display language names with dialects ("dialect" or "standard")
348 * @return The localized display name as an ani_string, or nullptr if not found and fallback is "none"
349 */
StdCoreIntlDisplayNamesOf(ani_env * env,ani_class klass,ani_string locale,ani_string type,ani_string code,ani_string style,ani_string fallback,ani_string languageDisplay)350 static ani_string StdCoreIntlDisplayNamesOf(ani_env *env, [[maybe_unused]] ani_class klass, ani_string locale,
351 ani_string type, ani_string code, ani_string style, ani_string fallback,
352 ani_string languageDisplay)
353 {
354 auto localeStr = ConvertFromAniString(env, locale);
355 auto typeStr = ConvertFromAniString(env, type);
356 auto codeStr = ConvertFromAniString(env, code);
357 auto styleStr = ConvertFromAniString(env, style);
358 auto fallbackStr = ConvertFromAniString(env, fallback);
359 auto languageDisplayStr = (languageDisplay != nullptr) ? ConvertFromAniString(env, languageDisplay) : "dialect";
360
361 bool isValidCodeType = true;
362 auto codeType = StringToCodeType(typeStr, isValidCodeType);
363
364 if (!isValidCodeType) {
365 ThrowNewError(env, "Lstd/core/RangeError;", ("Invalid type: " + typeStr).c_str(), "Lstd/core/String;:V");
366 return nullptr;
367 }
368
369 bool isValidCode = true;
370 std::string canonicalizedCode = ValidateAndCanonicalizeCode(env, codeType, codeStr, fallbackStr, isValidCode);
371 if (!isValidCode) {
372 return (fallbackStr == "code") ? StdStrToAni(env, codeStr) : nullptr;
373 }
374
375 // Set up ICU locale
376 UErrorCode status = U_ZERO_ERROR;
377 auto icuLocale = icu::Locale::forLanguageTag(localeStr, status);
378 if ((U_FAILURE(status) != 0)) {
379 ThrowNewError(env, "Lstd/core/RangeError;", ("Invalid locale tag: " + localeStr).c_str(),
380 "Lstd/core/String;:V");
381 return nullptr;
382 }
383
384 // Determine ICU dialect handling and open display names
385 auto dialectHandling = (styleStr == "short") ? ULDN_DIALECT_NAMES : ULDN_STANDARD_NAMES;
386 ULocaleDisplayNames *displayNames = uldn_open(localeStr.c_str(), dialectHandling, &status);
387
388 // Handle failure to create display names
389 if ((U_FAILURE(status) != 0) || displayNames == nullptr) {
390 SafeCloseDisplayNames(displayNames);
391 return (fallbackStr == "code") ? StdStrToAni(env, codeStr) : nullptr;
392 }
393
394 std::optional<icu::UnicodeString> result = GetDisplayNameByType(displayNames, codeType, canonicalizedCode);
395
396 ani_string retVal = nullptr;
397 bool hasValidResult = result.has_value() && (result->isEmpty() == 0);
398 if (hasValidResult) {
399 retVal = UnicodeToAniStr(env, *result);
400 } else if (fallbackStr == "code") {
401 retVal = StdStrToAni(env, codeStr);
402 } else {
403 ThrowNewError(env, "Lstd/core/RangeError;", ("Invalid code: " + codeStr).c_str(), "Lstd/core/String;:V");
404 }
405
406 SafeCloseDisplayNames(displayNames);
407 return retVal;
408 }
409
410 /**
411 * @brief Registers the native methods for the Intl.DisplayNames class
412 *
413 * This function binds the native C++ implementation of the DisplayNames methods
414 *
415 * @param env The environment
416 * @return ani_status indicating success or failure of the registration
417 */
RegisterIntlDisplayNames(ani_env * env)418 ani_status RegisterIntlDisplayNames(ani_env *env)
419 {
420 const auto methods = std::array {
421 ani_native_function {"ofNative",
422 "Lstd/core/String;Lstd/core/String;Lstd/core/String;Lstd/core/String;Lstd/core/"
423 "String;Lstd/core/String;:Lstd/core/String;",
424 reinterpret_cast<void *>(StdCoreIntlDisplayNamesOf)},
425 };
426
427 ani_class displayNamesClass;
428 ANI_FATAL_IF_ERROR(env->FindClass("Lstd/core/Intl/DisplayNames;", &displayNamesClass));
429
430 return env->Class_BindNativeMethods(displayNamesClass, methods.data(), methods.size());
431 }
432
433 } // namespace ark::ets::stdlib::intl
434