1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <algorithm>
17 #include <cstdint>
18 #include <regex>
19 #include "include/mem/panda_string.h"
20 #include "intrinsics.h"
21 #include "libpandabase/utils/logger.h"
22 #include "macros.h"
23 #include "napi/ets_napi.h"
24 #include "runtime/handle_scope-inl.h"
25 #include "runtime/entrypoints/string_index_of.h"
26 #include "plugins/ets/runtime/types/ets_string.h"
27 #include "plugins/ets/runtime/ets_exceptions.h"
28 #include "plugins/ets/runtime/ets_language_context.h"
29 #include "plugins/ets/runtime/ets_panda_file_items.h"
30
31 #include "unicode/locid.h"
32 #include "unicode/coll.h"
33 #include "unicode/unistr.h"
34 #include "unicode/normalizer2.h"
35 #include "utils/span.h"
36
37 using icu::Normalizer2;
38
39 namespace ark::ets::intrinsics {
40
41 constexpr const uint32_t CHAR0X1FFC00 = 0x1ffc00;
42 constexpr const uint16_t CHAR0XD800 = 0xd800;
43 constexpr const uint16_t CHAR0XDC00 = 0xdc00;
44
StdCoreStringGetChars(EtsString * s,ets_int begin,ets_int end)45 EtsCharArray *StdCoreStringGetChars(EtsString *s, ets_int begin, ets_int end)
46 {
47 ASSERT(s != nullptr);
48 ets_int length = s->GetLength();
49 if (UNLIKELY(begin > end)) {
50 ark::ThrowStringIndexOutOfBoundsException(begin, length);
51 return nullptr;
52 }
53 if (UNLIKELY(begin > length || begin < 0)) {
54 ark::ThrowStringIndexOutOfBoundsException(begin, length);
55 return nullptr;
56 }
57 if (UNLIKELY(end > length)) {
58 ark::ThrowStringIndexOutOfBoundsException(end, length);
59 return nullptr;
60 }
61
62 auto thread = ManagedThread::GetCurrent();
63 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
64 VMHandle<coretypes::String> sHandle(thread, s->GetCoreType());
65 ets_int n = end - begin;
66 EtsCharArray *charArray = EtsCharArray::Create(n);
67 if (charArray == nullptr || n == 0) {
68 return charArray;
69 }
70 Span<ets_char> out(charArray->GetData<ets_char>(), charArray->GetLength());
71 sHandle.GetPtr()->CopyDataRegionUtf16(&out[0], begin, charArray->GetLength(), sHandle.GetPtr()->GetLength());
72 return charArray;
73 }
74
StdCoreStringSubstring(EtsString * str,ets_int begin,ets_int end)75 EtsString *StdCoreStringSubstring(EtsString *str, ets_int begin, ets_int end)
76 {
77 ASSERT(str != nullptr);
78 auto indexes = coretypes::String::NormalizeSubStringIndexes(begin, end, str->GetCoreType());
79 if (UNLIKELY(indexes.first == 0 && indexes.second == str->GetLength())) {
80 return str;
81 }
82 ets_int substrLength = indexes.second - indexes.first;
83 return EtsString::FastSubString(str, static_cast<uint32_t>(indexes.first), static_cast<uint32_t>(substrLength));
84 }
85
StdCoreStringCharAt(EtsString * s,int32_t index)86 uint16_t StdCoreStringCharAt(EtsString *s, int32_t index)
87 {
88 ASSERT(s != nullptr);
89
90 int32_t length = s->GetLength();
91 if (UNLIKELY(index >= length || index < 0)) {
92 ark::ThrowStringIndexOutOfBoundsException(index, length);
93 return 0;
94 }
95
96 if (s->IsUtf16()) {
97 Span<uint16_t> sp(s->GetDataUtf16(), length);
98 return sp[index];
99 }
100
101 Span<uint8_t> sp(s->GetDataMUtf8(), length);
102 return sp[index];
103 }
104
StdCoreStringGetLength(EtsString * s)105 int32_t StdCoreStringGetLength(EtsString *s)
106 {
107 ASSERT(s != nullptr);
108 return s->GetLength();
109 }
110
StdCoreStringLength(EtsString * s)111 double StdCoreStringLength(EtsString *s)
112 {
113 ASSERT(s != nullptr);
114 return static_cast<double>(s->GetLength());
115 }
116
StdCoreStringIsEmpty(EtsString * s)117 EtsBoolean StdCoreStringIsEmpty(EtsString *s)
118 {
119 ASSERT(s != nullptr);
120 return ToEtsBoolean(s->IsEmpty());
121 }
122
StdCoreStringEquals(EtsString * owner,EtsObject * s)123 uint8_t StdCoreStringEquals(EtsString *owner, EtsObject *s)
124 {
125 if ((owner->AsObject()) == s) {
126 return UINT8_C(1);
127 }
128 if (s == nullptr || !(s->GetClass()->IsStringClass())) {
129 return UINT8_C(0);
130 }
131 return static_cast<uint8_t>(owner->StringsAreEqual(s));
132 }
133
StdCoreStringMatch(EtsString * thisStr,EtsString * reg)134 EtsString *StdCoreStringMatch(EtsString *thisStr, EtsString *reg)
135 {
136 PandaVector<uint8_t> buf;
137 auto thisS = std::string(thisStr->ConvertToStringView(&buf));
138 auto regex = std::string(reg->ConvertToStringView(&buf));
139
140 std::regex e(regex);
141 return EtsString::CreateFromMUtf8(std::sregex_iterator(thisS.begin(), thisS.end(), e)->str().c_str());
142 }
143
StringNormalize(EtsString * str,const Normalizer2 * normalizer)144 EtsString *StringNormalize(EtsString *str, const Normalizer2 *normalizer)
145 {
146 auto coroutine = EtsCoroutine::GetCurrent();
147 [[maybe_unused]] HandleScope<ObjectHeader *> scope(coroutine);
148
149 icu::UnicodeString utf16Str;
150 if (str->IsUtf16()) {
151 utf16Str = icu::UnicodeString {str->GetDataUtf16(), static_cast<int32_t>(str->GetUtf16Length())};
152 } else {
153 utf16Str =
154 icu::UnicodeString {utf::Mutf8AsCString(str->GetDataMUtf8()), static_cast<int32_t>(str->GetLength())};
155 }
156
157 UErrorCode errorCode = U_ZERO_ERROR;
158 utf16Str = normalizer->normalize(utf16Str, errorCode);
159
160 if (UNLIKELY(U_FAILURE(errorCode))) {
161 std::string message = "Got error in process of normalization: '" + std::string(u_errorName(errorCode)) + "'";
162 ThrowEtsException(coroutine, panda_file_items::class_descriptors::RANGE_ERROR, message);
163 return nullptr;
164 }
165
166 return EtsString::CreateFromUtf16(reinterpret_cast<const uint16_t *>(utf16Str.getTerminatedBuffer()),
167 utf16Str.length());
168 }
169
StdCoreStringNormalizeNFC(EtsString * thisStr)170 EtsString *StdCoreStringNormalizeNFC(EtsString *thisStr)
171 {
172 UErrorCode errorCode = U_ZERO_ERROR;
173 auto normalizer = Normalizer2::getNFCInstance(errorCode);
174 if (UNLIKELY(U_FAILURE(errorCode))) {
175 std::string message = "Cannot get NFC normalizer: '" + std::string(u_errorName(errorCode)) + "'";
176 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
177 return nullptr;
178 }
179 return StringNormalize(thisStr, normalizer);
180 }
181
StdCoreStringNormalizeNFD(EtsString * thisStr)182 EtsString *StdCoreStringNormalizeNFD(EtsString *thisStr)
183 {
184 UErrorCode errorCode = U_ZERO_ERROR;
185 auto normalizer = Normalizer2::getNFDInstance(errorCode);
186 if (UNLIKELY(U_FAILURE(errorCode))) {
187 std::string message = "Cannot get NFD normalizer: '" + std::string(u_errorName(errorCode)) + "'";
188 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
189 return nullptr;
190 }
191 return StringNormalize(thisStr, normalizer);
192 }
193
StdCoreStringNormalizeNFKC(EtsString * thisStr)194 EtsString *StdCoreStringNormalizeNFKC(EtsString *thisStr)
195 {
196 UErrorCode errorCode = U_ZERO_ERROR;
197 auto normalizer = Normalizer2::getNFKCInstance(errorCode);
198 if (UNLIKELY(U_FAILURE(errorCode))) {
199 std::string message = "Cannot get NFKC normalizer: '" + std::string(u_errorName(errorCode)) + "'";
200 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
201 return nullptr;
202 }
203 return StringNormalize(thisStr, normalizer);
204 }
205
StdCoreStringNormalizeNFKD(EtsString * thisStr)206 EtsString *StdCoreStringNormalizeNFKD(EtsString *thisStr)
207 {
208 UErrorCode errorCode = U_ZERO_ERROR;
209 auto normalizer = Normalizer2::getNFKDInstance(errorCode);
210 if (UNLIKELY(U_FAILURE(errorCode))) {
211 std::string message = "Cannot get NFKD normalizer: '" + std::string(u_errorName(errorCode)) + "'";
212 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
213 return nullptr;
214 }
215 return StringNormalize(thisStr, normalizer);
216 }
217
StdCoreStringIsWellFormed(EtsString * thisStr)218 uint8_t StdCoreStringIsWellFormed(EtsString *thisStr)
219 {
220 if (!thisStr->IsUtf16()) {
221 return UINT8_C(1);
222 }
223 auto length = thisStr->GetUtf16Length();
224 auto codeUnits = Span<uint16_t>(thisStr->GetDataUtf16(), length);
225 for (size_t i = 0; i < length; ++i) {
226 uint16_t codeUnit = codeUnits[i];
227 if ((codeUnit & CHAR0X1FFC00) == CHAR0XD800) {
228 // Code unit is a leading surrogate
229 if (i == length - 1) {
230 return UINT8_C(0);
231 }
232 // Is not trail surrogate
233 if ((codeUnits[i + 1] & CHAR0X1FFC00) != CHAR0XDC00) {
234 return UINT8_C(0);
235 }
236 // Skip the paired trailing surrogate
237 ++i;
238 // Is trail surrogate
239 } else if ((codeUnit & CHAR0X1FFC00) == CHAR0XDC00) {
240 return UINT8_C(0);
241 }
242 }
243 return UINT8_C(1);
244 }
245
ToLowerCase(EtsString * thisStr,const icu::Locale & locale)246 EtsString *ToLowerCase(EtsString *thisStr, const icu::Locale &locale)
247 {
248 auto coroutine = EtsCoroutine::GetCurrent();
249 [[maybe_unused]] HandleScope<ObjectHeader *> scope(coroutine);
250
251 icu::UnicodeString utf16Str;
252 if (thisStr->IsUtf16()) {
253 utf16Str = icu::UnicodeString {thisStr->GetDataUtf16(), static_cast<int32_t>(thisStr->GetUtf16Length())};
254 } else {
255 utf16Str = icu::UnicodeString {utf::Mutf8AsCString(thisStr->GetDataMUtf8()),
256 static_cast<int32_t>(thisStr->GetLength())};
257 }
258 auto res = utf16Str.toLower(locale);
259 return EtsString::CreateFromUtf16(reinterpret_cast<const uint16_t *>(res.getTerminatedBuffer()), res.length());
260 }
261
ToUpperCase(EtsString * thisStr,const icu::Locale & locale)262 EtsString *ToUpperCase(EtsString *thisStr, const icu::Locale &locale)
263 {
264 auto coroutine = EtsCoroutine::GetCurrent();
265 [[maybe_unused]] HandleScope<ObjectHeader *> scope(coroutine);
266
267 icu::UnicodeString utf16Str;
268 if (thisStr->IsUtf16()) {
269 utf16Str = icu::UnicodeString {thisStr->GetDataUtf16(), static_cast<int32_t>(thisStr->GetUtf16Length())};
270 } else {
271 utf16Str = icu::UnicodeString {utf::Mutf8AsCString(thisStr->GetDataMUtf8()),
272 static_cast<int32_t>(thisStr->GetLength())};
273 }
274 auto res = utf16Str.toUpper(locale);
275 return EtsString::CreateFromUtf16(reinterpret_cast<const uint16_t *>(res.getTerminatedBuffer()), res.length());
276 }
277
ParseSingleBCP47LanguageTag(EtsString * langTag,icu::Locale & locale)278 UErrorCode ParseSingleBCP47LanguageTag(EtsString *langTag, icu::Locale &locale)
279 {
280 if (langTag == nullptr) {
281 locale = icu::Locale::getDefault();
282 return U_ZERO_ERROR;
283 }
284
285 PandaVector<uint8_t> buf;
286 std::string_view locTag = langTag->ConvertToStringView(&buf);
287 icu::StringPiece sp {locTag.data(), static_cast<int32_t>(locTag.size())};
288 UErrorCode status = U_ZERO_ERROR;
289 locale = icu::Locale::forLanguageTag(sp, status);
290 return status;
291 }
292
StdCoreStringToUpperCase(EtsString * thisStr)293 EtsString *StdCoreStringToUpperCase(EtsString *thisStr)
294 {
295 return ToUpperCase(thisStr, icu::Locale::getDefault());
296 }
297
StdCoreStringToLowerCase(EtsString * thisStr)298 EtsString *StdCoreStringToLowerCase(EtsString *thisStr)
299 {
300 return ToLowerCase(thisStr, icu::Locale::getDefault());
301 }
302
StdCoreStringToLocaleUpperCase(EtsString * thisStr,EtsString * langTag)303 EtsString *StdCoreStringToLocaleUpperCase(EtsString *thisStr, EtsString *langTag)
304 {
305 ASSERT(langTag != nullptr);
306
307 icu::Locale locale;
308 auto localeParseStatus = ParseSingleBCP47LanguageTag(langTag, locale);
309 if (UNLIKELY(U_FAILURE(localeParseStatus))) {
310 auto message = "Language tag '" + ConvertToString(langTag->GetCoreType()) + "' is invalid or not supported";
311 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
312 return nullptr;
313 }
314 return ToUpperCase(thisStr, locale);
315 }
316
StdCoreStringToLocaleLowerCase(EtsString * thisStr,EtsString * langTag)317 EtsString *StdCoreStringToLocaleLowerCase(EtsString *thisStr, EtsString *langTag)
318 {
319 ASSERT(langTag != nullptr);
320
321 icu::Locale locale;
322 auto localeParseStatus = ParseSingleBCP47LanguageTag(langTag, locale);
323 if (UNLIKELY(U_FAILURE(localeParseStatus))) {
324 auto message = "Language tag '" + ConvertToString(langTag->GetCoreType()) + "' is invalid or not supported";
325 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
326 return nullptr;
327 }
328 return ToLowerCase(thisStr, locale);
329 }
330
StdCoreStringLocaleCmp(EtsString * thisStr,EtsString * cmpStr,EtsString * langTag)331 ets_double StdCoreStringLocaleCmp(EtsString *thisStr, EtsString *cmpStr, EtsString *langTag)
332 {
333 ASSERT(thisStr != nullptr && cmpStr != nullptr);
334
335 icu::Locale locale;
336 auto status = ParseSingleBCP47LanguageTag(langTag, locale);
337 if (UNLIKELY(U_FAILURE(status))) {
338 auto message = "Language tag '" + ConvertToString(langTag->GetCoreType()) + "' is invalid or not supported";
339 ThrowEtsException(EtsCoroutine::GetCurrent(), panda_file_items::class_descriptors::RANGE_ERROR, message);
340 return 0;
341 }
342
343 icu::UnicodeString source;
344 if (thisStr->IsUtf16()) {
345 source = icu::UnicodeString {thisStr->GetDataUtf16(), static_cast<int32_t>(thisStr->GetUtf16Length())};
346 } else {
347 source = icu::UnicodeString {utf::Mutf8AsCString(thisStr->GetDataMUtf8()),
348 static_cast<int32_t>(thisStr->GetLength())};
349 }
350 icu::UnicodeString target;
351 if (cmpStr->IsUtf16()) {
352 target = icu::UnicodeString {cmpStr->GetDataUtf16(), static_cast<int32_t>(cmpStr->GetUtf16Length())};
353 } else {
354 target =
355 icu::UnicodeString {utf::Mutf8AsCString(cmpStr->GetDataMUtf8()), static_cast<int32_t>(cmpStr->GetLength())};
356 }
357 status = U_ZERO_ERROR;
358 std::unique_ptr<icu::Collator> myCollator(icu::Collator::createInstance(locale, status));
359 if (UNLIKELY(U_FAILURE(status))) {
360 icu::UnicodeString dispName;
361 locale.getDisplayName(dispName);
362 std::string localeName;
363 dispName.toUTF8String(localeName);
364 LOG(FATAL, ETS) << "Failed to create the collator for " << localeName;
365 }
366 return myCollator->compare(source, target);
367 }
368
StdCoreStringIndexOfAfter(EtsString * s,uint16_t ch,ets_int fromIndex)369 ets_int StdCoreStringIndexOfAfter(EtsString *s, uint16_t ch, ets_int fromIndex)
370 {
371 return ark::intrinsics::StringIndexOfU16(s, ch, fromIndex);
372 }
373
StdCoreStringIndexOf(EtsString * s,uint16_t ch)374 ets_int StdCoreStringIndexOf(EtsString *s, uint16_t ch)
375 {
376 return StdCoreStringIndexOfAfter(s, ch, 0);
377 }
378
StdCoreStringIndexOfString(EtsString * thisStr,EtsString * patternStr,ets_int fromIndex)379 ets_int StdCoreStringIndexOfString(EtsString *thisStr, EtsString *patternStr, ets_int fromIndex)
380 {
381 ASSERT(thisStr != nullptr && patternStr != nullptr);
382 return thisStr->GetCoreType()->IndexOf(patternStr->GetCoreType(), fromIndex);
383 }
384
StdCoreStringLastIndexOfString(EtsString * thisStr,EtsString * patternStr,ets_int fromIndex)385 ets_int StdCoreStringLastIndexOfString(EtsString *thisStr, EtsString *patternStr, ets_int fromIndex)
386 {
387 ASSERT(thisStr != nullptr && patternStr != nullptr);
388 // "abc".lastIndexOf("ab", -10) will return 0
389 return thisStr->GetCoreType()->LastIndexOf(patternStr->GetCoreType(), std::max(fromIndex, 0));
390 }
391
StdCoreStringCodePointToChar(ets_int codePoint)392 ets_int StdCoreStringCodePointToChar(ets_int codePoint)
393 {
394 icu::UnicodeString uniStr((UChar32)codePoint);
395 uint32_t ret = bit_cast<uint16_t>(uniStr.charAt(0));
396 // if codepoint contains a surrogate pair
397 // encode it into int with higher bits being second char
398 if (uniStr.length() > 1) {
399 constexpr uint32_t BITS_IN_CHAR = 16;
400 ret |= static_cast<uint32_t>(bit_cast<uint16_t>(uniStr.charAt(1))) << BITS_IN_CHAR;
401 }
402 return bit_cast<ets_int>(ret);
403 }
404
StdCoreStringHashCode(EtsString * thisStr)405 int32_t StdCoreStringHashCode(EtsString *thisStr)
406 {
407 ASSERT(thisStr != nullptr);
408 return thisStr->GetCoreType()->GetHashcode();
409 }
410
StdCoreStringIsCompressed(EtsString * thisStr)411 EtsBoolean StdCoreStringIsCompressed(EtsString *thisStr)
412 {
413 ASSERT(thisStr != nullptr);
414 return ToEtsBoolean(thisStr->GetCoreType()->IsMUtf8());
415 }
416
StdCoreStringConcat2(EtsString * str1,EtsString * str2)417 EtsString *StdCoreStringConcat2(EtsString *str1, EtsString *str2)
418 {
419 auto s1 = reinterpret_cast<coretypes::String *>(str1);
420 auto s2 = reinterpret_cast<coretypes::String *>(str2);
421 return reinterpret_cast<EtsString *>(CoreStringConcat2(s1, s2));
422 }
423
StdCoreStringConcat3(EtsString * str1,EtsString * str2,EtsString * str3)424 EtsString *StdCoreStringConcat3(EtsString *str1, EtsString *str2, EtsString *str3)
425 {
426 auto s1 = reinterpret_cast<coretypes::String *>(str1);
427 auto s2 = reinterpret_cast<coretypes::String *>(str2);
428 auto s3 = reinterpret_cast<coretypes::String *>(str3);
429 return reinterpret_cast<EtsString *>(CoreStringConcat3(s1, s2, s3));
430 }
431
StdCoreStringConcat4(EtsString * str1,EtsString * str2,EtsString * str3,EtsString * str4)432 EtsString *StdCoreStringConcat4(EtsString *str1, EtsString *str2, EtsString *str3, EtsString *str4)
433 {
434 auto s1 = reinterpret_cast<coretypes::String *>(str1);
435 auto s2 = reinterpret_cast<coretypes::String *>(str2);
436 auto s3 = reinterpret_cast<coretypes::String *>(str3);
437 auto s4 = reinterpret_cast<coretypes::String *>(str4);
438 return reinterpret_cast<EtsString *>(CoreStringConcat4(s1, s2, s3, s4));
439 }
440
StdCoreStringCompareTo(EtsString * str1,EtsString * str2)441 ets_int StdCoreStringCompareTo(EtsString *str1, EtsString *str2)
442 {
443 /* corner cases */
444 if (str1->GetLength() == 0) {
445 return -str2->GetLength();
446 }
447 if (str2->GetLength() == 0) {
448 return str1->GetLength();
449 }
450
451 /* use the default implementation otherwise */
452 return str1->GetCoreType()->Compare(str2->GetCoreType());
453 }
454
StdCoreStringTrimLeft(EtsString * thisStr)455 EtsString *StdCoreStringTrimLeft(EtsString *thisStr)
456 {
457 return thisStr->TrimLeft();
458 }
459
StdCoreStringTrimRight(EtsString * thisStr)460 EtsString *StdCoreStringTrimRight(EtsString *thisStr)
461 {
462 return thisStr->TrimRight();
463 }
464
StdCoreStringTrim(EtsString * thisStr)465 EtsString *StdCoreStringTrim(EtsString *thisStr)
466 {
467 return thisStr->Trim();
468 }
469
470 } // namespace ark::ets::intrinsics
471