1 /** 2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_ 16 #define PANDA_RUNTIME_CORETYPES_STRING_H_ 17 18 #include <securec.h> 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "libpandabase/utils/utf.h" 24 #include "runtime/include/language_context.h" 25 #include "runtime/include/object_header.h" 26 27 namespace ark::coretypes { 28 29 class Array; 30 class String : public ObjectHeader { 31 public: Cast(ObjectHeader * object)32 static String *Cast(ObjectHeader *object) 33 { 34 // NOTE(linxiang) to do assert 35 return static_cast<String *>(object); 36 } 37 38 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length, 39 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 40 bool movable = true, bool pinned = false); 41 42 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 43 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 44 bool movable = true, bool pinned = false); 45 46 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 47 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 48 bool pinned = false); 49 50 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, 51 bool movable = true, bool pinned = false); 52 53 static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length, 54 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned); 55 56 PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, 57 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 58 bool pinned = false); 59 60 PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, 61 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 62 bool movable = true, bool pinned = false); 63 64 PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, 65 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 66 bool pinned = false); 67 68 PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm); 69 70 PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm); 71 72 PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm); 73 74 PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, 75 const LanguageContext &ctx, PandaVM *vm); 76 77 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray, 78 const LanguageContext &ctx, PandaVM *vm); 79 80 template <bool VERIFY = true> 81 uint16_t At(int32_t index); 82 83 PANDA_PUBLIC_API int32_t Compare(String *rstr); 84 85 PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx); 86 87 PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length, 88 const LanguageContext &ctx); 89 IsUtf16()90 bool IsUtf16() const 91 { 92 return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 93 } 94 IsMUtf8()95 bool IsMUtf8() const 96 { 97 return !IsUtf16(); 98 } 99 ComputeDataSizeUtf16(uint32_t length)100 static size_t ComputeDataSizeUtf16(uint32_t length) 101 { 102 return length * sizeof(dataUtf16_[0]); 103 } 104 105 /// Methods for uncompressed strings (UTF16) ComputeSizeUtf16(uint32_t utf16Length)106 static size_t ComputeSizeUtf16(uint32_t utf16Length) 107 { 108 return sizeof(String) + ComputeDataSizeUtf16(utf16Length); 109 } 110 GetDataUtf16()111 uint16_t *GetDataUtf16() 112 { 113 ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string"); 114 return dataUtf16_; 115 } 116 117 /// Methods for compresses strings (MUTF8 or LATIN1) ComputeSizeMUtf8(uint32_t mutf8Length)118 static size_t ComputeSizeMUtf8(uint32_t mutf8Length) 119 { 120 return sizeof(String) + mutf8Length; 121 } 122 GetDataUtf8()123 uint8_t *GetDataUtf8() 124 { 125 ASSERT_PRINT(!IsUtf16(), "String: Read data as utf8 for utf16 string"); 126 return reinterpret_cast<uint8_t *>(dataUtf16_); 127 } 128 129 /// It's MUtf8 format, but without 0 in the end. GetDataMUtf8()130 uint8_t *GetDataMUtf8() 131 { 132 ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string"); 133 return reinterpret_cast<uint8_t *>(dataUtf16_); 134 } 135 GetMUtf8Length()136 size_t GetMUtf8Length() 137 { 138 if (!IsUtf16()) { 139 return GetLength() + 1; // add place for zero at the end 140 } 141 return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength()); 142 } 143 GetUtf16Length()144 size_t GetUtf16Length() 145 { 146 return GetLength(); 147 } 148 GetUtf8Length()149 size_t GetUtf8Length() 150 { 151 if (!IsUtf16()) { 152 return GetLength(); 153 } 154 return ark::utf::Utf16ToUtf8Size(dataUtf16_, GetLength(), false) - 1; 155 } 156 CopyDataRegionUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)157 size_t CopyDataRegionUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) 158 { 159 if (length > maxLength) { 160 return 0; 161 } 162 uint32_t len = GetUtf8Length(); 163 if (start + length > len) { 164 return 0; 165 } 166 if (!IsUtf16()) { 167 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 168 if (length > MAX_LEN) { 169 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 170 } 171 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 172 if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataUtf8() + start, length) != EOK) { 173 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 174 } 175 return length; 176 } 177 length = GetUtf16Length(); 178 return ark::utf::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, maxLength, start, false); 179 } 180 CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)181 inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString) 182 { 183 if (isCString) { 184 ASSERT(maxLength != 0); 185 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 186 buf[maxLength - 1] = '\0'; 187 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1; // add place for zero at the end 188 } 189 190 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength); 191 } 192 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)193 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) 194 { 195 if (length > maxLength) { 196 return 0; 197 } 198 uint32_t len = GetLength(); 199 if (start + length > len) { 200 return 0; 201 } 202 if (!IsUtf16()) { 203 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 204 if (length > MAX_LEN) { 205 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 206 } 207 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 208 if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) { 209 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 210 } 211 return length; 212 } 213 return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); 214 } 215 CopyDataUtf16(uint16_t * buf,uint32_t maxLength)216 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) 217 { 218 return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength); 219 } 220 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)221 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) 222 { 223 if (length > maxLength) { 224 return 0; 225 } 226 uint32_t len = GetLength(); 227 if (start + length > len) { 228 return 0; 229 } 230 if (IsUtf16()) { 231 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 232 if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 233 EOK) { 234 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 235 } 236 } else { 237 uint8_t *src8 = GetDataMUtf8() + start; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 238 for (uint32_t i = 0; i < length; ++i) { 239 buf[i] = src8[i]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 240 } 241 } 242 return length; 243 } 244 GetLength()245 uint32_t GetLength() const 246 { 247 uint32_t length; 248 if (compressedStringsEnabled_) { 249 length = length_ >> 1U; 250 } else { 251 length = length_; 252 } 253 return length; 254 } 255 IsEmpty()256 bool IsEmpty() const 257 { 258 // do not shift right length because it is always zero for empty string 259 return length_ == 0; 260 } 261 ObjectSize()262 size_t ObjectSize() const 263 { 264 uint32_t length = GetLength(); 265 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 266 } 267 GetHashcode()268 uint32_t GetHashcode() 269 { 270 if (hashcode_ == 0) { 271 hashcode_ = ComputeHashcode(); 272 } 273 return hashcode_; 274 } 275 276 int32_t IndexOf(String *rhs, int pos = 0); 277 int32_t LastIndexOf(String *rhs, int pos = INT32_MAX); 278 GetLengthOffset()279 static constexpr uint32_t GetLengthOffset() 280 { 281 return MEMBER_OFFSET(String, length_); 282 } 283 GetDataOffset()284 static constexpr uint32_t GetDataOffset() 285 { 286 return MEMBER_OFFSET(String, dataUtf16_); 287 } 288 GetHashcodeOffset()289 static constexpr uint32_t GetHashcodeOffset() 290 { 291 return MEMBER_OFFSET(String, hashcode_); 292 } 293 GetStringCompressionMask()294 static constexpr uint32_t GetStringCompressionMask() 295 { 296 return STRING_COMPRESSED_BIT; 297 } 298 299 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 300 PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2); 301 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 302 PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length); 303 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, 304 bool canBeCompressed); 305 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 306 PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, 307 uint32_t utf16DataLength); 308 static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm); 309 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length); 310 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed); 311 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 312 SetCompressedStringsEnabled(bool val)313 static void SetCompressedStringsEnabled(bool val) 314 { 315 compressedStringsEnabled_ = val; 316 } 317 GetCompressedStringsEnabled()318 static bool GetCompressedStringsEnabled() 319 { 320 return compressedStringsEnabled_; 321 } 322 NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)323 static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex, 324 const coretypes::String *str) 325 { 326 auto strLen = str->GetLength(); 327 std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex}; 328 329 // If begin_index < 0, then it is assumed to be equal to zero. 330 if (normIndexes.first < 0) { 331 normIndexes.first = 0; 332 } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) { 333 // If begin_index > str_len, then it is assumed to be equal to str_len. 334 normIndexes.first = static_cast<int32_t>(strLen); 335 } 336 // If end_index < 0, then it is assumed to be equal to zero. 337 if (normIndexes.second < 0) { 338 normIndexes.second = 0; 339 } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) { 340 // If end_index > str_len, then it is assumed to be equal to str_len. 341 normIndexes.second = static_cast<int32_t>(strLen); 342 } 343 // If begin_index > end_index, then these are swapped. 344 if (normIndexes.first > normIndexes.second) { 345 std::swap(normIndexes.first, normIndexes.second); 346 } 347 ASSERT((normIndexes.second - normIndexes.first) >= 0); 348 return normIndexes; 349 } 350 351 static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx, 352 PandaVM *vm = nullptr); 353 354 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data); 355 IsASCIICharacter(uint16_t data)356 static bool IsASCIICharacter(uint16_t data) 357 { 358 // \0 is not considered ASCII in Modified-UTF8 359 return data - 1U < utf::UTF8_1B_MAX; 360 } 361 362 protected: 363 void SetLength(uint32_t length, bool compressed = false) 364 { 365 if (compressedStringsEnabled_) { 366 ASSERT(length < 0x80000000U); 367 // Use 0u for compressed/utf8 expression 368 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 369 } else { 370 length_ = length; 371 } 372 } 373 SetHashcode(uint32_t hashcode)374 void SetHashcode(uint32_t hashcode) 375 { 376 hashcode_ = hashcode; 377 } 378 379 uint32_t ComputeHashcode(); 380 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length); 381 static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length); 382 static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr, 383 bool movable = true, bool pinned = false); 384 385 private: 386 PANDA_PUBLIC_API static bool compressedStringsEnabled_; 387 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 388 enum CompressedStatus { 389 STRING_COMPRESSED, 390 STRING_UNCOMPRESSED, 391 }; 392 393 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length); 394 static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non); 395 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non); 396 397 /** 398 * str1 should have the same length as mutf16_data. 399 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 400 */ 401 // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer 402 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data, 403 uint32_t utf16DataLength); 404 405 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength); 406 407 template <typename T> 408 /// Check that two spans are equal. Should have the same length. 409 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 410 411 // In last bit of length_ we store if this string is compressed or not. 412 uint32_t length_; 413 uint32_t hashcode_; 414 // A pointer to the string data stored after the string header. 415 // Data can be stored in mutf8 or utf16 form according to compressed bit. 416 __extension__ uint16_t dataUtf16_[0]; // NOLINT(modernize-avoid-c-arrays) 417 }; 418 419 constexpr uint32_t STRING_LENGTH_OFFSET = sizeof(ObjectHeader); 420 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset()); 421 constexpr uint32_t STRING_HASHCODE_OFFSET = STRING_LENGTH_OFFSET + sizeof(uint32_t); 422 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset()); 423 constexpr uint32_t STRING_DATA_OFFSET = STRING_HASHCODE_OFFSET + sizeof(uint32_t); 424 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset()); 425 426 } // namespace ark::coretypes 427 428 #endif // PANDA_RUNTIME_CORETYPES_STRING_H_ 429