1 /** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_ 16 #define PANDA_RUNTIME_CORETYPES_STRING_H_ 17 18 #include <securec.h> 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "libpandabase/utils/utf.h" 24 #include "runtime/include/language_context.h" 25 #include "runtime/include/object_header.h" 26 #include "runtime/mem/vm_handle.h" 27 28 namespace ark::coretypes { 29 30 class Array; 31 class String : public ObjectHeader { 32 public: Cast(ObjectHeader * object)33 static String *Cast(ObjectHeader *object) 34 { 35 // NOTE(linxiang) to do assert 36 return static_cast<String *>(object); 37 } 38 39 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length, 40 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 41 bool movable = true, bool pinned = false); 42 43 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 44 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 45 bool movable = true, bool pinned = false); 46 47 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 48 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 49 bool pinned = false); 50 51 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, 52 bool movable = true, bool pinned = false); 53 54 static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length, 55 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned); 56 57 PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, 58 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 59 bool pinned = false); 60 61 PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, 62 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 63 bool pinned = false); 64 65 PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm); 66 67 PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm); 68 69 PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm); 70 71 PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, 72 const LanguageContext &ctx, PandaVM *vm); 73 74 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray, 75 const LanguageContext &ctx, PandaVM *vm); 76 77 template <bool VERIFY = true> 78 uint16_t At(int32_t index); 79 80 PANDA_PUBLIC_API int32_t Compare(String *rstr); 81 82 PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx); 83 84 PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length, 85 const LanguageContext &ctx); 86 IsUtf16()87 bool IsUtf16() const 88 { 89 return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 90 } 91 IsMUtf8()92 bool IsMUtf8() const 93 { 94 return !IsUtf16(); 95 } 96 ComputeDataSizeUtf16(uint32_t length)97 static size_t ComputeDataSizeUtf16(uint32_t length) 98 { 99 return length * sizeof(dataUtf16_[0]); 100 } 101 102 /// Methods for uncompressed strings (UTF16) ComputeSizeUtf16(uint32_t utf16Length)103 static size_t ComputeSizeUtf16(uint32_t utf16Length) 104 { 105 return sizeof(String) + ComputeDataSizeUtf16(utf16Length); 106 } 107 GetDataUtf16()108 uint16_t *GetDataUtf16() 109 { 110 ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string"); 111 return dataUtf16_; 112 } 113 114 /// Methods for compresses strings (MUTF8 or LATIN1) ComputeSizeMUtf8(uint32_t mutf8Length)115 static size_t ComputeSizeMUtf8(uint32_t mutf8Length) 116 { 117 return sizeof(String) + mutf8Length; 118 } 119 120 /// It's MUtf8 format, but without 0 in the end. GetDataMUtf8()121 uint8_t *GetDataMUtf8() 122 { 123 ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string"); 124 return reinterpret_cast<uint8_t *>(dataUtf16_); 125 } 126 GetMUtf8Length()127 size_t GetMUtf8Length() 128 { 129 if (!IsUtf16()) { 130 return GetLength() + 1; // add place for zero at the end 131 } 132 return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength()); 133 } 134 GetUtf16Length()135 size_t GetUtf16Length() 136 { 137 return GetLength(); 138 } 139 CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)140 inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString) 141 { 142 if (isCString) { 143 ASSERT(maxLength != 0); 144 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 145 buf[maxLength - 1] = '\0'; 146 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1; // add place for zero at the end 147 } 148 149 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength); 150 } 151 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)152 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) 153 { 154 if (length > maxLength) { 155 return 0; 156 } 157 uint32_t len = GetLength(); 158 if (start + length > len) { 159 return 0; 160 } 161 if (!IsUtf16()) { 162 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 163 if (length > MAX_LEN) { 164 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 165 } 166 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 167 if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) { 168 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 169 } 170 return length; 171 } 172 return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); 173 } 174 CopyDataUtf16(uint16_t * buf,uint32_t maxLength)175 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) 176 { 177 return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength); 178 } 179 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)180 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) 181 { 182 if (length > maxLength) { 183 return 0; 184 } 185 uint32_t len = GetLength(); 186 if (start + length > len) { 187 return 0; 188 } 189 if (IsUtf16()) { 190 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 191 if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 192 EOK) { 193 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 194 } 195 } else { 196 uint8_t *src8 = GetDataMUtf8() + start; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 197 for (uint32_t i = 0; i < length; ++i) { 198 buf[i] = src8[i]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 199 } 200 } 201 return length; 202 } 203 GetLength()204 uint32_t GetLength() const 205 { 206 uint32_t length; 207 if (compressedStringsEnabled_) { 208 length = length_ >> 1U; 209 } else { 210 length = length_; 211 } 212 return length; 213 } 214 IsEmpty()215 bool IsEmpty() const 216 { 217 // do not shift right length because it is always zero for empty string 218 return length_ == 0; 219 } 220 ObjectSize()221 size_t ObjectSize() const 222 { 223 uint32_t length = GetLength(); 224 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 225 } 226 GetHashcode()227 uint32_t GetHashcode() 228 { 229 if (hashcode_ == 0) { 230 hashcode_ = ComputeHashcode(); 231 } 232 return hashcode_; 233 } 234 235 int32_t IndexOf(String *rhs, int pos = 0); 236 int32_t LastIndexOf(String *rhs, int pos = INT32_MAX); 237 GetLengthOffset()238 static constexpr uint32_t GetLengthOffset() 239 { 240 return MEMBER_OFFSET(String, length_); 241 } 242 GetDataOffset()243 static constexpr uint32_t GetDataOffset() 244 { 245 return MEMBER_OFFSET(String, dataUtf16_); 246 } 247 GetHashcodeOffset()248 static constexpr uint32_t GetHashcodeOffset() 249 { 250 return MEMBER_OFFSET(String, hashcode_); 251 } 252 GetStringCompressionMask()253 static constexpr uint32_t GetStringCompressionMask() 254 { 255 return STRING_COMPRESSED_BIT; 256 } 257 258 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 259 PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2); 260 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 261 PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length); 262 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, 263 bool canBeCompressed); 264 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 265 PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, 266 uint32_t utf16DataLength); 267 static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm); 268 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length); 269 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed); 270 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 271 SetCompressedStringsEnabled(bool val)272 static void SetCompressedStringsEnabled(bool val) 273 { 274 compressedStringsEnabled_ = val; 275 } 276 GetCompressedStringsEnabled()277 static bool GetCompressedStringsEnabled() 278 { 279 return compressedStringsEnabled_; 280 } 281 NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)282 static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex, 283 const coretypes::String *str) 284 { 285 auto strLen = str->GetLength(); 286 std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex}; 287 288 // If begin_index < 0, then it is assumed to be equal to zero. 289 if (normIndexes.first < 0) { 290 normIndexes.first = 0; 291 } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) { 292 // If begin_index > str_len, then it is assumed to be equal to str_len. 293 normIndexes.first = static_cast<int32_t>(strLen); 294 } 295 // If end_index < 0, then it is assumed to be equal to zero. 296 if (normIndexes.second < 0) { 297 normIndexes.second = 0; 298 } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) { 299 // If end_index > str_len, then it is assumed to be equal to str_len. 300 normIndexes.second = static_cast<int32_t>(strLen); 301 } 302 // If begin_index > end_index, then these are swapped. 303 if (normIndexes.first > normIndexes.second) { 304 std::swap(normIndexes.first, normIndexes.second); 305 } 306 ASSERT((normIndexes.second - normIndexes.first) >= 0); 307 return normIndexes; 308 } 309 310 static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx, 311 PandaVM *vm = nullptr); 312 313 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data); 314 IsASCIICharacter(uint16_t data)315 static bool IsASCIICharacter(uint16_t data) 316 { 317 // \0 is not considered ASCII in Modified-UTF8 318 return data - 1U < utf::UTF8_1B_MAX; 319 } 320 321 protected: 322 void SetLength(uint32_t length, bool compressed = false) 323 { 324 if (compressedStringsEnabled_) { 325 ASSERT(length < 0x80000000U); 326 // Use 0u for compressed/utf8 expression 327 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 328 } else { 329 length_ = length; 330 } 331 } 332 SetHashcode(uint32_t hashcode)333 void SetHashcode(uint32_t hashcode) 334 { 335 hashcode_ = hashcode; 336 } 337 338 uint32_t ComputeHashcode(); 339 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length); 340 static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length); 341 static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr, 342 bool movable = true, bool pinned = false); 343 344 private: 345 PANDA_PUBLIC_API static bool compressedStringsEnabled_; 346 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 347 enum CompressedStatus { 348 STRING_COMPRESSED, 349 STRING_UNCOMPRESSED, 350 }; 351 352 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length); 353 static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non); 354 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non); 355 356 /** 357 * str1 should have the same length as mutf16_data. 358 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 359 */ 360 // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer 361 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data, 362 uint32_t utf16DataLength); 363 364 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength); 365 366 template <typename T> 367 /// Check that two spans are equal. Should have the same length. 368 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 369 370 // In last bit of length_ we store if this string is compressed or not. 371 uint32_t length_; 372 uint32_t hashcode_; 373 // A pointer to the string data stored after the string header. 374 // Data can be stored in mutf8 or utf16 form according to compressed bit. 375 __extension__ uint16_t dataUtf16_[0]; // NOLINT(modernize-avoid-c-arrays) 376 }; 377 378 constexpr uint32_t STRING_LENGTH_OFFSET = 8U; 379 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset()); 380 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U; 381 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset()); 382 constexpr uint32_t STRING_DATA_OFFSET = 16U; 383 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset()); 384 385 } // namespace ark::coretypes 386 387 #endif // PANDA_RUNTIME_CORETYPES_STRING_H_ 388