1 /** 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_ 16 #define PANDA_RUNTIME_CORETYPES_STRING_H_ 17 18 #include <securec.h> 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "libpandabase/utils/utf.h" 24 #include "runtime/include/language_context.h" 25 #include "runtime/include/object_header.h" 26 27 namespace ark::coretypes { 28 29 class Array; 30 class String : public ObjectHeader { 31 public: Cast(ObjectHeader * object)32 static String *Cast(ObjectHeader *object) 33 { 34 // NOTE(linxiang) to do assert 35 return static_cast<String *>(object); 36 } 37 38 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length, 39 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 40 bool movable = true, bool pinned = false); 41 42 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 43 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 44 bool movable = true, bool pinned = false); 45 46 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 47 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 48 bool pinned = false); 49 50 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, 51 bool movable = true, bool pinned = false); 52 53 static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length, 54 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned); 55 56 PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, 57 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 58 bool pinned = false); 59 60 PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, 61 const LanguageContext &ctx, PandaVM *vm, bool movable = true, 62 bool pinned = false); 63 64 PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm); 65 66 PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm); 67 68 PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm); 69 70 PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, 71 const LanguageContext &ctx, PandaVM *vm); 72 73 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray, 74 const LanguageContext &ctx, PandaVM *vm); 75 76 template <bool VERIFY = true> 77 uint16_t At(int32_t index); 78 79 PANDA_PUBLIC_API int32_t Compare(String *rstr); 80 81 PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx); 82 83 PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length, 84 const LanguageContext &ctx); 85 IsUtf16()86 bool IsUtf16() const 87 { 88 return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 89 } 90 IsMUtf8()91 bool IsMUtf8() const 92 { 93 return !IsUtf16(); 94 } 95 ComputeDataSizeUtf16(uint32_t length)96 static size_t ComputeDataSizeUtf16(uint32_t length) 97 { 98 return length * sizeof(dataUtf16_[0]); 99 } 100 101 /// Methods for uncompressed strings (UTF16) ComputeSizeUtf16(uint32_t utf16Length)102 static size_t ComputeSizeUtf16(uint32_t utf16Length) 103 { 104 return sizeof(String) + ComputeDataSizeUtf16(utf16Length); 105 } 106 GetDataUtf16()107 uint16_t *GetDataUtf16() 108 { 109 ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string"); 110 return dataUtf16_; 111 } 112 113 /// Methods for compresses strings (MUTF8 or LATIN1) ComputeSizeMUtf8(uint32_t mutf8Length)114 static size_t ComputeSizeMUtf8(uint32_t mutf8Length) 115 { 116 return sizeof(String) + mutf8Length; 117 } 118 119 /// It's MUtf8 format, but without 0 in the end. GetDataMUtf8()120 uint8_t *GetDataMUtf8() 121 { 122 ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string"); 123 return reinterpret_cast<uint8_t *>(dataUtf16_); 124 } 125 GetMUtf8Length()126 size_t GetMUtf8Length() 127 { 128 if (!IsUtf16()) { 129 return GetLength() + 1; // add place for zero at the end 130 } 131 return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength()); 132 } 133 GetUtf16Length()134 size_t GetUtf16Length() 135 { 136 return GetLength(); 137 } 138 CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)139 inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString) 140 { 141 if (isCString) { 142 ASSERT(maxLength != 0); 143 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 144 buf[maxLength - 1] = '\0'; 145 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1; // add place for zero at the end 146 } 147 148 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength); 149 } 150 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)151 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) 152 { 153 if (length > maxLength) { 154 return 0; 155 } 156 uint32_t len = GetLength(); 157 if (start + length > len) { 158 return 0; 159 } 160 if (!IsUtf16()) { 161 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 162 if (length > MAX_LEN) { 163 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 164 } 165 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 166 if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) { 167 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 168 } 169 return length; 170 } 171 return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); 172 } 173 CopyDataUtf16(uint16_t * buf,uint32_t maxLength)174 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) 175 { 176 return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength); 177 } 178 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)179 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) 180 { 181 if (length > maxLength) { 182 return 0; 183 } 184 uint32_t len = GetLength(); 185 if (start + length > len) { 186 return 0; 187 } 188 if (IsUtf16()) { 189 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 190 if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 191 EOK) { 192 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 193 } 194 } else { 195 uint8_t *src8 = GetDataMUtf8() + start; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 196 for (uint32_t i = 0; i < length; ++i) { 197 buf[i] = src8[i]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 198 } 199 } 200 return length; 201 } 202 GetLength()203 uint32_t GetLength() const 204 { 205 uint32_t length; 206 if (compressedStringsEnabled_) { 207 length = length_ >> 1U; 208 } else { 209 length = length_; 210 } 211 return length; 212 } 213 IsEmpty()214 bool IsEmpty() const 215 { 216 // do not shift right length because it is always zero for empty string 217 return length_ == 0; 218 } 219 ObjectSize()220 size_t ObjectSize() const 221 { 222 uint32_t length = GetLength(); 223 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 224 } 225 GetHashcode()226 uint32_t GetHashcode() 227 { 228 if (hashcode_ == 0) { 229 hashcode_ = ComputeHashcode(); 230 } 231 return hashcode_; 232 } 233 234 int32_t IndexOf(String *rhs, int pos = 0); 235 int32_t LastIndexOf(String *rhs, int pos = INT32_MAX); 236 GetLengthOffset()237 static constexpr uint32_t GetLengthOffset() 238 { 239 return MEMBER_OFFSET(String, length_); 240 } 241 GetDataOffset()242 static constexpr uint32_t GetDataOffset() 243 { 244 return MEMBER_OFFSET(String, dataUtf16_); 245 } 246 GetHashcodeOffset()247 static constexpr uint32_t GetHashcodeOffset() 248 { 249 return MEMBER_OFFSET(String, hashcode_); 250 } 251 GetStringCompressionMask()252 static constexpr uint32_t GetStringCompressionMask() 253 { 254 return STRING_COMPRESSED_BIT; 255 } 256 257 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 258 PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2); 259 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 260 PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length); 261 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, 262 bool canBeCompressed); 263 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 264 PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, 265 uint32_t utf16DataLength); 266 static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm); 267 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length); 268 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed); 269 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 270 SetCompressedStringsEnabled(bool val)271 static void SetCompressedStringsEnabled(bool val) 272 { 273 compressedStringsEnabled_ = val; 274 } 275 GetCompressedStringsEnabled()276 static bool GetCompressedStringsEnabled() 277 { 278 return compressedStringsEnabled_; 279 } 280 NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)281 static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex, 282 const coretypes::String *str) 283 { 284 auto strLen = str->GetLength(); 285 std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex}; 286 287 // If begin_index < 0, then it is assumed to be equal to zero. 288 if (normIndexes.first < 0) { 289 normIndexes.first = 0; 290 } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) { 291 // If begin_index > str_len, then it is assumed to be equal to str_len. 292 normIndexes.first = static_cast<int32_t>(strLen); 293 } 294 // If end_index < 0, then it is assumed to be equal to zero. 295 if (normIndexes.second < 0) { 296 normIndexes.second = 0; 297 } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) { 298 // If end_index > str_len, then it is assumed to be equal to str_len. 299 normIndexes.second = static_cast<int32_t>(strLen); 300 } 301 // If begin_index > end_index, then these are swapped. 302 if (normIndexes.first > normIndexes.second) { 303 std::swap(normIndexes.first, normIndexes.second); 304 } 305 ASSERT((normIndexes.second - normIndexes.first) >= 0); 306 return normIndexes; 307 } 308 309 static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx, 310 PandaVM *vm = nullptr); 311 312 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data); 313 IsASCIICharacter(uint16_t data)314 static bool IsASCIICharacter(uint16_t data) 315 { 316 // \0 is not considered ASCII in Modified-UTF8 317 return data - 1U < utf::UTF8_1B_MAX; 318 } 319 320 protected: 321 void SetLength(uint32_t length, bool compressed = false) 322 { 323 if (compressedStringsEnabled_) { 324 ASSERT(length < 0x80000000U); 325 // Use 0u for compressed/utf8 expression 326 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 327 } else { 328 length_ = length; 329 } 330 } 331 SetHashcode(uint32_t hashcode)332 void SetHashcode(uint32_t hashcode) 333 { 334 hashcode_ = hashcode; 335 } 336 337 uint32_t ComputeHashcode(); 338 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length); 339 static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length); 340 static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr, 341 bool movable = true, bool pinned = false); 342 343 private: 344 PANDA_PUBLIC_API static bool compressedStringsEnabled_; 345 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 346 enum CompressedStatus { 347 STRING_COMPRESSED, 348 STRING_UNCOMPRESSED, 349 }; 350 351 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length); 352 static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non); 353 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non); 354 355 /** 356 * str1 should have the same length as mutf16_data. 357 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 358 */ 359 // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer 360 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data, 361 uint32_t utf16DataLength); 362 363 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength); 364 365 template <typename T> 366 /// Check that two spans are equal. Should have the same length. 367 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 368 369 // In last bit of length_ we store if this string is compressed or not. 370 uint32_t length_; 371 uint32_t hashcode_; 372 // A pointer to the string data stored after the string header. 373 // Data can be stored in mutf8 or utf16 form according to compressed bit. 374 __extension__ uint16_t dataUtf16_[0]; // NOLINT(modernize-avoid-c-arrays) 375 }; 376 377 constexpr uint32_t STRING_LENGTH_OFFSET = 8U; 378 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset()); 379 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U; 380 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset()); 381 constexpr uint32_t STRING_DATA_OFFSET = 16U; 382 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset()); 383 384 } // namespace ark::coretypes 385 386 #endif // PANDA_RUNTIME_CORETYPES_STRING_H_ 387