1 /** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H 16 #define PANDA_RUNTIME_CORETYPES_STRING_H 17 18 #include <securec.h> 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "libpandabase/utils/utf.h" 24 #include "runtime/include/language_context.h" 25 #include "runtime/include/object_header.h" 26 #include "runtime/mem/vm_handle.h" 27 28 namespace panda::coretypes { 29 30 class Array; 31 class String : public ObjectHeader { 32 public: Cast(ObjectHeader * object)33 static String *Cast(ObjectHeader *object) 34 { 35 // NOTE(linxiang) to do assert 36 return static_cast<String *>(object); 37 } 38 39 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length, 40 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 41 bool movable = true); 42 43 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 44 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, 45 bool movable = true); 46 47 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, 48 const LanguageContext &ctx, PandaVM *vm, bool movable = true); 49 50 PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, 51 bool movable = true); 52 53 PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, 54 const LanguageContext &ctx, PandaVM *vm, bool movable = true); 55 56 PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, 57 const LanguageContext &ctx, PandaVM *vm, bool movable = true); 58 59 PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm); 60 61 PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm); 62 63 PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm); 64 65 PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, 66 const LanguageContext &ctx, PandaVM *vm); 67 68 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray, 69 const LanguageContext &ctx, PandaVM *vm); 70 71 template <bool VERIFY = true> 72 uint16_t At(int32_t index); 73 74 PANDA_PUBLIC_API int32_t Compare(String *rstr); 75 76 PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx); 77 78 PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length, 79 const LanguageContext &ctx); 80 IsUtf16()81 bool IsUtf16() const 82 { 83 return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 84 } 85 IsMUtf8()86 bool IsMUtf8() const 87 { 88 return !IsUtf16(); 89 } 90 ComputeDataSizeUtf16(uint32_t length)91 static size_t ComputeDataSizeUtf16(uint32_t length) 92 { 93 return length * sizeof(dataUtf16_[0]); 94 } 95 96 /// Methods for uncompressed strings (UTF16) ComputeSizeUtf16(uint32_t utf16Length)97 static size_t ComputeSizeUtf16(uint32_t utf16Length) 98 { 99 return sizeof(String) + ComputeDataSizeUtf16(utf16Length); 100 } 101 GetDataUtf16()102 uint16_t *GetDataUtf16() 103 { 104 ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string"); 105 return dataUtf16_; 106 } 107 108 /// Methods for compresses strings (MUTF8 or LATIN1) ComputeSizeMUtf8(uint32_t mutf8Length)109 static size_t ComputeSizeMUtf8(uint32_t mutf8Length) 110 { 111 return sizeof(String) + mutf8Length; 112 } 113 114 /// It's MUtf8 format, but without 0 in the end. GetDataMUtf8()115 uint8_t *GetDataMUtf8() 116 { 117 ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string"); 118 return reinterpret_cast<uint8_t *>(dataUtf16_); 119 } 120 GetMUtf8Length()121 size_t GetMUtf8Length() 122 { 123 if (!IsUtf16()) { 124 return GetLength() + 1; // add place for zero at the end 125 } 126 return panda::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength()); 127 } 128 GetUtf16Length()129 size_t GetUtf16Length() 130 { 131 return GetLength(); 132 } 133 CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)134 inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString) 135 { 136 if (isCString) { 137 ASSERT(maxLength != 0); 138 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 139 buf[maxLength - 1] = '\0'; 140 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1; // add place for zero at the end 141 } 142 143 return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength); 144 } 145 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)146 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) 147 { 148 if (length > maxLength) { 149 return 0; 150 } 151 uint32_t len = GetLength(); 152 if (start + length > len) { 153 return 0; 154 } 155 if (!IsUtf16()) { 156 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 157 if (length > MAX_LEN) { 158 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 159 } 160 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 161 if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) { 162 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 163 } 164 return length; 165 } 166 return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); 167 } 168 CopyDataUtf16(uint16_t * buf,uint32_t maxLength)169 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) 170 { 171 return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength); 172 } 173 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)174 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) 175 { 176 if (length > maxLength) { 177 return 0; 178 } 179 uint32_t len = GetLength(); 180 if (start + length > len) { 181 return 0; 182 } 183 if (IsUtf16()) { 184 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 185 if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 186 EOK) { 187 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 188 } 189 } else { 190 uint8_t *src8 = GetDataMUtf8() + start; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 191 for (uint32_t i = 0; i < length; ++i) { 192 buf[i] = src8[i]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 193 } 194 } 195 return length; 196 } 197 GetLength()198 uint32_t GetLength() const 199 { 200 uint32_t length; 201 if (compressedStringsEnabled_) { 202 length = length_ >> 1U; 203 } else { 204 length = length_; 205 } 206 return length; 207 } 208 IsEmpty()209 bool IsEmpty() const 210 { 211 // do not shift right length because it is always zero for empty string 212 return length_ == 0; 213 } 214 ObjectSize()215 size_t ObjectSize() const 216 { 217 uint32_t length = GetLength(); 218 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 219 } 220 GetHashcode()221 uint32_t GetHashcode() 222 { 223 if (hashcode_ == 0) { 224 hashcode_ = ComputeHashcode(); 225 } 226 return hashcode_; 227 } 228 229 int32_t IndexOf(String *rhs, int pos = 0); 230 int32_t LastIndexOf(String *rhs, int pos = INT32_MAX); 231 GetLengthOffset()232 static constexpr uint32_t GetLengthOffset() 233 { 234 return MEMBER_OFFSET(String, length_); 235 } 236 GetDataOffset()237 static constexpr uint32_t GetDataOffset() 238 { 239 return MEMBER_OFFSET(String, dataUtf16_); 240 } 241 GetHashcodeOffset()242 static constexpr uint32_t GetHashcodeOffset() 243 { 244 return MEMBER_OFFSET(String, hashcode_); 245 } 246 GetStringCompressionMask()247 static constexpr uint32_t GetStringCompressionMask() 248 { 249 return STRING_COMPRESSED_BIT; 250 } 251 252 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 253 PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2); 254 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 255 PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length); 256 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, 257 bool canBeCompressed); 258 /// Compares strings by bytes, It doesn't check canonical unicode equivalence. 259 PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, 260 uint32_t utf16DataLength); 261 static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm); 262 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length); 263 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed); 264 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 265 SetCompressedStringsEnabled(bool val)266 static void SetCompressedStringsEnabled(bool val) 267 { 268 compressedStringsEnabled_ = val; 269 } 270 GetCompressedStringsEnabled()271 static bool GetCompressedStringsEnabled() 272 { 273 return compressedStringsEnabled_; 274 } 275 NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)276 static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex, 277 const coretypes::String *str) 278 { 279 auto strLen = str->GetLength(); 280 std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex}; 281 282 // If begin_index < 0, then it is assumed to be equal to zero. 283 if (normIndexes.first < 0) { 284 normIndexes.first = 0; 285 } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) { 286 // If begin_index > str_len, then it is assumed to be equal to str_len. 287 normIndexes.first = static_cast<int32_t>(strLen); 288 } 289 // If end_index < 0, then it is assumed to be equal to zero. 290 if (normIndexes.second < 0) { 291 normIndexes.second = 0; 292 } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) { 293 // If end_index > str_len, then it is assumed to be equal to str_len. 294 normIndexes.second = static_cast<int32_t>(strLen); 295 } 296 // If begin_index > end_index, then these are swapped. 297 if (normIndexes.first > normIndexes.second) { 298 std::swap(normIndexes.first, normIndexes.second); 299 } 300 ASSERT((normIndexes.second - normIndexes.first) >= 0); 301 return normIndexes; 302 } 303 304 static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx, 305 PandaVM *vm = nullptr); 306 307 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data); 308 IsASCIICharacter(uint16_t data)309 static bool IsASCIICharacter(uint16_t data) 310 { 311 // \0 is not considered ASCII in Modified-UTF8 312 return data - 1U < utf::UTF8_1B_MAX; 313 } 314 315 protected: 316 void SetLength(uint32_t length, bool compressed = false) 317 { 318 if (compressedStringsEnabled_) { 319 ASSERT(length < 0x80000000U); 320 // Use 0u for compressed/utf8 expression 321 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 322 } else { 323 length_ = length; 324 } 325 } 326 SetHashcode(uint32_t hashcode)327 void SetHashcode(uint32_t hashcode) 328 { 329 hashcode_ = hashcode; 330 } 331 332 uint32_t ComputeHashcode(); 333 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length); 334 static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length); 335 static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr, 336 bool movable = true); 337 338 private: 339 PANDA_PUBLIC_API static bool compressedStringsEnabled_; 340 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 341 enum CompressedStatus { 342 STRING_COMPRESSED, 343 STRING_UNCOMPRESSED, 344 }; 345 346 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length); 347 static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non); 348 static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non); 349 350 /** 351 * str1 should have the same length as mutf16_data. 352 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 353 */ 354 // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer 355 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data, 356 uint32_t utf16DataLength); 357 358 static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength); 359 360 template <typename T> 361 /// Check that two spans are equal. Should have the same length. 362 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 363 364 // In last bit of length_ we store if this string is compressed or not. 365 uint32_t length_; 366 uint32_t hashcode_; 367 // A pointer to the string data stored after the string header. 368 // Data can be stored in mutf8 or utf16 form according to compressed bit. 369 __extension__ uint16_t dataUtf16_[0]; // NOLINT(modernize-avoid-c-arrays) 370 }; 371 372 constexpr uint32_t STRING_LENGTH_OFFSET = 8U; 373 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset()); 374 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U; 375 static_assert(STRING_HASHCODE_OFFSET == panda::coretypes::String::GetHashcodeOffset()); 376 constexpr uint32_t STRING_DATA_OFFSET = 16U; 377 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset()); 378 379 } // namespace panda::coretypes 380 381 #endif // PANDA_RUNTIME_CORETYPES_STRING_H 382