1 /** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_ 16 #define PANDA_RUNTIME_CORETYPES_STRING_H_ 17 18 #include <securec.h> 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "libpandabase/utils/utf.h" 24 #include "runtime/include/language_context.h" 25 #include "runtime/include/object_header.h" 26 #include "runtime/mem/vm_handle.h" 27 28 namespace panda::coretypes { 29 30 class Array; 31 class String : public ObjectHeader { 32 public: Cast(ObjectHeader * object)33 static String *Cast(ObjectHeader *object) 34 { 35 // TODO(linxiang) to do assert 36 return static_cast<String *>(object); 37 } 38 39 static String *CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length, 40 bool can_be_compressed, const LanguageContext &ctx, PandaVM *vm, 41 bool movable = true); 42 43 static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed, 44 const LanguageContext &ctx, PandaVM *vm, bool movable = true); 45 46 static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, const LanguageContext &ctx, 47 PandaVM *vm, bool movable = true); 48 49 static String *CreateFromMUtf8(const uint8_t *mutf8_data, const LanguageContext &ctx, PandaVM *vm, 50 bool movable = true); 51 52 static String *CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, const LanguageContext &ctx, 53 PandaVM *vm, bool movable = true); 54 55 static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm); 56 57 static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm); 58 59 static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm); 60 61 static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, 62 const LanguageContext &ctx, PandaVM *vm); 63 64 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray, 65 const LanguageContext &ctx, PandaVM *vm); 66 67 template <bool verify = true> 68 uint16_t At(int32_t index); 69 70 int32_t Compare(String *rstr); 71 72 Array *ToCharArray(const LanguageContext &ctx); 73 IsUtf16()74 bool IsUtf16() const 75 { 76 return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 77 } 78 IsMUtf8()79 bool IsMUtf8() const 80 { 81 return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : true; 82 } 83 ComputeDataSizeUtf16(uint32_t length)84 static size_t ComputeDataSizeUtf16(uint32_t length) 85 { 86 return length * sizeof(data_utf16_[0]); 87 } 88 89 /** 90 * Methods for uncompressed strings (UTF16) 91 */ ComputeSizeUtf16(uint32_t utf16_length)92 static size_t ComputeSizeUtf16(uint32_t utf16_length) 93 { 94 return sizeof(String) + ComputeDataSizeUtf16(utf16_length); 95 } 96 GetDataUtf16()97 uint16_t *GetDataUtf16() 98 { 99 LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "String: Read data as utf16 for mutf8 string"; 100 return data_utf16_; 101 } 102 103 /** 104 * Methods for compresses strings (MUTF8 or LATIN1) 105 */ ComputeSizeMUtf8(uint32_t mutf8_length)106 static size_t ComputeSizeMUtf8(uint32_t mutf8_length) 107 { 108 return sizeof(String) + mutf8_length; 109 } 110 111 /** 112 * It's MUtf8 format, but without 0 in the end. 113 */ GetDataMUtf8()114 uint8_t *GetDataMUtf8() 115 { 116 LOG_IF(IsUtf16(), FATAL, RUNTIME) << "String: Read data as mutf8 for utf16 string"; 117 return reinterpret_cast<uint8_t *>(data_utf16_); 118 } 119 GetMUtf8Length()120 size_t GetMUtf8Length() 121 { 122 if (!IsUtf16()) { 123 return GetLength() + 1; // add place for zero at the end 124 } 125 return panda::utf::Utf16ToMUtf8Size(data_utf16_, GetLength()); 126 } 127 GetUtf16Length()128 size_t GetUtf16Length() 129 { 130 return GetLength(); 131 } 132 CopyDataMUtf8(uint8_t * buf,size_t max_length,bool is_c_string)133 inline size_t CopyDataMUtf8(uint8_t *buf, size_t max_length, bool is_c_string) 134 { 135 if (is_c_string) { 136 ASSERT(max_length != 0); 137 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 138 buf[max_length - 1] = '\0'; 139 return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length) + 1; // add place for zero at the end 140 } 141 142 return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length); 143 } 144 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t max_length)145 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t max_length) 146 { 147 if (length > max_length) { 148 return 0; 149 } 150 uint32_t len = GetLength(); 151 if (start + length > len) { 152 return 0; 153 } 154 if (!IsUtf16()) { 155 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 156 if (length > MAX_LEN) { 157 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 158 } 159 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 160 if (memcpy_s(buf, sizeof(uint8_t) * (max_length + 1), GetDataMUtf8() + start, length) != EOK) { 161 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 162 } 163 return length; 164 } 165 return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, max_length - 1, start); 166 } 167 CopyDataUtf16(uint16_t * buf,uint32_t max_length)168 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length) 169 { 170 return CopyDataRegionUtf16(buf, 0, GetLength(), max_length); 171 } 172 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t max_length)173 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t max_length) 174 { 175 if (length > max_length) { 176 return 0; 177 } 178 uint32_t len = GetLength(); 179 if (start + length > len) { 180 return 0; 181 } 182 if (IsUtf16()) { 183 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 184 if (memcpy_s(buf, sizeof(uint16_t) * max_length, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 185 EOK) { 186 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 187 } 188 } else { 189 uint8_t *src_8 = GetDataMUtf8() + start; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 190 for (uint32_t i = 0; i < length; ++i) { 191 buf[i] = src_8[i]; // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 192 } 193 } 194 return length; 195 } 196 GetLength()197 uint32_t GetLength() const 198 { 199 uint32_t length; 200 if (compressed_strings_enabled) { 201 length = length_ >> 1U; 202 } else { 203 length = length_; 204 } 205 return length; 206 } 207 IsEmpty()208 bool IsEmpty() const 209 { 210 // do not shift right length because it is always zero for empty string 211 return length_ == 0; 212 } 213 ObjectSize()214 size_t ObjectSize() const 215 { 216 uint32_t length = GetLength(); 217 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 218 } 219 GetHashcode()220 uint32_t GetHashcode() 221 { 222 if (hashcode_ == 0) { 223 hashcode_ = ComputeHashcode(); 224 } 225 return hashcode_; 226 } 227 228 int32_t IndexOf(String *rhs, int pos = 0); 229 GetLengthOffset()230 static constexpr uint32_t GetLengthOffset() 231 { 232 return MEMBER_OFFSET(String, length_); 233 } 234 GetDataOffset()235 static constexpr uint32_t GetDataOffset() 236 { 237 return MEMBER_OFFSET(String, data_utf16_); 238 } 239 GetStringCompressionMask()240 static constexpr uint32_t GetStringCompressionMask() 241 { 242 return STRING_COMPRESSED_BIT; 243 } 244 245 /** 246 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 247 */ 248 static bool StringsAreEqual(String *str1, String *str2); 249 /** 250 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 251 */ 252 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length); 253 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length, 254 bool can_be_compressed); 255 /** 256 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 257 */ 258 static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length); 259 static String *DoReplace(String *src, uint16_t old_c, uint16_t new_c, const LanguageContext &ctx, PandaVM *vm); 260 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length); 261 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed); 262 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16_data, uint32_t length); 263 SetCompressedStringsEnabled(bool val)264 static void SetCompressedStringsEnabled(bool val) 265 { 266 compressed_strings_enabled = val; 267 } 268 GetCompressedStringsEnabled()269 static bool GetCompressedStringsEnabled() 270 { 271 return compressed_strings_enabled; 272 } 273 274 static String *FastSubString(String *src, uint32_t start, uint32_t utf16_length, const LanguageContext &ctx, 275 PandaVM *vm = nullptr); 276 277 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data); 278 279 protected: 280 void SetLength(uint32_t length, bool compressed = false) 281 { 282 if (compressed_strings_enabled) { 283 ASSERT(length < 0x80000000U); 284 // Use 0u for compressed/utf8 expression 285 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 286 } else { 287 length_ = length; 288 } 289 } 290 SetHashcode(uint32_t hashcode)291 void SetHashcode(uint32_t hashcode) 292 { 293 hashcode_ = hashcode; 294 } 295 296 uint32_t ComputeHashcode(); 297 static bool CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length); 298 static void CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length); 299 300 private: 301 static bool compressed_strings_enabled; 302 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 303 enum CompressedStatus { 304 STRING_COMPRESSED, 305 STRING_UNCOMPRESSED, 306 }; 307 IsASCIICharacter(uint16_t data)308 static bool IsASCIICharacter(uint16_t data) 309 { 310 // \0 is not considered ASCII in Modified-UTF8 311 return data - 1U < utf::MUTF8_1B_MAX; 312 } 313 314 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length); 315 static bool CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non); 316 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non); 317 318 /** 319 * str1 should have the same length as mutf16_data. 320 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 321 */ 322 // TODO(alovkov): move to utils/utf.h without allocation a temporary buffer 323 static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data, 324 uint32_t utf16_data_length); 325 326 static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length); 327 328 template <typename T> 329 /** 330 * Check that two spans are equal. Should have the same length. 331 */ 332 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 333 334 template <typename T1, typename T2> 335 static int32_t IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max); 336 337 static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr, 338 bool movable = true); 339 340 // In last bit of length_ we store if this string is compressed or not. 341 uint32_t length_; 342 uint32_t hashcode_; 343 // A pointer to the string data stored after the string header. 344 // Data can be stored in mutf8 or utf16 form according to compressed bit. 345 __extension__ uint16_t data_utf16_[0]; // NOLINT(modernize-avoid-c-arrays) 346 }; 347 348 constexpr uint32_t STRING_LENGTH_OFFSET = 8U; 349 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset()); 350 constexpr uint32_t STRING_DATA_OFFSET = 16U; 351 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset()); 352 353 } // namespace panda::coretypes 354 355 #endif // PANDA_RUNTIME_CORETYPES_STRING_H_ 356