1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_ 17 #define PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_ 18 19 #include <securec.h> 20 #include <cstddef> 21 #include <cstdint> 22 #include <cstring> 23 24 #include "libpandabase/utils/utf.h" 25 #include "runtime/include/language_context.h" 26 #include "runtime/include/object_header.h" 27 #include "runtime/mem/vm_handle.h" 28 29 namespace panda::coretypes { 30 31 class Array; 32 class String : public ObjectHeader { 33 public: Cast(ObjectHeader * object)34 static String *Cast(ObjectHeader *object) 35 { 36 return static_cast<String *>(object); 37 } 38 39 static String *CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length, 40 bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable = true); 41 42 static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed, 43 LanguageContext ctx, PandaVM *vm, bool movable = true); 44 45 static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm, 46 bool movable = true); 47 48 static String *CreateFromMUtf8(const uint8_t *mutf8_data, LanguageContext ctx, PandaVM *vm, bool movable = true); 49 50 static String *CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm, 51 bool movable = true); 52 53 static String *CreateEmptyString(LanguageContext ctx, PandaVM *vm); 54 55 static String *CreateFromString(String *str, LanguageContext ctx, PandaVM *vm); 56 57 static String *Concat(String *jstring1, String *jstring2, LanguageContext ctx, PandaVM *vm); 58 59 static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, LanguageContext ctx, 60 PandaVM *vm); 61 62 static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray, 63 LanguageContext ctx, PandaVM *vm); 64 65 template <bool verify = true> 66 uint16_t At(int32_t index); 67 68 int32_t Compare(String *rstr); 69 70 Array *ToCharArray(LanguageContext ctx); 71 IsUtf16()72 bool IsUtf16() const 73 { 74 return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 75 } 76 IsMUtf8()77 bool IsMUtf8() const 78 { 79 return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : true; 80 } 81 ComputeDataSizeUtf16(uint32_t length)82 static size_t ComputeDataSizeUtf16(uint32_t length) 83 { 84 return length * sizeof(data_utf16_[0]); 85 } 86 87 /** 88 * Methods for uncompressed strings (UTF16) 89 */ ComputeSizeUtf16(uint32_t utf16_length)90 static size_t ComputeSizeUtf16(uint32_t utf16_length) 91 { 92 return sizeof(String) + ComputeDataSizeUtf16(utf16_length); 93 } 94 GetDataUtf16()95 uint16_t *GetDataUtf16() 96 { 97 LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "String: Read data as utf16 for mutf8 string"; 98 return data_utf16_; 99 } 100 101 /** 102 * Methods for compresses strings (MUTF8 or LATIN1) 103 */ ComputeSizeMUtf8(uint32_t mutf8_length)104 static size_t ComputeSizeMUtf8(uint32_t mutf8_length) 105 { 106 return sizeof(String) + mutf8_length; 107 } 108 109 /** 110 * It's MUtf8 format, but without 0 in the end. 111 */ GetDataMUtf8()112 uint8_t *GetDataMUtf8() 113 { 114 LOG_IF(IsUtf16(), FATAL, RUNTIME) << "String: Read data as mutf8 for utf16 string"; 115 return reinterpret_cast<uint8_t *>(data_utf16_); 116 } 117 GetMUtf8Length()118 size_t GetMUtf8Length() 119 { 120 if (!IsUtf16()) { 121 return GetLength() + 1; // add place for zero at the end 122 } 123 return panda::utf::Utf16ToMUtf8Size(data_utf16_, GetLength()); 124 } 125 GetUtf16Length()126 size_t GetUtf16Length() 127 { 128 return GetLength(); 129 } 130 CopyDataMUtf8(uint8_t * buf,size_t max_length)131 inline size_t CopyDataMUtf8(uint8_t *buf, size_t max_length) 132 { 133 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 134 buf[max_length - 1] = '\0'; 135 return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length) + 1; // add place for zero at the end 136 } 137 CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t max_length)138 size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t max_length) 139 { 140 if (length > max_length) { 141 return 0; 142 } 143 uint32_t len = GetLength(); 144 if (start + length > len) { 145 return 0; 146 } 147 if (!IsUtf16()) { 148 constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1; 149 if (length > MAX_LEN) { 150 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max"; 151 } 152 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 153 if (memcpy_s(buf, sizeof(buf) * (max_length + 1), GetDataMUtf8() + start, length) != EOK) { 154 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 155 } 156 return length; 157 } 158 return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, max_length - 1, start); 159 } 160 CopyDataUtf16(uint16_t * buf,uint32_t max_length)161 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length) 162 { 163 return CopyDataRegionUtf16(buf, 0, GetLength(), max_length); 164 } 165 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t max_length)166 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t max_length) 167 { 168 if (length > max_length) { 169 return 0; 170 } 171 uint32_t len = GetLength(); 172 if (start + length > len) { 173 return 0; 174 } 175 if (IsUtf16()) { 176 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 177 if (memcpy_s(buf, sizeof(buf) * max_length, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != EOK) { 178 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size"; 179 } 180 return length; 181 } 182 return panda::utf::ConvertRegionMUtf8ToUtf16(GetDataMUtf8(), buf, len, length, start); 183 } 184 GetLength()185 uint32_t GetLength() const 186 { 187 uint32_t length; 188 if (compressed_strings_enabled) { 189 length = length_ >> 1U; 190 } else { 191 length = length_; 192 } 193 return length; 194 } 195 IsEmpty()196 bool IsEmpty() const 197 { 198 // do not shift right length because it is always zero for empty string 199 return length_ == 0; 200 } 201 ObjectSize()202 size_t ObjectSize() const 203 { 204 uint32_t length = GetLength(); 205 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length); 206 } 207 GetHashcode()208 uint32_t GetHashcode() 209 { 210 if (hashcode_ == 0) { 211 hashcode_ = ComputeHashcode(); 212 } 213 return hashcode_; 214 } 215 216 int32_t IndexOf(String *rhs, int pos = 0); 217 GetLengthOffset()218 static constexpr uint32_t GetLengthOffset() 219 { 220 return MEMBER_OFFSET(String, length_); 221 } 222 GetDataOffset()223 static constexpr uint32_t GetDataOffset() 224 { 225 return MEMBER_OFFSET(String, data_utf16_); 226 } 227 GetStringCompressionMask()228 static constexpr uint32_t GetStringCompressionMask() 229 { 230 return STRING_COMPRESSED_BIT; 231 } 232 233 /** 234 * Compares strings by bytes. It doesn't check canonical unicode equivalence. 235 */ 236 static bool StringsAreEqual(String *str1, String *str2); 237 /** 238 * Compares strings by bytes. It doesn't check canonical unicode equivalence. 239 */ 240 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length); 241 static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length, 242 bool can_be_compressed); 243 /** 244 * Compares strings by bytes. It doesn't check canonical unicode equivalence. 245 */ 246 static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length); 247 static String *DoReplace(String *src, uint16_t old_c, uint16_t new_c, LanguageContext ctx, PandaVM *vm); 248 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length, bool can_be_compressed); 249 static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length); 250 static uint32_t ComputeHashcodeUtf16(uint16_t *utf16_data, uint32_t length); 251 SetCompressedStringsEnabled(bool val)252 static void SetCompressedStringsEnabled(bool val) 253 { 254 compressed_strings_enabled = val; 255 } 256 GetCompressedStringsEnabled()257 static bool GetCompressedStringsEnabled() 258 { 259 return compressed_strings_enabled; 260 } 261 262 static String *FastSubString(String *src, uint32_t start, uint32_t utf16_length, LanguageContext ctx, 263 PandaVM *vm = nullptr); 264 265 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data); 266 267 protected: 268 void SetLength(uint32_t length, bool compressed = false) 269 { 270 if (compressed_strings_enabled) { 271 ASSERT(length < 0x80000000U); 272 // Use 0u for compressed/utf8 expression 273 length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED); 274 } else { 275 length_ = length; 276 } 277 } 278 SetHashcode(uint32_t hashcode)279 void SetHashcode(uint32_t hashcode) 280 { 281 hashcode_ = hashcode; 282 } 283 284 uint32_t ComputeHashcode(); 285 static bool CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length); 286 static void CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length); 287 288 private: 289 static bool compressed_strings_enabled; 290 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 291 enum CompressedStatus { 292 STRING_COMPRESSED, 293 STRING_UNCOMPRESSED, 294 }; 295 IsASCIICharacter(uint16_t data)296 static bool IsASCIICharacter(uint16_t data) 297 { 298 // \0 is not considered ASCII in Modified-UTF8 299 return data - 1U < utf::MUTF8_1B_MAX; 300 } 301 302 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length); 303 static bool CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non); 304 static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non); 305 306 /** 307 * str1 should have the same length as mutf16_data. 308 * Converts mutf8_data to mutf16 and compare it with given mutf16_data. 309 */ 310 static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data, 311 uint32_t utf16_data_length); 312 313 static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length); 314 315 template <typename T> 316 /** 317 * Check that two spans are equal. Should have the same length. 318 */ 319 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 320 321 template <typename T1, typename T2> 322 static int32_t IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max); 323 324 static String *AllocStringObject(size_t length, bool compressed, LanguageContext ctx, PandaVM *vm = nullptr, 325 bool movable = true); 326 327 // In last bit of length_ we store if this string is compressed or not. 328 uint32_t length_; 329 uint32_t hashcode_; 330 // A pointer to the string data stored after the string header. 331 // Data can be stored in mutf8 or utf16 form according to compressed bit. 332 __extension__ uint16_t data_utf16_[0]; // NOLINT(modernize-avoid-c-arrays) 333 }; 334 335 constexpr uint32_t STRING_LENGTH_OFFSET = 8U; 336 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset()); 337 constexpr uint32_t STRING_DATA_OFFSET = 16U; 338 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset()); 339 340 } // namespace panda::coretypes 341 342 #endif // PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_ 343