1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_STRING_H 17 #define ECMASCRIPT_STRING_H 18 19 #include <cstddef> 20 #include <cstdint> 21 #include <cstring> 22 23 #include "ecmascript/base/utf_helper.h" 24 #include "ecmascript/ecma_macros.h" 25 #include "ecmascript/js_tagged_value.h" 26 #include "ecmascript/mem/tagged_object.h" 27 28 namespace panda { 29 namespace ecmascript { 30 template<typename T> 31 class JSHandle; 32 class EcmaVM; 33 34 class EcmaString : public TaggedObject { 35 public: 36 static EcmaString *Cast(ObjectHeader *object); 37 static const EcmaString *ConstCast(const TaggedObject *object); 38 39 static EcmaString *CreateEmptyString(const EcmaVM *vm); 40 static EcmaString *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Len, const EcmaVM *vm, bool canBeCompress); 41 static EcmaString *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Len, const EcmaVM *vm, 42 bool canBeCompress); 43 static EcmaString *Concat(const JSHandle<EcmaString> &str1Handle, const JSHandle<EcmaString> &str2Handle, 44 const EcmaVM *vm); 45 static EcmaString *FastSubString(const JSHandle<EcmaString> &src, uint32_t start, uint32_t utf16Len, 46 const EcmaVM *vm); 47 48 static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1; 49 static constexpr uint32_t STRING_INTERN_BIT = 0x2; 50 enum CompressedStatus { 51 STRING_COMPRESSED, 52 STRING_UNCOMPRESSED, 53 }; 54 55 template<bool verify = true> 56 uint16_t At(int32_t index) const; 57 58 int32_t Compare(const EcmaString *rhs) const; 59 IsUtf16()60 bool IsUtf16() const 61 { 62 return compressedStringsEnabled ? ((GetMixLength() & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true; 63 } 64 IsUtf8()65 bool IsUtf8() const 66 { 67 return compressedStringsEnabled ? ((GetMixLength() & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : false; 68 } 69 ComputeDataSizeUtf16(uint32_t length)70 static size_t ComputeDataSizeUtf16(uint32_t length) 71 { 72 return length * sizeof(uint16_t); 73 } 74 75 /** 76 * Methods for uncompressed strings (UTF16): 77 */ ComputeSizeUtf16(uint32_t utf16Len)78 static size_t ComputeSizeUtf16(uint32_t utf16Len) 79 { 80 return DATA_OFFSET + ComputeDataSizeUtf16(utf16Len); 81 } 82 GetData()83 inline uint16_t *GetData() const 84 { 85 return reinterpret_cast<uint16_t *>(ToUintPtr(this) + DATA_OFFSET); 86 } 87 GetDataUtf16()88 const uint16_t *GetDataUtf16() const 89 { 90 LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf16 for utf8 string"; 91 return GetData(); 92 } 93 94 /** 95 * Methods for compresses strings (UTF8 or LATIN1): 96 */ ComputeSizeUtf8(uint32_t utf8Len)97 static size_t ComputeSizeUtf8(uint32_t utf8Len) 98 { 99 return DATA_OFFSET + utf8Len; 100 } 101 102 /** 103 * It's Utf8 format, but without 0 in the end. 104 */ GetDataUtf8()105 const uint8_t *GetDataUtf8() const 106 { 107 LOG_IF(IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf8 for utf16 string"; 108 return reinterpret_cast<uint8_t *>(GetData()); 109 } 110 GetUtf8Length()111 size_t GetUtf8Length() const 112 { 113 if (!IsUtf16()) { 114 return GetLength() + 1; // add place for zero in the end 115 } 116 return base::utf_helper::Utf16ToUtf8Size(GetData(), GetLength()); 117 } 118 GetUtf16Length()119 size_t GetUtf16Length() const 120 { 121 return GetLength(); 122 } 123 CopyDataUtf8(uint8_t * buf,size_t maxLength)124 inline size_t CopyDataUtf8(uint8_t *buf, size_t maxLength) const 125 { 126 ASSERT(maxLength > 0); 127 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 128 buf[maxLength - 1] = '\0'; 129 return CopyDataRegionUtf8(buf, 0, GetLength(), maxLength) + 1; // add place for zero in the end 130 } 131 CopyDataRegionUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)132 size_t CopyDataRegionUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) const 133 { 134 if (length > maxLength) { 135 return 0; 136 } 137 uint32_t len = GetLength(); 138 if (start + length > len) { 139 return 0; 140 } 141 if (!IsUtf16()) { 142 if (length > std::numeric_limits<size_t>::max() / 2 - 1) { // 2: half 143 LOG(FATAL, RUNTIME) << " length is higher than half of size_t::max"; 144 UNREACHABLE(); 145 } 146 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 147 if (memcpy_s(buf, maxLength, GetDataUtf8() + start, length) != EOK) { 148 LOG(FATAL, RUNTIME) << "memcpy_s failed"; 149 UNREACHABLE(); 150 } 151 return length; 152 } 153 return base::utf_helper::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, maxLength - 1, start); 154 } 155 CopyDataUtf16(uint16_t * buf,uint32_t maxLength)156 inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const 157 { 158 return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength); 159 } 160 CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)161 uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) const 162 { 163 if (length > maxLength) { 164 return 0; 165 } 166 uint32_t len = GetLength(); 167 if (start + length > len) { 168 return 0; 169 } 170 if (IsUtf16()) { 171 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 172 if (memcpy_s(buf, ComputeDataSizeUtf16(maxLength), GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != 173 EOK) { 174 LOG(FATAL, RUNTIME) << "memcpy_s failed"; 175 UNREACHABLE(); 176 } 177 return length; 178 } 179 return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start); 180 } 181 182 // NOLINTNEXTLINE(modernize-avoid-c-arrays) GetCString()183 inline std::unique_ptr<char[]> GetCString() 184 { 185 auto length = GetUtf8Length(); 186 char *buf = new char[length](); 187 CopyDataUtf8(reinterpret_cast<uint8_t *>(buf), length); 188 // NOLINTNEXTLINE(modernize-avoid-c-arrays) 189 return std::unique_ptr<char[]>(buf); 190 } 191 192 inline void WriteData(EcmaString *src, uint32_t start, uint32_t destSize, uint32_t length); 193 inline void WriteData(char src, uint32_t start); GetLength()194 uint32_t GetLength() const 195 { 196 return GetMixLength() >> 2U; 197 } 198 SetIsInternString()199 void SetIsInternString() 200 { 201 SetMixLength(GetMixLength() | STRING_INTERN_BIT); 202 } 203 IsInternString()204 bool IsInternString() const 205 { 206 return (GetMixLength() & STRING_INTERN_BIT) != 0; 207 } 208 ObjectSize()209 size_t ObjectSize() const 210 { 211 uint32_t length = GetLength(); 212 return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeUtf8(length); 213 } 214 GetHashcode()215 uint32_t GetHashcode() 216 { 217 uint32_t hashcode = GetRawHashcode(); 218 if (hashcode == 0) { 219 hashcode = ComputeHashcode(0); 220 SetRawHashcode(hashcode); 221 } 222 return hashcode; 223 } 224 225 uint32_t ComputeHashcode(uint32_t hashSeed) const; 226 227 int32_t IndexOf(const EcmaString *rhs, int pos = 0) const; 228 GetStringCompressionMask()229 static constexpr uint32_t GetStringCompressionMask() 230 { 231 return STRING_COMPRESSED_BIT; 232 } 233 234 /** 235 * Compares string1 + string2 by bytes, It doesn't check canonical unicode equivalence. 236 */ 237 bool EqualToSplicedString(const EcmaString *str1, const EcmaString *str2); 238 /** 239 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 240 */ 241 static bool StringsAreEqual(EcmaString *str1, EcmaString *str2); 242 /** 243 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 244 */ 245 static bool StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8Data, uint32_t utf8Len, 246 bool canBeCompress); 247 /** 248 * Compares strings by bytes, It doesn't check canonical unicode equivalence. 249 */ 250 static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len); 251 static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress); 252 static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length); 253 SetCompressedStringsEnabled(bool val)254 static void SetCompressedStringsEnabled(bool val) 255 { 256 compressedStringsEnabled = val; 257 } 258 GetCompressedStringsEnabled()259 static bool GetCompressedStringsEnabled() 260 { 261 return compressedStringsEnabled; 262 } 263 264 static EcmaString *AllocStringObject(size_t length, bool compressed, const EcmaVM *vm); 265 266 static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len); 267 static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len); 268 static bool CanBeCompressed(const EcmaString *string); 269 270 static constexpr size_t MIX_LENGTH_OFFSET = TaggedObjectSize(); 271 // In last bit of mix_length we store if this string is compressed or not. 272 ACCESSORS_PRIMITIVE_FIELD(MixLength, uint32_t, MIX_LENGTH_OFFSET, HASHCODE_OFFSET) 273 ACCESSORS_PRIMITIVE_FIELD(RawHashcode, uint32_t, HASHCODE_OFFSET, SIZE) 274 // DATA_OFFSET: the string data stored after the string header. 275 // Data can be stored in utf8 or utf16 form according to compressed bit. 276 static constexpr size_t DATA_OFFSET = SIZE; // DATA_OFFSET equal to Empty String size 277 278 static inline EcmaString *FastSubUtf8String(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start, 279 uint32_t length); 280 static inline EcmaString *FastSubUtf16String(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start, 281 uint32_t length); 282 283 private: 284 void SetLength(uint32_t length, bool compressed = false) 285 { 286 ASSERT(length < 0x40000000U); 287 // Use 0u for compressed/utf8 expression 288 SetMixLength((length << 2U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED)); 289 } 290 GetDataUtf16Writable()291 uint16_t *GetDataUtf16Writable() 292 { 293 LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf16 for utf8 string"; 294 return GetData(); 295 } 296 GetDataUtf8Writable()297 uint8_t *GetDataUtf8Writable() 298 { 299 LOG_IF(IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf8 for utf16 string"; 300 return reinterpret_cast<uint8_t *>(GetData()); 301 } 302 303 static void CopyUtf16AsUtf8(const uint16_t *utf16From, uint8_t *utf8To, uint32_t utf16Len); 304 305 static bool compressedStringsEnabled; 306 IsASCIICharacter(uint16_t data)307 static bool IsASCIICharacter(uint16_t data) 308 { 309 // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000'] 310 return data - 1U < base::utf_helper::UTF8_1B_MAX; 311 } 312 313 /** 314 * str1 should have the same length as utf16_data. 315 * Converts utf8Data to utf16 and compare it with given utf16_data. 316 */ 317 static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data, 318 uint32_t utf16Len); 319 320 template<typename T> 321 /** 322 * Check that two spans are equal. Should have the same length. 323 */ 324 static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2); 325 326 template<typename T> 327 /** 328 * Copy String from src to dst 329 * */ 330 static bool StringCopy(Span<T> &dst, size_t dstMax, Span<const T> &src, size_t count); 331 332 template<typename T1, typename T2> 333 static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max); 334 }; 335 336 static_assert((EcmaString::DATA_OFFSET % static_cast<uint8_t>(MemAlignment::MEM_ALIGN_OBJECT)) == 0); 337 } // namespace ecmascript 338 } // namespace panda 339 #endif // ECMASCRIPT_STRING_H 340