• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_STRING_H
17 #define ECMASCRIPT_STRING_H
18 
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "ecmascript/base/utf_helper.h"
24 #include "ecmascript/ecma_macros.h"
25 #include "ecmascript/js_tagged_value.h"
26 #include "ecmascript/mem/tagged_object.h"
27 
28 namespace panda {
29 namespace ecmascript {
30 template<typename T>
31 class JSHandle;
32 class EcmaVM;
33 
34 class EcmaString : public TaggedObject {
35 public:
36     static EcmaString *Cast(ObjectHeader *object);
37     static const EcmaString *ConstCast(const TaggedObject *object);
38 
39     static EcmaString *CreateEmptyString(const EcmaVM *vm);
40     static EcmaString *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Len, const EcmaVM *vm, bool canBeCompress);
41     static EcmaString *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Len, const EcmaVM *vm,
42                                        bool canBeCompress);
43     static EcmaString *Concat(const JSHandle<EcmaString> &str1Handle, const JSHandle<EcmaString> &str2Handle,
44                               const EcmaVM *vm);
45     static EcmaString *FastSubString(const JSHandle<EcmaString> &src, uint32_t start, uint32_t utf16Len,
46                                      const EcmaVM *vm);
47 
48     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
49     static constexpr uint32_t STRING_INTERN_BIT = 0x2;
50     enum CompressedStatus {
51         STRING_COMPRESSED,
52         STRING_UNCOMPRESSED,
53     };
54 
55     template<bool verify = true>
56     uint16_t At(int32_t index) const;
57 
58     int32_t Compare(const EcmaString *rhs) const;
59 
IsUtf16()60     bool IsUtf16() const
61     {
62         return compressedStringsEnabled ? ((GetMixLength() & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
63     }
64 
IsUtf8()65     bool IsUtf8() const
66     {
67         return compressedStringsEnabled ? ((GetMixLength() & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : false;
68     }
69 
ComputeDataSizeUtf16(uint32_t length)70     static size_t ComputeDataSizeUtf16(uint32_t length)
71     {
72         return length * sizeof(uint16_t);
73     }
74 
75     /**
76      * Methods for uncompressed strings (UTF16):
77      */
ComputeSizeUtf16(uint32_t utf16Len)78     static size_t ComputeSizeUtf16(uint32_t utf16Len)
79     {
80         return DATA_OFFSET + ComputeDataSizeUtf16(utf16Len);
81     }
82 
GetData()83     inline uint16_t *GetData() const
84     {
85         return reinterpret_cast<uint16_t *>(ToUintPtr(this) + DATA_OFFSET);
86     }
87 
GetDataUtf16()88     const uint16_t *GetDataUtf16() const
89     {
90         LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf16 for utf8 string";
91         return GetData();
92     }
93 
94     /**
95      * Methods for compresses strings (UTF8 or LATIN1):
96      */
ComputeSizeUtf8(uint32_t utf8Len)97     static size_t ComputeSizeUtf8(uint32_t utf8Len)
98     {
99         return DATA_OFFSET + utf8Len;
100     }
101 
102     /**
103      * It's Utf8 format, but without 0 in the end.
104      */
GetDataUtf8()105     const uint8_t *GetDataUtf8() const
106     {
107         LOG_IF(IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf8 for utf16 string";
108         return reinterpret_cast<uint8_t *>(GetData());
109     }
110 
GetUtf8Length()111     size_t GetUtf8Length() const
112     {
113         if (!IsUtf16()) {
114             return GetLength() + 1;  // add place for zero in the end
115         }
116         return base::utf_helper::Utf16ToUtf8Size(GetData(), GetLength());
117     }
118 
GetUtf16Length()119     size_t GetUtf16Length() const
120     {
121         return GetLength();
122     }
123 
CopyDataUtf8(uint8_t * buf,size_t maxLength)124     inline size_t CopyDataUtf8(uint8_t *buf, size_t maxLength) const
125     {
126         ASSERT(maxLength > 0);
127         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
128         buf[maxLength - 1] = '\0';
129         return CopyDataRegionUtf8(buf, 0, GetLength(), maxLength) + 1;  // add place for zero in the end
130     }
131 
CopyDataRegionUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)132     size_t CopyDataRegionUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength) const
133     {
134         if (length > maxLength) {
135             return 0;
136         }
137         uint32_t len = GetLength();
138         if (start + length > len) {
139             return 0;
140         }
141         if (!IsUtf16()) {
142             if (length > std::numeric_limits<size_t>::max() / 2 - 1) {  // 2: half
143                 LOG(FATAL, RUNTIME) << " length is higher than half of size_t::max";
144                 UNREACHABLE();
145             }
146             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
147             if (memcpy_s(buf, maxLength, GetDataUtf8() + start, length) != EOK) {
148                 LOG(FATAL, RUNTIME) << "memcpy_s failed";
149                 UNREACHABLE();
150             }
151             return length;
152         }
153         return base::utf_helper::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, maxLength - 1, start);
154     }
155 
CopyDataUtf16(uint16_t * buf,uint32_t maxLength)156     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength) const
157     {
158         return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
159     }
160 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)161     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength) const
162     {
163         if (length > maxLength) {
164             return 0;
165         }
166         uint32_t len = GetLength();
167         if (start + length > len) {
168             return 0;
169         }
170         if (IsUtf16()) {
171             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
172             if (memcpy_s(buf, ComputeDataSizeUtf16(maxLength), GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
173                 EOK) {
174                 LOG(FATAL, RUNTIME) << "memcpy_s failed";
175                 UNREACHABLE();
176             }
177             return length;
178         }
179         return base::utf_helper::ConvertRegionUtf8ToUtf16(GetDataUtf8(), buf, len, maxLength, start);
180     }
181 
182     // NOLINTNEXTLINE(modernize-avoid-c-arrays)
GetCString()183     inline std::unique_ptr<char[]> GetCString()
184     {
185         auto length = GetUtf8Length();
186         char *buf = new char[length]();
187         CopyDataUtf8(reinterpret_cast<uint8_t *>(buf), length);
188         // NOLINTNEXTLINE(modernize-avoid-c-arrays)
189         return std::unique_ptr<char[]>(buf);
190     }
191 
192     inline void WriteData(EcmaString *src, uint32_t start, uint32_t destSize, uint32_t length);
193     inline void WriteData(char src, uint32_t start);
GetLength()194     uint32_t GetLength() const
195     {
196         return GetMixLength() >> 2U;
197     }
198 
SetIsInternString()199     void SetIsInternString()
200     {
201         SetMixLength(GetMixLength() | STRING_INTERN_BIT);
202     }
203 
IsInternString()204     bool IsInternString() const
205     {
206         return (GetMixLength() & STRING_INTERN_BIT) != 0;
207     }
208 
ObjectSize()209     size_t ObjectSize() const
210     {
211         uint32_t length = GetLength();
212         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeUtf8(length);
213     }
214 
GetHashcode()215     uint32_t GetHashcode()
216     {
217         uint32_t hashcode = GetRawHashcode();
218         if (hashcode == 0) {
219             hashcode = ComputeHashcode(0);
220             SetRawHashcode(hashcode);
221         }
222         return hashcode;
223     }
224 
225     uint32_t ComputeHashcode(uint32_t hashSeed) const;
226 
227     int32_t IndexOf(const EcmaString *rhs, int pos = 0) const;
228 
GetStringCompressionMask()229     static constexpr uint32_t GetStringCompressionMask()
230     {
231         return STRING_COMPRESSED_BIT;
232     }
233 
234     /**
235      * Compares string1 + string2 by bytes, It doesn't check canonical unicode equivalence.
236      */
237     bool EqualToSplicedString(const EcmaString *str1, const EcmaString *str2);
238     /**
239      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
240      */
241     static bool StringsAreEqual(EcmaString *str1, EcmaString *str2);
242     /**
243      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
244      */
245     static bool StringsAreEqualUtf8(const EcmaString *str1, const uint8_t *utf8Data, uint32_t utf8Len,
246                                     bool canBeCompress);
247     /**
248      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
249      */
250     static bool StringsAreEqualUtf16(const EcmaString *str1, const uint16_t *utf16Data, uint32_t utf16Len);
251     static uint32_t ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress);
252     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
253 
SetCompressedStringsEnabled(bool val)254     static void SetCompressedStringsEnabled(bool val)
255     {
256         compressedStringsEnabled = val;
257     }
258 
GetCompressedStringsEnabled()259     static bool GetCompressedStringsEnabled()
260     {
261         return compressedStringsEnabled;
262     }
263 
264     static EcmaString *AllocStringObject(size_t length, bool compressed, const EcmaVM *vm);
265 
266     static bool CanBeCompressed(const uint8_t *utf8Data, uint32_t utf8Len);
267     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Len);
268     static bool CanBeCompressed(const EcmaString *string);
269 
270     static constexpr size_t MIX_LENGTH_OFFSET = TaggedObjectSize();
271     // In last bit of mix_length we store if this string is compressed or not.
272     ACCESSORS_PRIMITIVE_FIELD(MixLength, uint32_t, MIX_LENGTH_OFFSET, HASHCODE_OFFSET)
273     ACCESSORS_PRIMITIVE_FIELD(RawHashcode, uint32_t, HASHCODE_OFFSET, SIZE)
274     // DATA_OFFSET: the string data stored after the string header.
275     // Data can be stored in utf8 or utf16 form according to compressed bit.
276     static constexpr size_t DATA_OFFSET = SIZE;  // DATA_OFFSET equal to Empty String size
277 
278     static inline EcmaString *FastSubUtf8String(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start,
279                                                 uint32_t length);
280     static inline EcmaString *FastSubUtf16String(const EcmaVM *vm, const JSHandle<EcmaString> &src, uint32_t start,
281                                                  uint32_t length);
282 
283 private:
284     void SetLength(uint32_t length, bool compressed = false)
285     {
286         ASSERT(length < 0x40000000U);
287         // Use 0u for compressed/utf8 expression
288         SetMixLength((length << 2U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED));
289     }
290 
GetDataUtf16Writable()291     uint16_t *GetDataUtf16Writable()
292     {
293         LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf16 for utf8 string";
294         return GetData();
295     }
296 
GetDataUtf8Writable()297     uint8_t *GetDataUtf8Writable()
298     {
299         LOG_IF(IsUtf16(), FATAL, RUNTIME) << "EcmaString: Read data as utf8 for utf16 string";
300         return reinterpret_cast<uint8_t *>(GetData());
301     }
302 
303     static void CopyUtf16AsUtf8(const uint16_t *utf16From, uint8_t *utf8To, uint32_t utf16Len);
304 
305     static bool compressedStringsEnabled;
306 
IsASCIICharacter(uint16_t data)307     static bool IsASCIICharacter(uint16_t data)
308     {
309         // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000']
310         return data - 1U < base::utf_helper::UTF8_1B_MAX;
311     }
312 
313     /**
314      * str1 should have the same length as utf16_data.
315      * Converts utf8Data to utf16 and compare it with given utf16_data.
316      */
317     static bool IsUtf8EqualsUtf16(const uint8_t *utf8Data, size_t utf8Len, const uint16_t *utf16Data,
318                                   uint32_t utf16Len);
319 
320     template<typename T>
321     /**
322      * Check that two spans are equal. Should have the same length.
323      */
324     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
325 
326     template<typename T>
327     /**
328      * Copy String from src to dst
329      * */
330     static bool StringCopy(Span<T> &dst, size_t dstMax, Span<const T> &src, size_t count);
331 
332     template<typename T1, typename T2>
333     static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max);
334 };
335 
336 static_assert((EcmaString::DATA_OFFSET % static_cast<uint8_t>(MemAlignment::MEM_ALIGN_OBJECT)) == 0);
337 }  // namespace ecmascript
338 }  // namespace panda
339 #endif  // ECMASCRIPT_STRING_H
340