• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_
17 #define PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_
18 
19 #include <securec.h>
20 #include <cstddef>
21 #include <cstdint>
22 #include <cstring>
23 
24 #include "libpandabase/utils/utf.h"
25 #include "runtime/include/language_context.h"
26 #include "runtime/include/object_header.h"
27 #include "runtime/mem/vm_handle.h"
28 
29 namespace panda::coretypes {
30 
31 class Array;
32 class String : public ObjectHeader {
33 public:
Cast(ObjectHeader * object)34     static String *Cast(ObjectHeader *object)
35     {
36         return static_cast<String *>(object);
37     }
38 
39     static String *CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length,
40                                    bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable = true);
41 
42     static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed,
43                                    LanguageContext ctx, PandaVM *vm, bool movable = true);
44 
45     static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm,
46                                    bool movable = true);
47 
48     static String *CreateFromMUtf8(const uint8_t *mutf8_data, LanguageContext ctx, PandaVM *vm, bool movable = true);
49 
50     static String *CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm,
51                                    bool movable = true);
52 
53     static String *CreateEmptyString(LanguageContext ctx, PandaVM *vm);
54 
55     static String *CreateFromString(String *str, LanguageContext ctx, PandaVM *vm);
56 
57     static String *Concat(String *jstring1, String *jstring2, LanguageContext ctx, PandaVM *vm);
58 
59     static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, LanguageContext ctx,
60                                             PandaVM *vm);
61 
62     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray,
63                                             LanguageContext ctx, PandaVM *vm);
64 
65     template <bool verify = true>
66     uint16_t At(int32_t index);
67 
68     int32_t Compare(String *rstr);
69 
70     Array *ToCharArray(LanguageContext ctx);
71 
IsUtf16()72     bool IsUtf16() const
73     {
74         return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
75     }
76 
IsMUtf8()77     bool IsMUtf8() const
78     {
79         return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : true;
80     }
81 
ComputeDataSizeUtf16(uint32_t length)82     static size_t ComputeDataSizeUtf16(uint32_t length)
83     {
84         return length * sizeof(data_utf16_[0]);
85     }
86 
87     /**
88      * Methods for uncompressed strings (UTF16)
89      */
ComputeSizeUtf16(uint32_t utf16_length)90     static size_t ComputeSizeUtf16(uint32_t utf16_length)
91     {
92         return sizeof(String) + ComputeDataSizeUtf16(utf16_length);
93     }
94 
GetDataUtf16()95     uint16_t *GetDataUtf16()
96     {
97         LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "String: Read data as utf16 for mutf8 string";
98         return data_utf16_;
99     }
100 
101     /**
102      * Methods for compresses strings (MUTF8 or LATIN1)
103      */
ComputeSizeMUtf8(uint32_t mutf8_length)104     static size_t ComputeSizeMUtf8(uint32_t mutf8_length)
105     {
106         return sizeof(String) + mutf8_length;
107     }
108 
109     /**
110      * It's MUtf8 format, but without 0 in the end.
111      */
GetDataMUtf8()112     uint8_t *GetDataMUtf8()
113     {
114         LOG_IF(IsUtf16(), FATAL, RUNTIME) << "String: Read data as mutf8 for utf16 string";
115         return reinterpret_cast<uint8_t *>(data_utf16_);
116     }
117 
GetMUtf8Length()118     size_t GetMUtf8Length()
119     {
120         if (!IsUtf16()) {
121             return GetLength() + 1;  // add place for zero at the end
122         }
123         return panda::utf::Utf16ToMUtf8Size(data_utf16_, GetLength());
124     }
125 
GetUtf16Length()126     size_t GetUtf16Length()
127     {
128         return GetLength();
129     }
130 
CopyDataMUtf8(uint8_t * buf,size_t max_length)131     inline size_t CopyDataMUtf8(uint8_t *buf, size_t max_length)
132     {
133         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
134         buf[max_length - 1] = '\0';
135         return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length) + 1;  // add place for zero at the end
136     }
137 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t max_length)138     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t max_length)
139     {
140         if (length > max_length) {
141             return 0;
142         }
143         uint32_t len = GetLength();
144         if (start + length > len) {
145             return 0;
146         }
147         if (!IsUtf16()) {
148             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
149             if (length > MAX_LEN) {
150                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
151             }
152             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
153             if (memcpy_s(buf, sizeof(buf) * (max_length + 1), GetDataMUtf8() + start, length) != EOK) {
154                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
155             }
156             return length;
157         }
158         return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, max_length - 1, start);
159     }
160 
CopyDataUtf16(uint16_t * buf,uint32_t max_length)161     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length)
162     {
163         return CopyDataRegionUtf16(buf, 0, GetLength(), max_length);
164     }
165 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t max_length)166     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t max_length)
167     {
168         if (length > max_length) {
169             return 0;
170         }
171         uint32_t len = GetLength();
172         if (start + length > len) {
173             return 0;
174         }
175         if (IsUtf16()) {
176             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
177             if (memcpy_s(buf, sizeof(buf) * max_length, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) != EOK) {
178                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
179             }
180             return length;
181         }
182         return panda::utf::ConvertRegionMUtf8ToUtf16(GetDataMUtf8(), buf, len, length, start);
183     }
184 
GetLength()185     uint32_t GetLength() const
186     {
187         uint32_t length;
188         if (compressed_strings_enabled) {
189             length = length_ >> 1U;
190         } else {
191             length = length_;
192         }
193         return length;
194     }
195 
IsEmpty()196     bool IsEmpty() const
197     {
198         // do not shift right length because it is always zero for empty string
199         return length_ == 0;
200     }
201 
ObjectSize()202     size_t ObjectSize() const
203     {
204         uint32_t length = GetLength();
205         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
206     }
207 
GetHashcode()208     uint32_t GetHashcode()
209     {
210         if (hashcode_ == 0) {
211             hashcode_ = ComputeHashcode();
212         }
213         return hashcode_;
214     }
215 
216     int32_t IndexOf(String *rhs, int pos = 0);
217 
GetLengthOffset()218     static constexpr uint32_t GetLengthOffset()
219     {
220         return MEMBER_OFFSET(String, length_);
221     }
222 
GetDataOffset()223     static constexpr uint32_t GetDataOffset()
224     {
225         return MEMBER_OFFSET(String, data_utf16_);
226     }
227 
GetStringCompressionMask()228     static constexpr uint32_t GetStringCompressionMask()
229     {
230         return STRING_COMPRESSED_BIT;
231     }
232 
233     /**
234      * Compares strings by bytes. It doesn't check canonical unicode equivalence.
235      */
236     static bool StringsAreEqual(String *str1, String *str2);
237     /**
238      * Compares strings by bytes. It doesn't check canonical unicode equivalence.
239      */
240     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length);
241     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length,
242                                      bool can_be_compressed);
243     /**
244      * Compares strings by bytes. It doesn't check canonical unicode equivalence.
245      */
246     static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length);
247     static String *DoReplace(String *src, uint16_t old_c, uint16_t new_c, LanguageContext ctx, PandaVM *vm);
248     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length, bool can_be_compressed);
249     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length);
250     static uint32_t ComputeHashcodeUtf16(uint16_t *utf16_data, uint32_t length);
251 
SetCompressedStringsEnabled(bool val)252     static void SetCompressedStringsEnabled(bool val)
253     {
254         compressed_strings_enabled = val;
255     }
256 
GetCompressedStringsEnabled()257     static bool GetCompressedStringsEnabled()
258     {
259         return compressed_strings_enabled;
260     }
261 
262     static String *FastSubString(String *src, uint32_t start, uint32_t utf16_length, LanguageContext ctx,
263                                  PandaVM *vm = nullptr);
264 
265     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data);
266 
267 protected:
268     void SetLength(uint32_t length, bool compressed = false)
269     {
270         if (compressed_strings_enabled) {
271             ASSERT(length < 0x80000000U);
272             // Use 0u for compressed/utf8 expression
273             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
274         } else {
275             length_ = length;
276         }
277     }
278 
SetHashcode(uint32_t hashcode)279     void SetHashcode(uint32_t hashcode)
280     {
281         hashcode_ = hashcode;
282     }
283 
284     uint32_t ComputeHashcode();
285     static bool CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length);
286     static void CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length);
287 
288 private:
289     static bool compressed_strings_enabled;
290     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
291     enum CompressedStatus {
292         STRING_COMPRESSED,
293         STRING_UNCOMPRESSED,
294     };
295 
IsASCIICharacter(uint16_t data)296     static bool IsASCIICharacter(uint16_t data)
297     {
298         // \0 is not considered ASCII in Modified-UTF8
299         return data - 1U < utf::MUTF8_1B_MAX;
300     }
301 
302     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length);
303     static bool CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non);
304     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non);
305 
306     /**
307      * str1 should have the same length as mutf16_data.
308      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
309      */
310     static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data,
311                                    uint32_t utf16_data_length);
312 
313     static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length);
314 
315     template <typename T>
316     /**
317      * Check that two spans are equal. Should have the same length.
318      */
319     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
320 
321     template <typename T1, typename T2>
322     static int32_t IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max);
323 
324     static String *AllocStringObject(size_t length, bool compressed, LanguageContext ctx, PandaVM *vm = nullptr,
325                                      bool movable = true);
326 
327     // In last bit of length_ we store if this string is compressed or not.
328     uint32_t length_;
329     uint32_t hashcode_;
330     // A pointer to the string data stored after the string header.
331     // Data can be stored in mutf8 or utf16 form according to compressed bit.
332     __extension__ uint16_t data_utf16_[0];  // NOLINT(modernize-avoid-c-arrays)
333 };
334 
335 constexpr uint32_t STRING_LENGTH_OFFSET = 8U;
336 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset());
337 constexpr uint32_t STRING_DATA_OFFSET = 16U;
338 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset());
339 
340 }  // namespace panda::coretypes
341 
342 #endif  // PANDA_RUNTIME_INCLUDE_CORETYPES_STRING_H_
343