• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H
16 #define PANDA_RUNTIME_CORETYPES_STRING_H
17 
18 #include <securec.h>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "libpandabase/utils/utf.h"
24 #include "runtime/include/language_context.h"
25 #include "runtime/include/object_header.h"
26 #include "runtime/mem/vm_handle.h"
27 
28 namespace panda::coretypes {
29 
30 class Array;
31 class String : public ObjectHeader {
32 public:
Cast(ObjectHeader * object)33     static String *Cast(ObjectHeader *object)
34     {
35         // NOTE(linxiang) to do assert
36         return static_cast<String *>(object);
37     }
38 
39     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
40                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
41                                                     bool movable = true);
42 
43     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
44                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
45                                                     bool movable = true);
46 
47     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
48                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true);
49 
50     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm,
51                                                     bool movable = true);
52 
53     PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length,
54                                                    const LanguageContext &ctx, PandaVM *vm, bool movable = true);
55 
56     PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length,
57                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true);
58 
59     PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm);
60 
61     PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm);
62 
63     PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm);
64 
65     PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray,
66                                                              const LanguageContext &ctx, PandaVM *vm);
67 
68     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
69                                             const LanguageContext &ctx, PandaVM *vm);
70 
71     template <bool VERIFY = true>
72     uint16_t At(int32_t index);
73 
74     PANDA_PUBLIC_API int32_t Compare(String *rstr);
75 
76     PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx);
77 
78     PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length,
79                                             const LanguageContext &ctx);
80 
IsUtf16()81     bool IsUtf16() const
82     {
83         return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
84     }
85 
IsMUtf8()86     bool IsMUtf8() const
87     {
88         return !IsUtf16();
89     }
90 
ComputeDataSizeUtf16(uint32_t length)91     static size_t ComputeDataSizeUtf16(uint32_t length)
92     {
93         return length * sizeof(dataUtf16_[0]);
94     }
95 
96     /// Methods for uncompressed strings (UTF16)
ComputeSizeUtf16(uint32_t utf16Length)97     static size_t ComputeSizeUtf16(uint32_t utf16Length)
98     {
99         return sizeof(String) + ComputeDataSizeUtf16(utf16Length);
100     }
101 
GetDataUtf16()102     uint16_t *GetDataUtf16()
103     {
104         ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string");
105         return dataUtf16_;
106     }
107 
108     /// Methods for compresses strings (MUTF8 or LATIN1)
ComputeSizeMUtf8(uint32_t mutf8Length)109     static size_t ComputeSizeMUtf8(uint32_t mutf8Length)
110     {
111         return sizeof(String) + mutf8Length;
112     }
113 
114     /// It's MUtf8 format, but without 0 in the end.
GetDataMUtf8()115     uint8_t *GetDataMUtf8()
116     {
117         ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string");
118         return reinterpret_cast<uint8_t *>(dataUtf16_);
119     }
120 
GetMUtf8Length()121     size_t GetMUtf8Length()
122     {
123         if (!IsUtf16()) {
124             return GetLength() + 1;  // add place for zero at the end
125         }
126         return panda::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength());
127     }
128 
GetUtf16Length()129     size_t GetUtf16Length()
130     {
131         return GetLength();
132     }
133 
CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)134     inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString)
135     {
136         if (isCString) {
137             ASSERT(maxLength != 0);
138             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
139             buf[maxLength - 1] = '\0';
140             return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1;  // add place for zero at the end
141         }
142 
143         return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength);
144     }
145 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)146     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength)
147     {
148         if (length > maxLength) {
149             return 0;
150         }
151         uint32_t len = GetLength();
152         if (start + length > len) {
153             return 0;
154         }
155         if (!IsUtf16()) {
156             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
157             if (length > MAX_LEN) {
158                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
159             }
160             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
161             if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) {
162                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
163             }
164             return length;
165         }
166         return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start);
167     }
168 
CopyDataUtf16(uint16_t * buf,uint32_t maxLength)169     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength)
170     {
171         return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
172     }
173 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)174     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength)
175     {
176         if (length > maxLength) {
177             return 0;
178         }
179         uint32_t len = GetLength();
180         if (start + length > len) {
181             return 0;
182         }
183         if (IsUtf16()) {
184             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
185             if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
186                 EOK) {
187                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
188             }
189         } else {
190             uint8_t *src8 = GetDataMUtf8() + start;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191             for (uint32_t i = 0; i < length; ++i) {
192                 buf[i] = src8[i];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
193             }
194         }
195         return length;
196     }
197 
GetLength()198     uint32_t GetLength() const
199     {
200         uint32_t length;
201         if (compressedStringsEnabled_) {
202             length = length_ >> 1U;
203         } else {
204             length = length_;
205         }
206         return length;
207     }
208 
IsEmpty()209     bool IsEmpty() const
210     {
211         // do not shift right length because it is always zero for empty string
212         return length_ == 0;
213     }
214 
ObjectSize()215     size_t ObjectSize() const
216     {
217         uint32_t length = GetLength();
218         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
219     }
220 
GetHashcode()221     uint32_t GetHashcode()
222     {
223         if (hashcode_ == 0) {
224             hashcode_ = ComputeHashcode();
225         }
226         return hashcode_;
227     }
228 
229     int32_t IndexOf(String *rhs, int pos = 0);
230     int32_t LastIndexOf(String *rhs, int pos = INT32_MAX);
231 
GetLengthOffset()232     static constexpr uint32_t GetLengthOffset()
233     {
234         return MEMBER_OFFSET(String, length_);
235     }
236 
GetDataOffset()237     static constexpr uint32_t GetDataOffset()
238     {
239         return MEMBER_OFFSET(String, dataUtf16_);
240     }
241 
GetHashcodeOffset()242     static constexpr uint32_t GetHashcodeOffset()
243     {
244         return MEMBER_OFFSET(String, hashcode_);
245     }
246 
GetStringCompressionMask()247     static constexpr uint32_t GetStringCompressionMask()
248     {
249         return STRING_COMPRESSED_BIT;
250     }
251 
252     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
253     PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2);
254     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
255     PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length);
256     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length,
257                                      bool canBeCompressed);
258     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
259     PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data,
260                                                       uint32_t utf16DataLength);
261     static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm);
262     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length);
263     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed);
264     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
265 
SetCompressedStringsEnabled(bool val)266     static void SetCompressedStringsEnabled(bool val)
267     {
268         compressedStringsEnabled_ = val;
269     }
270 
GetCompressedStringsEnabled()271     static bool GetCompressedStringsEnabled()
272     {
273         return compressedStringsEnabled_;
274     }
275 
NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)276     static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex,
277                                                                  const coretypes::String *str)
278     {
279         auto strLen = str->GetLength();
280         std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex};
281 
282         // If begin_index < 0, then it is assumed to be equal to zero.
283         if (normIndexes.first < 0) {
284             normIndexes.first = 0;
285         } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) {
286             // If begin_index > str_len, then it is assumed to be equal to str_len.
287             normIndexes.first = static_cast<int32_t>(strLen);
288         }
289         // If end_index < 0, then it is assumed to be equal to zero.
290         if (normIndexes.second < 0) {
291             normIndexes.second = 0;
292         } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) {
293             // If end_index > str_len, then it is assumed to be equal to str_len.
294             normIndexes.second = static_cast<int32_t>(strLen);
295         }
296         // If begin_index > end_index, then these are swapped.
297         if (normIndexes.first > normIndexes.second) {
298             std::swap(normIndexes.first, normIndexes.second);
299         }
300         ASSERT((normIndexes.second - normIndexes.first) >= 0);
301         return normIndexes;
302     }
303 
304     static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
305                                  PandaVM *vm = nullptr);
306 
307     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data);
308 
IsASCIICharacter(uint16_t data)309     static bool IsASCIICharacter(uint16_t data)
310     {
311         // \0 is not considered ASCII in Modified-UTF8
312         return data - 1U < utf::UTF8_1B_MAX;
313     }
314 
315 protected:
316     void SetLength(uint32_t length, bool compressed = false)
317     {
318         if (compressedStringsEnabled_) {
319             ASSERT(length < 0x80000000U);
320             // Use 0u for compressed/utf8 expression
321             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
322         } else {
323             length_ = length;
324         }
325     }
326 
SetHashcode(uint32_t hashcode)327     void SetHashcode(uint32_t hashcode)
328     {
329         hashcode_ = hashcode;
330     }
331 
332     uint32_t ComputeHashcode();
333     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length);
334     static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length);
335     static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr,
336                                      bool movable = true);
337 
338 private:
339     PANDA_PUBLIC_API static bool compressedStringsEnabled_;
340     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
341     enum CompressedStatus {
342         STRING_COMPRESSED,
343         STRING_UNCOMPRESSED,
344     };
345 
346     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length);
347     static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non);
348     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non);
349 
350     /**
351      * str1 should have the same length as mutf16_data.
352      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
353      */
354     // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer
355     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
356                                    uint32_t utf16DataLength);
357 
358     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength);
359 
360     template <typename T>
361     /// Check that two spans are equal. Should have the same length.
362     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
363 
364     // In last bit of length_ we store if this string is compressed or not.
365     uint32_t length_;
366     uint32_t hashcode_;
367     // A pointer to the string data stored after the string header.
368     // Data can be stored in mutf8 or utf16 form according to compressed bit.
369     __extension__ uint16_t dataUtf16_[0];  // NOLINT(modernize-avoid-c-arrays)
370 };
371 
372 constexpr uint32_t STRING_LENGTH_OFFSET = 8U;
373 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset());
374 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U;
375 static_assert(STRING_HASHCODE_OFFSET == panda::coretypes::String::GetHashcodeOffset());
376 constexpr uint32_t STRING_DATA_OFFSET = 16U;
377 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset());
378 
379 }  // namespace panda::coretypes
380 
381 #endif  // PANDA_RUNTIME_CORETYPES_STRING_H
382