• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_
16 #define PANDA_RUNTIME_CORETYPES_STRING_H_
17 
18 #include <securec.h>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "libpandabase/utils/utf.h"
24 #include "runtime/include/language_context.h"
25 #include "runtime/include/object_header.h"
26 #include "runtime/mem/vm_handle.h"
27 
28 namespace ark::coretypes {
29 
30 class Array;
31 class String : public ObjectHeader {
32 public:
Cast(ObjectHeader * object)33     static String *Cast(ObjectHeader *object)
34     {
35         // NOTE(linxiang) to do assert
36         return static_cast<String *>(object);
37     }
38 
39     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
40                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
41                                                     bool movable = true, bool pinned = false);
42 
43     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
44                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
45                                                     bool movable = true, bool pinned = false);
46 
47     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
48                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
49                                                     bool pinned = false);
50 
51     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm,
52                                                     bool movable = true, bool pinned = false);
53 
54     static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length,
55                                    const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned);
56 
57     PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length,
58                                                    const LanguageContext &ctx, PandaVM *vm, bool movable = true,
59                                                    bool pinned = false);
60 
61     PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length,
62                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
63                                                     bool pinned = false);
64 
65     PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm);
66 
67     PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm);
68 
69     PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm);
70 
71     PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray,
72                                                              const LanguageContext &ctx, PandaVM *vm);
73 
74     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
75                                             const LanguageContext &ctx, PandaVM *vm);
76 
77     template <bool VERIFY = true>
78     uint16_t At(int32_t index);
79 
80     PANDA_PUBLIC_API int32_t Compare(String *rstr);
81 
82     PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx);
83 
84     PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length,
85                                             const LanguageContext &ctx);
86 
IsUtf16()87     bool IsUtf16() const
88     {
89         return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
90     }
91 
IsMUtf8()92     bool IsMUtf8() const
93     {
94         return !IsUtf16();
95     }
96 
ComputeDataSizeUtf16(uint32_t length)97     static size_t ComputeDataSizeUtf16(uint32_t length)
98     {
99         return length * sizeof(dataUtf16_[0]);
100     }
101 
102     /// Methods for uncompressed strings (UTF16)
ComputeSizeUtf16(uint32_t utf16Length)103     static size_t ComputeSizeUtf16(uint32_t utf16Length)
104     {
105         return sizeof(String) + ComputeDataSizeUtf16(utf16Length);
106     }
107 
GetDataUtf16()108     uint16_t *GetDataUtf16()
109     {
110         ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string");
111         return dataUtf16_;
112     }
113 
114     /// Methods for compresses strings (MUTF8 or LATIN1)
ComputeSizeMUtf8(uint32_t mutf8Length)115     static size_t ComputeSizeMUtf8(uint32_t mutf8Length)
116     {
117         return sizeof(String) + mutf8Length;
118     }
119 
120     /// It's MUtf8 format, but without 0 in the end.
GetDataMUtf8()121     uint8_t *GetDataMUtf8()
122     {
123         ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string");
124         return reinterpret_cast<uint8_t *>(dataUtf16_);
125     }
126 
GetMUtf8Length()127     size_t GetMUtf8Length()
128     {
129         if (!IsUtf16()) {
130             return GetLength() + 1;  // add place for zero at the end
131         }
132         return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength());
133     }
134 
GetUtf16Length()135     size_t GetUtf16Length()
136     {
137         return GetLength();
138     }
139 
CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)140     inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString)
141     {
142         if (isCString) {
143             ASSERT(maxLength != 0);
144             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145             buf[maxLength - 1] = '\0';
146             return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1;  // add place for zero at the end
147         }
148 
149         return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength);
150     }
151 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)152     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength)
153     {
154         if (length > maxLength) {
155             return 0;
156         }
157         uint32_t len = GetLength();
158         if (start + length > len) {
159             return 0;
160         }
161         if (!IsUtf16()) {
162             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
163             if (length > MAX_LEN) {
164                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
165             }
166             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
167             if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) {
168                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
169             }
170             return length;
171         }
172         return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start);
173     }
174 
CopyDataUtf16(uint16_t * buf,uint32_t maxLength)175     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength)
176     {
177         return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
178     }
179 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)180     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength)
181     {
182         if (length > maxLength) {
183             return 0;
184         }
185         uint32_t len = GetLength();
186         if (start + length > len) {
187             return 0;
188         }
189         if (IsUtf16()) {
190             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191             if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
192                 EOK) {
193                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
194             }
195         } else {
196             uint8_t *src8 = GetDataMUtf8() + start;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
197             for (uint32_t i = 0; i < length; ++i) {
198                 buf[i] = src8[i];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
199             }
200         }
201         return length;
202     }
203 
GetLength()204     uint32_t GetLength() const
205     {
206         uint32_t length;
207         if (compressedStringsEnabled_) {
208             length = length_ >> 1U;
209         } else {
210             length = length_;
211         }
212         return length;
213     }
214 
IsEmpty()215     bool IsEmpty() const
216     {
217         // do not shift right length because it is always zero for empty string
218         return length_ == 0;
219     }
220 
ObjectSize()221     size_t ObjectSize() const
222     {
223         uint32_t length = GetLength();
224         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
225     }
226 
GetHashcode()227     uint32_t GetHashcode()
228     {
229         if (hashcode_ == 0) {
230             hashcode_ = ComputeHashcode();
231         }
232         return hashcode_;
233     }
234 
235     int32_t IndexOf(String *rhs, int pos = 0);
236     int32_t LastIndexOf(String *rhs, int pos = INT32_MAX);
237 
GetLengthOffset()238     static constexpr uint32_t GetLengthOffset()
239     {
240         return MEMBER_OFFSET(String, length_);
241     }
242 
GetDataOffset()243     static constexpr uint32_t GetDataOffset()
244     {
245         return MEMBER_OFFSET(String, dataUtf16_);
246     }
247 
GetHashcodeOffset()248     static constexpr uint32_t GetHashcodeOffset()
249     {
250         return MEMBER_OFFSET(String, hashcode_);
251     }
252 
GetStringCompressionMask()253     static constexpr uint32_t GetStringCompressionMask()
254     {
255         return STRING_COMPRESSED_BIT;
256     }
257 
258     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
259     PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2);
260     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
261     PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length);
262     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length,
263                                      bool canBeCompressed);
264     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
265     PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data,
266                                                       uint32_t utf16DataLength);
267     static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm);
268     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length);
269     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed);
270     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
271 
SetCompressedStringsEnabled(bool val)272     static void SetCompressedStringsEnabled(bool val)
273     {
274         compressedStringsEnabled_ = val;
275     }
276 
GetCompressedStringsEnabled()277     static bool GetCompressedStringsEnabled()
278     {
279         return compressedStringsEnabled_;
280     }
281 
NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)282     static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex,
283                                                                  const coretypes::String *str)
284     {
285         auto strLen = str->GetLength();
286         std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex};
287 
288         // If begin_index < 0, then it is assumed to be equal to zero.
289         if (normIndexes.first < 0) {
290             normIndexes.first = 0;
291         } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) {
292             // If begin_index > str_len, then it is assumed to be equal to str_len.
293             normIndexes.first = static_cast<int32_t>(strLen);
294         }
295         // If end_index < 0, then it is assumed to be equal to zero.
296         if (normIndexes.second < 0) {
297             normIndexes.second = 0;
298         } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) {
299             // If end_index > str_len, then it is assumed to be equal to str_len.
300             normIndexes.second = static_cast<int32_t>(strLen);
301         }
302         // If begin_index > end_index, then these are swapped.
303         if (normIndexes.first > normIndexes.second) {
304             std::swap(normIndexes.first, normIndexes.second);
305         }
306         ASSERT((normIndexes.second - normIndexes.first) >= 0);
307         return normIndexes;
308     }
309 
310     static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
311                                  PandaVM *vm = nullptr);
312 
313     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data);
314 
IsASCIICharacter(uint16_t data)315     static bool IsASCIICharacter(uint16_t data)
316     {
317         // \0 is not considered ASCII in Modified-UTF8
318         return data - 1U < utf::UTF8_1B_MAX;
319     }
320 
321 protected:
322     void SetLength(uint32_t length, bool compressed = false)
323     {
324         if (compressedStringsEnabled_) {
325             ASSERT(length < 0x80000000U);
326             // Use 0u for compressed/utf8 expression
327             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
328         } else {
329             length_ = length;
330         }
331     }
332 
SetHashcode(uint32_t hashcode)333     void SetHashcode(uint32_t hashcode)
334     {
335         hashcode_ = hashcode;
336     }
337 
338     uint32_t ComputeHashcode();
339     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length);
340     static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length);
341     static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr,
342                                      bool movable = true, bool pinned = false);
343 
344 private:
345     PANDA_PUBLIC_API static bool compressedStringsEnabled_;
346     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
347     enum CompressedStatus {
348         STRING_COMPRESSED,
349         STRING_UNCOMPRESSED,
350     };
351 
352     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length);
353     static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non);
354     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non);
355 
356     /**
357      * str1 should have the same length as mutf16_data.
358      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
359      */
360     // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer
361     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
362                                    uint32_t utf16DataLength);
363 
364     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength);
365 
366     template <typename T>
367     /// Check that two spans are equal. Should have the same length.
368     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
369 
370     // In last bit of length_ we store if this string is compressed or not.
371     uint32_t length_;
372     uint32_t hashcode_;
373     // A pointer to the string data stored after the string header.
374     // Data can be stored in mutf8 or utf16 form according to compressed bit.
375     __extension__ uint16_t dataUtf16_[0];  // NOLINT(modernize-avoid-c-arrays)
376 };
377 
378 constexpr uint32_t STRING_LENGTH_OFFSET = 8U;
379 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset());
380 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U;
381 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset());
382 constexpr uint32_t STRING_DATA_OFFSET = 16U;
383 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset());
384 
385 }  // namespace ark::coretypes
386 
387 #endif  // PANDA_RUNTIME_CORETYPES_STRING_H_
388