• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_
16 #define PANDA_RUNTIME_CORETYPES_STRING_H_
17 
18 #include <securec.h>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "libpandabase/utils/utf.h"
24 #include "runtime/include/language_context.h"
25 #include "runtime/include/object_header.h"
26 
27 namespace ark::coretypes {
28 
29 class Array;
30 class String : public ObjectHeader {
31 public:
Cast(ObjectHeader * object)32     static String *Cast(ObjectHeader *object)
33     {
34         // NOTE(linxiang) to do assert
35         return static_cast<String *>(object);
36     }
37 
38     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
39                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
40                                                     bool movable = true, bool pinned = false);
41 
42     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
43                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
44                                                     bool movable = true, bool pinned = false);
45 
46     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
47                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
48                                                     bool pinned = false);
49 
50     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm,
51                                                     bool movable = true, bool pinned = false);
52 
53     static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length,
54                                    const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned);
55 
56     PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length,
57                                                    const LanguageContext &ctx, PandaVM *vm, bool movable = true,
58                                                    bool pinned = false);
59 
60     PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length,
61                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
62                                                     bool pinned = false);
63 
64     PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm);
65 
66     PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm);
67 
68     PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm);
69 
70     PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray,
71                                                              const LanguageContext &ctx, PandaVM *vm);
72 
73     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
74                                             const LanguageContext &ctx, PandaVM *vm);
75 
76     template <bool VERIFY = true>
77     uint16_t At(int32_t index);
78 
79     PANDA_PUBLIC_API int32_t Compare(String *rstr);
80 
81     PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx);
82 
83     PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length,
84                                             const LanguageContext &ctx);
85 
IsUtf16()86     bool IsUtf16() const
87     {
88         return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
89     }
90 
IsMUtf8()91     bool IsMUtf8() const
92     {
93         return !IsUtf16();
94     }
95 
ComputeDataSizeUtf16(uint32_t length)96     static size_t ComputeDataSizeUtf16(uint32_t length)
97     {
98         return length * sizeof(dataUtf16_[0]);
99     }
100 
101     /// Methods for uncompressed strings (UTF16)
ComputeSizeUtf16(uint32_t utf16Length)102     static size_t ComputeSizeUtf16(uint32_t utf16Length)
103     {
104         return sizeof(String) + ComputeDataSizeUtf16(utf16Length);
105     }
106 
GetDataUtf16()107     uint16_t *GetDataUtf16()
108     {
109         ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string");
110         return dataUtf16_;
111     }
112 
113     /// Methods for compresses strings (MUTF8 or LATIN1)
ComputeSizeMUtf8(uint32_t mutf8Length)114     static size_t ComputeSizeMUtf8(uint32_t mutf8Length)
115     {
116         return sizeof(String) + mutf8Length;
117     }
118 
119     /// It's MUtf8 format, but without 0 in the end.
GetDataMUtf8()120     uint8_t *GetDataMUtf8()
121     {
122         ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string");
123         return reinterpret_cast<uint8_t *>(dataUtf16_);
124     }
125 
GetMUtf8Length()126     size_t GetMUtf8Length()
127     {
128         if (!IsUtf16()) {
129             return GetLength() + 1;  // add place for zero at the end
130         }
131         return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength());
132     }
133 
GetUtf16Length()134     size_t GetUtf16Length()
135     {
136         return GetLength();
137     }
138 
CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)139     inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString)
140     {
141         if (isCString) {
142             ASSERT(maxLength != 0);
143             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
144             buf[maxLength - 1] = '\0';
145             return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1;  // add place for zero at the end
146         }
147 
148         return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength);
149     }
150 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)151     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength)
152     {
153         if (length > maxLength) {
154             return 0;
155         }
156         uint32_t len = GetLength();
157         if (start + length > len) {
158             return 0;
159         }
160         if (!IsUtf16()) {
161             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
162             if (length > MAX_LEN) {
163                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
164             }
165             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
166             if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) {
167                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
168             }
169             return length;
170         }
171         return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start);
172     }
173 
CopyDataUtf16(uint16_t * buf,uint32_t maxLength)174     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength)
175     {
176         return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
177     }
178 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)179     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength)
180     {
181         if (length > maxLength) {
182             return 0;
183         }
184         uint32_t len = GetLength();
185         if (start + length > len) {
186             return 0;
187         }
188         if (IsUtf16()) {
189             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
190             if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
191                 EOK) {
192                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
193             }
194         } else {
195             uint8_t *src8 = GetDataMUtf8() + start;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196             for (uint32_t i = 0; i < length; ++i) {
197                 buf[i] = src8[i];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198             }
199         }
200         return length;
201     }
202 
GetLength()203     uint32_t GetLength() const
204     {
205         uint32_t length;
206         if (compressedStringsEnabled_) {
207             length = length_ >> 1U;
208         } else {
209             length = length_;
210         }
211         return length;
212     }
213 
IsEmpty()214     bool IsEmpty() const
215     {
216         // do not shift right length because it is always zero for empty string
217         return length_ == 0;
218     }
219 
ObjectSize()220     size_t ObjectSize() const
221     {
222         uint32_t length = GetLength();
223         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
224     }
225 
GetHashcode()226     uint32_t GetHashcode()
227     {
228         if (hashcode_ == 0) {
229             hashcode_ = ComputeHashcode();
230         }
231         return hashcode_;
232     }
233 
234     int32_t IndexOf(String *rhs, int pos = 0);
235     int32_t LastIndexOf(String *rhs, int pos = INT32_MAX);
236 
GetLengthOffset()237     static constexpr uint32_t GetLengthOffset()
238     {
239         return MEMBER_OFFSET(String, length_);
240     }
241 
GetDataOffset()242     static constexpr uint32_t GetDataOffset()
243     {
244         return MEMBER_OFFSET(String, dataUtf16_);
245     }
246 
GetHashcodeOffset()247     static constexpr uint32_t GetHashcodeOffset()
248     {
249         return MEMBER_OFFSET(String, hashcode_);
250     }
251 
GetStringCompressionMask()252     static constexpr uint32_t GetStringCompressionMask()
253     {
254         return STRING_COMPRESSED_BIT;
255     }
256 
257     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
258     PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2);
259     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
260     PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length);
261     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length,
262                                      bool canBeCompressed);
263     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
264     PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data,
265                                                       uint32_t utf16DataLength);
266     static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm);
267     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length);
268     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed);
269     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
270 
SetCompressedStringsEnabled(bool val)271     static void SetCompressedStringsEnabled(bool val)
272     {
273         compressedStringsEnabled_ = val;
274     }
275 
GetCompressedStringsEnabled()276     static bool GetCompressedStringsEnabled()
277     {
278         return compressedStringsEnabled_;
279     }
280 
NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)281     static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex,
282                                                                  const coretypes::String *str)
283     {
284         auto strLen = str->GetLength();
285         std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex};
286 
287         // If begin_index < 0, then it is assumed to be equal to zero.
288         if (normIndexes.first < 0) {
289             normIndexes.first = 0;
290         } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) {
291             // If begin_index > str_len, then it is assumed to be equal to str_len.
292             normIndexes.first = static_cast<int32_t>(strLen);
293         }
294         // If end_index < 0, then it is assumed to be equal to zero.
295         if (normIndexes.second < 0) {
296             normIndexes.second = 0;
297         } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) {
298             // If end_index > str_len, then it is assumed to be equal to str_len.
299             normIndexes.second = static_cast<int32_t>(strLen);
300         }
301         // If begin_index > end_index, then these are swapped.
302         if (normIndexes.first > normIndexes.second) {
303             std::swap(normIndexes.first, normIndexes.second);
304         }
305         ASSERT((normIndexes.second - normIndexes.first) >= 0);
306         return normIndexes;
307     }
308 
309     static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
310                                  PandaVM *vm = nullptr);
311 
312     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data);
313 
IsASCIICharacter(uint16_t data)314     static bool IsASCIICharacter(uint16_t data)
315     {
316         // \0 is not considered ASCII in Modified-UTF8
317         return data - 1U < utf::UTF8_1B_MAX;
318     }
319 
320 protected:
321     void SetLength(uint32_t length, bool compressed = false)
322     {
323         if (compressedStringsEnabled_) {
324             ASSERT(length < 0x80000000U);
325             // Use 0u for compressed/utf8 expression
326             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
327         } else {
328             length_ = length;
329         }
330     }
331 
SetHashcode(uint32_t hashcode)332     void SetHashcode(uint32_t hashcode)
333     {
334         hashcode_ = hashcode;
335     }
336 
337     uint32_t ComputeHashcode();
338     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length);
339     static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length);
340     static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr,
341                                      bool movable = true, bool pinned = false);
342 
343 private:
344     PANDA_PUBLIC_API static bool compressedStringsEnabled_;
345     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
346     enum CompressedStatus {
347         STRING_COMPRESSED,
348         STRING_UNCOMPRESSED,
349     };
350 
351     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length);
352     static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non);
353     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non);
354 
355     /**
356      * str1 should have the same length as mutf16_data.
357      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
358      */
359     // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer
360     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
361                                    uint32_t utf16DataLength);
362 
363     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength);
364 
365     template <typename T>
366     /// Check that two spans are equal. Should have the same length.
367     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
368 
369     // In last bit of length_ we store if this string is compressed or not.
370     uint32_t length_;
371     uint32_t hashcode_;
372     // A pointer to the string data stored after the string header.
373     // Data can be stored in mutf8 or utf16 form according to compressed bit.
374     __extension__ uint16_t dataUtf16_[0];  // NOLINT(modernize-avoid-c-arrays)
375 };
376 
377 constexpr uint32_t STRING_LENGTH_OFFSET = 8U;
378 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset());
379 constexpr uint32_t STRING_HASHCODE_OFFSET = 12U;
380 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset());
381 constexpr uint32_t STRING_DATA_OFFSET = 16U;
382 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset());
383 
384 }  // namespace ark::coretypes
385 
386 #endif  // PANDA_RUNTIME_CORETYPES_STRING_H_
387