• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_
16 #define PANDA_RUNTIME_CORETYPES_STRING_H_
17 
18 #include <securec.h>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "libpandabase/utils/utf.h"
24 #include "runtime/include/language_context.h"
25 #include "runtime/include/object_header.h"
26 
27 namespace ark::coretypes {
28 
29 class Array;
30 class String : public ObjectHeader {
31 public:
Cast(ObjectHeader * object)32     static String *Cast(ObjectHeader *object)
33     {
34         // NOTE(linxiang) to do assert
35         return static_cast<String *>(object);
36     }
37 
38     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
39                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
40                                                     bool movable = true, bool pinned = false);
41 
42     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
43                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
44                                                     bool movable = true, bool pinned = false);
45 
46     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length,
47                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
48                                                     bool pinned = false);
49 
50     PANDA_PUBLIC_API static String *CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm,
51                                                     bool movable = true, bool pinned = false);
52 
53     static String *CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length,
54                                    const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned);
55 
56     PANDA_PUBLIC_API static String *CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length,
57                                                    const LanguageContext &ctx, PandaVM *vm, bool movable = true,
58                                                    bool pinned = false);
59 
60     PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length,
61                                                     bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm,
62                                                     bool movable = true, bool pinned = false);
63 
64     PANDA_PUBLIC_API static String *CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length,
65                                                     const LanguageContext &ctx, PandaVM *vm, bool movable = true,
66                                                     bool pinned = false);
67 
68     PANDA_PUBLIC_API static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm);
69 
70     PANDA_PUBLIC_API static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm);
71 
72     PANDA_PUBLIC_API static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm);
73 
74     PANDA_PUBLIC_API static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray,
75                                                              const LanguageContext &ctx, PandaVM *vm);
76 
77     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
78                                             const LanguageContext &ctx, PandaVM *vm);
79 
80     template <bool VERIFY = true>
81     uint16_t At(int32_t index);
82 
83     PANDA_PUBLIC_API int32_t Compare(String *rstr);
84 
85     PANDA_PUBLIC_API Array *ToCharArray(const LanguageContext &ctx);
86 
87     PANDA_PUBLIC_API static Array *GetChars(String *src, uint32_t start, uint32_t utf16Length,
88                                             const LanguageContext &ctx);
89 
IsUtf16()90     bool IsUtf16() const
91     {
92         return compressedStringsEnabled_ ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
93     }
94 
IsMUtf8()95     bool IsMUtf8() const
96     {
97         return !IsUtf16();
98     }
99 
ComputeDataSizeUtf16(uint32_t length)100     static size_t ComputeDataSizeUtf16(uint32_t length)
101     {
102         return length * sizeof(dataUtf16_[0]);
103     }
104 
105     /// Methods for uncompressed strings (UTF16)
ComputeSizeUtf16(uint32_t utf16Length)106     static size_t ComputeSizeUtf16(uint32_t utf16Length)
107     {
108         return sizeof(String) + ComputeDataSizeUtf16(utf16Length);
109     }
110 
GetDataUtf16()111     uint16_t *GetDataUtf16()
112     {
113         ASSERT_PRINT(IsUtf16(), "String: Read data as utf16 for mutf8 string");
114         return dataUtf16_;
115     }
116 
117     /// Methods for compresses strings (MUTF8 or LATIN1)
ComputeSizeMUtf8(uint32_t mutf8Length)118     static size_t ComputeSizeMUtf8(uint32_t mutf8Length)
119     {
120         return sizeof(String) + mutf8Length;
121     }
122 
GetDataUtf8()123     uint8_t *GetDataUtf8()
124     {
125         ASSERT_PRINT(!IsUtf16(), "String: Read data as utf8 for utf16 string");
126         return reinterpret_cast<uint8_t *>(dataUtf16_);
127     }
128 
129     /// It's MUtf8 format, but without 0 in the end.
GetDataMUtf8()130     uint8_t *GetDataMUtf8()
131     {
132         ASSERT_PRINT(!IsUtf16(), "String: Read data as mutf8 for utf16 string");
133         return reinterpret_cast<uint8_t *>(dataUtf16_);
134     }
135 
GetMUtf8Length()136     size_t GetMUtf8Length()
137     {
138         if (!IsUtf16()) {
139             return GetLength() + 1;  // add place for zero at the end
140         }
141         return ark::utf::Utf16ToMUtf8Size(dataUtf16_, GetLength());
142     }
143 
GetUtf16Length()144     size_t GetUtf16Length()
145     {
146         return GetLength();
147     }
148 
GetUtf8Length()149     size_t GetUtf8Length()
150     {
151         if (!IsUtf16()) {
152             return GetLength();
153         }
154         return ark::utf::Utf16ToUtf8Size(dataUtf16_, GetLength(), false) - 1;
155     }
156 
CopyDataRegionUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)157     size_t CopyDataRegionUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength)
158     {
159         if (length > maxLength) {
160             return 0;
161         }
162         uint32_t len = GetUtf8Length();
163         if (start + length > len) {
164             return 0;
165         }
166         if (!IsUtf16()) {
167             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
168             if (length > MAX_LEN) {
169                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
170             }
171             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
172             if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataUtf8() + start, length) != EOK) {
173                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
174             }
175             return length;
176         }
177         length = GetUtf16Length();
178         return ark::utf::ConvertRegionUtf16ToUtf8(GetDataUtf16(), buf, length, maxLength, start, false);
179     }
180 
CopyDataMUtf8(uint8_t * buf,size_t maxLength,bool isCString)181     inline size_t CopyDataMUtf8(uint8_t *buf, size_t maxLength, bool isCString)
182     {
183         if (isCString) {
184             ASSERT(maxLength != 0);
185             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
186             buf[maxLength - 1] = '\0';
187             return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength) + 1;  // add place for zero at the end
188         }
189 
190         return CopyDataRegionMUtf8(buf, 0, GetLength(), maxLength);
191     }
192 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t maxLength)193     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t maxLength)
194     {
195         if (length > maxLength) {
196             return 0;
197         }
198         uint32_t len = GetLength();
199         if (start + length > len) {
200             return 0;
201         }
202         if (!IsUtf16()) {
203             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
204             if (length > MAX_LEN) {
205                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
206             }
207             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
208             if (memcpy_s(buf, sizeof(uint8_t) * (maxLength + 1), GetDataMUtf8() + start, length) != EOK) {
209                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
210             }
211             return length;
212         }
213         return ark::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, maxLength - 1, start);
214     }
215 
CopyDataUtf16(uint16_t * buf,uint32_t maxLength)216     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t maxLength)
217     {
218         return CopyDataRegionUtf16(buf, 0, GetLength(), maxLength);
219     }
220 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t maxLength)221     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t maxLength)
222     {
223         if (length > maxLength) {
224             return 0;
225         }
226         uint32_t len = GetLength();
227         if (start + length > len) {
228             return 0;
229         }
230         if (IsUtf16()) {
231             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
232             if (memcpy_s(buf, sizeof(uint16_t) * maxLength, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
233                 EOK) {
234                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
235             }
236         } else {
237             uint8_t *src8 = GetDataMUtf8() + start;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
238             for (uint32_t i = 0; i < length; ++i) {
239                 buf[i] = src8[i];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240             }
241         }
242         return length;
243     }
244 
GetLength()245     uint32_t GetLength() const
246     {
247         uint32_t length;
248         if (compressedStringsEnabled_) {
249             length = length_ >> 1U;
250         } else {
251             length = length_;
252         }
253         return length;
254     }
255 
IsEmpty()256     bool IsEmpty() const
257     {
258         // do not shift right length because it is always zero for empty string
259         return length_ == 0;
260     }
261 
ObjectSize()262     size_t ObjectSize() const
263     {
264         uint32_t length = GetLength();
265         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
266     }
267 
GetHashcode()268     uint32_t GetHashcode()
269     {
270         if (hashcode_ == 0) {
271             hashcode_ = ComputeHashcode();
272         }
273         return hashcode_;
274     }
275 
276     int32_t IndexOf(String *rhs, int pos = 0);
277     int32_t LastIndexOf(String *rhs, int pos = INT32_MAX);
278 
GetLengthOffset()279     static constexpr uint32_t GetLengthOffset()
280     {
281         return MEMBER_OFFSET(String, length_);
282     }
283 
GetDataOffset()284     static constexpr uint32_t GetDataOffset()
285     {
286         return MEMBER_OFFSET(String, dataUtf16_);
287     }
288 
GetHashcodeOffset()289     static constexpr uint32_t GetHashcodeOffset()
290     {
291         return MEMBER_OFFSET(String, hashcode_);
292     }
293 
GetStringCompressionMask()294     static constexpr uint32_t GetStringCompressionMask()
295     {
296         return STRING_COMPRESSED_BIT;
297     }
298 
299     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
300     PANDA_PUBLIC_API static bool StringsAreEqual(String *str1, String *str2);
301     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
302     PANDA_PUBLIC_API static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length);
303     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length,
304                                      bool canBeCompressed);
305     /// Compares strings by bytes, It doesn't check canonical unicode equivalence.
306     PANDA_PUBLIC_API static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data,
307                                                       uint32_t utf16DataLength);
308     static String *DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm);
309     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t length);
310     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed);
311     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length);
312 
SetCompressedStringsEnabled(bool val)313     static void SetCompressedStringsEnabled(bool val)
314     {
315         compressedStringsEnabled_ = val;
316     }
317 
GetCompressedStringsEnabled()318     static bool GetCompressedStringsEnabled()
319     {
320         return compressedStringsEnabled_;
321     }
322 
NormalizeSubStringIndexes(int32_t beginIndex,int32_t endIndex,const coretypes::String * str)323     static std::pair<int32_t, int32_t> NormalizeSubStringIndexes(int32_t beginIndex, int32_t endIndex,
324                                                                  const coretypes::String *str)
325     {
326         auto strLen = str->GetLength();
327         std::pair<int32_t, int32_t> normIndexes = {beginIndex, endIndex};
328 
329         // If begin_index < 0, then it is assumed to be equal to zero.
330         if (normIndexes.first < 0) {
331             normIndexes.first = 0;
332         } else if (static_cast<decltype(strLen)>(normIndexes.first) > strLen) {
333             // If begin_index > str_len, then it is assumed to be equal to str_len.
334             normIndexes.first = static_cast<int32_t>(strLen);
335         }
336         // If end_index < 0, then it is assumed to be equal to zero.
337         if (normIndexes.second < 0) {
338             normIndexes.second = 0;
339         } else if (static_cast<decltype(strLen)>(normIndexes.second) > strLen) {
340             // If end_index > str_len, then it is assumed to be equal to str_len.
341             normIndexes.second = static_cast<int32_t>(strLen);
342         }
343         // If begin_index > end_index, then these are swapped.
344         if (normIndexes.first > normIndexes.second) {
345             std::swap(normIndexes.first, normIndexes.second);
346         }
347         ASSERT((normIndexes.second - normIndexes.first) >= 0);
348         return normIndexes;
349     }
350 
351     static String *FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
352                                  PandaVM *vm = nullptr);
353 
354     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data);
355 
IsASCIICharacter(uint16_t data)356     static bool IsASCIICharacter(uint16_t data)
357     {
358         // \0 is not considered ASCII in Modified-UTF8
359         return data - 1U < utf::UTF8_1B_MAX;
360     }
361 
362 protected:
363     void SetLength(uint32_t length, bool compressed = false)
364     {
365         if (compressedStringsEnabled_) {
366             ASSERT(length < 0x80000000U);
367             // Use 0u for compressed/utf8 expression
368             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
369         } else {
370             length_ = length;
371         }
372     }
373 
SetHashcode(uint32_t hashcode)374     void SetHashcode(uint32_t hashcode)
375     {
376         hashcode_ = hashcode;
377     }
378 
379     uint32_t ComputeHashcode();
380     static bool CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length);
381     static void CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length);
382     static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr,
383                                      bool movable = true, bool pinned = false);
384 
385 private:
386     PANDA_PUBLIC_API static bool compressedStringsEnabled_;
387     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
388     enum CompressedStatus {
389         STRING_COMPRESSED,
390         STRING_UNCOMPRESSED,
391     };
392 
393     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length);
394     static bool CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non);
395     static bool CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non);
396 
397     /**
398      * str1 should have the same length as mutf16_data.
399      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
400      */
401     // NOTE(alovkov): move to utils/utf.h without allocation a temporary buffer
402     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
403                                    uint32_t utf16DataLength);
404 
405     static bool IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength);
406 
407     template <typename T>
408     /// Check that two spans are equal. Should have the same length.
409     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
410 
411     // In last bit of length_ we store if this string is compressed or not.
412     uint32_t length_;
413     uint32_t hashcode_;
414     // A pointer to the string data stored after the string header.
415     // Data can be stored in mutf8 or utf16 form according to compressed bit.
416     __extension__ uint16_t dataUtf16_[0];  // NOLINT(modernize-avoid-c-arrays)
417 };
418 
419 constexpr uint32_t STRING_LENGTH_OFFSET = sizeof(ObjectHeader);
420 static_assert(STRING_LENGTH_OFFSET == ark::coretypes::String::GetLengthOffset());
421 constexpr uint32_t STRING_HASHCODE_OFFSET = STRING_LENGTH_OFFSET + sizeof(uint32_t);
422 static_assert(STRING_HASHCODE_OFFSET == ark::coretypes::String::GetHashcodeOffset());
423 constexpr uint32_t STRING_DATA_OFFSET = STRING_HASHCODE_OFFSET + sizeof(uint32_t);
424 static_assert(STRING_DATA_OFFSET == ark::coretypes::String::GetDataOffset());
425 
426 }  // namespace ark::coretypes
427 
428 #endif  // PANDA_RUNTIME_CORETYPES_STRING_H_
429