• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 #ifndef PANDA_RUNTIME_CORETYPES_STRING_H_
16 #define PANDA_RUNTIME_CORETYPES_STRING_H_
17 
18 #include <securec.h>
19 #include <cstddef>
20 #include <cstdint>
21 #include <cstring>
22 
23 #include "libpandabase/utils/utf.h"
24 #include "runtime/include/language_context.h"
25 #include "runtime/include/object_header.h"
26 #include "runtime/mem/vm_handle.h"
27 
28 namespace panda::coretypes {
29 
30 class Array;
31 class String : public ObjectHeader {
32 public:
Cast(ObjectHeader * object)33     static String *Cast(ObjectHeader *object)
34     {
35         // TODO(linxiang) to do assert
36         return static_cast<String *>(object);
37     }
38 
39     static String *CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length,
40                                    bool can_be_compressed, const LanguageContext &ctx, PandaVM *vm,
41                                    bool movable = true);
42 
43     static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed,
44                                    const LanguageContext &ctx, PandaVM *vm, bool movable = true);
45 
46     static String *CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, const LanguageContext &ctx,
47                                    PandaVM *vm, bool movable = true);
48 
49     static String *CreateFromMUtf8(const uint8_t *mutf8_data, const LanguageContext &ctx, PandaVM *vm,
50                                    bool movable = true);
51 
52     static String *CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, const LanguageContext &ctx,
53                                    PandaVM *vm, bool movable = true);
54 
55     static String *CreateEmptyString(const LanguageContext &ctx, PandaVM *vm);
56 
57     static String *CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm);
58 
59     static String *Concat(String *jstring1, String *jstring2, const LanguageContext &ctx, PandaVM *vm);
60 
61     static String *CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray,
62                                             const LanguageContext &ctx, PandaVM *vm);
63 
64     static String *CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray,
65                                             const LanguageContext &ctx, PandaVM *vm);
66 
67     template <bool verify = true>
68     uint16_t At(int32_t index);
69 
70     int32_t Compare(String *rstr);
71 
72     Array *ToCharArray(const LanguageContext &ctx);
73 
IsUtf16()74     bool IsUtf16() const
75     {
76         return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_UNCOMPRESSED) : true;
77     }
78 
IsMUtf8()79     bool IsMUtf8() const
80     {
81         return compressed_strings_enabled ? ((length_ & STRING_COMPRESSED_BIT) == STRING_COMPRESSED) : true;
82     }
83 
ComputeDataSizeUtf16(uint32_t length)84     static size_t ComputeDataSizeUtf16(uint32_t length)
85     {
86         return length * sizeof(data_utf16_[0]);
87     }
88 
89     /**
90      * Methods for uncompressed strings (UTF16)
91      */
ComputeSizeUtf16(uint32_t utf16_length)92     static size_t ComputeSizeUtf16(uint32_t utf16_length)
93     {
94         return sizeof(String) + ComputeDataSizeUtf16(utf16_length);
95     }
96 
GetDataUtf16()97     uint16_t *GetDataUtf16()
98     {
99         LOG_IF(!IsUtf16(), FATAL, RUNTIME) << "String: Read data as utf16 for mutf8 string";
100         return data_utf16_;
101     }
102 
103     /**
104      * Methods for compresses strings (MUTF8 or LATIN1)
105      */
ComputeSizeMUtf8(uint32_t mutf8_length)106     static size_t ComputeSizeMUtf8(uint32_t mutf8_length)
107     {
108         return sizeof(String) + mutf8_length;
109     }
110 
111     /**
112      * It's MUtf8 format, but without 0 in the end.
113      */
GetDataMUtf8()114     uint8_t *GetDataMUtf8()
115     {
116         LOG_IF(IsUtf16(), FATAL, RUNTIME) << "String: Read data as mutf8 for utf16 string";
117         return reinterpret_cast<uint8_t *>(data_utf16_);
118     }
119 
GetMUtf8Length()120     size_t GetMUtf8Length()
121     {
122         if (!IsUtf16()) {
123             return GetLength() + 1;  // add place for zero at the end
124         }
125         return panda::utf::Utf16ToMUtf8Size(data_utf16_, GetLength());
126     }
127 
GetUtf16Length()128     size_t GetUtf16Length()
129     {
130         return GetLength();
131     }
132 
CopyDataMUtf8(uint8_t * buf,size_t max_length,bool is_c_string)133     inline size_t CopyDataMUtf8(uint8_t *buf, size_t max_length, bool is_c_string)
134     {
135         if (is_c_string) {
136             ASSERT(max_length != 0);
137             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
138             buf[max_length - 1] = '\0';
139             return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length) + 1;  // add place for zero at the end
140         }
141 
142         return CopyDataRegionMUtf8(buf, 0, GetLength(), max_length);
143     }
144 
CopyDataRegionMUtf8(uint8_t * buf,size_t start,size_t length,size_t max_length)145     size_t CopyDataRegionMUtf8(uint8_t *buf, size_t start, size_t length, size_t max_length)
146     {
147         if (length > max_length) {
148             return 0;
149         }
150         uint32_t len = GetLength();
151         if (start + length > len) {
152             return 0;
153         }
154         if (!IsUtf16()) {
155             constexpr size_t MAX_LEN = std::numeric_limits<size_t>::max() / 2 - 1;
156             if (length > MAX_LEN) {
157                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than half of size_t::max";
158             }
159             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
160             if (memcpy_s(buf, sizeof(uint8_t) * (max_length + 1), GetDataMUtf8() + start, length) != EOK) {
161                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
162             }
163             return length;
164         }
165         return panda::utf::ConvertRegionUtf16ToMUtf8(GetDataUtf16(), buf, length, max_length - 1, start);
166     }
167 
CopyDataUtf16(uint16_t * buf,uint32_t max_length)168     inline uint32_t CopyDataUtf16(uint16_t *buf, uint32_t max_length)
169     {
170         return CopyDataRegionUtf16(buf, 0, GetLength(), max_length);
171     }
172 
CopyDataRegionUtf16(uint16_t * buf,uint32_t start,uint32_t length,uint32_t max_length)173     uint32_t CopyDataRegionUtf16(uint16_t *buf, uint32_t start, uint32_t length, uint32_t max_length)
174     {
175         if (length > max_length) {
176             return 0;
177         }
178         uint32_t len = GetLength();
179         if (start + length > len) {
180             return 0;
181         }
182         if (IsUtf16()) {
183             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
184             if (memcpy_s(buf, sizeof(uint16_t) * max_length, GetDataUtf16() + start, ComputeDataSizeUtf16(length)) !=
185                 EOK) {
186                 LOG(FATAL, RUNTIME) << __func__ << " length is higher than buf size";
187             }
188         } else {
189             uint8_t *src_8 = GetDataMUtf8() + start;  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
190             for (uint32_t i = 0; i < length; ++i) {
191                 buf[i] = src_8[i];  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192             }
193         }
194         return length;
195     }
196 
GetLength()197     uint32_t GetLength() const
198     {
199         uint32_t length;
200         if (compressed_strings_enabled) {
201             length = length_ >> 1U;
202         } else {
203             length = length_;
204         }
205         return length;
206     }
207 
IsEmpty()208     bool IsEmpty() const
209     {
210         // do not shift right length because it is always zero for empty string
211         return length_ == 0;
212     }
213 
ObjectSize()214     size_t ObjectSize() const
215     {
216         uint32_t length = GetLength();
217         return IsUtf16() ? ComputeSizeUtf16(length) : ComputeSizeMUtf8(length);
218     }
219 
GetHashcode()220     uint32_t GetHashcode()
221     {
222         if (hashcode_ == 0) {
223             hashcode_ = ComputeHashcode();
224         }
225         return hashcode_;
226     }
227 
228     int32_t IndexOf(String *rhs, int pos = 0);
229 
GetLengthOffset()230     static constexpr uint32_t GetLengthOffset()
231     {
232         return MEMBER_OFFSET(String, length_);
233     }
234 
GetDataOffset()235     static constexpr uint32_t GetDataOffset()
236     {
237         return MEMBER_OFFSET(String, data_utf16_);
238     }
239 
GetStringCompressionMask()240     static constexpr uint32_t GetStringCompressionMask()
241     {
242         return STRING_COMPRESSED_BIT;
243     }
244 
245     /**
246      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
247      */
248     static bool StringsAreEqual(String *str1, String *str2);
249     /**
250      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
251      */
252     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length);
253     static bool StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length,
254                                      bool can_be_compressed);
255     /**
256      * Compares strings by bytes, It doesn't check canonical unicode equivalence.
257      */
258     static bool StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length);
259     static String *DoReplace(String *src, uint16_t old_c, uint16_t new_c, const LanguageContext &ctx, PandaVM *vm);
260     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t length);
261     static uint32_t ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed);
262     static uint32_t ComputeHashcodeUtf16(const uint16_t *utf16_data, uint32_t length);
263 
SetCompressedStringsEnabled(bool val)264     static void SetCompressedStringsEnabled(bool val)
265     {
266         compressed_strings_enabled = val;
267     }
268 
GetCompressedStringsEnabled()269     static bool GetCompressedStringsEnabled()
270     {
271         return compressed_strings_enabled;
272     }
273 
274     static String *FastSubString(String *src, uint32_t start, uint32_t utf16_length, const LanguageContext &ctx,
275                                  PandaVM *vm = nullptr);
276 
277     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data);
278 
279 protected:
280     void SetLength(uint32_t length, bool compressed = false)
281     {
282         if (compressed_strings_enabled) {
283             ASSERT(length < 0x80000000U);
284             // Use 0u for compressed/utf8 expression
285             length_ = (length << 1U) | (compressed ? STRING_COMPRESSED : STRING_UNCOMPRESSED);
286         } else {
287             length_ = length;
288         }
289     }
290 
SetHashcode(uint32_t hashcode)291     void SetHashcode(uint32_t hashcode)
292     {
293         hashcode_ = hashcode;
294     }
295 
296     uint32_t ComputeHashcode();
297     static bool CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length);
298     static void CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length);
299 
300 private:
301     static bool compressed_strings_enabled;
302     static constexpr uint32_t STRING_COMPRESSED_BIT = 0x1;
303     enum CompressedStatus {
304         STRING_COMPRESSED,
305         STRING_UNCOMPRESSED,
306     };
307 
IsASCIICharacter(uint16_t data)308     static bool IsASCIICharacter(uint16_t data)
309     {
310         // \0 is not considered ASCII in Modified-UTF8
311         return data - 1U < utf::MUTF8_1B_MAX;
312     }
313 
314     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length);
315     static bool CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non);
316     static bool CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non);
317 
318     /**
319      * str1 should have the same length as mutf16_data.
320      * Converts mutf8_data to mutf16 and compare it with given mutf16_data.
321      */
322     // TODO(alovkov): move to utils/utf.h without allocation a temporary buffer
323     static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data,
324                                    uint32_t utf16_data_length);
325 
326     static bool IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length);
327 
328     template <typename T>
329     /**
330      * Check that two spans are equal. Should have the same length.
331      */
332     static bool StringsAreEquals(Span<const T> &str1, Span<const T> &str2);
333 
334     template <typename T1, typename T2>
335     static int32_t IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max);
336 
337     static String *AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm = nullptr,
338                                      bool movable = true);
339 
340     // In last bit of length_ we store if this string is compressed or not.
341     uint32_t length_;
342     uint32_t hashcode_;
343     // A pointer to the string data stored after the string header.
344     // Data can be stored in mutf8 or utf16 form according to compressed bit.
345     __extension__ uint16_t data_utf16_[0];  // NOLINT(modernize-avoid-c-arrays)
346 };
347 
348 constexpr uint32_t STRING_LENGTH_OFFSET = 8U;
349 static_assert(STRING_LENGTH_OFFSET == panda::coretypes::String::GetLengthOffset());
350 constexpr uint32_t STRING_DATA_OFFSET = 16U;
351 static_assert(STRING_DATA_OFFSET == panda::coretypes::String::GetDataOffset());
352 
353 }  // namespace panda::coretypes
354 
355 #endif  // PANDA_RUNTIME_CORETYPES_STRING_H_
356