• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include <cstddef>
17 #include <cstring>
18 #include <limits>
19 
20 #include "libpandabase/utils/utf.h"
21 #include "libpandabase/utils/hash.h"
22 #include "libpandabase/utils/span.h"
23 #include "runtime/arch/memory_helpers.h"
24 #include "runtime/include/coretypes/array.h"
25 #include "runtime/include/coretypes/string-inl.h"
26 #include "runtime/include/runtime.h"
27 #include "runtime/handle_base-inl.h"
28 #include "runtime/include/panda_vm.h"
29 
30 namespace ark::coretypes {
31 
32 bool String::compressedStringsEnabled_ = true;
33 
34 /* static */
CreateFromString(String * str,const LanguageContext & ctx,PandaVM * vm)35 String *String::CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm)
36 {
37     ASSERT(str != nullptr);
38     // allocator may trig gc and move str, need to hold it
39     auto thread = ManagedThread::GetCurrent();
40     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
41     VMHandle<String> strHandle(thread, str);
42     auto string = AllocStringObject(strHandle->GetLength(), !strHandle->IsUtf16(), ctx, vm);
43     if (string == nullptr) {
44         return nullptr;
45     }
46 
47     // retrive str after gc
48     str = strHandle.GetPtr();
49     string->hashcode_ = str->hashcode_;
50 
51     uint32_t length = str->GetLength();
52     // After memcpy we should have a full barrier, so this writes should happen-before barrier
53     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
54     if (str->IsUtf16()) {
55         memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), str->GetDataUtf16(),
56                  ComputeDataSizeUtf16(length));
57     } else {
58         memcpy_s(string->GetDataMUtf8(), string->GetLength(), str->GetDataMUtf8(), length);
59     }
60     TSAN_ANNOTATE_IGNORE_WRITES_END();
61     // String is supposed to be a constant object, so all its data should be visible by all threads
62     arch::FullMemoryBarrier();
63 
64     return string;
65 }
66 
67 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,size_t mutf8Length,uint32_t utf16Length,bool canBeCompressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)68 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
69                                 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, bool movable,
70                                 bool pinned)
71 {
72     auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm, movable, pinned);
73     if (string == nullptr) {
74         return nullptr;
75     }
76 
77     ASSERT(string->hashcode_ == 0);
78     // After copying we should have a full barrier, so this writes should happen-before barrier
79     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
80     if (canBeCompressed) {
81         memcpy_s(string->GetDataMUtf8(), string->GetLength(), mutf8Data, utf16Length);
82     } else {
83         utf::ConvertMUtf8ToUtf16(mutf8Data, mutf8Length, string->GetDataUtf16());
84     }
85     TSAN_ANNOTATE_IGNORE_WRITES_END();
86     // String is supposed to be a constant object, so all its data should be visible by all threads
87     arch::FullMemoryBarrier();
88     return string;
89 }
90 
91 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)92 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, const LanguageContext &ctx, PandaVM *vm,
93                                 bool movable, bool pinned)
94 {
95     bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
96     auto mutf8Length = utf::Mutf8Size(mutf8Data);
97     ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
98     return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
99 }
100 
101 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)102 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed,
103                                 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned)
104 {
105     auto mutf8Length = utf::Mutf8Size(mutf8Data);
106     ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
107     ASSERT(canBeCompressed == CanBeCompressedMUtf8(mutf8Data));
108     return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
109 }
110 
111 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)112 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, bool movable,
113                                 bool pinned)
114 {
115     size_t mutf8Length = utf::Mutf8Size(mutf8Data);
116     size_t utf16Length = utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length);
117     bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
118     return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
119 }
120 
121 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)122 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length,
123                                 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned)
124 {
125     ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
126     auto canBeCompressed = CanBeCompressedMUtf8(mutf8Data, mutf8Length);
127     return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
128 }
129 
130 /* static */
CreateFromUtf8(const uint8_t * utf8Data,uint32_t utf8Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)131 String *String::CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, const LanguageContext &ctx, PandaVM *vm,
132                                bool movable, bool pinned)
133 {
134     coretypes::String *s = nullptr;
135     auto utf16Length = utf::Utf8ToUtf16Size(utf8Data, utf8Length);
136     if (CanBeCompressedMUtf8(utf8Data, utf8Length)) {
137         // ascii string have equal representation in utf8 and mutf8 formats
138         s = coretypes::String::CreateFromMUtf8(utf8Data, utf8Length, utf16Length, true, ctx, vm, movable, pinned);
139     } else {
140         PandaVector<uint16_t> tmpBuffer(utf16Length);
141         [[maybe_unused]] auto len =
142             utf::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Length, utf16Length, 0);
143         ASSERT(len == utf16Length);
144         s = coretypes::String::CreateFromUtf16(tmpBuffer.data(), utf16Length, ctx, vm, movable, pinned);
145     }
146     return s;
147 }
148 
149 /* static */
CreateFromUtf16(const uint16_t * utf16Data,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)150 String *String::CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, const LanguageContext &ctx,
151                                 PandaVM *vm, bool movable, bool pinned)
152 {
153     bool canBeCompressed = CanBeCompressed(utf16Data, utf16Length);
154     auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm, movable, pinned);
155     if (string == nullptr) {
156         return nullptr;
157     }
158 
159     ASSERT(string->hashcode_ == 0);
160     // After copying we should have a full barrier, so this writes should happen-before barrier
161     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
162     if (canBeCompressed) {
163         CopyUtf16AsMUtf8(utf16Data, string->GetDataMUtf8(), utf16Length);
164     } else {
165         memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), utf16Data, utf16Length << 1UL);
166     }
167     TSAN_ANNOTATE_IGNORE_WRITES_END();
168     // String is supposed to be a constant object, so all its data should be visible by all threads
169     arch::FullMemoryBarrier();
170     return string;
171 }
172 
173 /* static */
CreateEmptyString(const LanguageContext & ctx,PandaVM * vm)174 String *String::CreateEmptyString(const LanguageContext &ctx, PandaVM *vm)
175 {
176     uint16_t data = 0;
177     return CreateFromUtf16(&data, 0, ctx, vm);
178 }
179 
180 /* static */
CopyUtf16AsMUtf8(const uint16_t * utf16From,uint8_t * mutf8To,uint32_t utf16Length)181 void String::CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length)
182 {
183     Span<const uint16_t> from(utf16From, utf16Length);
184     Span<uint8_t> to(mutf8To, utf16Length);
185     for (uint32_t i = 0; i < utf16Length; i++) {
186         to[i] = from[i];
187     }
188 }
189 
190 // static
CreateNewStringFromChars(uint32_t offset,uint32_t length,Array * chararray,const LanguageContext & ctx,PandaVM * vm)191 String *String::CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, const LanguageContext &ctx,
192                                          PandaVM *vm)
193 {
194     ASSERT(chararray != nullptr);
195     // allocator may trig gc and move array, need to hold it
196     auto thread = ManagedThread::GetCurrent();
197     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
198     VMHandle<Array> arrayHandle(thread, chararray);
199 
200     // There is a potential data race between read of src in CanBeCompressed and write of destination buf
201     // in CopyDataRegionUtf16. The src is a cast from chararray comming from managed object.
202     // Hence the race is reported on managed object, which has a synchronization on a high level.
203     // TSAN does not see such synchronization, thus we ignore such races here.
204     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
205     // NOLINTNEXTLINE(readability-identifier-naming)
206     const uint16_t *src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(chararray->GetData()) + (offset << 1UL));
207     bool canBeCompressed = CanBeCompressed(src, length);
208     TSAN_ANNOTATE_IGNORE_WRITES_END();
209     auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
210     if (string == nullptr) {
211         return nullptr;
212     }
213 
214     // retrieve src since gc may move it
215     src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(arrayHandle->GetData()) + (offset << 1UL));
216     ASSERT(string->hashcode_ == 0);
217     // After copying we should have a full barrier, so this writes should happen-before barrier
218     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
219     if (canBeCompressed) {
220         CopyUtf16AsMUtf8(src, string->GetDataMUtf8(), length);
221     } else {
222         memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), src, length << 1UL);
223     }
224     TSAN_ANNOTATE_IGNORE_WRITES_END();
225     // String is supposed to be a constant object, so all its data should be visible by all threads
226     arch::FullMemoryBarrier();
227     return string;
228 }
229 
230 // static
CreateNewStringFromBytes(uint32_t offset,uint32_t length,uint32_t highByte,Array * bytearray,const LanguageContext & ctx,PandaVM * vm)231 String *String::CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
232                                          const LanguageContext &ctx, PandaVM *vm)
233 {
234     ASSERT(length != 0);
235     ASSERT(bytearray != nullptr);
236     // allocator may trig gc and move array, need to hold it
237     auto thread = ManagedThread::GetCurrent();
238     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
239     VMHandle<Array> arrayHandle(thread, bytearray);
240 
241     constexpr size_t BYTE_MASK = 0xFF;
242 
243     // NOLINTNEXTLINE(readability-identifier-naming)
244     const uint8_t *src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(bytearray->GetData()) + offset);
245     highByte &= BYTE_MASK;
246     bool canBeCompressed = CanBeCompressedMUtf8(src, length) && (highByte == 0);
247     auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
248     if (string == nullptr) {
249         return nullptr;
250     }
251 
252     // retrieve src since gc may move it
253     src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(arrayHandle->GetData()) + offset);
254     ASSERT(string->hashcode_ == 0);
255     // After copying we should have a full barrier, so this writes should happen-before barrier
256     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
257     if (canBeCompressed) {
258         Span<const uint8_t> from(src, length);
259         Span<uint8_t> to(string->GetDataMUtf8(), length);
260         for (uint32_t i = 0; i < length; ++i) {
261             to[i] = (from[i] & BYTE_MASK);
262         }
263     } else {
264         Span<const uint8_t> from(src, length);
265         Span<uint16_t> to(string->GetDataUtf16(), length);
266         for (uint32_t i = 0; i < length; ++i) {
267             to[i] = (highByte << 8U) + (from[i] & BYTE_MASK);
268         }
269     }
270     TSAN_ANNOTATE_IGNORE_WRITES_END();
271 
272     // String is supposed to be a constant object, so all its data should be visible by all threads
273     arch::FullMemoryBarrier();
274     return string;
275 }
276 
277 template <typename T1, typename T2>
CompareStringSpan(Span<T1> & lhsSp,Span<T2> & rhsSp,int32_t count)278 int32_t CompareStringSpan(Span<T1> &lhsSp, Span<T2> &rhsSp, int32_t count)
279 {
280     for (int32_t i = 0; i < count; ++i) {
281         int32_t charDiff = static_cast<int32_t>(lhsSp[i]) - static_cast<int32_t>(rhsSp[i]);
282         if (charDiff != 0) {
283             return charDiff;
284         }
285     }
286     return 0;
287 }
288 
289 template <typename T>
CompareBytesBlock(T * lstrPt,T * rstrPt,int32_t minCount)290 int32_t CompareBytesBlock(T *lstrPt, T *rstrPt, int32_t minCount)
291 {
292     constexpr int32_t BYTES_CNT = sizeof(size_t);
293     static_assert(BYTES_CNT >= sizeof(T));
294     static_assert(BYTES_CNT % sizeof(T) == 0);
295     int32_t totalBytes = minCount * sizeof(T);
296     auto lhsBlock = reinterpret_cast<size_t *>(lstrPt);
297     auto rhsBlock = reinterpret_cast<size_t *>(rstrPt);
298     int32_t curBytePos = 0;
299     while (curBytePos + BYTES_CNT <= totalBytes) {
300         if (*lhsBlock == *rhsBlock) {
301             curBytePos += BYTES_CNT;
302             lhsBlock++;
303             rhsBlock++;
304         } else {
305             break;
306         }
307     }
308     int32_t curElementPos = curBytePos / sizeof(T);
309     for (int32_t i = curElementPos; i < minCount; ++i) {
310         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
311         int32_t charDiff = static_cast<int32_t>(lstrPt[i]) - static_cast<int32_t>(rstrPt[i]);
312         if (charDiff != 0) {
313             return charDiff;
314         }
315     }
316 
317     return 0;
318 }
319 
Compare(String * rstr)320 int32_t String::Compare(String *rstr)
321 {
322     String *lstr = this;
323     if (lstr == rstr) {
324         return 0;
325     }
326     ASSERT(lstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
327     ASSERT(rstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
328     auto lstrLeng = static_cast<int32_t>(lstr->GetLength());
329     auto rstrLeng = static_cast<int32_t>(rstr->GetLength());
330     int32_t lengRet = lstrLeng - rstrLeng;
331     int32_t minCount = (lengRet < 0) ? lstrLeng : rstrLeng;
332     bool lstrIsUtf16 = lstr->IsUtf16();
333     bool rstrIsUtf16 = rstr->IsUtf16();
334     if (!lstrIsUtf16 && !rstrIsUtf16) {
335         int32_t charDiff = CompareBytesBlock(lstr->GetDataMUtf8(), rstr->GetDataMUtf8(), minCount);
336         if (charDiff != 0) {
337             return charDiff;
338         }
339     } else if (!lstrIsUtf16) {
340         Span<uint8_t> lhsSp(lstr->GetDataMUtf8(), lstrLeng);
341         Span<uint16_t> rhsSp(rstr->GetDataUtf16(), rstrLeng);
342         int32_t charDiff = CompareStringSpan(lhsSp, rhsSp, minCount);
343         if (charDiff != 0) {
344             return charDiff;
345         }
346     } else if (!rstrIsUtf16) {
347         Span<uint16_t> lhsSp(lstr->GetDataUtf16(), lstrLeng);
348         Span<uint8_t> rhsSp(rstr->GetDataMUtf8(), rstrLeng);
349         int32_t charDiff = CompareStringSpan(lhsSp, rhsSp, minCount);
350         if (charDiff != 0) {
351             return charDiff;
352         }
353     } else {
354         int32_t charDiff = CompareBytesBlock(lstr->GetDataUtf16(), rstr->GetDataUtf16(), minCount);
355         if (charDiff != 0) {
356             return charDiff;
357         }
358     }
359     return lengRet;
360 }
361 
362 template <typename T1, typename T2>
SubstringEquals(Span<const T1> & string,Span<const T2> & pattern,int32_t pos)363 static inline ALWAYS_INLINE int32_t SubstringEquals(Span<const T1> &string, Span<const T2> &pattern, int32_t pos)
364 {
365     ASSERT(pos + pattern.size() <= string.size());
366     if constexpr (std::is_same_v<T1, T2>) {
367         return std::memcmp(string.begin() + pos, pattern.begin(), pattern.size()) == 0;
368     }
369     return std::equal(pattern.begin(), pattern.end(), string.begin() + pos);
370 }
371 
372 /*
373  * Tailed Substring method (based on D. Cantone and S. Faro: Searching for a substring with constant extra-space
374  * complexity). O(nm) worst-case but reported to have good performance both on random and natural language data
375  * Substring s of t is called tailed-substring, if the last character of s does not repeat elsewhere in s
376  */
377 /* static */
378 template <typename T1, typename T2>
IndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos,int32_t max)379 static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max)
380 {
381     int32_t maxTailedLen = 1;
382     auto tailedEnd = static_cast<int32_t>(rhsSp.size() - 1);
383     int32_t maxTailedEnd = tailedEnd;
384     // Phase 1: search in the beginning of string while computing maximal tailed-substring length
385     auto searchChar = rhsSp[tailedEnd];
386     auto *shiftedLhs = lhsSp.begin() + tailedEnd;
387     while (pos <= max) {
388         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
389         if (searchChar != shiftedLhs[pos]) {
390             pos++;
391             continue;
392         }
393         if (SubstringEquals(lhsSp, rhsSp, pos)) {
394             return pos;
395         }
396         auto tailedStart = tailedEnd - 1;
397         while (tailedStart >= 0 && rhsSp[tailedStart] != searchChar) {
398             tailedStart--;
399         }
400         if (maxTailedLen < tailedEnd - tailedStart) {
401             maxTailedLen = tailedEnd - tailedStart;
402             maxTailedEnd = tailedEnd;
403         }
404         if (maxTailedLen >= tailedEnd) {
405             break;
406         }
407         pos += tailedEnd - tailedStart;
408         tailedEnd--;
409         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
410         shiftedLhs--;
411         searchChar = rhsSp[tailedEnd];
412     }
413     // Phase 2: search in the remainder of string using computed maximal tailed-substring length
414     searchChar = rhsSp[maxTailedEnd];
415     shiftedLhs = lhsSp.begin() + maxTailedEnd;
416     while (pos <= max) {
417         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
418         if (searchChar != shiftedLhs[pos]) {
419             pos++;
420             continue;
421         }
422         if (SubstringEquals(lhsSp, rhsSp, pos)) {
423             return pos;
424         }
425         pos += maxTailedLen;
426     }
427     return -1;
428 }
429 
430 // Search of the last occurence is equivalent to search of the first occurence of
431 // reversed pattern in reversed string
432 template <typename T1, typename T2>
LastIndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos)433 static int32_t LastIndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos)
434 {
435     int32_t maxTailedLen = 1;
436     int32_t tailedStart = 0;
437     int32_t maxTailedStart = tailedStart;
438     auto patternSize = static_cast<int32_t>(rhsSp.size());
439     // Phase 1: search in the end of string while computing maximal tailed-substring length
440     auto searchChar = rhsSp[tailedStart];
441     auto *shiftedLhs = lhsSp.begin() + tailedStart;
442     while (pos >= 0) {
443         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
444         if (searchChar != shiftedLhs[pos]) {
445             pos--;
446             continue;
447         }
448         if (SubstringEquals(lhsSp, rhsSp, pos)) {
449             return pos;
450         }
451         auto tailedEnd = tailedStart + 1;
452         while (tailedEnd < patternSize && rhsSp[tailedEnd] != searchChar) {
453             tailedEnd++;
454         }
455         if (maxTailedLen < tailedEnd - tailedStart) {
456             maxTailedLen = tailedEnd - tailedStart;
457             maxTailedStart = tailedStart;
458         }
459         if (maxTailedLen >= patternSize - tailedStart) {
460             break;
461         }
462         pos -= tailedEnd - tailedStart;
463         tailedStart++;
464         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
465         shiftedLhs++;
466         searchChar = rhsSp[tailedStart];
467     }
468     // Phase 2: search in the remainder of string using computed maximal tailed-substring length
469     searchChar = rhsSp[maxTailedStart];
470     shiftedLhs = lhsSp.begin() + maxTailedStart;
471     while (pos >= 0) {
472         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
473         if (searchChar != shiftedLhs[pos]) {
474             pos--;
475             continue;
476         }
477         if (SubstringEquals(lhsSp, rhsSp, pos)) {
478             return pos;
479         }
480         pos -= maxTailedLen;
481     }
482     return -1;
483 }
484 
GetCompressionAndLength(ark::coretypes::String * string)485 static inline ALWAYS_INLINE std::pair<bool, int32_t> GetCompressionAndLength(ark::coretypes::String *string)
486 {
487     ASSERT(string->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
488     ASSERT(string != nullptr);
489     return {string->IsMUtf8(), static_cast<int32_t>(string->GetLength())};
490 }
491 
IndexOf(String * rhs,int32_t pos)492 int32_t String::IndexOf(String *rhs, int32_t pos)
493 {
494     String *lhs = this;
495     auto [lhs_utf8, lhs_count] = GetCompressionAndLength(lhs);
496     auto [rhs_utf8, rhs_count] = GetCompressionAndLength(rhs);
497 
498     if (pos < 0) {
499         pos = 0;
500     }
501 
502     if (rhs_count == 0) {
503         return std::min(lhs_count, pos);
504     }
505 
506     int32_t max = lhs_count - rhs_count;
507     // for pos > max IndexOf impl will return -1
508     if (lhs_utf8 && rhs_utf8) {
509         Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
510         Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
511         return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
512     } else if (!lhs_utf8 && !rhs_utf8) {  // NOLINT(readability-else-after-return)
513         Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
514         Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
515         return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
516     } else if (rhs_utf8) {
517         Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
518         Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
519         return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
520     } else {  // NOLINT(readability-else-after-return)
521         Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
522         Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
523         return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
524     }
525 }
526 
LastIndexOf(String * rhs,int32_t pos)527 int32_t String::LastIndexOf(String *rhs, int32_t pos)
528 {
529     String *lhs = this;
530     auto [lhs_utf8, lhs_count] = GetCompressionAndLength(lhs);
531     auto [rhs_utf8, rhs_count] = GetCompressionAndLength(rhs);
532 
533     int32_t max = lhs_count - rhs_count;
534 
535     if (pos > max) {
536         pos = max;
537     }
538 
539     if (pos < 0) {
540         return -1;
541     }
542 
543     if (rhs_count == 0) {
544         return pos;
545     }
546 
547     if (lhs_utf8 && rhs_utf8) {
548         Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
549         Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
550         return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
551     } else if (!lhs_utf8 && !rhs_utf8) {  // NOLINT(readability-else-after-return)
552         Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
553         Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
554         return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
555     } else if (rhs_utf8) {
556         Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
557         Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
558         return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
559     } else {  // NOLINT(readability-else-after-return)
560         Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
561         Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
562         return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
563     }
564 }
565 
566 /* static */
CanBeCompressed(const uint16_t * utf16Data,uint32_t utf16Length)567 bool String::CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length)
568 {
569     if (!compressedStringsEnabled_) {
570         return false;
571     }
572     bool isCompressed = true;
573     Span<const uint16_t> data(utf16Data, utf16Length);
574     for (uint32_t i = 0; i < utf16Length; i++) {
575         if (!IsASCIICharacter(data[i])) {
576             isCompressed = false;
577             break;
578         }
579     }
580     return isCompressed;
581 }
582 
583 // static
CanBeCompressedMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length)584 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length)
585 {
586     if (!compressedStringsEnabled_) {
587         return false;
588     }
589     bool isCompressed = true;
590     Span<const uint8_t> data(mutf8Data, mutf8Length);
591     for (uint32_t i = 0; i < mutf8Length; i++) {
592         if (!IsASCIICharacter(data[i])) {
593             isCompressed = false;
594             break;
595         }
596     }
597     return isCompressed;
598 }
599 
600 // static
CanBeCompressedMUtf8(const uint8_t * mutf8Data)601 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data)
602 {
603     return compressedStringsEnabled_ ? utf::IsMUtf8OnlySingleBytes(mutf8Data) : false;
604 }
605 
606 /* static */
CanBeCompressedUtf16(const uint16_t * utf16Data,uint32_t utf16Length,uint16_t non)607 bool String::CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non)
608 {
609     if (!compressedStringsEnabled_) {
610         return false;
611     }
612     bool isCompressed = true;
613     Span<const uint16_t> data(utf16Data, utf16Length);
614     for (uint32_t i = 0; i < utf16Length; i++) {
615         if (!IsASCIICharacter(data[i]) && data[i] != non) {
616             isCompressed = false;
617             break;
618         }
619     }
620     return isCompressed;
621 }
622 
623 /* static */
CanBeCompressedMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length,uint16_t non)624 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non)
625 {
626     if (!compressedStringsEnabled_) {
627         return false;
628     }
629     bool isCompressed = true;
630     Span<const uint8_t> data(mutf8Data, mutf8Length);
631     for (uint32_t i = 0; i < mutf8Length; i++) {
632         if (!IsASCIICharacter(data[i]) && data[i] != non) {
633             isCompressed = false;
634             break;
635         }
636     }
637     return isCompressed;
638 }
639 
640 /* static */
StringsAreEqual(String * str1,String * str2)641 bool String::StringsAreEqual(String *str1, String *str2)
642 {
643     ASSERT(str1 != nullptr);
644     ASSERT(str2 != nullptr);
645 
646     if ((str1->IsUtf16() != str2->IsUtf16()) || (str1->GetLength() != str2->GetLength())) {
647         return false;
648     }
649 
650     if (str1->IsUtf16()) {
651         Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
652         Span<const uint16_t> data2(str2->GetDataUtf16(), str1->GetLength());
653         return String::StringsAreEquals(data1, data2);
654     } else {  // NOLINT(readability-else-after-return)
655         Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
656         Span<const uint8_t> data2(str2->GetDataMUtf8(), str1->GetLength());
657         return String::StringsAreEquals(data1, data2);
658     }
659 }
660 
661 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8Data,uint32_t utf16Length)662 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length)
663 {
664     ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data));
665     if (str1->GetLength() != utf16Length) {
666         return false;
667     }
668     bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
669     return StringsAreEqualMUtf8(str1, mutf8Data, utf16Length, canBeCompressed);
670 }
671 
672 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed)673 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed)
674 {
675     bool result = true;
676     if (str1->GetLength() != utf16Length) {
677         result = false;
678     } else {
679         bool str1CanBeCompressed = !str1->IsUtf16();
680         bool data2CanBeCompressed = canBeCompressed;
681         if (str1CanBeCompressed != data2CanBeCompressed) {
682             return false;
683         }
684 
685         ASSERT(str1CanBeCompressed == data2CanBeCompressed);
686         if (str1CanBeCompressed) {
687             Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
688             Span<const uint8_t> data2(mutf8Data, utf16Length);
689             result = String::StringsAreEquals(data1, data2);
690         } else {
691             result = IsMutf8EqualsUtf16(mutf8Data, str1->GetDataUtf16(), str1->GetLength());
692         }
693     }
694     return result;
695 }
696 
697 /* static */
StringsAreEqualUtf16(String * str1,const uint16_t * utf16Data,uint32_t utf16DataLength)698 bool String::StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, uint32_t utf16DataLength)
699 {
700     bool result = true;
701     if (str1->GetLength() != utf16DataLength) {
702         result = false;
703     } else if (!str1->IsUtf16()) {
704         result = IsMutf8EqualsUtf16(str1->GetDataMUtf8(), str1->GetLength(), utf16Data, utf16DataLength);
705     } else {
706         Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
707         Span<const uint16_t> data2(utf16Data, utf16DataLength);
708         result = String::StringsAreEquals(data1, data2);
709     }
710     return result;
711 }
712 
713 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8Data,uint32_t utf8DataLength,const uint16_t * utf16Data,uint32_t utf16DataLength)714 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
715                                 uint32_t utf16DataLength)
716 {
717     auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
718     auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16DataLength);
719     [[maybe_unused]] auto convertedStringSize =
720         utf::ConvertRegionMUtf8ToUtf16(utf8Data, tmpBuffer, utf8DataLength, utf16DataLength, 0);
721     ASSERT(convertedStringSize == utf16DataLength);
722 
723     Span<const uint16_t> data1(tmpBuffer, utf16DataLength);
724     Span<const uint16_t> data2(utf16Data, utf16DataLength);
725     bool result = String::StringsAreEquals(data1, data2);
726     allocator->Delete(tmpBuffer);
727     return result;
728 }
729 
730 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8Data,const uint16_t * utf16Data,uint32_t utf16DataLength)731 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength)
732 {
733     auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
734     auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16DataLength);
735     utf::ConvertMUtf8ToUtf16(utf8Data, utf::Mutf8Size(utf8Data), tmpBuffer);
736 
737     Span<const uint16_t> data1(tmpBuffer, utf16DataLength);
738     Span<const uint16_t> data2(utf16Data, utf16DataLength);
739     bool result = String::StringsAreEquals(data1, data2);
740     allocator->Delete(tmpBuffer);
741     return result;
742 }
743 
744 /* static */
745 template <typename T>
StringsAreEquals(Span<const T> & str1,Span<const T> & str2)746 bool String::StringsAreEquals(Span<const T> &str1, Span<const T> &str2)
747 {
748     return 0 == std::memcmp(str1.Data(), str2.Data(), str1.SizeBytes());
749 }
750 
ToCharArray(const LanguageContext & ctx)751 Array *String::ToCharArray(const LanguageContext &ctx)
752 {
753     // allocator may trig gc and move 'this', need to hold it
754     auto thread = ManagedThread::GetCurrent();
755     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
756     VMHandle<String> str(thread, this);
757     auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
758     Array *array = Array::Create(klass, GetLength());
759     if (array == nullptr) {
760         return nullptr;
761     }
762 
763     if (str->IsUtf16()) {
764         Span<uint16_t> sp(str->GetDataUtf16(), str->GetLength());
765         for (size_t i = 0; i < sp.size(); i++) {
766             array->Set<uint16_t>(i, sp[i]);
767         }
768     } else {
769         Span<uint8_t> sp(str->GetDataMUtf8(), str->GetLength());
770         for (size_t i = 0; i < sp.size(); i++) {
771             array->Set<uint16_t>(i, sp[i]);
772         }
773     }
774 
775     return array;
776 }
777 
778 /* static */
GetChars(String * src,uint32_t start,uint32_t utf16Length,const LanguageContext & ctx)779 Array *String::GetChars(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx)
780 {
781     // allocator may trig gc and move 'src', need to hold it
782     auto thread = ManagedThread::GetCurrent();
783     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
784     VMHandle<String> str(thread, src);
785     auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
786     Array *array = Array::Create(klass, utf16Length);
787     if (array == nullptr) {
788         return nullptr;
789     }
790 
791     if (str->IsUtf16()) {
792         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
793         Span<uint16_t> sp(str->GetDataUtf16() + start, utf16Length);
794         for (size_t i = 0; i < sp.size(); i++) {
795             array->Set<uint16_t>(i, sp[i]);
796         }
797     } else {
798         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
799         Span<uint8_t> sp(str->GetDataMUtf8() + start, utf16Length);
800         for (size_t i = 0; i < sp.size(); i++) {
801             array->Set<uint16_t>(i, sp[i]);
802         }
803     }
804 
805     return array;
806 }
807 
808 template <class T>
ComputeHashForData(const T * data,size_t size)809 static int32_t ComputeHashForData(const T *data, size_t size)
810 {
811     uint32_t hash = 0;
812 #if defined(__GNUC__)
813 #pragma GCC diagnostic push
814 #pragma GCC diagnostic ignored "-Wignored-attributes"
815     Span<const T> sp(data, size);
816 #pragma GCC diagnostic pop
817 #endif
818     for (auto c : sp) {
819         constexpr size_t SHIFT = 5;
820         hash = (hash << SHIFT) - hash + c;
821     }
822     return static_cast<int32_t>(hash);
823 }
824 
ComputeHashForMutf8(const uint8_t * mutf8Data)825 static int32_t ComputeHashForMutf8(const uint8_t *mutf8Data)
826 {
827     uint32_t hash = 0;
828     while (*mutf8Data != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
829         constexpr size_t SHIFT = 5;
830         hash = (hash << SHIFT) - hash + *mutf8Data++;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
831     }
832     return static_cast<int32_t>(hash);
833 }
834 
ComputeHashcode()835 uint32_t String::ComputeHashcode()
836 {
837     uint32_t hash;
838     if (compressedStringsEnabled_) {
839         if (!IsUtf16()) {
840             hash = static_cast<uint32_t>(ComputeHashForData(GetDataMUtf8(), GetLength()));
841         } else {
842             hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
843         }
844     } else {
845         ASSERT(static_cast<size_t>(GetLength()) < (std::numeric_limits<size_t>::max() >> 1U));
846         hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
847     }
848     return hash;
849 }
850 
851 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8Data,uint32_t utf16Length)852 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length)
853 {
854     bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
855     return ComputeHashcodeMutf8(mutf8Data, utf16Length, canBeCompressed);
856 }
857 
858 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed)859 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed)
860 {
861     uint32_t hash;
862     if (canBeCompressed) {
863         hash = static_cast<uint32_t>(ComputeHashForMutf8(mutf8Data));
864     } else {
865         // NOTE(alovkov): optimize it without allocation a temporary buffer
866         auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
867         auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16Length);
868         utf::ConvertMUtf8ToUtf16(mutf8Data, utf::Mutf8Size(mutf8Data), tmpBuffer);
869         hash = static_cast<uint32_t>(ComputeHashForData(tmpBuffer, utf16Length));
870         allocator->Delete(tmpBuffer);
871     }
872     return hash;
873 }
874 
875 /* static */
ComputeHashcodeUtf16(const uint16_t * utf16Data,uint32_t length)876 uint32_t String::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length)
877 {
878     return ComputeHashForData(utf16Data, length);
879 }
880 
881 /* static */
DoReplace(String * src,uint16_t oldC,uint16_t newC,const LanguageContext & ctx,PandaVM * vm)882 String *String::DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm)
883 {
884     ASSERT(src != nullptr);
885     auto length = static_cast<int32_t>(src->GetLength());
886     bool canBeCompressed = IsASCIICharacter(newC);
887     if (src->IsUtf16()) {
888         canBeCompressed = canBeCompressed && CanBeCompressedUtf16(src->GetDataUtf16(), length, oldC);
889     } else {
890         canBeCompressed = canBeCompressed && CanBeCompressedMUtf8(src->GetDataMUtf8(), length, oldC);
891     }
892 
893     // allocator may trig gc and move src, need to hold it
894     auto thread = ManagedThread::GetCurrent();
895     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
896     VMHandle<String> srcHandle(thread, src);
897     auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
898     if (string == nullptr) {
899         return nullptr;
900     }
901 
902     // retrieve src after gc
903     src = srcHandle.GetPtr();
904     ASSERT(string->hashcode_ == 0);
905 
906     // After replacing we should have a full barrier, so this writes should happen-before barrier
907     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
908     if (src->IsUtf16()) {
909         if (canBeCompressed) {
910             auto replace = [oldC, newC](uint16_t c) { return static_cast<uint8_t>((oldC != c) ? c : newC); };
911             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
912             std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataMUtf8(), replace);
913         } else {
914             auto replace = [oldC, newC](uint16_t c) { return (oldC != c) ? c : newC; };
915             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
916             std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataUtf16(), replace);
917         }
918     } else {
919         if (canBeCompressed) {
920             auto replace = [oldC, newC](uint16_t c) { return static_cast<uint8_t>((oldC != c) ? c : newC); };
921             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
922             std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataMUtf8(), replace);
923         } else {
924             auto replace = [oldC, newC](uint16_t c) { return (oldC != c) ? c : newC; };
925             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
926             std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataUtf16(), replace);
927         }
928     }
929     TSAN_ANNOTATE_IGNORE_WRITES_END();
930     // String is supposed to be a constant object, so all its data should be visible by all threads
931     arch::FullMemoryBarrier();
932     return string;
933 }
934 
935 /* static */
FastSubString(String * src,uint32_t start,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm)936 String *String::FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
937                               PandaVM *vm)
938 {
939     ASSERT(src != nullptr);
940     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
941     bool canBeCompressed = !src->IsUtf16() || CanBeCompressed(src->GetDataUtf16() + start, utf16Length);
942 
943     // allocator may trig gc and move src, need to hold it
944     auto thread = ManagedThread::GetCurrent();
945     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
946     VMHandle<String> srcHandle(thread, src);
947     auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm);
948     if (string == nullptr) {
949         return nullptr;
950     }
951 
952     // retrieve src after gc
953     src = srcHandle.GetPtr();
954     ASSERT(string->hashcode_ == 0);
955 
956     // After copying we should have a full barrier, so this writes should happen-before barrier
957     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
958     if (src->IsUtf16()) {
959         if (canBeCompressed) {
960             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
961             CopyUtf16AsMUtf8(src->GetDataUtf16() + start, string->GetDataMUtf8(), utf16Length);
962         } else {
963             memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()),
964                      // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
965                      src->GetDataUtf16() + start, utf16Length << 1UL);
966         }
967     } else {
968         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
969         memcpy_s(string->GetDataMUtf8(), string->GetLength(), src->GetDataMUtf8() + start, utf16Length);
970     }
971     TSAN_ANNOTATE_IGNORE_WRITES_END();
972     // String is supposed to be a constant object, so all its data should be visible by all threads
973     arch::FullMemoryBarrier();
974     return string;
975 }
976 
977 /* static */
Concat(String * string1,String * string2,const LanguageContext & ctx,PandaVM * vm)978 String *String::Concat(String *string1, String *string2, const LanguageContext &ctx, PandaVM *vm)
979 {
980     ASSERT(string1 != nullptr);
981     ASSERT(string2 != nullptr);
982     // allocator may trig gc and move src, need to hold it
983     auto thread = ManagedThread::GetCurrent();
984     [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
985     VMHandle<String> str1Handle(thread, string1);
986     VMHandle<String> str2Handle(thread, string2);
987 
988     uint32_t length1 = string1->GetLength();
989     uint32_t length2 = string2->GetLength();
990     uint32_t newLength = length1 + length2;
991     bool compressed = compressedStringsEnabled_ && (!string1->IsUtf16() && !string2->IsUtf16());
992     auto newString = AllocStringObject(newLength, compressed, ctx, vm);
993     if (UNLIKELY(newString == nullptr)) {
994         return nullptr;
995     }
996 
997     ASSERT(newString->hashcode_ == 0);
998 
999     // retrieve strings after gc
1000     string1 = str1Handle.GetPtr();
1001     string2 = str2Handle.GetPtr();
1002 
1003     // After copying we should have a full barrier, so this writes should happen-before barrier
1004     TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
1005     if (compressed) {
1006         Span<uint8_t> sp(newString->GetDataMUtf8(), newLength);
1007         memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataMUtf8(), length1);
1008         sp = sp.SubSpan(length1);
1009         memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataMUtf8(), length2);
1010     } else {
1011         Span<uint16_t> sp(newString->GetDataUtf16(), newLength);
1012         if (!string1->IsUtf16()) {
1013             for (uint32_t i = 0; i < length1; ++i) {
1014                 sp[i] = string1->At<false>(i);
1015             }
1016         } else {
1017             memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataUtf16(), length1 << 1U);
1018         }
1019         sp = sp.SubSpan(length1);
1020         if (!string2->IsUtf16()) {
1021             for (uint32_t i = 0; i < length2; ++i) {
1022                 sp[i] = string2->At<false>(i);
1023             }
1024         } else {
1025             memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataUtf16(), length2 << 1U);
1026         }
1027     }
1028     TSAN_ANNOTATE_IGNORE_WRITES_END();
1029     // String is supposed to be a constant object, so all its data should be visible by all threads
1030     arch::FullMemoryBarrier();
1031 
1032     return newString;
1033 }
1034 
1035 /* static */
AllocStringObject(size_t length,bool compressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)1036 String *String::AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm, bool movable,
1037                                   bool pinned)
1038 {
1039     ASSERT(vm != nullptr);
1040     auto *thread = ManagedThread::GetCurrent();
1041     auto *stringClass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::STRING);
1042     size_t size = compressed ? String::ComputeSizeMUtf8(length) : String::ComputeSizeUtf16(length);
1043     auto string =
1044         movable
1045             ? reinterpret_cast<String *>(
1046                   vm->GetHeapManager()->AllocateObject(stringClass, size, DEFAULT_ALIGNMENT, thread,
1047                                                        mem::ObjectAllocatorBase::ObjMemInitPolicy::NO_INIT, pinned))
1048             : reinterpret_cast<String *>(vm->GetHeapManager()->AllocateNonMovableObject(
1049                   // CC-OFFNXT(G.FMT.06) project code style
1050                   stringClass, size, DEFAULT_ALIGNMENT, thread, mem::ObjectAllocatorBase::ObjMemInitPolicy::NO_INIT));
1051     if (string != nullptr) {
1052         // After setting length we should have a full barrier, so this write should happens-before barrier
1053         TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
1054         string->SetLength(length, compressed);
1055         string->SetHashcode(0);
1056         TSAN_ANNOTATE_IGNORE_WRITES_END();
1057         // Witout full memory barrier it is possible that architectures with weak memory order can try fetching string
1058         // legth before it's set
1059         arch::FullMemoryBarrier();
1060     }
1061     return string;
1062 }
1063 
1064 }  // namespace ark::coretypes
1065