1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <cstddef>
17 #include <cstring>
18 #include <limits>
19
20 #include "libpandabase/utils/utf.h"
21 #include "libpandabase/utils/hash.h"
22 #include "libpandabase/utils/span.h"
23 #include "runtime/arch/memory_helpers.h"
24 #include "runtime/include/coretypes/array.h"
25 #include "runtime/include/coretypes/string-inl.h"
26 #include "runtime/include/runtime.h"
27 #include "runtime/handle_base-inl.h"
28 #include "runtime/include/panda_vm.h"
29
30 namespace ark::coretypes {
31
32 bool String::compressedStringsEnabled_ = true;
33
34 /* static */
CreateFromString(String * str,const LanguageContext & ctx,PandaVM * vm)35 String *String::CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm)
36 {
37 ASSERT(str != nullptr);
38 // allocator may trig gc and move str, need to hold it
39 auto thread = ManagedThread::GetCurrent();
40 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
41 VMHandle<String> strHandle(thread, str);
42 ASSERT(strHandle.GetPtr() != nullptr);
43 auto string = AllocStringObject(strHandle->GetLength(), !strHandle->IsUtf16(), ctx, vm);
44 if (string == nullptr) {
45 return nullptr;
46 }
47
48 // retrive str after gc
49 str = strHandle.GetPtr();
50 string->hashcode_ = str->hashcode_;
51
52 uint32_t length = str->GetLength();
53 // After memcpy we should have a full barrier, so this writes should happen-before barrier
54 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
55 if (str->IsUtf16()) {
56 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), str->GetDataUtf16(),
57 ComputeDataSizeUtf16(length));
58 } else {
59 memcpy_s(string->GetDataMUtf8(), string->GetLength(), str->GetDataMUtf8(), length);
60 }
61 TSAN_ANNOTATE_IGNORE_WRITES_END();
62 // String is supposed to be a constant object, so all its data should be visible by all threads
63 arch::FullMemoryBarrier();
64
65 return string;
66 }
67
68 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,size_t mutf8Length,uint32_t utf16Length,bool canBeCompressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)69 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, size_t mutf8Length, uint32_t utf16Length,
70 bool canBeCompressed, const LanguageContext &ctx, PandaVM *vm, bool movable,
71 bool pinned)
72 {
73 auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm, movable, pinned);
74 if (string == nullptr) {
75 return nullptr;
76 }
77
78 ASSERT(string->hashcode_ == 0);
79 // After copying we should have a full barrier, so this writes should happen-before barrier
80 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
81 if (canBeCompressed) {
82 memcpy_s(string->GetDataMUtf8(), string->GetLength(), mutf8Data, utf16Length);
83 } else {
84 utf::ConvertMUtf8ToUtf16(mutf8Data, mutf8Length, string->GetDataUtf16());
85 }
86 TSAN_ANNOTATE_IGNORE_WRITES_END();
87 // String is supposed to be a constant object, so all its data should be visible by all threads
88 arch::FullMemoryBarrier();
89 return string;
90 }
91
92 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)93 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, const LanguageContext &ctx, PandaVM *vm,
94 bool movable, bool pinned)
95 {
96 bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
97 auto mutf8Length = utf::Mutf8Size(mutf8Data);
98 ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
99 return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
100 }
101
102 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)103 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed,
104 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned)
105 {
106 auto mutf8Length = utf::Mutf8Size(mutf8Data);
107 ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
108 ASSERT(canBeCompressed == CanBeCompressedMUtf8(mutf8Data));
109 return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
110 }
111
112 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)113 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, const LanguageContext &ctx, PandaVM *vm, bool movable,
114 bool pinned)
115 {
116 size_t mutf8Length = utf::Mutf8Size(mutf8Data);
117 size_t utf16Length = utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length);
118 bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
119 return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
120 }
121
122 /* static */
CreateFromMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)123 String *String::CreateFromMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint32_t utf16Length,
124 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned)
125 {
126 ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data, mutf8Length));
127 auto canBeCompressed = CanBeCompressedMUtf8(mutf8Data, mutf8Length);
128 return CreateFromMUtf8(mutf8Data, mutf8Length, utf16Length, canBeCompressed, ctx, vm, movable, pinned);
129 }
130
131 /* static */
CreateFromUtf8(const uint8_t * utf8Data,uint32_t utf8Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)132 String *String::CreateFromUtf8(const uint8_t *utf8Data, uint32_t utf8Length, const LanguageContext &ctx, PandaVM *vm,
133 bool movable, bool pinned)
134 {
135 coretypes::String *s = nullptr;
136 auto utf16Length = utf::Utf8ToUtf16Size(utf8Data, utf8Length);
137 if (CanBeCompressedMUtf8(utf8Data, utf8Length)) {
138 // ascii string have equal representation in utf8 and mutf8 formats
139 s = coretypes::String::CreateFromMUtf8(utf8Data, utf8Length, utf16Length, true, ctx, vm, movable, pinned);
140 } else {
141 PandaVector<uint16_t> tmpBuffer(utf16Length);
142 [[maybe_unused]] auto len =
143 utf::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Length, utf16Length, 0);
144 ASSERT(len == utf16Length);
145 s = coretypes::String::CreateFromUtf16(tmpBuffer.data(), utf16Length, ctx, vm, movable, pinned);
146 }
147 return s;
148 }
149
150 /* static */
CreateFromUtf16(const uint16_t * utf16Data,uint32_t utf16Length,bool canBeCompressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)151 String *String::CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, bool canBeCompressed,
152 const LanguageContext &ctx, PandaVM *vm, bool movable, bool pinned)
153 {
154 ASSERT(canBeCompressed == CanBeCompressed(utf16Data, utf16Length));
155 auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm, movable, pinned);
156 if (string == nullptr) {
157 return nullptr;
158 }
159
160 ASSERT(string->hashcode_ == 0);
161 // After copying we should have a full barrier, so this writes should happen-before barrier
162 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
163 if (canBeCompressed) {
164 CopyUtf16AsMUtf8(utf16Data, string->GetDataMUtf8(), utf16Length);
165 } else {
166 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), utf16Data, utf16Length << 1UL);
167 }
168 TSAN_ANNOTATE_IGNORE_WRITES_END();
169 // String is supposed to be a constant object, so all its data should be visible by all threads
170 arch::FullMemoryBarrier();
171 return string;
172 }
173
174 /* static */
CreateFromUtf16(const uint16_t * utf16Data,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)175 String *String::CreateFromUtf16(const uint16_t *utf16Data, uint32_t utf16Length, const LanguageContext &ctx,
176 PandaVM *vm, bool movable, bool pinned)
177 {
178 bool compressable = CanBeCompressed(utf16Data, utf16Length);
179 return CreateFromUtf16(utf16Data, utf16Length, compressable, ctx, vm, movable, pinned);
180 }
181
182 /* static */
CreateEmptyString(const LanguageContext & ctx,PandaVM * vm)183 String *String::CreateEmptyString(const LanguageContext &ctx, PandaVM *vm)
184 {
185 uint16_t data = 0;
186 return CreateFromUtf16(&data, 0, ctx, vm);
187 }
188
189 /* static */
CopyUtf16AsMUtf8(const uint16_t * utf16From,uint8_t * mutf8To,uint32_t utf16Length)190 void String::CopyUtf16AsMUtf8(const uint16_t *utf16From, uint8_t *mutf8To, uint32_t utf16Length)
191 {
192 Span<const uint16_t> from(utf16From, utf16Length);
193 Span<uint8_t> to(mutf8To, utf16Length);
194 for (uint32_t i = 0; i < utf16Length; i++) {
195 to[i] = from[i];
196 }
197 }
198
199 // static
CreateNewStringFromChars(uint32_t offset,uint32_t length,Array * chararray,const LanguageContext & ctx,PandaVM * vm)200 String *String::CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, const LanguageContext &ctx,
201 PandaVM *vm)
202 {
203 ASSERT(chararray != nullptr);
204 // allocator may trig gc and move array, need to hold it
205 auto thread = ManagedThread::GetCurrent();
206 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
207 VMHandle<Array> arrayHandle(thread, chararray);
208 ASSERT(arrayHandle.GetPtr() != nullptr);
209
210 // There is a potential data race between read of src in CanBeCompressed and write of destination buf
211 // in CopyDataRegionUtf16. The src is a cast from chararray comming from managed object.
212 // Hence the race is reported on managed object, which has a synchronization on a high level.
213 // TSAN does not see such synchronization, thus we ignore such races here.
214 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
215 // NOLINTNEXTLINE(readability-identifier-naming)
216 const uint16_t *src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(chararray->GetData()) + (offset << 1UL));
217 bool canBeCompressed = CanBeCompressed(src, length);
218 TSAN_ANNOTATE_IGNORE_WRITES_END();
219 auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
220 if (string == nullptr) {
221 return nullptr;
222 }
223
224 // retrieve src since gc may move it
225 src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(arrayHandle->GetData()) + (offset << 1UL));
226 ASSERT(string->hashcode_ == 0);
227 // After copying we should have a full barrier, so this writes should happen-before barrier
228 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
229 if (canBeCompressed) {
230 CopyUtf16AsMUtf8(src, string->GetDataMUtf8(), length);
231 } else {
232 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), src, length << 1UL);
233 }
234 TSAN_ANNOTATE_IGNORE_WRITES_END();
235 // String is supposed to be a constant object, so all its data should be visible by all threads
236 arch::FullMemoryBarrier();
237 return string;
238 }
239
240 // static
CreateNewStringFromBytes(uint32_t offset,uint32_t length,uint32_t highByte,Array * bytearray,const LanguageContext & ctx,PandaVM * vm)241 String *String::CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t highByte, Array *bytearray,
242 const LanguageContext &ctx, PandaVM *vm)
243 {
244 ASSERT(length != 0);
245 ASSERT(bytearray != nullptr);
246 // allocator may trig gc and move array, need to hold it
247 auto thread = ManagedThread::GetCurrent();
248 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
249 VMHandle<Array> arrayHandle(thread, bytearray);
250 ASSERT(arrayHandle.GetPtr() != nullptr);
251
252 constexpr size_t BYTE_MASK = 0xFF;
253
254 // NOLINTNEXTLINE(readability-identifier-naming)
255 const uint8_t *src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(bytearray->GetData()) + offset);
256 highByte &= BYTE_MASK;
257 bool canBeCompressed = CanBeCompressedMUtf8(src, length) && (highByte == 0);
258 auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
259 if (string == nullptr) {
260 return nullptr;
261 }
262
263 // retrieve src since gc may move it
264 src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(arrayHandle->GetData()) + offset);
265 ASSERT(string->hashcode_ == 0);
266 // After copying we should have a full barrier, so this writes should happen-before barrier
267 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
268 if (canBeCompressed) {
269 Span<const uint8_t> from(src, length);
270 Span<uint8_t> to(string->GetDataMUtf8(), length);
271 for (uint32_t i = 0; i < length; ++i) {
272 to[i] = (from[i] & BYTE_MASK);
273 }
274 } else {
275 Span<const uint8_t> from(src, length);
276 Span<uint16_t> to(string->GetDataUtf16(), length);
277 for (uint32_t i = 0; i < length; ++i) {
278 to[i] = (highByte << 8U) + (from[i] & BYTE_MASK);
279 }
280 }
281 TSAN_ANNOTATE_IGNORE_WRITES_END();
282
283 // String is supposed to be a constant object, so all its data should be visible by all threads
284 arch::FullMemoryBarrier();
285 return string;
286 }
287
288 template <typename T1, typename T2>
CompareStringSpan(Span<T1> & lhsSp,Span<T2> & rhsSp,int32_t count)289 int32_t CompareStringSpan(Span<T1> &lhsSp, Span<T2> &rhsSp, int32_t count)
290 {
291 for (int32_t i = 0; i < count; ++i) {
292 int32_t charDiff = static_cast<int32_t>(lhsSp[i]) - static_cast<int32_t>(rhsSp[i]);
293 if (charDiff != 0) {
294 return charDiff;
295 }
296 }
297 return 0;
298 }
299
300 template <typename T>
CompareBytesBlock(T * lstrPt,T * rstrPt,int32_t minCount)301 int32_t CompareBytesBlock(T *lstrPt, T *rstrPt, int32_t minCount)
302 {
303 constexpr int32_t BYTES_CNT = sizeof(size_t);
304 static_assert(BYTES_CNT >= sizeof(T));
305 static_assert(BYTES_CNT % sizeof(T) == 0);
306 int32_t totalBytes = minCount * sizeof(T);
307 auto lhsBlock = reinterpret_cast<size_t *>(lstrPt);
308 auto rhsBlock = reinterpret_cast<size_t *>(rstrPt);
309 int32_t curBytePos = 0;
310 while (curBytePos + BYTES_CNT <= totalBytes) {
311 if (*lhsBlock == *rhsBlock) {
312 curBytePos += BYTES_CNT;
313 lhsBlock++;
314 rhsBlock++;
315 } else {
316 break;
317 }
318 }
319 int32_t curElementPos = curBytePos / sizeof(T);
320 for (int32_t i = curElementPos; i < minCount; ++i) {
321 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
322 int32_t charDiff = static_cast<int32_t>(lstrPt[i]) - static_cast<int32_t>(rstrPt[i]);
323 if (charDiff != 0) {
324 return charDiff;
325 }
326 }
327
328 return 0;
329 }
330
Compare(String * rstr)331 int32_t String::Compare(String *rstr)
332 {
333 String *lstr = this;
334 if (lstr == rstr) {
335 return 0;
336 }
337 ASSERT(lstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
338 ASSERT(rstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
339 auto lstrLeng = static_cast<int32_t>(lstr->GetLength());
340 auto rstrLeng = static_cast<int32_t>(rstr->GetLength());
341 int32_t lengRet = lstrLeng - rstrLeng;
342 int32_t minCount = (lengRet < 0) ? lstrLeng : rstrLeng;
343 bool lstrIsUtf16 = lstr->IsUtf16();
344 bool rstrIsUtf16 = rstr->IsUtf16();
345 if (!lstrIsUtf16 && !rstrIsUtf16) {
346 int32_t charDiff = CompareBytesBlock(lstr->GetDataMUtf8(), rstr->GetDataMUtf8(), minCount);
347 if (charDiff != 0) {
348 return charDiff;
349 }
350 } else if (!lstrIsUtf16) {
351 Span<uint8_t> lhsSp(lstr->GetDataMUtf8(), lstrLeng);
352 Span<uint16_t> rhsSp(rstr->GetDataUtf16(), rstrLeng);
353 int32_t charDiff = CompareStringSpan(lhsSp, rhsSp, minCount);
354 if (charDiff != 0) {
355 return charDiff;
356 }
357 } else if (!rstrIsUtf16) {
358 Span<uint16_t> lhsSp(lstr->GetDataUtf16(), lstrLeng);
359 Span<uint8_t> rhsSp(rstr->GetDataMUtf8(), rstrLeng);
360 int32_t charDiff = CompareStringSpan(lhsSp, rhsSp, minCount);
361 if (charDiff != 0) {
362 return charDiff;
363 }
364 } else {
365 int32_t charDiff = CompareBytesBlock(lstr->GetDataUtf16(), rstr->GetDataUtf16(), minCount);
366 if (charDiff != 0) {
367 return charDiff;
368 }
369 }
370 return lengRet;
371 }
372
373 template <typename T1, typename T2>
SubstringEquals(Span<const T1> & string,Span<const T2> & pattern,int32_t pos)374 static inline ALWAYS_INLINE int32_t SubstringEquals(Span<const T1> &string, Span<const T2> &pattern, int32_t pos)
375 {
376 ASSERT(pos + pattern.size() <= string.size());
377 if constexpr (std::is_same_v<T1, T2>) {
378 return std::memcmp(string.begin() + pos, pattern.begin(), pattern.size()) == 0;
379 }
380 return std::equal(pattern.begin(), pattern.end(), string.begin() + pos);
381 }
382
383 /*
384 * Tailed Substring method (based on D. Cantone and S. Faro: Searching for a substring with constant extra-space
385 * complexity). O(nm) worst-case but reported to have good performance both on random and natural language data
386 * Substring s of t is called tailed-substring, if the last character of s does not repeat elsewhere in s
387 */
388 /* static */
389 template <typename T1, typename T2>
IndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos,int32_t max)390 static int32_t IndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos, int32_t max)
391 {
392 int32_t maxTailedLen = 1;
393 auto tailedEnd = static_cast<int32_t>(rhsSp.size() - 1);
394 int32_t maxTailedEnd = tailedEnd;
395 // Phase 1: search in the beginning of string while computing maximal tailed-substring length
396 auto searchChar = rhsSp[tailedEnd];
397 auto *shiftedLhs = lhsSp.begin() + tailedEnd;
398 while (pos <= max) {
399 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
400 if (searchChar != shiftedLhs[pos]) {
401 pos++;
402 continue;
403 }
404 if (SubstringEquals(lhsSp, rhsSp, pos)) {
405 return pos;
406 }
407 auto tailedStart = tailedEnd - 1;
408 while (tailedStart >= 0 && rhsSp[tailedStart] != searchChar) {
409 tailedStart--;
410 }
411 if (maxTailedLen < tailedEnd - tailedStart) {
412 maxTailedLen = tailedEnd - tailedStart;
413 maxTailedEnd = tailedEnd;
414 }
415 if (maxTailedLen >= tailedEnd) {
416 break;
417 }
418 pos += tailedEnd - tailedStart;
419 tailedEnd--;
420 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
421 shiftedLhs--;
422 searchChar = rhsSp[tailedEnd];
423 }
424 // Phase 2: search in the remainder of string using computed maximal tailed-substring length
425 searchChar = rhsSp[maxTailedEnd];
426 shiftedLhs = lhsSp.begin() + maxTailedEnd;
427 while (pos <= max) {
428 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
429 if (searchChar != shiftedLhs[pos]) {
430 pos++;
431 continue;
432 }
433 if (SubstringEquals(lhsSp, rhsSp, pos)) {
434 return pos;
435 }
436 pos += maxTailedLen;
437 }
438 return -1;
439 }
440
441 // Search of the last occurence is equivalent to search of the first occurence of
442 // reversed pattern in reversed string
443 template <typename T1, typename T2>
LastIndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos)444 static int32_t LastIndexOf(Span<const T1> &lhsSp, Span<const T2> &rhsSp, int32_t pos)
445 {
446 int32_t maxTailedLen = 1;
447 int32_t tailedStart = 0;
448 int32_t maxTailedStart = tailedStart;
449 auto patternSize = static_cast<int32_t>(rhsSp.size());
450 // Phase 1: search in the end of string while computing maximal tailed-substring length
451 auto searchChar = rhsSp[tailedStart];
452 auto *shiftedLhs = lhsSp.begin() + tailedStart;
453 while (pos >= 0) {
454 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
455 if (searchChar != shiftedLhs[pos]) {
456 pos--;
457 continue;
458 }
459 if (SubstringEquals(lhsSp, rhsSp, pos)) {
460 return pos;
461 }
462 auto tailedEnd = tailedStart + 1;
463 while (tailedEnd < patternSize && rhsSp[tailedEnd] != searchChar) {
464 tailedEnd++;
465 }
466 if (maxTailedLen < tailedEnd - tailedStart) {
467 maxTailedLen = tailedEnd - tailedStart;
468 maxTailedStart = tailedStart;
469 }
470 if (maxTailedLen >= patternSize - tailedStart) {
471 break;
472 }
473 pos -= tailedEnd - tailedStart;
474 tailedStart++;
475 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
476 shiftedLhs++;
477 searchChar = rhsSp[tailedStart];
478 }
479 // Phase 2: search in the remainder of string using computed maximal tailed-substring length
480 searchChar = rhsSp[maxTailedStart];
481 shiftedLhs = lhsSp.begin() + maxTailedStart;
482 while (pos >= 0) {
483 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484 if (searchChar != shiftedLhs[pos]) {
485 pos--;
486 continue;
487 }
488 if (SubstringEquals(lhsSp, rhsSp, pos)) {
489 return pos;
490 }
491 pos -= maxTailedLen;
492 }
493 return -1;
494 }
495
GetCompressionAndLength(ark::coretypes::String * string)496 static inline ALWAYS_INLINE std::pair<bool, int32_t> GetCompressionAndLength(ark::coretypes::String *string)
497 {
498 ASSERT(string->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
499 ASSERT(string != nullptr);
500 return {string->IsMUtf8(), static_cast<int32_t>(string->GetLength())};
501 }
502
IndexOf(String * rhs,int32_t pos)503 int32_t String::IndexOf(String *rhs, int32_t pos)
504 {
505 String *lhs = this;
506 auto [lhs_utf8, lhs_count] = GetCompressionAndLength(lhs);
507 auto [rhs_utf8, rhs_count] = GetCompressionAndLength(rhs);
508
509 if (pos < 0) {
510 pos = 0;
511 }
512
513 if (rhs_count == 0) {
514 return std::min(lhs_count, pos);
515 }
516
517 int32_t max = lhs_count - rhs_count;
518 // for pos > max IndexOf impl will return -1
519 if (lhs_utf8 && rhs_utf8) {
520 Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
521 Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
522 return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
523 } else if (!lhs_utf8 && !rhs_utf8) { // NOLINT(readability-else-after-return)
524 Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
525 Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
526 return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
527 } else if (rhs_utf8) {
528 Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
529 Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
530 return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
531 } else { // NOLINT(readability-else-after-return)
532 Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
533 Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
534 return ark::coretypes::IndexOf(lhsSp, rhsSp, pos, max);
535 }
536 }
537
LastIndexOf(String * rhs,int32_t pos)538 int32_t String::LastIndexOf(String *rhs, int32_t pos)
539 {
540 String *lhs = this;
541 auto [lhs_utf8, lhs_count] = GetCompressionAndLength(lhs);
542 auto [rhs_utf8, rhs_count] = GetCompressionAndLength(rhs);
543
544 int32_t max = lhs_count - rhs_count;
545
546 if (pos > max) {
547 pos = max;
548 }
549
550 if (pos < 0) {
551 return -1;
552 }
553
554 if (rhs_count == 0) {
555 return pos;
556 }
557
558 if (lhs_utf8 && rhs_utf8) {
559 Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
560 Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
561 return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
562 } else if (!lhs_utf8 && !rhs_utf8) { // NOLINT(readability-else-after-return)
563 Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
564 Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
565 return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
566 } else if (rhs_utf8) {
567 Span<const uint16_t> lhsSp(lhs->GetDataUtf16(), lhs_count);
568 Span<const uint8_t> rhsSp(rhs->GetDataMUtf8(), rhs_count);
569 return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
570 } else { // NOLINT(readability-else-after-return)
571 Span<const uint8_t> lhsSp(lhs->GetDataMUtf8(), lhs_count);
572 Span<const uint16_t> rhsSp(rhs->GetDataUtf16(), rhs_count);
573 return ark::coretypes::LastIndexOf(lhsSp, rhsSp, pos);
574 }
575 }
576
577 /* static */
CanBeCompressed(const uint16_t * utf16Data,uint32_t utf16Length)578 bool String::CanBeCompressed(const uint16_t *utf16Data, uint32_t utf16Length)
579 {
580 if (!compressedStringsEnabled_) {
581 return false;
582 }
583 bool isCompressed = true;
584 Span<const uint16_t> data(utf16Data, utf16Length);
585 for (uint32_t i = 0; i < utf16Length; i++) {
586 if (!IsASCIICharacter(data[i])) {
587 isCompressed = false;
588 break;
589 }
590 }
591 return isCompressed;
592 }
593
594 // static
CanBeCompressedMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length)595 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length)
596 {
597 if (!compressedStringsEnabled_) {
598 return false;
599 }
600 bool isCompressed = true;
601 Span<const uint8_t> data(mutf8Data, mutf8Length);
602 for (uint32_t i = 0; i < mutf8Length; i++) {
603 if (!IsASCIICharacter(data[i])) {
604 isCompressed = false;
605 break;
606 }
607 }
608 return isCompressed;
609 }
610
611 // static
CanBeCompressedMUtf8(const uint8_t * mutf8Data)612 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data)
613 {
614 return compressedStringsEnabled_ ? utf::IsMUtf8OnlySingleBytes(mutf8Data) : false;
615 }
616
617 /* static */
CanBeCompressedUtf16(const uint16_t * utf16Data,uint32_t utf16Length,uint16_t non)618 bool String::CanBeCompressedUtf16(const uint16_t *utf16Data, uint32_t utf16Length, uint16_t non)
619 {
620 if (!compressedStringsEnabled_) {
621 return false;
622 }
623 bool isCompressed = true;
624 Span<const uint16_t> data(utf16Data, utf16Length);
625 for (uint32_t i = 0; i < utf16Length; i++) {
626 if (!IsASCIICharacter(data[i]) && data[i] != non) {
627 isCompressed = false;
628 break;
629 }
630 }
631 return isCompressed;
632 }
633
634 /* static */
CanBeCompressedMUtf8(const uint8_t * mutf8Data,uint32_t mutf8Length,uint16_t non)635 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8Data, uint32_t mutf8Length, uint16_t non)
636 {
637 if (!compressedStringsEnabled_) {
638 return false;
639 }
640 bool isCompressed = true;
641 Span<const uint8_t> data(mutf8Data, mutf8Length);
642 for (uint32_t i = 0; i < mutf8Length; i++) {
643 if (!IsASCIICharacter(data[i]) && data[i] != non) {
644 isCompressed = false;
645 break;
646 }
647 }
648 return isCompressed;
649 }
650
651 /* static */
StringsAreEqual(String * str1,String * str2)652 bool String::StringsAreEqual(String *str1, String *str2)
653 {
654 ASSERT(str1 != nullptr);
655 ASSERT(str2 != nullptr);
656
657 if ((str1->IsUtf16() != str2->IsUtf16()) || (str1->GetLength() != str2->GetLength())) {
658 return false;
659 }
660
661 if (str1->IsUtf16()) {
662 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
663 Span<const uint16_t> data2(str2->GetDataUtf16(), str1->GetLength());
664 return String::StringsAreEquals(data1, data2);
665 } else { // NOLINT(readability-else-after-return)
666 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
667 Span<const uint8_t> data2(str2->GetDataMUtf8(), str1->GetLength());
668 return String::StringsAreEquals(data1, data2);
669 }
670 }
671
672 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8Data,uint32_t utf16Length)673 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length)
674 {
675 ASSERT(utf16Length == utf::MUtf8ToUtf16Size(mutf8Data));
676 if (str1->GetLength() != utf16Length) {
677 return false;
678 }
679 bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
680 return StringsAreEqualMUtf8(str1, mutf8Data, utf16Length, canBeCompressed);
681 }
682
683 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed)684 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed)
685 {
686 bool result = true;
687 if (str1->GetLength() != utf16Length) {
688 result = false;
689 } else {
690 bool str1CanBeCompressed = !str1->IsUtf16();
691 bool data2CanBeCompressed = canBeCompressed;
692 if (str1CanBeCompressed != data2CanBeCompressed) {
693 return false;
694 }
695
696 ASSERT(str1CanBeCompressed == data2CanBeCompressed);
697 if (str1CanBeCompressed) {
698 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
699 Span<const uint8_t> data2(mutf8Data, utf16Length);
700 result = String::StringsAreEquals(data1, data2);
701 } else {
702 result = IsMutf8EqualsUtf16(mutf8Data, str1->GetDataUtf16(), str1->GetLength());
703 }
704 }
705 return result;
706 }
707
708 /* static */
StringsAreEqualUtf16(String * str1,const uint16_t * utf16Data,uint32_t utf16DataLength)709 bool String::StringsAreEqualUtf16(String *str1, const uint16_t *utf16Data, uint32_t utf16DataLength)
710 {
711 bool result = true;
712 if (str1->GetLength() != utf16DataLength) {
713 result = false;
714 } else if (!str1->IsUtf16()) {
715 result = IsMutf8EqualsUtf16(str1->GetDataMUtf8(), str1->GetLength(), utf16Data, utf16DataLength);
716 } else {
717 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
718 Span<const uint16_t> data2(utf16Data, utf16DataLength);
719 result = String::StringsAreEquals(data1, data2);
720 }
721 return result;
722 }
723
724 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8Data,uint32_t utf8DataLength,const uint16_t * utf16Data,uint32_t utf16DataLength)725 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8Data, uint32_t utf8DataLength, const uint16_t *utf16Data,
726 uint32_t utf16DataLength)
727 {
728 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
729 auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16DataLength);
730 [[maybe_unused]] auto convertedStringSize =
731 utf::ConvertRegionMUtf8ToUtf16(utf8Data, tmpBuffer, utf8DataLength, utf16DataLength, 0);
732 ASSERT(convertedStringSize == utf16DataLength);
733
734 Span<const uint16_t> data1(tmpBuffer, utf16DataLength);
735 Span<const uint16_t> data2(utf16Data, utf16DataLength);
736 bool result = String::StringsAreEquals(data1, data2);
737 allocator->Delete(tmpBuffer);
738 return result;
739 }
740
741 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8Data,const uint16_t * utf16Data,uint32_t utf16DataLength)742 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8Data, const uint16_t *utf16Data, uint32_t utf16DataLength)
743 {
744 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
745 auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16DataLength);
746 utf::ConvertMUtf8ToUtf16(utf8Data, utf::Mutf8Size(utf8Data), tmpBuffer);
747
748 Span<const uint16_t> data1(tmpBuffer, utf16DataLength);
749 Span<const uint16_t> data2(utf16Data, utf16DataLength);
750 bool result = String::StringsAreEquals(data1, data2);
751 allocator->Delete(tmpBuffer);
752 return result;
753 }
754
755 /* static */
756 template <typename T>
StringsAreEquals(Span<const T> & str1,Span<const T> & str2)757 bool String::StringsAreEquals(Span<const T> &str1, Span<const T> &str2)
758 {
759 return 0 == std::memcmp(str1.Data(), str2.Data(), str1.SizeBytes());
760 }
761
ToCharArray(const LanguageContext & ctx)762 Array *String::ToCharArray(const LanguageContext &ctx)
763 {
764 // allocator may trig gc and move 'this', need to hold it
765 auto thread = ManagedThread::GetCurrent();
766 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
767 VMHandle<String> str(thread, this);
768 auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
769 Array *array = Array::Create(klass, GetLength());
770 if (array == nullptr) {
771 return nullptr;
772 }
773
774 if (str->IsUtf16()) {
775 Span<uint16_t> sp(str->GetDataUtf16(), str->GetLength());
776 for (size_t i = 0; i < sp.size(); i++) {
777 array->Set<uint16_t>(i, sp[i]);
778 }
779 } else {
780 Span<uint8_t> sp(str->GetDataMUtf8(), str->GetLength());
781 for (size_t i = 0; i < sp.size(); i++) {
782 array->Set<uint16_t>(i, sp[i]);
783 }
784 }
785
786 return array;
787 }
788
789 /* static */
GetChars(String * src,uint32_t start,uint32_t utf16Length,const LanguageContext & ctx)790 Array *String::GetChars(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx)
791 {
792 // allocator may trig gc and move 'src', need to hold it
793 auto thread = ManagedThread::GetCurrent();
794 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
795 VMHandle<String> str(thread, src);
796 auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
797 Array *array = Array::Create(klass, utf16Length);
798 if (array == nullptr) {
799 return nullptr;
800 }
801
802 if (str->IsUtf16()) {
803 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
804 Span<uint16_t> sp(str->GetDataUtf16() + start, utf16Length);
805 for (size_t i = 0; i < sp.size(); i++) {
806 array->Set<uint16_t>(i, sp[i]);
807 }
808 } else {
809 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
810 Span<uint8_t> sp(str->GetDataMUtf8() + start, utf16Length);
811 for (size_t i = 0; i < sp.size(); i++) {
812 array->Set<uint16_t>(i, sp[i]);
813 }
814 }
815
816 return array;
817 }
818
819 template <class T>
ComputeHashForData(const T * data,size_t size)820 static int32_t ComputeHashForData(const T *data, size_t size)
821 {
822 uint32_t hash = 0;
823 #if defined(__GNUC__)
824 #pragma GCC diagnostic push
825 #pragma GCC diagnostic ignored "-Wignored-attributes"
826 Span<const T> sp(data, size);
827 #pragma GCC diagnostic pop
828 #endif
829 for (auto c : sp) {
830 constexpr size_t SHIFT = 5;
831 hash = (hash << SHIFT) - hash + c;
832 }
833 return static_cast<int32_t>(hash);
834 }
835
ComputeHashForMutf8(const uint8_t * mutf8Data)836 static int32_t ComputeHashForMutf8(const uint8_t *mutf8Data)
837 {
838 uint32_t hash = 0;
839 while (*mutf8Data != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
840 constexpr size_t SHIFT = 5;
841 hash = (hash << SHIFT) - hash + *mutf8Data++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
842 }
843 return static_cast<int32_t>(hash);
844 }
845
ComputeHashcode()846 uint32_t String::ComputeHashcode()
847 {
848 uint32_t hash;
849 if (compressedStringsEnabled_) {
850 if (!IsUtf16()) {
851 hash = static_cast<uint32_t>(ComputeHashForData(GetDataMUtf8(), GetLength()));
852 } else {
853 hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
854 }
855 } else {
856 ASSERT(static_cast<size_t>(GetLength()) < (std::numeric_limits<size_t>::max() >> 1U));
857 hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
858 }
859 return hash;
860 }
861
862 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8Data,uint32_t utf16Length)863 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length)
864 {
865 bool canBeCompressed = CanBeCompressedMUtf8(mutf8Data);
866 return ComputeHashcodeMutf8(mutf8Data, utf16Length, canBeCompressed);
867 }
868
869 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8Data,uint32_t utf16Length,bool canBeCompressed)870 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8Data, uint32_t utf16Length, bool canBeCompressed)
871 {
872 uint32_t hash;
873 if (canBeCompressed) {
874 hash = static_cast<uint32_t>(ComputeHashForMutf8(mutf8Data));
875 } else {
876 // NOTE(alovkov): optimize it without allocation a temporary buffer
877 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
878 auto tmpBuffer = allocator->AllocArray<uint16_t>(utf16Length);
879 utf::ConvertMUtf8ToUtf16(mutf8Data, utf::Mutf8Size(mutf8Data), tmpBuffer);
880 hash = static_cast<uint32_t>(ComputeHashForData(tmpBuffer, utf16Length));
881 allocator->Delete(tmpBuffer);
882 }
883 return hash;
884 }
885
886 /* static */
ComputeHashcodeUtf16(const uint16_t * utf16Data,uint32_t length)887 uint32_t String::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length)
888 {
889 return ComputeHashForData(utf16Data, length);
890 }
891
892 /* static */
DoReplace(String * src,uint16_t oldC,uint16_t newC,const LanguageContext & ctx,PandaVM * vm)893 String *String::DoReplace(String *src, uint16_t oldC, uint16_t newC, const LanguageContext &ctx, PandaVM *vm)
894 {
895 ASSERT(src != nullptr);
896 auto length = static_cast<int32_t>(src->GetLength());
897 bool canBeCompressed = IsASCIICharacter(newC);
898 if (src->IsUtf16()) {
899 canBeCompressed = canBeCompressed && CanBeCompressedUtf16(src->GetDataUtf16(), length, oldC);
900 } else {
901 canBeCompressed = canBeCompressed && CanBeCompressedMUtf8(src->GetDataMUtf8(), length, oldC);
902 }
903
904 // allocator may trig gc and move src, need to hold it
905 auto thread = ManagedThread::GetCurrent();
906 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
907 VMHandle<String> srcHandle(thread, src);
908 ASSERT(srcHandle.GetPtr() != nullptr);
909 auto string = AllocStringObject(length, canBeCompressed, ctx, vm);
910 if (string == nullptr) {
911 return nullptr;
912 }
913
914 // retrieve src after gc
915 src = srcHandle.GetPtr();
916 ASSERT(string->hashcode_ == 0);
917
918 // After replacing we should have a full barrier, so this writes should happen-before barrier
919 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
920 if (src->IsUtf16()) {
921 if (canBeCompressed) {
922 auto replace = [oldC, newC](uint16_t c) { return static_cast<uint8_t>((oldC != c) ? c : newC); };
923 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
924 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataMUtf8(), replace);
925 } else {
926 auto replace = [oldC, newC](uint16_t c) { return (oldC != c) ? c : newC; };
927 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
928 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataUtf16(), replace);
929 }
930 } else {
931 if (canBeCompressed) {
932 auto replace = [oldC, newC](uint16_t c) { return static_cast<uint8_t>((oldC != c) ? c : newC); };
933 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
934 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataMUtf8(), replace);
935 } else {
936 auto replace = [oldC, newC](uint16_t c) { return (oldC != c) ? c : newC; };
937 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
938 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataUtf16(), replace);
939 }
940 }
941 TSAN_ANNOTATE_IGNORE_WRITES_END();
942 // String is supposed to be a constant object, so all its data should be visible by all threads
943 arch::FullMemoryBarrier();
944 return string;
945 }
946
947 /* static */
FastSubString(String * src,uint32_t start,uint32_t utf16Length,const LanguageContext & ctx,PandaVM * vm)948 String *String::FastSubString(String *src, uint32_t start, uint32_t utf16Length, const LanguageContext &ctx,
949 PandaVM *vm)
950 {
951 ASSERT(src != nullptr);
952 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
953 bool canBeCompressed = !src->IsUtf16() || CanBeCompressed(src->GetDataUtf16() + start, utf16Length);
954
955 // allocator may trig gc and move src, need to hold it
956 auto thread = ManagedThread::GetCurrent();
957 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
958 VMHandle<String> srcHandle(thread, src);
959 ASSERT(srcHandle.GetPtr() != nullptr);
960 auto string = AllocStringObject(utf16Length, canBeCompressed, ctx, vm);
961 if (string == nullptr) {
962 return nullptr;
963 }
964
965 // retrieve src after gc
966 src = srcHandle.GetPtr();
967 ASSERT(string->hashcode_ == 0);
968
969 // After copying we should have a full barrier, so this writes should happen-before barrier
970 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
971 if (src->IsUtf16()) {
972 if (canBeCompressed) {
973 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
974 CopyUtf16AsMUtf8(src->GetDataUtf16() + start, string->GetDataMUtf8(), utf16Length);
975 } else {
976 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()),
977 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
978 src->GetDataUtf16() + start, utf16Length << 1UL);
979 }
980 } else {
981 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
982 memcpy_s(string->GetDataMUtf8(), string->GetLength(), src->GetDataMUtf8() + start, utf16Length);
983 }
984 TSAN_ANNOTATE_IGNORE_WRITES_END();
985 // String is supposed to be a constant object, so all its data should be visible by all threads
986 arch::FullMemoryBarrier();
987 return string;
988 }
989
990 /* static */
Concat(String * string1,String * string2,const LanguageContext & ctx,PandaVM * vm)991 String *String::Concat(String *string1, String *string2, const LanguageContext &ctx, PandaVM *vm)
992 {
993 ASSERT(string1 != nullptr);
994 ASSERT(string2 != nullptr);
995 // allocator may trig gc and move src, need to hold it
996 auto thread = ManagedThread::GetCurrent();
997 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
998 VMHandle<String> str1Handle(thread, string1);
999 VMHandle<String> str2Handle(thread, string2);
1000 ASSERT(str1Handle.GetPtr() != nullptr);
1001 ASSERT(str2Handle.GetPtr() != nullptr);
1002 uint32_t length1 = string1->GetLength();
1003 uint32_t length2 = string2->GetLength();
1004 uint32_t newLength = length1 + length2;
1005 bool compressed = compressedStringsEnabled_ && (!string1->IsUtf16() && !string2->IsUtf16());
1006 auto newString = AllocStringObject(newLength, compressed, ctx, vm);
1007 if (UNLIKELY(newString == nullptr)) {
1008 return nullptr;
1009 }
1010
1011 ASSERT(newString->hashcode_ == 0);
1012
1013 // retrieve strings after gc
1014 string1 = str1Handle.GetPtr();
1015 string2 = str2Handle.GetPtr();
1016
1017 // After copying we should have a full barrier, so this writes should happen-before barrier
1018 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
1019 if (compressed) {
1020 Span<uint8_t> sp(newString->GetDataMUtf8(), newLength);
1021 memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataMUtf8(), length1);
1022 sp = sp.SubSpan(length1);
1023 memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataMUtf8(), length2);
1024 } else {
1025 Span<uint16_t> sp(newString->GetDataUtf16(), newLength);
1026 if (!string1->IsUtf16()) {
1027 for (uint32_t i = 0; i < length1; ++i) {
1028 sp[i] = string1->At<false>(i);
1029 }
1030 } else {
1031 memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataUtf16(), length1 << 1U);
1032 }
1033 sp = sp.SubSpan(length1);
1034 if (!string2->IsUtf16()) {
1035 for (uint32_t i = 0; i < length2; ++i) {
1036 sp[i] = string2->At<false>(i);
1037 }
1038 } else {
1039 memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataUtf16(), length2 << 1U);
1040 }
1041 }
1042 TSAN_ANNOTATE_IGNORE_WRITES_END();
1043 // String is supposed to be a constant object, so all its data should be visible by all threads
1044 arch::FullMemoryBarrier();
1045
1046 return newString;
1047 }
1048
1049 /* static */
AllocStringObject(size_t length,bool compressed,const LanguageContext & ctx,PandaVM * vm,bool movable,bool pinned)1050 String *String::AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm, bool movable,
1051 bool pinned)
1052 {
1053 ASSERT(vm != nullptr);
1054 auto *thread = ManagedThread::GetCurrent();
1055 auto *stringClass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::STRING);
1056 size_t size = compressed ? String::ComputeSizeMUtf8(length) : String::ComputeSizeUtf16(length);
1057 auto string =
1058 movable
1059 ? reinterpret_cast<String *>(
1060 vm->GetHeapManager()->AllocateObject(stringClass, size, DEFAULT_ALIGNMENT, thread,
1061 mem::ObjectAllocatorBase::ObjMemInitPolicy::NO_INIT, pinned))
1062 : reinterpret_cast<String *>(vm->GetHeapManager()->AllocateNonMovableObject(
1063 // CC-OFFNXT(G.FMT.06) project code style
1064 stringClass, size, DEFAULT_ALIGNMENT, thread, mem::ObjectAllocatorBase::ObjMemInitPolicy::NO_INIT));
1065 if (string != nullptr) {
1066 // After setting length we should have a full barrier, so this write should happens-before barrier
1067 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
1068 string->SetLength(length, compressed);
1069 string->SetHashcode(0);
1070 TSAN_ANNOTATE_IGNORE_WRITES_END();
1071 // Witout full memory barrier it is possible that architectures with weak memory order can try fetching string
1072 // legth before it's set
1073 arch::FullMemoryBarrier();
1074 }
1075 return string;
1076 }
1077
1078 } // namespace ark::coretypes
1079