1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <cstddef>
17 #include <cstring>
18 #include <limits>
19
20 #include "libpandabase/utils/hash.h"
21 #include "libpandabase/utils/span.h"
22 #include "runtime/arch/memory_helpers.h"
23 #include "runtime/include/coretypes/array.h"
24 #include "runtime/include/coretypes/string-inl.h"
25 #include "runtime/include/runtime.h"
26 #include "runtime/handle_base-inl.h"
27 #include "runtime/include/panda_vm.h"
28
29 namespace panda::coretypes {
30
31 bool String::compressed_strings_enabled = true;
32
33 /* static */
CreateFromString(String * str,const LanguageContext & ctx,PandaVM * vm)34 String *String::CreateFromString(String *str, const LanguageContext &ctx, PandaVM *vm)
35 {
36 ASSERT(str != nullptr);
37 // allocator may trig gc and move str, need to hold it
38 auto thread = ManagedThread::GetCurrent();
39 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
40 VMHandle<String> str_handle(thread, str);
41 auto string = AllocStringObject(str_handle->GetLength(), !str_handle->IsUtf16(), ctx, vm);
42 if (string == nullptr) {
43 return nullptr;
44 }
45
46 // retrive str after gc
47 str = str_handle.GetPtr();
48 string->length_ = str->length_;
49 string->hashcode_ = str->hashcode_;
50
51 uint32_t length = str->GetLength();
52 // After memcpy we should have a full barrier, so this writes should happen-before barrier
53 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
54 if (str->IsUtf16()) {
55 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), str->GetDataUtf16(),
56 ComputeDataSizeUtf16(length));
57 } else {
58 memcpy_s(string->GetDataMUtf8(), string->GetLength(), str->GetDataMUtf8(), length);
59 }
60 TSAN_ANNOTATE_IGNORE_WRITES_END();
61 // String is supposed to be a constant object, so all its data should be visible by all threads
62 arch::FullMemoryBarrier();
63
64 return string;
65 }
66
67 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,size_t mutf8_length,uint32_t utf16_length,bool can_be_compressed,const LanguageContext & ctx,PandaVM * vm,bool movable)68 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length,
69 bool can_be_compressed, const LanguageContext &ctx, PandaVM *vm, bool movable)
70 {
71 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm, movable);
72 if (string == nullptr) {
73 return nullptr;
74 }
75
76 ASSERT(string->hashcode_ == 0);
77 // After copying we should have a full barrier, so this writes should happen-before barrier
78 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
79 if (can_be_compressed) {
80 memcpy_s(string->GetDataMUtf8(), string->GetLength(), mutf8_data, utf16_length);
81 } else {
82 utf::ConvertMUtf8ToUtf16(mutf8_data, mutf8_length, string->GetDataUtf16());
83 }
84 TSAN_ANNOTATE_IGNORE_WRITES_END();
85 // String is supposed to be a constant object, so all its data should be visible by all threads
86 arch::FullMemoryBarrier();
87 return string;
88 }
89
90 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,uint32_t utf16_length,const LanguageContext & ctx,PandaVM * vm,bool movable)91 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, const LanguageContext &ctx,
92 PandaVM *vm, bool movable)
93 {
94 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
95 return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable);
96 }
97
98 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed,const LanguageContext & ctx,PandaVM * vm,bool movable)99 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed,
100 const LanguageContext &ctx, PandaVM *vm, bool movable)
101 {
102 return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable);
103 }
104
105 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,const LanguageContext & ctx,PandaVM * vm,bool movable)106 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, const LanguageContext &ctx, PandaVM *vm, bool movable)
107 {
108 size_t mutf8_length = utf::Mutf8Size(mutf8_data);
109 size_t utf16_length = utf::MUtf8ToUtf16Size(mutf8_data, mutf8_length);
110 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
111 return CreateFromMUtf8(mutf8_data, mutf8_length, utf16_length, can_be_compressed, ctx, vm, movable);
112 }
113
114 /* static */
CreateFromUtf16(const uint16_t * utf16_data,uint32_t utf16_length,const LanguageContext & ctx,PandaVM * vm,bool movable)115 String *String::CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, const LanguageContext &ctx,
116 PandaVM *vm, bool movable)
117 {
118 bool can_be_compressed = CanBeCompressed(utf16_data, utf16_length);
119 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm, movable);
120 if (string == nullptr) {
121 return nullptr;
122 }
123
124 ASSERT(string->hashcode_ == 0);
125 // After copying we should have a full barrier, so this writes should happen-before barrier
126 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
127 if (can_be_compressed) {
128 CopyUtf16AsMUtf8(utf16_data, string->GetDataMUtf8(), utf16_length);
129 } else {
130 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), utf16_data, utf16_length << 1UL);
131 }
132 TSAN_ANNOTATE_IGNORE_WRITES_END();
133 // String is supposed to be a constant object, so all its data should be visible by all threads
134 arch::FullMemoryBarrier();
135 return string;
136 }
137
138 /* static */
CreateEmptyString(const LanguageContext & ctx,PandaVM * vm)139 String *String::CreateEmptyString(const LanguageContext &ctx, PandaVM *vm)
140 {
141 uint16_t data = 0;
142 return CreateFromUtf16(&data, 0, ctx, vm);
143 }
144
145 /* static */
CopyUtf16AsMUtf8(const uint16_t * utf16_from,uint8_t * mutf8_to,uint32_t utf16_length)146 void String::CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length)
147 {
148 Span<const uint16_t> from(utf16_from, utf16_length);
149 Span<uint8_t> to(mutf8_to, utf16_length);
150 for (uint32_t i = 0; i < utf16_length; i++) {
151 to[i] = from[i];
152 }
153 }
154
155 // static
CreateNewStringFromChars(uint32_t offset,uint32_t length,Array * chararray,const LanguageContext & ctx,PandaVM * vm)156 String *String::CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, const LanguageContext &ctx,
157 PandaVM *vm)
158 {
159 ASSERT(chararray != nullptr);
160 // allocator may trig gc and move array, need to hold it
161 auto thread = ManagedThread::GetCurrent();
162 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
163 VMHandle<Array> array_handle(thread, chararray);
164
165 // NOLINTNEXTLINE(readability-identifier-naming)
166 const uint16_t *src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(chararray->GetData()) + (offset << 1UL));
167 bool can_be_compressed = CanBeCompressed(src, length);
168 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
169 if (string == nullptr) {
170 return nullptr;
171 }
172
173 // retrieve src since gc may move it
174 src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(array_handle->GetData()) + (offset << 1UL));
175 ASSERT(string->hashcode_ == 0);
176 // After copying we should have a full barrier, so this writes should happen-before barrier
177 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
178 if (can_be_compressed) {
179 CopyUtf16AsMUtf8(src, string->GetDataMUtf8(), length);
180 } else {
181 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), src, length << 1UL);
182 }
183 TSAN_ANNOTATE_IGNORE_WRITES_END();
184 // String is supposed to be a constant object, so all its data should be visible by all threads
185 arch::FullMemoryBarrier();
186 return string;
187 }
188
189 // static
CreateNewStringFromBytes(uint32_t offset,uint32_t length,uint32_t high_byte,Array * bytearray,const LanguageContext & ctx,PandaVM * vm)190 String *String::CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray,
191 const LanguageContext &ctx, PandaVM *vm)
192 {
193 ASSERT(length != 0);
194 ASSERT(bytearray != nullptr);
195 // allocator may trig gc and move array, need to hold it
196 auto thread = ManagedThread::GetCurrent();
197 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
198 VMHandle<Array> array_handle(thread, bytearray);
199
200 constexpr size_t BYTE_MASK = 0xFF;
201
202 // NOLINTNEXTLINE(readability-identifier-naming)
203 const uint8_t *src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(bytearray->GetData()) + offset);
204 high_byte &= BYTE_MASK;
205 bool can_be_compressed = CanBeCompressedMUtf8(src, length) && (high_byte == 0);
206 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
207 if (string == nullptr) {
208 return nullptr;
209 }
210
211 // retrieve src since gc may move it
212 src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(array_handle->GetData()) + offset);
213 ASSERT(string->hashcode_ == 0);
214 // After copying we should have a full barrier, so this writes should happen-before barrier
215 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
216 if (can_be_compressed) {
217 Span<const uint8_t> from(src, length);
218 Span<uint8_t> to(string->GetDataMUtf8(), length);
219 for (uint32_t i = 0; i < length; ++i) {
220 to[i] = (from[i] & BYTE_MASK);
221 }
222 } else {
223 Span<const uint8_t> from(src, length);
224 Span<uint16_t> to(string->GetDataUtf16(), length);
225 for (uint32_t i = 0; i < length; ++i) {
226 to[i] = (high_byte << 8U) + (from[i] & BYTE_MASK);
227 }
228 }
229 TSAN_ANNOTATE_IGNORE_WRITES_END();
230
231 // String is supposed to be a constant object, so all its data should be visible by all threads
232 arch::FullMemoryBarrier();
233 return string;
234 }
235
236 template <typename T1, typename T2>
CompareStringSpan(Span<T1> & lhs_sp,Span<T2> & rhs_sp,int32_t count)237 int32_t CompareStringSpan(Span<T1> &lhs_sp, Span<T2> &rhs_sp, int32_t count)
238 {
239 for (int32_t i = 0; i < count; ++i) {
240 int32_t char_diff = static_cast<int32_t>(lhs_sp[i]) - static_cast<int32_t>(rhs_sp[i]);
241 if (char_diff != 0) {
242 return char_diff;
243 }
244 }
245 return 0;
246 }
247
248 template <typename T>
CompareBytesBlock(T * lstr_pt,T * rstr_pt,int32_t min_count)249 int32_t CompareBytesBlock(T *lstr_pt, T *rstr_pt, int32_t min_count)
250 {
251 constexpr int32_t bytes_cnt = sizeof(size_t);
252 static_assert(bytes_cnt >= sizeof(T));
253 static_assert(bytes_cnt % sizeof(T) == 0);
254 int32_t total_bytes = min_count * sizeof(T);
255 auto lhs_block = reinterpret_cast<size_t *>(lstr_pt);
256 auto rhs_block = reinterpret_cast<size_t *>(rstr_pt);
257 int32_t cur_byte_pos = 0;
258 while (cur_byte_pos + bytes_cnt <= total_bytes) {
259 if (*lhs_block == *rhs_block) {
260 cur_byte_pos += bytes_cnt;
261 lhs_block++;
262 rhs_block++;
263 } else {
264 break;
265 }
266 }
267 int32_t cur_element_pos = cur_byte_pos / sizeof(T);
268 for (int32_t i = cur_element_pos; i < min_count; ++i) {
269 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
270 int32_t char_diff = static_cast<int32_t>(lstr_pt[i]) - static_cast<int32_t>(rstr_pt[i]);
271 if (char_diff != 0) {
272 return char_diff;
273 }
274 }
275
276 return 0;
277 }
278
Compare(String * rstr)279 int32_t String::Compare(String *rstr)
280 {
281 String *lstr = this;
282 if (lstr == rstr) {
283 return 0;
284 }
285 ASSERT(lstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
286 ASSERT(rstr->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
287 auto lstr_leng = static_cast<int32_t>(lstr->GetLength());
288 auto rstr_leng = static_cast<int32_t>(rstr->GetLength());
289 int32_t leng_ret = lstr_leng - rstr_leng;
290 int32_t min_count = (leng_ret < 0) ? lstr_leng : rstr_leng;
291 bool lstr_isUtf16 = lstr->IsUtf16();
292 bool rstr_isUtf16 = rstr->IsUtf16();
293 if (!lstr_isUtf16 && !rstr_isUtf16) {
294 int32_t char_diff = CompareBytesBlock(lstr->GetDataMUtf8(), rstr->GetDataMUtf8(), min_count);
295 if (char_diff != 0) {
296 return char_diff;
297 }
298 } else if (!lstr_isUtf16) {
299 Span<uint8_t> lhs_sp(lstr->GetDataMUtf8(), lstr_leng);
300 Span<uint16_t> rhs_sp(rstr->GetDataUtf16(), rstr_leng);
301 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
302 if (char_diff != 0) {
303 return char_diff;
304 }
305 } else if (!rstr_isUtf16) {
306 Span<uint16_t> lhs_sp(lstr->GetDataUtf16(), lstr_leng);
307 Span<uint8_t> rhs_sp(rstr->GetDataMUtf8(), rstr_leng);
308 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
309 if (char_diff != 0) {
310 return char_diff;
311 }
312 } else {
313 int32_t char_diff = CompareBytesBlock(lstr->GetDataUtf16(), rstr->GetDataUtf16(), min_count);
314 if (char_diff != 0) {
315 return char_diff;
316 }
317 }
318 return leng_ret;
319 }
320
321 /* static */
322 template <typename T1, typename T2>
IndexOf(Span<const T1> & lhs_sp,Span<const T2> & rhs_sp,int32_t pos,int32_t max)323 int32_t String::IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max)
324 {
325 auto first = static_cast<int32_t>(rhs_sp[0]);
326 for (int32_t i = pos; i <= max; i++) {
327 if (static_cast<int32_t>(lhs_sp[i]) != first) {
328 i++;
329 while (i <= max && static_cast<int32_t>(lhs_sp[i]) != first) {
330 i++;
331 }
332 }
333 /* Found first character, now look at the rest of rhs_sp */
334 if (i <= max) {
335 int j = i + 1;
336 int end = j + rhs_sp.size() - 1;
337
338 for (int k = 1; j < end && static_cast<int32_t>(lhs_sp[j]) == static_cast<int32_t>(rhs_sp[k]); j++, k++) {
339 }
340 if (j == end) {
341 /* Found whole string. */
342 return i;
343 }
344 }
345 }
346 return -1;
347 }
348
IndexOf(String * rhs,int32_t pos)349 int32_t String::IndexOf(String *rhs, int32_t pos)
350 {
351 if (rhs == nullptr) {
352 return -1;
353 }
354 String *lhs = this;
355 ASSERT(lhs->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
356 ASSERT(rhs->GetLength() <= static_cast<uint32_t>(std::numeric_limits<int32_t>::max()));
357 auto lhs_count = static_cast<int32_t>(lhs->GetLength());
358 auto rhs_count = static_cast<int32_t>(rhs->GetLength());
359
360 if (rhs_count == 0) {
361 return pos;
362 }
363
364 if (pos >= lhs_count) {
365 return -1;
366 }
367
368 if (pos < 0) {
369 pos = 0;
370 }
371
372 int32_t max = lhs_count - rhs_count;
373 if (rhs->IsMUtf8() && lhs->IsMUtf8()) {
374 Span<const uint8_t> lhs_sp(lhs->GetDataMUtf8(), lhs_count);
375 Span<const uint8_t> rhs_sp(rhs->GetDataMUtf8(), rhs_count);
376 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
377 } else if (rhs->IsUtf16() && lhs->IsUtf16()) { // NOLINT(readability-else-after-return)
378 Span<const uint16_t> lhs_sp(lhs->GetDataUtf16(), lhs_count);
379 Span<const uint16_t> rhs_sp(rhs->GetDataUtf16(), rhs_count);
380 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
381 } else if (rhs->IsUtf16()) {
382 Span<const uint8_t> lhs_sp(lhs->GetDataMUtf8(), lhs_count);
383 Span<const uint16_t> rhs_sp(rhs->GetDataUtf16(), rhs_count);
384 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
385 } else { // NOLINT(readability-else-after-return)
386 Span<const uint16_t> lhs_sp(lhs->GetDataUtf16(), lhs_count);
387 Span<const uint8_t> rhs_sp(rhs->GetDataMUtf8(), rhs_count);
388 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
389 }
390
391 return -1;
392 }
393
394 /* static */
CanBeCompressed(const uint16_t * utf16_data,uint32_t utf16_length)395 bool String::CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length)
396 {
397 if (!compressed_strings_enabled) {
398 return false;
399 }
400 bool is_compressed = true;
401 Span<const uint16_t> data(utf16_data, utf16_length);
402 for (uint32_t i = 0; i < utf16_length; i++) {
403 if (!IsASCIICharacter(data[i])) {
404 is_compressed = false;
405 break;
406 }
407 }
408 return is_compressed;
409 }
410
411 // static
CanBeCompressedMUtf8(const uint8_t * mutf8_data,uint32_t mutf8_length)412 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length)
413 {
414 if (!compressed_strings_enabled) {
415 return false;
416 }
417 bool is_compressed = true;
418 Span<const uint8_t> data(mutf8_data, mutf8_length);
419 for (uint32_t i = 0; i < mutf8_length; i++) {
420 if (!IsASCIICharacter(data[i])) {
421 is_compressed = false;
422 break;
423 }
424 }
425 return is_compressed;
426 }
427
428 // static
CanBeCompressedMUtf8(const uint8_t * mutf8_data)429 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data)
430 {
431 return compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false;
432 }
433
434 /* static */
CanBeCompressedUtf16(const uint16_t * utf16_data,uint32_t utf16_length,uint16_t non)435 bool String::CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non)
436 {
437 if (!compressed_strings_enabled) {
438 return false;
439 }
440 bool is_compressed = true;
441 Span<const uint16_t> data(utf16_data, utf16_length);
442 for (uint32_t i = 0; i < utf16_length; i++) {
443 if (!IsASCIICharacter(data[i]) && data[i] != non) {
444 is_compressed = false;
445 break;
446 }
447 }
448 return is_compressed;
449 }
450
451 /* static */
CanBeCompressedMUtf8(const uint8_t * mutf8_data,uint32_t mutf8_length,uint16_t non)452 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non)
453 {
454 if (!compressed_strings_enabled) {
455 return false;
456 }
457 bool is_compressed = true;
458 Span<const uint8_t> data(mutf8_data, mutf8_length);
459 for (uint32_t i = 0; i < mutf8_length; i++) {
460 if (!IsASCIICharacter(data[i]) && data[i] != non) {
461 is_compressed = false;
462 break;
463 }
464 }
465 return is_compressed;
466 }
467
468 /* static */
StringsAreEqual(String * str1,String * str2)469 bool String::StringsAreEqual(String *str1, String *str2)
470 {
471 ASSERT(str1 != nullptr);
472 ASSERT(str2 != nullptr);
473
474 if ((str1->IsUtf16() != str2->IsUtf16()) || (str1->GetLength() != str2->GetLength())) {
475 return false;
476 }
477
478 if (str1->IsUtf16()) {
479 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
480 Span<const uint16_t> data2(str2->GetDataUtf16(), str1->GetLength());
481 return String::StringsAreEquals(data1, data2);
482 } else { // NOLINT(readability-else-after-return)
483 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
484 Span<const uint8_t> data2(str2->GetDataMUtf8(), str1->GetLength());
485 return String::StringsAreEquals(data1, data2);
486 }
487 }
488
489 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8_data,uint32_t utf16_length)490 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length)
491 {
492 if (str1->GetLength() != utf16_length) {
493 return false;
494 }
495 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
496 return StringsAreEqualMUtf8(str1, mutf8_data, utf16_length, can_be_compressed);
497 }
498
499 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed)500 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length,
501 bool can_be_compressed)
502 {
503 bool result = true;
504 if (str1->GetLength() != utf16_length) {
505 result = false;
506 } else {
507 bool str1_can_be_compressed = !str1->IsUtf16();
508 bool data2_can_be_compressed = can_be_compressed;
509 if (str1_can_be_compressed != data2_can_be_compressed) {
510 return false;
511 }
512
513 ASSERT(str1_can_be_compressed == data2_can_be_compressed);
514 if (str1_can_be_compressed) {
515 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
516 Span<const uint8_t> data2(mutf8_data, utf16_length);
517 result = String::StringsAreEquals(data1, data2);
518 } else {
519 result = IsMutf8EqualsUtf16(mutf8_data, str1->GetDataUtf16(), str1->GetLength());
520 }
521 }
522 return result;
523 }
524
525 /* static */
StringsAreEqualUtf16(String * str1,const uint16_t * utf16_data,uint32_t utf16_data_length)526 bool String::StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length)
527 {
528 bool result = true;
529 if (str1->GetLength() != utf16_data_length) {
530 result = false;
531 } else if (!str1->IsUtf16()) {
532 result = IsMutf8EqualsUtf16(str1->GetDataMUtf8(), str1->GetLength(), utf16_data, utf16_data_length);
533 } else {
534 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
535 Span<const uint16_t> data2(utf16_data, utf16_data_length);
536 result = String::StringsAreEquals(data1, data2);
537 }
538 return result;
539 }
540
541 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8_data,uint32_t utf8_data_length,const uint16_t * utf16_data,uint32_t utf16_data_length)542 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data,
543 uint32_t utf16_data_length)
544 {
545 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
546 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_data_length);
547 [[maybe_unused]] auto converted_string_size =
548 utf::ConvertRegionMUtf8ToUtf16(utf8_data, tmp_buffer, utf8_data_length, utf16_data_length, 0);
549 ASSERT(converted_string_size == utf16_data_length);
550
551 Span<const uint16_t> data1(tmp_buffer, utf16_data_length);
552 Span<const uint16_t> data2(utf16_data, utf16_data_length);
553 bool result = String::StringsAreEquals(data1, data2);
554 allocator->Delete(tmp_buffer);
555 return result;
556 }
557
558 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8_data,const uint16_t * utf16_data,uint32_t utf16_data_length)559 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length)
560 {
561 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
562 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_data_length);
563 utf::ConvertMUtf8ToUtf16(utf8_data, utf::Mutf8Size(utf8_data), tmp_buffer);
564
565 Span<const uint16_t> data1(tmp_buffer, utf16_data_length);
566 Span<const uint16_t> data2(utf16_data, utf16_data_length);
567 bool result = String::StringsAreEquals(data1, data2);
568 allocator->Delete(tmp_buffer);
569 return result;
570 }
571
572 /* static */
573 template <typename T>
StringsAreEquals(Span<const T> & str1,Span<const T> & str2)574 bool String::StringsAreEquals(Span<const T> &str1, Span<const T> &str2)
575 {
576 return std::memcmp(str1.Data(), str2.Data(), str1.SizeBytes()) == 0;
577 }
578
ToCharArray(const LanguageContext & ctx)579 Array *String::ToCharArray(const LanguageContext &ctx)
580 {
581 // allocator may trig gc and move 'this', need to hold it
582 auto thread = ManagedThread::GetCurrent();
583 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
584 VMHandle<String> str(thread, this);
585 auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
586 Array *array = Array::Create(klass, GetLength());
587 if (array == nullptr) {
588 return nullptr;
589 }
590
591 if (str->IsUtf16()) {
592 Span<uint16_t> sp(str->GetDataUtf16(), str->GetLength());
593 for (size_t i = 0; i < sp.size(); i++) {
594 array->Set<uint16_t>(i, sp[i]);
595 }
596 } else {
597 Span<uint8_t> sp(str->GetDataMUtf8(), str->GetLength());
598 for (size_t i = 0; i < sp.size(); i++) {
599 array->Set<uint16_t>(i, sp[i]);
600 }
601 }
602
603 return array;
604 }
605
606 template <class T>
ComputeHashForData(const T * data,size_t size)607 static int32_t ComputeHashForData(const T *data, size_t size)
608 {
609 uint32_t hash = 0;
610 #if defined(__GNUC__)
611 #pragma GCC diagnostic push
612 #pragma GCC diagnostic ignored "-Wignored-attributes"
613 Span<const T> sp(data, size);
614 #pragma GCC diagnostic pop
615 #endif
616 for (auto c : sp) {
617 constexpr size_t SHIFT = 5;
618 hash = (hash << SHIFT) - hash + c;
619 }
620 return static_cast<int32_t>(hash);
621 }
622
ComputeHashForMutf8(const uint8_t * mutf8_data)623 static int32_t ComputeHashForMutf8(const uint8_t *mutf8_data)
624 {
625 uint32_t hash = 0;
626 while (*mutf8_data != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
627 constexpr size_t SHIFT = 5;
628 hash = (hash << SHIFT) - hash + *mutf8_data++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
629 }
630 return static_cast<int32_t>(hash);
631 }
632
ComputeHashcode()633 uint32_t String::ComputeHashcode()
634 {
635 uint32_t hash;
636 if (compressed_strings_enabled) {
637 if (!IsUtf16()) {
638 hash = static_cast<uint32_t>(ComputeHashForData(GetDataMUtf8(), GetLength()));
639 } else {
640 hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
641 }
642 } else {
643 ASSERT(static_cast<size_t>(GetLength()) > (std::numeric_limits<size_t>::max() >> 1U));
644 hash = static_cast<uint32_t>(ComputeHashForData(GetDataUtf16(), GetLength()));
645 }
646 return hash;
647 }
648
649 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8_data,uint32_t utf16_length)650 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length)
651 {
652 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
653 return ComputeHashcodeMutf8(mutf8_data, utf16_length, can_be_compressed);
654 }
655
656 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed)657 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed)
658 {
659 uint32_t hash;
660 if (can_be_compressed) {
661 hash = static_cast<uint32_t>(ComputeHashForMutf8(mutf8_data));
662 } else {
663 // TODO(alovkov): optimize it without allocation a temporary buffer
664 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
665 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_length);
666 utf::ConvertMUtf8ToUtf16(mutf8_data, utf::Mutf8Size(mutf8_data), tmp_buffer);
667 hash = static_cast<uint32_t>(ComputeHashForData(tmp_buffer, utf16_length));
668 allocator->Delete(tmp_buffer);
669 }
670 return hash;
671 }
672
673 /* static */
ComputeHashcodeUtf16(const uint16_t * utf16_data,uint32_t length)674 uint32_t String::ComputeHashcodeUtf16(const uint16_t *utf16_data, uint32_t length)
675 {
676 return ComputeHashForData(utf16_data, length);
677 }
678
679 /* static */
DoReplace(String * src,uint16_t old_c,uint16_t new_c,const LanguageContext & ctx,PandaVM * vm)680 String *String::DoReplace(String *src, uint16_t old_c, uint16_t new_c, const LanguageContext &ctx, PandaVM *vm)
681 {
682 ASSERT(src != nullptr);
683 auto length = static_cast<int32_t>(src->GetLength());
684 bool can_be_compressed = IsASCIICharacter(new_c);
685 if (src->IsUtf16()) {
686 can_be_compressed = can_be_compressed && CanBeCompressedUtf16(src->GetDataUtf16(), length, old_c);
687 } else {
688 can_be_compressed = can_be_compressed && CanBeCompressedMUtf8(src->GetDataMUtf8(), length, old_c);
689 }
690
691 // allocator may trig gc and move src, need to hold it
692 auto thread = ManagedThread::GetCurrent();
693 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
694 VMHandle<String> src_handle(thread, src);
695 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
696 if (string == nullptr) {
697 return nullptr;
698 }
699
700 // retrieve src after gc
701 src = src_handle.GetPtr();
702 ASSERT(string->hashcode_ == 0);
703
704 // After replacing we should have a full barrier, so this writes should happen-before barrier
705 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
706 if (src->IsUtf16()) {
707 if (can_be_compressed) {
708 auto replace = [old_c, new_c](uint16_t c) { return static_cast<uint8_t>((old_c != c) ? c : new_c); };
709 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
710 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataMUtf8(), replace);
711 } else {
712 auto replace = [old_c, new_c](uint16_t c) { return (old_c != c) ? c : new_c; };
713 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
714 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataUtf16(), replace);
715 }
716 } else {
717 if (can_be_compressed) {
718 auto replace = [old_c, new_c](uint16_t c) { return static_cast<uint8_t>((old_c != c) ? c : new_c); };
719 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
720 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataMUtf8(), replace);
721 } else {
722 auto replace = [old_c, new_c](uint16_t c) { return (old_c != c) ? c : new_c; };
723 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
724 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataUtf16(), replace);
725 }
726 }
727 TSAN_ANNOTATE_IGNORE_WRITES_END();
728 // String is supposed to be a constant object, so all its data should be visible by all threads
729 arch::FullMemoryBarrier();
730 return string;
731 }
732
733 /* static */
FastSubString(String * src,uint32_t start,uint32_t utf16_length,const LanguageContext & ctx,PandaVM * vm)734 String *String::FastSubString(String *src, uint32_t start, uint32_t utf16_length, const LanguageContext &ctx,
735 PandaVM *vm)
736 {
737 ASSERT(src != nullptr);
738 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
739 bool can_be_compressed = !src->IsUtf16() || CanBeCompressed(src->GetDataUtf16() + start, utf16_length);
740
741 // allocator may trig gc and move src, need to hold it
742 auto thread = ManagedThread::GetCurrent();
743 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
744 VMHandle<String> src_handle(thread, src);
745 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm);
746 if (string == nullptr) {
747 return nullptr;
748 }
749
750 // retrieve src after gc
751 src = src_handle.GetPtr();
752 ASSERT(string->hashcode_ == 0);
753
754 // After copying we should have a full barrier, so this writes should happen-before barrier
755 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
756 if (src->IsUtf16()) {
757 if (can_be_compressed) {
758 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
759 CopyUtf16AsMUtf8(src->GetDataUtf16() + start, string->GetDataMUtf8(), utf16_length);
760 } else {
761 memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()),
762 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
763 src->GetDataUtf16() + start, utf16_length << 1UL);
764 }
765 } else {
766 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
767 memcpy_s(string->GetDataMUtf8(), string->GetLength(), src->GetDataMUtf8() + start, utf16_length);
768 }
769 TSAN_ANNOTATE_IGNORE_WRITES_END();
770 // String is supposed to be a constant object, so all its data should be visible by all threads
771 arch::FullMemoryBarrier();
772 return string;
773 }
774
775 /* static */
Concat(String * string1,String * string2,const LanguageContext & ctx,PandaVM * vm)776 String *String::Concat(String *string1, String *string2, const LanguageContext &ctx, PandaVM *vm)
777 {
778 ASSERT(string1 != nullptr);
779 ASSERT(string2 != nullptr);
780 // allocator may trig gc and move src, need to hold it
781 auto thread = ManagedThread::GetCurrent();
782 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
783 VMHandle<String> str1_handle(thread, string1);
784 VMHandle<String> str2_handle(thread, string2);
785
786 uint32_t length1 = string1->GetLength();
787 uint32_t length2 = string2->GetLength();
788 uint32_t new_length = length1 + length2;
789 bool compressed = compressed_strings_enabled && (!string1->IsUtf16() && !string2->IsUtf16());
790 auto new_string = AllocStringObject(new_length, compressed, ctx, vm);
791 if (UNLIKELY(new_string == nullptr)) {
792 return nullptr;
793 }
794
795 ASSERT(new_string->hashcode_ == 0);
796
797 // retrieve strings after gc
798 string1 = str1_handle.GetPtr();
799 string2 = str2_handle.GetPtr();
800
801 // After copying we should have a full barrier, so this writes should happen-before barrier
802 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
803 if (compressed) {
804 Span<uint8_t> sp(new_string->GetDataMUtf8(), new_length);
805 memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataMUtf8(), length1);
806 sp = sp.SubSpan(length1);
807 memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataMUtf8(), length2);
808 } else {
809 Span<uint16_t> sp(new_string->GetDataUtf16(), new_length);
810 if (!string1->IsUtf16()) {
811 for (uint32_t i = 0; i < length1; ++i) {
812 sp[i] = string1->At<false>(i);
813 }
814 } else {
815 memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataUtf16(), length1 << 1U);
816 }
817 sp = sp.SubSpan(length1);
818 if (!string2->IsUtf16()) {
819 for (uint32_t i = 0; i < length2; ++i) {
820 sp[i] = string2->At<false>(i);
821 }
822 } else {
823 memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataUtf16(), length2 << 1U);
824 }
825 }
826 TSAN_ANNOTATE_IGNORE_WRITES_END();
827 // String is supposed to be a constant object, so all its data should be visible by all threads
828 arch::FullMemoryBarrier();
829
830 return new_string;
831 }
832
833 /* static */
AllocStringObject(size_t length,bool compressed,const LanguageContext & ctx,PandaVM * vm,bool movable)834 String *String::AllocStringObject(size_t length, bool compressed, const LanguageContext &ctx, PandaVM *vm, bool movable)
835 {
836 ASSERT(vm != nullptr);
837 auto *string_class = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::STRING);
838 size_t size = compressed ? String::ComputeSizeMUtf8(length) : String::ComputeSizeUtf16(length);
839 auto string = movable
840 ? reinterpret_cast<String *>(vm->GetHeapManager()->AllocateObject(string_class, size))
841 : reinterpret_cast<String *>(vm->GetHeapManager()->AllocateNonMovableObject(string_class, size));
842 if (string != nullptr) {
843 // After setting length we should have a full barrier, so this write should happens-before barrier
844 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
845 string->SetLength(length, compressed);
846 TSAN_ANNOTATE_IGNORE_WRITES_END();
847 // Witout full memory barrier it is possible that architectures with weak memory order can try fetching string
848 // legth before it's set
849 arch::FullMemoryBarrier();
850 }
851 return string;
852 }
853
854 } // namespace panda::coretypes
855