1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include <cstddef>
17 #include <cstring>
18 #include <limits>
19
20 #include "libpandabase/utils/hash.h"
21 #include "libpandabase/utils/span.h"
22 #include "runtime/arch/memory_helpers.h"
23 #include "runtime/include/coretypes/array.h"
24 #include "runtime/include/coretypes/string-inl.h"
25 #include "runtime/include/runtime.h"
26 #include "runtime/handle_base-inl.h"
27 #include "runtime/include/panda_vm.h"
28
29 namespace panda::coretypes {
30
31 bool String::compressed_strings_enabled = true;
32
33 /* static */
CreateFromString(String * str,LanguageContext ctx,PandaVM * vm)34 String *String::CreateFromString(String *str, LanguageContext ctx, PandaVM *vm)
35 {
36 // Allocator may trig gc and move str, need to hold it
37 auto thread = ManagedThread::GetCurrent();
38 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
39 VMHandle<String> str_handle(thread, str);
40 auto string = AllocStringObject(str_handle->GetLength(), !str_handle->IsUtf16(), ctx, vm);
41 if (string == nullptr) {
42 return nullptr;
43 }
44
45 // Retrieve str after gc
46 str = str_handle.GetPtr();
47 string->length_ = str->length_;
48 string->hashcode_ = str->hashcode_;
49
50 uint32_t length = str->GetLength();
51 // After memcpy we should have a full barrier, so this writes should happen-before barrier
52 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
53 if (str->IsUtf16()) {
54 if (memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), str->GetDataUtf16(),
55 ComputeDataSizeUtf16(length)) != EOK) {
56 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
57 UNREACHABLE();
58 }
59 } else {
60 if (memcpy_s(string->GetDataMUtf8(), string->GetLength(), str->GetDataMUtf8(), length) != EOK) {
61 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
62 UNREACHABLE();
63 }
64 }
65 TSAN_ANNOTATE_IGNORE_WRITES_END();
66 // String is supposed to be a constant object, so all its data should be visible to all threads
67 arch::FullMemoryBarrier();
68
69 return string;
70 }
71
72 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,size_t mutf8_length,uint32_t utf16_length,bool can_be_compressed,LanguageContext ctx,PandaVM * vm,bool movable)73 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, size_t mutf8_length, uint32_t utf16_length,
74 bool can_be_compressed, LanguageContext ctx, PandaVM *vm, bool movable)
75 {
76 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm, movable);
77 if (string == nullptr) {
78 return nullptr;
79 }
80
81 ASSERT(string->hashcode_ == 0);
82 // After copying we should have a full barrier, so this writes should happen-before barrier
83 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
84 if (can_be_compressed) {
85 if (utf16_length != 0 &&
86 memcpy_s(string->GetDataMUtf8(), string->GetLength(), mutf8_data, utf16_length) != EOK) {
87 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
88 UNREACHABLE();
89 }
90 } else {
91 utf::ConvertMUtf8ToUtf16(mutf8_data, mutf8_length, string->GetDataUtf16());
92 }
93 TSAN_ANNOTATE_IGNORE_WRITES_END();
94 // String is supposed to be a constant object, so all its data should be visible to all threads
95 arch::FullMemoryBarrier();
96 return string;
97 }
98
99 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,uint32_t utf16_length,LanguageContext ctx,PandaVM * vm,bool movable)100 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm,
101 bool movable)
102 {
103 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
104 return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable);
105 }
106
107 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed,LanguageContext ctx,PandaVM * vm,bool movable)108 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed,
109 LanguageContext ctx, PandaVM *vm, bool movable)
110 {
111 return CreateFromMUtf8(mutf8_data, utf::Mutf8Size(mutf8_data), utf16_length, can_be_compressed, ctx, vm, movable);
112 }
113
114 /* static */
CreateFromMUtf8(const uint8_t * mutf8_data,LanguageContext ctx,PandaVM * vm,bool movable)115 String *String::CreateFromMUtf8(const uint8_t *mutf8_data, LanguageContext ctx, PandaVM *vm, bool movable)
116 {
117 size_t mutf8_length = utf::Mutf8Size(mutf8_data);
118 size_t utf16_length = utf::MUtf8ToUtf16Size(mutf8_data, mutf8_length);
119 bool can_be_compressed = CanBeCompressedMUtf8(mutf8_data);
120 return CreateFromMUtf8(mutf8_data, mutf8_length, utf16_length, can_be_compressed, ctx, vm, movable);
121 }
122
123 /* static */
CreateFromUtf16(const uint16_t * utf16_data,uint32_t utf16_length,LanguageContext ctx,PandaVM * vm,bool movable)124 String *String::CreateFromUtf16(const uint16_t *utf16_data, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm,
125 bool movable)
126 {
127 bool can_be_compressed = CanBeCompressed(utf16_data, utf16_length);
128 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm, movable);
129 if (string == nullptr) {
130 return nullptr;
131 }
132
133 ASSERT(string->hashcode_ == 0);
134 // After copying we should have a full barrier, so this writes should happen-before barrier
135 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
136 if (can_be_compressed) {
137 CopyUtf16AsMUtf8(utf16_data, string->GetDataMUtf8(), utf16_length);
138 } else {
139 if (utf16_length != 0 && memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), utf16_data,
140 utf16_length << 1UL) != EOK) {
141 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
142 UNREACHABLE();
143 }
144 }
145 TSAN_ANNOTATE_IGNORE_WRITES_END();
146 // String is supposed to be a constant object, so all its data should be visible to all threads
147 arch::FullMemoryBarrier();
148 return string;
149 }
150
151 /* static */
CreateEmptyString(LanguageContext ctx,PandaVM * vm)152 String *String::CreateEmptyString(LanguageContext ctx, PandaVM *vm)
153 {
154 uint16_t data = 0;
155 return CreateFromUtf16(&data, 0, ctx, vm);
156 }
157
158 /* static */
CopyUtf16AsMUtf8(const uint16_t * utf16_from,uint8_t * mutf8_to,uint32_t utf16_length)159 void String::CopyUtf16AsMUtf8(const uint16_t *utf16_from, uint8_t *mutf8_to, uint32_t utf16_length)
160 {
161 Span<const uint16_t> from(utf16_from, utf16_length);
162 Span<uint8_t> to(mutf8_to, utf16_length);
163 for (uint32_t i = 0; i < utf16_length; i++) {
164 to[i] = from[i];
165 }
166 }
167
168 // static
CreateNewStringFromChars(uint32_t offset,uint32_t length,Array * chararray,LanguageContext ctx,PandaVM * vm)169 String *String::CreateNewStringFromChars(uint32_t offset, uint32_t length, Array *chararray, LanguageContext ctx,
170 PandaVM *vm)
171 {
172 // Allocator may trig gc and move array, need to hold it
173 auto thread = ManagedThread::GetCurrent();
174 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
175 VMHandle<Array> array_handle(thread, chararray);
176
177 // NOLINTNEXTLINE(readability-identifier-naming)
178 const uint16_t *src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(chararray->GetData()) + (offset << 1UL));
179 bool can_be_compressed = CanBeCompressed(src, length);
180 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
181 if (string == nullptr) {
182 return nullptr;
183 }
184
185 // Retrieve src since gc may move it
186 src = reinterpret_cast<uint16_t *>(ToUintPtr<uint32_t>(array_handle->GetData()) + (offset << 1UL));
187 ASSERT(string->hashcode_ == 0);
188 // After copying we should have a full barrier, so this writes should happen-before barrier
189 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
190 if (can_be_compressed) {
191 CopyUtf16AsMUtf8(src, string->GetDataMUtf8(), length);
192 } else {
193 if (memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()), src, length << 1UL) != EOK) {
194 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
195 UNREACHABLE();
196 }
197 }
198 TSAN_ANNOTATE_IGNORE_WRITES_END();
199 // String is supposed to be a constant object, so all its data should be visible to all threads
200 arch::FullMemoryBarrier();
201 return string;
202 }
203
204 // static
CreateNewStringFromBytes(uint32_t offset,uint32_t length,uint32_t high_byte,Array * bytearray,LanguageContext ctx,PandaVM * vm)205 String *String::CreateNewStringFromBytes(uint32_t offset, uint32_t length, uint32_t high_byte, Array *bytearray,
206 LanguageContext ctx, PandaVM *vm)
207 {
208 // Allocator may trig gc and move array, need to hold it
209 auto thread = ManagedThread::GetCurrent();
210 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
211 VMHandle<Array> array_handle(thread, bytearray);
212
213 constexpr size_t BYTE_MASK = 0xFF;
214
215 // NOLINTNEXTLINE(readability-identifier-naming)
216 const uint8_t *src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(bytearray->GetData()) + offset);
217 high_byte &= BYTE_MASK;
218 bool can_be_compressed = CanBeCompressedMUtf8(src, length) && (high_byte == 0);
219 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
220 if (string == nullptr) {
221 return nullptr;
222 }
223
224 // Retrieve src since gc may move it
225 src = reinterpret_cast<uint8_t *>(ToUintPtr<uint32_t>(array_handle->GetData()) + offset);
226 ASSERT(string->hashcode_ == 0);
227 // After copying we should have a full barrier, so this writes should happen-before barrier
228 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
229 if (can_be_compressed) {
230 Span<const uint8_t> from(src, length);
231 Span<uint8_t> to(string->GetDataMUtf8(), length);
232 for (uint32_t i = 0; i < length; ++i) {
233 to[i] = (from[i] & BYTE_MASK);
234 }
235 } else {
236 Span<const uint8_t> from(src, length);
237 Span<uint16_t> to(string->GetDataUtf16(), length);
238 for (uint32_t i = 0; i < length; ++i) {
239 to[i] = (high_byte << 8U) + (from[i] & BYTE_MASK);
240 }
241 }
242 TSAN_ANNOTATE_IGNORE_WRITES_END();
243
244 // String is supposed to be a constant object, so all its data should be visible to all threads
245 arch::FullMemoryBarrier();
246 return string;
247 }
248
249 template <typename T1, typename T2>
CompareStringSpan(Span<T1> & lhs_sp,Span<T2> & rhs_sp,int32_t count)250 int32_t CompareStringSpan(Span<T1> &lhs_sp, Span<T2> &rhs_sp, int32_t count)
251 {
252 for (int32_t i = 0; i < count; ++i) {
253 int32_t char_diff = static_cast<int32_t>(lhs_sp[i]) - static_cast<int32_t>(rhs_sp[i]);
254 if (char_diff != 0) {
255 return char_diff;
256 }
257 }
258 return 0;
259 }
260
Compare(String * rstr)261 int32_t String::Compare(String *rstr)
262 {
263 String *lstr = this;
264 if (lstr == rstr) {
265 return 0;
266 }
267 int32_t lstr_leng = lstr->GetLength();
268 int32_t rstr_leng = rstr->GetLength();
269 int32_t leng_ret = lstr_leng - rstr_leng;
270 int32_t min_count = (leng_ret < 0) ? lstr_leng : rstr_leng;
271 if (!lstr->IsUtf16() && !rstr->IsUtf16()) {
272 Span<uint8_t> lhs_sp(lstr->GetDataMUtf8(), lstr_leng);
273 Span<uint8_t> rhs_sp(rstr->GetDataMUtf8(), rstr_leng);
274 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
275 if (char_diff != 0) {
276 return char_diff;
277 }
278 } else if (!lstr->IsUtf16()) {
279 Span<uint8_t> lhs_sp(lstr->GetDataMUtf8(), lstr_leng);
280 Span<uint16_t> rhs_sp(rstr->GetDataUtf16(), rstr_leng);
281 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
282 if (char_diff != 0) {
283 return char_diff;
284 }
285 } else if (!rstr->IsUtf16()) {
286 Span<uint16_t> lhs_sp(lstr->GetDataUtf16(), rstr_leng);
287 Span<uint8_t> rhs_sp(rstr->GetDataMUtf8(), lstr_leng);
288 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
289 if (char_diff != 0) {
290 return char_diff;
291 }
292 } else {
293 Span<uint16_t> lhs_sp(lstr->GetDataUtf16(), lstr_leng);
294 Span<uint16_t> rhs_sp(rstr->GetDataUtf16(), rstr_leng);
295 int32_t char_diff = CompareStringSpan(lhs_sp, rhs_sp, min_count);
296 if (char_diff != 0) {
297 return char_diff;
298 }
299 }
300 return leng_ret;
301 }
302
303 /* static */
304 template <typename T1, typename T2>
IndexOf(Span<const T1> & lhs_sp,Span<const T2> & rhs_sp,int32_t pos,int32_t max)305 int32_t String::IndexOf(Span<const T1> &lhs_sp, Span<const T2> &rhs_sp, int32_t pos, int32_t max)
306 {
307 auto first = static_cast<int32_t>(rhs_sp[0]);
308 for (int32_t i = pos; i <= max; i++) {
309 if (static_cast<int32_t>(lhs_sp[i]) != first) {
310 i++;
311 while (i <= max && static_cast<int32_t>(lhs_sp[i]) != first) {
312 i++;
313 }
314 }
315 /* Found the first character, now look at the rest of rhs_sp */
316 if (i <= max) {
317 int j = i + 1;
318 int end = j + rhs_sp.size() - 1;
319
320 for (int k = 1; j < end && static_cast<int32_t>(lhs_sp[j]) == static_cast<int32_t>(rhs_sp[k]); j++, k++) {
321 }
322 if (j == end) {
323 /* Found whole string. */
324 return i;
325 }
326 }
327 }
328 return -1;
329 }
330
IndexOf(String * rhs,int32_t pos)331 int32_t String::IndexOf(String *rhs, int32_t pos)
332 {
333 if (rhs == nullptr) {
334 return -1;
335 }
336 String *lhs = this;
337 int32_t lhs_count = lhs->GetLength();
338 int32_t rhs_count = rhs->GetLength();
339
340 if (rhs_count == 0) {
341 return pos;
342 }
343
344 if (pos >= lhs_count) {
345 return -1;
346 }
347
348 if (pos < 0) {
349 pos = 0;
350 }
351
352 int32_t max = lhs_count - rhs_count;
353 if (rhs->IsMUtf8() && lhs->IsMUtf8()) {
354 Span<const uint8_t> lhs_sp(lhs->GetDataMUtf8(), lhs_count);
355 Span<const uint8_t> rhs_sp(rhs->GetDataMUtf8(), rhs_count);
356 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
357 } else if (rhs->IsUtf16() && lhs->IsUtf16()) { // NOLINT(readability-else-after-return)
358 Span<const uint16_t> lhs_sp(lhs->GetDataUtf16(), lhs_count);
359 Span<const uint16_t> rhs_sp(rhs->GetDataUtf16(), rhs_count);
360 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
361 } else if (rhs->IsUtf16()) {
362 Span<const uint8_t> lhs_sp(lhs->GetDataMUtf8(), lhs_count);
363 Span<const uint16_t> rhs_sp(rhs->GetDataUtf16(), rhs_count);
364 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
365 } else { // NOLINT(readability-else-after-return)
366 Span<const uint16_t> lhs_sp(lhs->GetDataUtf16(), lhs_count);
367 Span<const uint8_t> rhs_sp(rhs->GetDataMUtf8(), rhs_count);
368 return String::IndexOf(lhs_sp, rhs_sp, pos, max);
369 }
370
371 return -1;
372 }
373
374 /* static */
CanBeCompressed(const uint16_t * utf16_data,uint32_t utf16_length)375 bool String::CanBeCompressed(const uint16_t *utf16_data, uint32_t utf16_length)
376 {
377 if (!compressed_strings_enabled) {
378 return false;
379 }
380 bool is_compressed = true;
381 Span<const uint16_t> data(utf16_data, utf16_length);
382 for (uint32_t i = 0; i < utf16_length; i++) {
383 if (!IsASCIICharacter(data[i])) {
384 is_compressed = false;
385 break;
386 }
387 }
388 return is_compressed;
389 }
390
391 // static
CanBeCompressedMUtf8(const uint8_t * mutf8_data,uint32_t mutf8_length)392 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length)
393 {
394 if (!compressed_strings_enabled) {
395 return false;
396 }
397 bool is_compressed = true;
398 Span<const uint8_t> data(mutf8_data, mutf8_length);
399 for (uint32_t i = 0; i < mutf8_length; i++) {
400 if (!IsASCIICharacter(data[i])) {
401 is_compressed = false;
402 break;
403 }
404 }
405 return is_compressed;
406 }
407
408 /* static */
CanBeCompressedMUtf8(const uint8_t * mutf8_data)409 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data)
410 {
411 return compressed_strings_enabled ? utf::IsMUtf8OnlySingleBytes(mutf8_data) : false;
412 }
413
414 /* static */
CanBeCompressedUtf16(const uint16_t * utf16_data,uint32_t utf16_length,uint16_t non)415 bool String::CanBeCompressedUtf16(const uint16_t *utf16_data, uint32_t utf16_length, uint16_t non)
416 {
417 if (!compressed_strings_enabled) {
418 return false;
419 }
420 bool is_compressed = true;
421 Span<const uint16_t> data(utf16_data, utf16_length);
422 for (uint32_t i = 0; i < utf16_length; i++) {
423 if (!IsASCIICharacter(data[i]) && data[i] != non) {
424 is_compressed = false;
425 break;
426 }
427 }
428 return is_compressed;
429 }
430
431 /* static */
CanBeCompressedMUtf8(const uint8_t * mutf8_data,uint32_t mutf8_length,uint16_t non)432 bool String::CanBeCompressedMUtf8(const uint8_t *mutf8_data, uint32_t mutf8_length, uint16_t non)
433 {
434 if (!compressed_strings_enabled) {
435 return false;
436 }
437 bool is_compressed = true;
438 Span<const uint8_t> data(mutf8_data, mutf8_length);
439 for (uint32_t i = 0; i < mutf8_length; i++) {
440 if (!IsASCIICharacter(data[i]) && data[i] != non) {
441 is_compressed = false;
442 break;
443 }
444 }
445 return is_compressed;
446 }
447
448 /* static */
StringsAreEqual(String * str1,String * str2)449 bool String::StringsAreEqual(String *str1, String *str2)
450 {
451 if ((str1->IsUtf16() != str2->IsUtf16()) || (str1->GetLength() != str2->GetLength())) {
452 return false;
453 }
454
455 if (str1->IsUtf16()) {
456 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
457 Span<const uint16_t> data2(str2->GetDataUtf16(), str1->GetLength());
458 return String::StringsAreEquals(data1, data2);
459 } else { // NOLINT(readability-else-after-return)
460 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
461 Span<const uint8_t> data2(str2->GetDataMUtf8(), str1->GetLength());
462 return String::StringsAreEquals(data1, data2);
463 }
464 }
465
466 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8_data,uint32_t utf16_length)467 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length)
468 {
469 if (str1->GetLength() != utf16_length) {
470 return false;
471 }
472 return StringsAreEqualMUtf8(str1, mutf8_data, utf16_length, CanBeCompressedMUtf8(mutf8_data));
473 }
474
475 /* static */
StringsAreEqualMUtf8(String * str1,const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed)476 bool String::StringsAreEqualMUtf8(String *str1, const uint8_t *mutf8_data, uint32_t utf16_length,
477 bool can_be_compressed)
478 {
479 bool result = true;
480 if (str1->GetLength() != utf16_length) {
481 result = false;
482 } else {
483 bool str1_can_be_compressed = !str1->IsUtf16();
484 if (str1_can_be_compressed != can_be_compressed) {
485 return false;
486 }
487
488 ASSERT(str1_can_be_compressed == can_be_compressed);
489 if (str1_can_be_compressed) {
490 Span<const uint8_t> data1(str1->GetDataMUtf8(), str1->GetLength());
491 Span<const uint8_t> data2(mutf8_data, utf16_length);
492 result = String::StringsAreEquals(data1, data2);
493 } else {
494 result = IsMutf8EqualsUtf16(mutf8_data, str1->GetDataUtf16(), str1->GetLength());
495 }
496 }
497 return result;
498 }
499
500 /* static */
StringsAreEqualUtf16(String * str1,const uint16_t * utf16_data,uint32_t utf16_data_length)501 bool String::StringsAreEqualUtf16(String *str1, const uint16_t *utf16_data, uint32_t utf16_data_length)
502 {
503 bool result = true;
504 if (str1->GetLength() != utf16_data_length) {
505 result = false;
506 } else if (!str1->IsUtf16()) {
507 result = IsMutf8EqualsUtf16(str1->GetDataMUtf8(), str1->GetLength(), utf16_data, utf16_data_length);
508 } else {
509 Span<const uint16_t> data1(str1->GetDataUtf16(), str1->GetLength());
510 Span<const uint16_t> data2(utf16_data, utf16_data_length);
511 result = String::StringsAreEquals(data1, data2);
512 }
513 return result;
514 }
515
516 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8_data,uint32_t utf8_data_length,const uint16_t * utf16_data,uint32_t utf16_data_length)517 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8_data, uint32_t utf8_data_length, const uint16_t *utf16_data,
518 uint32_t utf16_data_length)
519 {
520 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
521 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_data_length);
522 [[maybe_unused]] auto converted_string_size =
523 utf::ConvertRegionMUtf8ToUtf16(utf8_data, tmp_buffer, utf8_data_length, utf16_data_length, 0);
524 ASSERT(converted_string_size == utf16_data_length);
525
526 Span<const uint16_t> data1(tmp_buffer, utf16_data_length);
527 Span<const uint16_t> data2(utf16_data, utf16_data_length);
528 bool result = String::StringsAreEquals(data1, data2);
529 allocator->Delete(tmp_buffer);
530 return result;
531 }
532
533 /* static */
IsMutf8EqualsUtf16(const uint8_t * utf8_data,const uint16_t * utf16_data,uint32_t utf16_data_length)534 bool String::IsMutf8EqualsUtf16(const uint8_t *utf8_data, const uint16_t *utf16_data, uint32_t utf16_data_length)
535 {
536 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
537 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_data_length);
538 utf::ConvertMUtf8ToUtf16(utf8_data, utf::Mutf8Size(utf8_data), tmp_buffer);
539
540 Span<const uint16_t> data1(tmp_buffer, utf16_data_length);
541 Span<const uint16_t> data2(utf16_data, utf16_data_length);
542 bool result = String::StringsAreEquals(data1, data2);
543 allocator->Delete(tmp_buffer);
544 return result;
545 }
546
547 /* static */
548 template <typename T>
StringsAreEquals(Span<const T> & str1,Span<const T> & str2)549 bool String::StringsAreEquals(Span<const T> &str1, Span<const T> &str2)
550 {
551 for (size_t i = 0; i < str1.Size(); i++) {
552 if (str1[i] != str2[i]) {
553 return false;
554 }
555 }
556 return true;
557 }
558
ToCharArray(LanguageContext ctx)559 Array *String::ToCharArray(LanguageContext ctx)
560 {
561 // allocator may trig gc and move 'this', need to hold it
562 auto thread = ManagedThread::GetCurrent();
563 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
564 VMHandle<String> str(thread, this);
565 auto *klass = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::ARRAY_U16);
566 Array *array = Array::Create(klass, GetLength());
567 if (array == nullptr) {
568 return nullptr;
569 }
570
571 if (str->IsUtf16()) {
572 Span<uint16_t> sp(str->GetDataUtf16(), str->GetLength());
573 for (size_t i = 0; i < sp.size(); i++) {
574 array->Set<uint16_t>(i, sp[i]);
575 }
576 } else {
577 Span<uint8_t> sp(str->GetDataMUtf8(), str->GetLength());
578 for (size_t i = 0; i < sp.size(); i++) {
579 array->Set<uint16_t>(i, sp[i]);
580 }
581 }
582
583 return array;
584 }
585
586 // We need to use java compatible hash algorithm as javac relies on it
587 // when compiles switch-case statement with strings
588 template <class T>
ComputeHashForData(const T * data,size_t size)589 static int32_t ComputeHashForData(const T *data, size_t size)
590 {
591 uint32_t hash = 0;
592 #if defined(__GNUC__)
593 #pragma GCC diagnostic push
594 #pragma GCC diagnostic ignored "-Wignored-attributes"
595 Span<const T> sp(data, size);
596 #pragma GCC diagnostic pop
597 #endif
598 for (auto c : sp) {
599 constexpr size_t SHIFT = 5;
600 hash = (hash << SHIFT) - hash + c;
601 }
602 return static_cast<int32_t>(hash);
603 }
604
ComputeHashForMutf8(const uint8_t * mutf8_data)605 static int32_t ComputeHashForMutf8(const uint8_t *mutf8_data)
606 {
607 uint32_t hash = 0;
608 while (*mutf8_data != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
609 constexpr size_t SHIFT = 5;
610 hash = (hash << SHIFT) - hash + *mutf8_data++; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
611 }
612 return static_cast<int32_t>(hash);
613 }
614
ComputeHashcode()615 uint32_t String::ComputeHashcode()
616 {
617 uint32_t hash;
618 if (compressed_strings_enabled) {
619 if (!IsUtf16()) {
620 hash = ComputeHashForData(GetDataMUtf8(), GetLength());
621 } else {
622 hash = ComputeHashForData(GetDataUtf16(), GetLength());
623 }
624 } else {
625 ASSERT(static_cast<size_t>(GetLength()) > (std::numeric_limits<size_t>::max() >> 1U));
626 hash = ComputeHashForData(GetDataUtf16(), GetLength());
627 }
628 return hash;
629 }
630
631 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8_data,uint32_t utf16_length)632 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length)
633 {
634 return ComputeHashcodeMutf8(mutf8_data, utf16_length, CanBeCompressedMUtf8(mutf8_data));
635 }
636
637 /* static */
ComputeHashcodeMutf8(const uint8_t * mutf8_data,uint32_t utf16_length,bool can_be_compressed)638 uint32_t String::ComputeHashcodeMutf8(const uint8_t *mutf8_data, uint32_t utf16_length, bool can_be_compressed)
639 {
640 uint32_t hash;
641 if (can_be_compressed) {
642 hash = ComputeHashForMutf8(mutf8_data);
643 } else {
644 auto allocator = Runtime::GetCurrent()->GetInternalAllocator();
645 auto tmp_buffer = allocator->AllocArray<uint16_t>(utf16_length);
646 utf::ConvertMUtf8ToUtf16(mutf8_data, utf::Mutf8Size(mutf8_data), tmp_buffer);
647 hash = ComputeHashForData(tmp_buffer, utf16_length);
648 allocator->Delete(tmp_buffer);
649 }
650 return hash;
651 }
652
653 /* static */
ComputeHashcodeUtf16(uint16_t * utf16_data,uint32_t length)654 uint32_t String::ComputeHashcodeUtf16(uint16_t *utf16_data, uint32_t length)
655 {
656 return ComputeHashForData(utf16_data, length);
657 }
658
659 /* static */
DoReplace(String * src,uint16_t old_c,uint16_t new_c,LanguageContext ctx,PandaVM * vm)660 String *String::DoReplace(String *src, uint16_t old_c, uint16_t new_c, LanguageContext ctx, PandaVM *vm)
661 {
662 int32_t length = src->GetLength();
663 bool can_be_compressed = IsASCIICharacter(new_c);
664 if (src->IsUtf16()) {
665 can_be_compressed = can_be_compressed && CanBeCompressedUtf16(src->GetDataUtf16(), length, old_c);
666 } else {
667 can_be_compressed = can_be_compressed && CanBeCompressedMUtf8(src->GetDataMUtf8(), length, old_c);
668 }
669
670 // allocator may trig gc and move src, need to hold it
671 auto thread = ManagedThread::GetCurrent();
672 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
673 VMHandle<String> src_handle(thread, src);
674 auto string = AllocStringObject(length, can_be_compressed, ctx, vm);
675 if (string == nullptr) {
676 return nullptr;
677 }
678
679 // Retrieve src after gc
680 src = src_handle.GetPtr();
681 ASSERT(string->hashcode_ == 0);
682
683 // After replacing we should have a full barrier, so this writes should happen-before barrier
684 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
685 if (src->IsUtf16()) {
686 if (can_be_compressed) {
687 auto replace = [old_c, new_c](uint16_t c) { return static_cast<uint8_t>((old_c != c) ? c : new_c); };
688 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
689 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataMUtf8(), replace);
690 } else {
691 auto replace = [old_c, new_c](uint16_t c) { return (old_c != c) ? c : new_c; };
692 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
693 std::transform(src->GetDataUtf16(), src->GetDataUtf16() + length, string->GetDataUtf16(), replace);
694 }
695 } else {
696 if (can_be_compressed) {
697 auto replace = [old_c, new_c](uint16_t c) { return static_cast<uint8_t>((old_c != c) ? c : new_c); };
698 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
699 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataMUtf8(), replace);
700 } else {
701 auto replace = [old_c, new_c](uint16_t c) { return (old_c != c) ? c : new_c; };
702 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
703 std::transform(src->GetDataMUtf8(), src->GetDataMUtf8() + length, string->GetDataUtf16(), replace);
704 }
705 }
706 TSAN_ANNOTATE_IGNORE_WRITES_END();
707 // String is supposed to be a constant object, so all its data should be visible to all threads
708 arch::FullMemoryBarrier();
709 return string;
710 }
711
712 /* static */
FastSubString(String * src,uint32_t start,uint32_t utf16_length,LanguageContext ctx,PandaVM * vm)713 String *String::FastSubString(String *src, uint32_t start, uint32_t utf16_length, LanguageContext ctx, PandaVM *vm)
714 {
715 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
716 bool can_be_compressed = !src->IsUtf16() || CanBeCompressed(src->GetDataUtf16() + start, utf16_length);
717
718 // allocator may trig gc and move src, need to hold it
719 auto thread = ManagedThread::GetCurrent();
720 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
721 VMHandle<String> src_handle(thread, src);
722 auto string = AllocStringObject(utf16_length, can_be_compressed, ctx, vm);
723 if (string == nullptr) {
724 return nullptr;
725 }
726
727 // Retrieve src after gc
728 src = src_handle.GetPtr();
729 ASSERT(string->hashcode_ == 0);
730
731 // After copying we should have a full barrier, so this writes should happen-before barrier
732 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
733 if (src->IsUtf16()) {
734 if (can_be_compressed) {
735 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
736 CopyUtf16AsMUtf8(src->GetDataUtf16() + start, string->GetDataMUtf8(), utf16_length);
737 } else {
738 if (memcpy_s(string->GetDataUtf16(), ComputeDataSizeUtf16(string->GetLength()),
739 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
740 src->GetDataUtf16() + start, utf16_length << 1UL) != EOK) {
741 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
742 UNREACHABLE();
743 }
744 }
745 } else {
746 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
747 if (memcpy_s(string->GetDataMUtf8(), string->GetLength(), src->GetDataMUtf8() + start, utf16_length) != EOK) {
748 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
749 UNREACHABLE();
750 }
751 }
752 TSAN_ANNOTATE_IGNORE_WRITES_END();
753 // String is supposed to be a constant object, so all its data should be visible to all threads
754 arch::FullMemoryBarrier();
755 return string;
756 }
757
758 /* static */
Concat(String * string1,String * string2,LanguageContext ctx,PandaVM * vm)759 String *String::Concat(String *string1, String *string2, LanguageContext ctx, PandaVM *vm)
760 {
761 // allocator may trig gc and move src, need to hold it
762 auto thread = ManagedThread::GetCurrent();
763 [[maybe_unused]] HandleScope<ObjectHeader *> scope(thread);
764 VMHandle<String> str1_handle(thread, string1);
765 VMHandle<String> str2_handle(thread, string2);
766
767 uint32_t length1 = string1->GetLength();
768 uint32_t length2 = string2->GetLength();
769 uint32_t new_length = length1 + length2;
770 bool compressed = compressed_strings_enabled && (!string1->IsUtf16() && !string2->IsUtf16());
771 auto new_string = AllocStringObject(new_length, compressed, ctx, vm);
772 if (UNLIKELY(new_string == nullptr)) {
773 return nullptr;
774 }
775
776 ASSERT(new_string->hashcode_ == 0);
777
778 // Retrieve strings after gc
779 string1 = str1_handle.GetPtr();
780 string2 = str2_handle.GetPtr();
781
782 // After copying we should have a full barrier, so this writes should happen-before barrier
783 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
784 if (compressed) {
785 Span<uint8_t> sp(new_string->GetDataMUtf8(), new_length);
786 if (memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataMUtf8(), length1) != EOK) {
787 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
788 UNREACHABLE();
789 }
790 sp = sp.SubSpan(length1);
791 if (memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataMUtf8(), length2) != EOK) {
792 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
793 UNREACHABLE();
794 }
795 } else {
796 Span<uint16_t> sp(new_string->GetDataUtf16(), new_length);
797 if (!string1->IsUtf16()) {
798 for (uint32_t i = 0; i < length1; ++i) {
799 sp[i] = string1->At<false>(i);
800 }
801 } else {
802 if (memcpy_s(sp.Data(), sp.SizeBytes(), string1->GetDataUtf16(), length1 << 1U) != EOK) {
803 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
804 UNREACHABLE();
805 }
806 }
807 sp = sp.SubSpan(length1);
808 if (!string2->IsUtf16()) {
809 for (uint32_t i = 0; i < length2; ++i) {
810 sp[i] = string2->At<false>(i);
811 }
812 } else {
813 if (memcpy_s(sp.Data(), sp.SizeBytes(), string2->GetDataUtf16(), length2 << 1U) != EOK) {
814 LOG(FATAL, RUNTIME) << __func__ << " memcpy_s failed";
815 UNREACHABLE();
816 }
817 }
818 }
819 TSAN_ANNOTATE_IGNORE_WRITES_END();
820 // String is supposed to be a constant object, so all its data should be visible to all threads
821 arch::FullMemoryBarrier();
822
823 return new_string;
824 }
825
826 /* static */
AllocStringObject(size_t length,bool compressed,LanguageContext ctx,PandaVM * vm,bool movable)827 String *String::AllocStringObject(size_t length, bool compressed, LanguageContext ctx, PandaVM *vm, bool movable)
828 {
829 ASSERT(vm != nullptr);
830 auto *string_class = Runtime::GetCurrent()->GetClassLinker()->GetExtension(ctx)->GetClassRoot(ClassRoot::STRING);
831 size_t size = compressed ? String::ComputeSizeMUtf8(length) : String::ComputeSizeUtf16(length);
832 auto string = movable
833 ? reinterpret_cast<String *>(vm->GetHeapManager()->AllocateObject(string_class, size))
834 : reinterpret_cast<String *>(vm->GetHeapManager()->AllocateNonMovableObject(string_class, size));
835 if (string != nullptr) {
836 // After setting length we should have a full barrier, so this write should happens-before barrier
837 TSAN_ANNOTATE_IGNORE_WRITES_BEGIN();
838 string->SetLength(length, compressed);
839 TSAN_ANNOTATE_IGNORE_WRITES_END();
840 // Witout full memory barrier it is possible that architectures with weak memory order can try fetching string
841 // legth before it's set
842 arch::FullMemoryBarrier();
843 }
844 return string;
845 }
846
847 } // namespace panda::coretypes
848