1 /* 2 * Copyright (c) 2025 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #include <codecvt> 17 #include <locale> 18 19 #include "common_components/base/utf_helper.h" 20 #include "common_interfaces/objects/base_string.h" 21 22 #include "common_components/platform/string_hash.h" 23 #include "common_components/platform/string_hash_helper.h" 24 25 namespace common { 26 constexpr size_t LOW_3BITS = 0x7; 27 constexpr size_t LOW_4BITS = 0xF; 28 constexpr size_t LOW_5BITS = 0x1F; 29 constexpr size_t LOW_6BITS = 0x3F; 30 constexpr size_t L_SURROGATE_START = 0xDC00; 31 constexpr size_t H_SURROGATE_START = 0xD800; 32 constexpr size_t SURROGATE_RAIR_START = 0x10000; 33 constexpr size_t OFFSET_18POS = 18; 34 constexpr size_t OFFSET_12POS = 12; 35 constexpr size_t OFFSET_10POS = 10; 36 constexpr size_t OFFSET_6POS = 6; 37 DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)38 size_t UtfUtils::DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, 39 size_t utf8Len, size_t start, bool modify, bool isWriteBuffer) 40 { 41 return common::utf_helper::DebuggerConvertRegionUtf16ToUtf8(utf16In, utf8Out, utf16Len, utf8Len, 42 start, modify, isWriteBuffer); 43 } 44 Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)45 size_t UtfUtils::Utf8ToUtf16Size(const uint8_t* utf8, size_t utf8Len) 46 { 47 return common::utf_helper::Utf8ToUtf16Size(utf8, utf8Len); 48 } 49 Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify,bool isGetBufferSize,bool cesu8)50 size_t UtfUtils::Utf16ToUtf8Size(const uint16_t* utf16, uint32_t length, bool modify, bool isGetBufferSize, 51 bool cesu8) 52 { 53 return common::utf_helper::Utf16ToUtf8Size(utf16, length, modify, isGetBufferSize, cesu8); 54 } 55 ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len)56 size_t UtfUtils::ConvertRegionUtf8ToUtf16(const uint8_t* utf8In, uint16_t* utf16Out, size_t utf8Len, 57 size_t utf16Len) 58 { 59 return common::utf_helper::ConvertRegionUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len); 60 } 61 ConvertRegionUtf16ToLatin1(const uint16_t * utf16In,uint8_t * latin1Out,size_t utf16Len,size_t latin1Len)62 size_t UtfUtils::ConvertRegionUtf16ToLatin1(const uint16_t* utf16In, uint8_t* latin1Out, size_t utf16Len, 63 size_t latin1Len) 64 { 65 return common::utf_helper::ConvertRegionUtf16ToLatin1(utf16In, latin1Out, utf16Len, latin1Len); 66 } 67 ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer,bool cesu)68 size_t UtfUtils::ConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, 69 size_t utf8Len, size_t start, bool modify, bool isWriteBuffer, bool cesu) 70 { 71 return common::utf_helper::ConvertRegionUtf16ToUtf8( 72 utf16In, utf8Out, utf16Len, utf8Len, start, modify, isWriteBuffer, cesu); 73 } 74 75 76 // To change the hash algorithm of BaseString, please modify BaseString::CalculateConcatHashCode 77 // and BaseStringHashHelper::ComputeHashForDataPlatform simultaneously!! 78 template<typename T> ComputeHashForDataInternal(const T * data,size_t size,uint32_t hashSeed)79 uint32_t ComputeHashForDataInternal(const T *data, size_t size, uint32_t hashSeed) 80 { 81 if (size <= static_cast<size_t>(StringHash::MIN_SIZE_FOR_UNROLLING)) { 82 uint32_t hash = hashSeed; 83 for (uint32_t i = 0; i < size; i++) { 84 hash = (hash << static_cast<uint32_t>(StringHash::HASH_SHIFT)) - hash + data[i]; 85 } 86 return hash; 87 } 88 return StringHashHelper::ComputeHashForDataPlatform(data, size, hashSeed); 89 } 90 ComputeHashForData(const uint8_t * data,size_t size,uint32_t hashSeed)91 PUBLIC_API uint32_t BaseString::ComputeHashForData(const uint8_t *data, size_t size, uint32_t hashSeed) 92 { 93 return ComputeHashForDataInternal(data, size, hashSeed); 94 } 95 ComputeHashForData(const uint16_t * data,size_t size,uint32_t hashSeed)96 PUBLIC_API uint32_t BaseString::ComputeHashForData(const uint16_t *data, size_t size, uint32_t hashSeed) 97 { 98 return ComputeHashForDataInternal(data, size, hashSeed); 99 } 100 ComputeHashcodeUtf8(const uint8_t * utf8Data,size_t utf8Len,bool canBeCompress)101 uint32_t BaseString::ComputeHashcodeUtf8(const uint8_t *utf8Data, size_t utf8Len, bool canBeCompress) 102 { 103 if (utf8Len == 0) { 104 return MixHashcode(0, NOT_INTEGER); 105 } 106 if (canBeCompress) { 107 uint32_t mixHash = 0; 108 // String using UTF8 encoding, and length smaller than 10, try to compute integer hash. 109 if (utf8Len < MAX_ELEMENT_INDEX_LEN && HashIntegerString(utf8Data, utf8Len, &mixHash, 0)) { 110 return mixHash; 111 } 112 uint32_t hash = ComputeHashForData(utf8Data, utf8Len, 0); 113 return MixHashcode(hash, NOT_INTEGER); 114 } 115 auto utf16Len = UtfUtils::Utf8ToUtf16Size(utf8Data, utf8Len); 116 std::vector<uint16_t> tmpBuffer(utf16Len); 117 [[maybe_unused]] auto len = UtfUtils::ConvertRegionUtf8ToUtf16(utf8Data, tmpBuffer.data(), utf8Len, 118 utf16Len); 119 DCHECK_CC(len == utf16Len); 120 uint32_t hash = ComputeHashForData(tmpBuffer.data(), utf16Len, 0); 121 return MixHashcode(hash, NOT_INTEGER); 122 } 123 124 /* static */ ComputeHashcodeUtf16(const uint16_t * utf16Data,uint32_t length)125 uint32_t BaseString::ComputeHashcodeUtf16(const uint16_t *utf16Data, uint32_t length) 126 { 127 if (length == 0) { 128 return MixHashcode(0, NOT_INTEGER); 129 } 130 uint32_t mixHash = 0; 131 // String length smaller than 10, try to compute integer hash. 132 if (length < MAX_ELEMENT_INDEX_LEN && HashIntegerString(utf16Data, length, &mixHash, 0)) { 133 return mixHash; 134 } 135 uint32_t hash = ComputeHashForData(utf16Data, length, 0); 136 return MixHashcode(hash, NOT_INTEGER); 137 } 138 139 140 // drop the tail bytes if the remain length can't fill the length it represents. FixUtf8Len(const uint8_t * utf8,size_t utf8Len)141 static size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len) 142 { 143 constexpr size_t TWO_BYTES_LENGTH = 2; 144 constexpr size_t THREE_BYTES_LENGTH = 3; 145 size_t trimSize = 0; 146 if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) { 147 // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one. 148 trimSize = 1; 149 } 150 if (utf8Len >= TWO_BYTES_LENGTH && utf8[utf8Len - TWO_BYTES_LENGTH] >= 0xE0) { 151 // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two. 152 trimSize = TWO_BYTES_LENGTH; 153 } 154 if (utf8Len >= THREE_BYTES_LENGTH && utf8[utf8Len - THREE_BYTES_LENGTH] >= 0xF0) { 155 // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last 156 // three. 157 trimSize = THREE_BYTES_LENGTH; 158 } 159 return utf8Len - trimSize; 160 } 161 162 /* static */ IsUtf8EqualsUtf16(const uint8_t * utf8Data,size_t utf8Len,const uint16_t * utf16Data,uint32_t utf16Len)163 bool BaseString::IsUtf8EqualsUtf16(const uint8_t* utf8Data, size_t utf8Len, 164 const uint16_t* utf16Data, uint32_t utf16Len) 165 { 166 size_t safeUtf8Len = FixUtf8Len(utf8Data, utf8Len); 167 const uint8_t* utf8End = utf8Data + utf8Len; 168 const uint8_t* utf8SafeEnd = utf8Data + safeUtf8Len; 169 const uint16_t* utf16End = utf16Data + utf16Len; 170 while (utf8Data < utf8SafeEnd && utf16Data < utf16End) { 171 uint8_t src = *utf8Data; 172 switch (src & 0xF0) { 173 case 0xF0: 174 { 175 const uint8_t c2 = *(++utf8Data); 176 const uint8_t c3 = *(++utf8Data); 177 const uint8_t c4 = *(++utf8Data); 178 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) | 179 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS); 180 if (codePoint >= SURROGATE_RAIR_START) { 181 if (utf16Data >= utf16End - 1) { 182 return false; 183 } 184 codePoint -= SURROGATE_RAIR_START; 185 if (*utf16Data++ != static_cast<uint16_t>((codePoint >> OFFSET_10POS) | 186 H_SURROGATE_START)) { 187 return false; 188 } else if (*utf16Data++ != static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START)) { 189 return false; 190 } 191 } else { 192 if (*utf16Data++ != static_cast<uint16_t>(codePoint)) { 193 return false; 194 } 195 } 196 utf8Data++; 197 break; 198 } 199 case 0xE0: 200 { 201 const uint8_t c2 = *(++utf8Data); 202 const uint8_t c3 = *(++utf8Data); 203 if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) | 204 ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS))) { 205 return false; 206 } 207 utf8Data++; 208 break; 209 } 210 case 0xD0: 211 case 0xC0: 212 { 213 const uint8_t c2 = *(++utf8Data); 214 if (*utf16Data++ != static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & 215 LOW_6BITS))) { 216 return false; 217 } 218 utf8Data++; 219 break; 220 } 221 default: 222 do { 223 if (*utf16Data++ != static_cast<uint16_t>(*utf8Data++)) { 224 return false; 225 } 226 } 227 while (utf8Data < utf8SafeEnd && utf16Data < utf16End && *utf8Data < 0x80); 228 break; 229 } 230 } 231 // The remain chars should be treated as single byte char. 232 while (utf8Data < utf8End && utf16Data < utf16End) { 233 if (*utf16Data++ != static_cast<uint16_t>(*utf8Data++)) { 234 return false; 235 } 236 } 237 return utf8Data == utf8End && utf16Data == utf16End; 238 } 239 240 // static 241 template <typename T1, typename T2> CalculateDataConcatHashCode(const T1 * dataFirst,size_t sizeFirst,const T2 * dataSecond,size_t sizeSecond)242 uint32_t BaseString::CalculateDataConcatHashCode(const T1* dataFirst, size_t sizeFirst, 243 const T2* dataSecond, size_t sizeSecond) 244 { 245 uint32_t totalHash = ComputeHashForData(dataFirst, sizeFirst, 0); 246 totalHash = ComputeHashForData(dataSecond, sizeSecond, totalHash); 247 return MixHashcode(totalHash, NOT_INTEGER); 248 } 249 250 template 251 uint32_t BaseString::CalculateDataConcatHashCode<uint8_t, uint8_t>(const uint8_t* dataFirst, size_t sizeFirst, 252 const uint8_t* dataSecond, size_t sizeSecond); 253 template 254 uint32_t BaseString::CalculateDataConcatHashCode<uint16_t, uint16_t>(const uint16_t* dataFirst, size_t sizeFirst, 255 const uint16_t* dataSecond, size_t sizeSecond); 256 template 257 uint32_t BaseString::CalculateDataConcatHashCode<uint8_t, uint16_t>(const uint8_t* dataFirst, size_t sizeFirst, 258 const uint16_t* dataSecond, size_t sizeSecond); 259 template 260 uint32_t BaseString::CalculateDataConcatHashCode<uint16_t, uint8_t>(const uint16_t* dataFirst, size_t sizeFirst, 261 const uint8_t* dataSecond, size_t sizeSecond); 262 263 CanBeCompressed(const BaseString * string)264 bool BaseString::CanBeCompressed(const BaseString* string) 265 { 266 DCHECK_CC(string->IsLineString()); 267 if (string->IsUtf8()) { 268 return CanBeCompressed(string->GetDataUtf8(), string->GetLength()); 269 } 270 return CanBeCompressed(string->GetDataUtf16(), string->GetLength()); 271 } 272 273 // static CanBeCompressed(const uint8_t * utf8Data,uint32_t utf8Len)274 bool BaseString::CanBeCompressed(const uint8_t* utf8Data, uint32_t utf8Len) 275 { 276 uint32_t index = 0; 277 for (; index + 4 <= utf8Len; index += 4) { 278 // 4: process the data in chunks of 4 elements to improve speed 279 // Check if all four characters in the current block are ASCII characters 280 if (!IsASCIICharacter(utf8Data[index]) || 281 !IsASCIICharacter(utf8Data[index + 1]) || // 1: the second element of the block 282 !IsASCIICharacter(utf8Data[index + 2]) || // 2: the third element of the block 283 !IsASCIICharacter(utf8Data[index + 3])) { 284 // 3: the fourth element of the block 285 return false; 286 } 287 } 288 // Check remaining characters if they are ASCII 289 for (; index < utf8Len; ++index) { 290 if (!IsASCIICharacter(utf8Data[index])) { 291 return false; 292 } 293 } 294 return true; 295 } 296 297 /* static */ CanBeCompressed(const uint16_t * utf16Data,uint32_t utf16Len)298 bool BaseString::CanBeCompressed(const uint16_t* utf16Data, uint32_t utf16Len) 299 { 300 uint32_t index = 0; 301 for (; index + 4 <= utf16Len; index += 4) { 302 // 4: process the data in chunks of 4 elements to improve speed 303 // Check if all four characters in the current block are ASCII characters 304 if (!IsASCIICharacter(utf16Data[index]) || 305 !IsASCIICharacter(utf16Data[index + 1]) || // 1: the second element of the block 306 !IsASCIICharacter(utf16Data[index + 2]) || // 2: the third element of the block 307 !IsASCIICharacter(utf16Data[index + 3])) { 308 // 3: the fourth element of the block 309 return false; 310 } 311 } 312 // Check remaining characters if they are ASCII 313 for (; index < utf16Len; ++index) { 314 if (!IsASCIICharacter(utf16Data[index])) { 315 return false; 316 } 317 } 318 return true; 319 } 320 321 IsASCIICharacter(uint16_t data)322 bool BaseString::IsASCIICharacter(uint16_t data) 323 { 324 if (data == 0) { 325 return false; 326 } 327 // \0 is not considered ASCII in Ecma-Modified-UTF8 [only modify '\u0000'] 328 return data <= UtfUtils::UTF8_1B_MAX; 329 } 330 331 332 /* static */ 333 template <typename T1, typename T2> IndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos,int32_t max)334 int32_t BaseString::IndexOf(Span<const T1>& lhsSp, Span<const T2>& rhsSp, int32_t pos, int32_t max) 335 { 336 DCHECK_CC(rhsSp.size() > 0); 337 auto first = static_cast<int32_t>(rhsSp[0]); 338 for (int32_t i = pos; i <= max; i++) { 339 if (static_cast<int32_t>(lhsSp[i]) != first) { 340 i++; 341 while (i <= max && static_cast<int32_t>(lhsSp[i]) != first) { 342 i++; 343 } 344 } 345 /* Found first character, now look at the rest of rhsSp */ 346 if (i <= max) { 347 int j = i + 1; 348 int end = j + static_cast<int>(rhsSp.size()) - 1; 349 350 for (int k = 1; j < end && static_cast<int32_t>(lhsSp[j]) == static_cast<int32_t>(rhsSp[k]); j++, k++) { 351 } 352 if (j == end) { 353 /* Found whole string. */ 354 return i; 355 } 356 } 357 } 358 return -1; 359 } 360 361 template 362 int32_t BaseString::IndexOf<uint8_t, uint8_t>(Span<const uint8_t>& lhsSp, Span<const uint8_t>& rhsSp, int32_t pos, 363 int32_t max); 364 template 365 int32_t BaseString::IndexOf<uint16_t, uint16_t>(Span<const uint16_t>& lhsSp, Span<const uint16_t>& rhsSp, 366 int32_t pos, int32_t max); 367 368 template 369 int32_t BaseString::IndexOf<uint8_t, uint16_t>(Span<const uint8_t>& lhsSp, Span<const uint16_t>& rhsSp, int32_t pos, 370 int32_t max); 371 372 template 373 int32_t BaseString::IndexOf<uint16_t, uint8_t>(Span<const uint16_t>& lhsSp, Span<const uint8_t>& rhsSp, int32_t pos, 374 int32_t max); 375 376 377 template <typename T1, typename T2> LastIndexOf(Span<const T1> & lhsSp,Span<const T2> & rhsSp,int32_t pos)378 int32_t BaseString::LastIndexOf(Span<const T1>& lhsSp, Span<const T2>& rhsSp, int32_t pos) 379 { 380 int rhsSize = static_cast<int>(rhsSp.size()); 381 DCHECK_CC(rhsSize > 0); 382 auto first = rhsSp[0]; 383 for (int32_t i = pos; i >= 0; i--) { 384 if (lhsSp[i] != first) { 385 continue; 386 } 387 /* Found first character, now look at the rest of rhsSp */ 388 int j = 1; 389 while (j < rhsSize) { 390 if (rhsSp[j] != lhsSp[i + j]) { 391 break; 392 } 393 j++; 394 } 395 if (j == rhsSize) { 396 return i; 397 } 398 } 399 return -1; 400 } 401 402 template 403 int32_t BaseString::LastIndexOf<uint8_t, uint8_t>(Span<const uint8_t>& lhsSp, Span<const uint8_t>& rhsSp, 404 int32_t pos); 405 template 406 int32_t BaseString::LastIndexOf<uint16_t, uint16_t>(Span<const uint16_t>& lhsSp, Span<const uint16_t>& rhsSp, 407 int32_t pos); 408 template 409 int32_t BaseString::LastIndexOf<uint8_t, uint16_t>(Span<const uint8_t>& lhsSp, Span<const uint16_t>& rhsSp, 410 int32_t pos); 411 template 412 int32_t BaseString::LastIndexOf<uint16_t, uint8_t>(Span<const uint16_t>& lhsSp, Span<const uint8_t>& rhsSp, 413 int32_t pos); 414 415 416 template <typename T1, typename T2> CompareStringSpan(Span<T1> & lhsSp,Span<T2> & rhsSp,int32_t count)417 int32_t CompareStringSpan(Span<T1>& lhsSp, Span<T2>& rhsSp, int32_t count) 418 { 419 for (int32_t i = 0; i < count; ++i) { 420 auto left = static_cast<int32_t>(lhsSp[i]); 421 auto right = static_cast<int32_t>(rhsSp[i]); 422 if (left != right) { 423 return left - right; 424 } 425 } 426 return 0; 427 } 428 429 template 430 int32_t CompareStringSpan<const uint8_t, const uint8_t>(Span<const uint8_t>& lhsSp, Span<const uint8_t>& rhsSp, 431 int32_t count); 432 template 433 int32_t CompareStringSpan<const uint16_t, const uint16_t>(Span<const uint16_t>& lhsSp, Span<const uint16_t>& rhsSp, 434 int32_t count); 435 template 436 int32_t CompareStringSpan<const uint8_t, const uint16_t>(Span<const uint8_t>& lhsSp, Span<const uint16_t>& rhsSp, 437 int32_t count); 438 template 439 int32_t CompareStringSpan<const uint16_t, const uint8_t>(Span<const uint16_t>& lhsSp, Span<const uint8_t>& rhsSp, 440 int32_t count); 441 442 443 template <typename T1, typename T2> IsSubStringAtSpan(Span<T1> & lhsSp,Span<T2> & rhsSp,uint32_t offset)444 bool IsSubStringAtSpan(Span<T1>& lhsSp, Span<T2>& rhsSp, uint32_t offset) 445 { 446 int rhsSize = static_cast<int>(rhsSp.size()); 447 DCHECK_CC(rhsSize + offset <= lhsSp.size()); 448 for (int i = 0; i < rhsSize; ++i) { 449 auto left = static_cast<int32_t>(lhsSp[offset + static_cast<uint32_t>(i)]); 450 auto right = static_cast<int32_t>(rhsSp[i]); 451 if (left != right) { 452 return false; 453 } 454 } 455 return true; 456 } 457 458 template 459 bool IsSubStringAtSpan<const uint8_t, const uint8_t>(Span<const uint8_t>& lhsSp, Span<const uint8_t>& rhsSp, 460 uint32_t offset); 461 template 462 bool IsSubStringAtSpan<const uint16_t, const uint16_t>(Span<const uint16_t>& lhsSp, Span<const uint16_t>& rhsSp, 463 uint32_t offset); 464 template 465 bool IsSubStringAtSpan<const uint8_t, const uint16_t>(Span<const uint8_t>& lhsSp, Span<const uint16_t>& rhsSp, 466 uint32_t offset); 467 template 468 bool IsSubStringAtSpan<const uint16_t, const uint8_t>(Span<const uint16_t>& lhsSp, Span<const uint8_t>& rhsSp, 469 uint32_t offset); 470 471 Utf16ToU16String(const uint16_t * utf16Data,uint32_t dataLen)472 std::u16string Utf16ToU16String(const uint16_t* utf16Data, uint32_t dataLen) 473 { 474 auto* char16tData = reinterpret_cast<const char16_t*>(utf16Data); 475 std::u16string u16str(char16tData, dataLen); 476 return u16str; 477 } 478 Utf8ToU16String(const uint8_t * utf8Data,uint32_t dataLen)479 std::u16string Utf8ToU16String(const uint8_t* utf8Data, uint32_t dataLen) 480 { 481 auto* charData = reinterpret_cast<const char*>(utf8Data); 482 std::string str(charData, dataLen); 483 std::u16string u16str = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str); 484 return u16str; 485 } 486 } // namespace common 487