1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_BASE_STRING_HELP_H 17 #define ECMASCRIPT_BASE_STRING_HELP_H 18 19 #include <algorithm> 20 #include <codecvt> 21 #include <locale> 22 #include <regex> 23 #include <sstream> 24 #include <string> 25 #include <vector> 26 27 #include "ecmascript/base/utf_helper.h" 28 #include "ecmascript/ecma_vm.h" 29 #include "ecmascript/js_thread.h" 30 #include "ecmascript/mem/assert_scope.h" 31 #include "ecmascript/object_factory.h" 32 33 #include "unicode/unistr.h" 34 35 namespace panda::ecmascript::base { 36 // White Space Code Points and Line Terminators Code Point 37 // NOLINTNEXTLINE(modernize-avoid-c-arrays) 38 static constexpr uint16_t SPACE_OR_LINE_TERMINAL[] = { 39 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 40 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF, 41 }; 42 static constexpr int UICODE_FROM_UTF8[] = { 43 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd, 44 }; 45 static constexpr int UTF8_MIN_CODE[] = { 46 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, 47 }; 48 static constexpr char UTF8_FIRST_CODE[] = { 49 0x1f, 0xf, 0x7, 0x3, 0x1, 50 }; 51 class StringHelper { 52 public: 53 static constexpr int INVALID_UNICODE_FROM_UTF8 = -1; 54 ReplaceAll(CString str,const CString & oldValue,const CString & newValue)55 static inline CString ReplaceAll(CString str, const CString &oldValue, 56 const CString &newValue) 57 { 58 if (oldValue.empty() || oldValue == newValue) { 59 return str; 60 } 61 CString::size_type pos(0); 62 while ((pos = str.find(oldValue, pos)) != CString::npos) { 63 str.replace(pos, oldValue.length(), newValue); 64 pos += newValue.length(); 65 } 66 return str; 67 } 68 Replace(CString str,const CString & oldValue,const CString & newValue)69 static inline CString Replace(CString str, const CString &oldValue, 70 const CString &newValue) 71 { 72 if (oldValue.empty() || oldValue == newValue) { 73 return str; 74 } 75 CString::size_type pos(0); 76 if ((pos = str.find(oldValue, pos)) != CString::npos) { 77 str.replace(pos, oldValue.length(), newValue); 78 } 79 return str; 80 } 81 Utf16ToU16String(const uint16_t * utf16Data,uint32_t dataLen)82 static inline std::u16string Utf16ToU16String(const uint16_t *utf16Data, uint32_t dataLen) 83 { 84 auto *char16tData = reinterpret_cast<const char16_t *>(utf16Data); 85 std::u16string u16str(char16tData, dataLen); 86 return u16str; 87 } 88 Utf8ToString(const uint8_t * utf8Data,uint32_t dataLen)89 static inline std::string Utf8ToString(const uint8_t *utf8Data, uint32_t dataLen) 90 { 91 auto *charData = reinterpret_cast<const char *>(utf8Data); 92 std::string str(charData, dataLen); 93 return str; 94 } 95 Utf8ToU16String(const uint8_t * utf8Data,uint32_t dataLen)96 static inline std::u16string Utf8ToU16String(const uint8_t *utf8Data, uint32_t dataLen) 97 { 98 auto *charData = reinterpret_cast<const char *>(utf8Data); 99 std::string str(charData, dataLen); 100 std::u16string u16str = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str); 101 return u16str; 102 } 103 WstringToString(const std::wstring & wstr)104 static inline std::string WstringToString(const std::wstring &wstr) 105 { 106 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.to_bytes(wstr); 107 } 108 StringToWstring(const std::string & str)109 static inline std::wstring StringToWstring(const std::string &str) 110 { 111 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.from_bytes(str); 112 } 113 U16stringToString(const std::u16string & u16str)114 static inline std::string U16stringToString(const std::u16string &u16str) 115 { 116 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str); 117 } 118 StringToU16string(const std::string & str)119 static inline std::u16string StringToU16string(const std::string &str) 120 { 121 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str); 122 } 123 Find(const std::string & thisStr,const std::string & searchStr,int32_t pos)124 static inline size_t Find(const std::string &thisStr, const std::string &searchStr, int32_t pos) 125 { 126 size_t idx = thisStr.find(searchStr, pos); 127 return idx; 128 } 129 Find(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)130 static inline size_t Find(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos) 131 { 132 size_t idx = thisStr.find(searchStr, pos); 133 return idx; 134 } 135 RFind(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)136 static inline size_t RFind(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos) 137 { 138 size_t idx = thisStr.rfind(searchStr, pos); 139 return idx; 140 } 141 ToUpper(const std::u16string & str)142 static inline std::string ToUpper(const std::u16string &str) 143 { 144 std::u16string tmpStr = str; 145 const char16_t *constChar16tData = tmpStr.data(); 146 icu::UnicodeString uString(constChar16tData); 147 icu::UnicodeString up = uString.toUpper(); 148 std::string res; 149 up.toUTF8String(res); 150 return res; 151 } 152 ToLocaleUpper(const std::u16string & str,const icu::Locale & locale)153 static inline std::string ToLocaleUpper(const std::u16string &str, const icu::Locale &locale) 154 { 155 std::u16string tmpStr = str; 156 const char16_t *constChar16tData = tmpStr.data(); 157 icu::UnicodeString uString(constChar16tData); 158 icu::UnicodeString up = uString.toUpper(locale); 159 std::string res; 160 up.toUTF8String(res); 161 return res; 162 } 163 ToLower(const std::u16string & str)164 static inline std::string ToLower(const std::u16string &str) 165 { 166 std::u16string tmpStr = str; 167 const char16_t *constChar16tData = tmpStr.data(); 168 icu::UnicodeString uString(constChar16tData); 169 icu::UnicodeString low = uString.toLower(); 170 std::string res; 171 low.toUTF8String(res); 172 return res; 173 } 174 ToLocaleLower(const std::u16string & str,const icu::Locale & locale)175 static inline std::string ToLocaleLower(const std::u16string &str, const icu::Locale &locale) 176 { 177 std::u16string tmpStr = str; 178 const char16_t *constChar16tData = tmpStr.data(); 179 icu::UnicodeString uString(constChar16tData); 180 icu::UnicodeString low = uString.toLower(locale); 181 std::string res; 182 low.toUTF8String(res); 183 return res; 184 } 185 FindFromU16ToUpper(const std::u16string & thisStr,uint16_t * u16Data)186 static inline size_t FindFromU16ToUpper(const std::u16string &thisStr, uint16_t *u16Data) 187 { 188 std::u16string tmpStr = Utf16ToU16String(u16Data, 1); 189 const char16_t *constChar16tData = tmpStr.data(); 190 icu::UnicodeString uString(constChar16tData); 191 icu::UnicodeString up = uString.toUpper(); 192 std::string res; 193 up.toUTF8String(res); 194 std::u16string searchStr = StringToU16string(res); 195 size_t idx = Find(thisStr, searchStr, 0); 196 return idx; 197 } 198 UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)199 static int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp) 200 { 201 int c = *p++; 202 if (c < UICODE_FROM_UTF8[0]) { 203 *pp = p; 204 return c; 205 } 206 int l = 0; 207 if (c >= UICODE_FROM_UTF8[1] && c <= UICODE_FROM_UTF8[2]) { // 1 - 2: 0000 0080 - 0000 07FF 208 l = 1; // 1: 0000 0080 - 0000 07FF Unicode 209 } else if (c >= UICODE_FROM_UTF8[3] && c <= UICODE_FROM_UTF8[4]) { // 3 - 4: 0000 0800 - 0000 FFFF 210 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode 211 } else if (c >= UICODE_FROM_UTF8[5] && c <= UICODE_FROM_UTF8[6]) { // 5 - 6: 0001 0000 - 0010 FFFF 212 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode 213 } else if (c >= UICODE_FROM_UTF8[7] && c <= UICODE_FROM_UTF8[8]) { // 7 - 8: 0020 0000 - 03FF FFFF 214 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode 215 } else if (c == UICODE_FROM_UTF8[9] || c == UICODE_FROM_UTF8[10]) { // 9 - 10: 0400 0000 - 7FFF FFFF 216 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode 217 } else { 218 return INVALID_UNICODE_FROM_UTF8; 219 } 220 /* check that we have enough characters */ 221 if ((l + 1) > maxLen) { 222 return INVALID_UNICODE_FROM_UTF8; 223 } 224 return FromUtf8(c, l, p, pp); 225 } 226 FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)227 static int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp) 228 { 229 uint32_t b; 230 c &= UTF8_FIRST_CODE[l - 1]; 231 for (int i = 0; i < l; i++) { 232 b = *p++; 233 if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) { 234 return INVALID_UNICODE_FROM_UTF8; 235 } 236 c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range 237 } 238 if (c < UTF8_MIN_CODE[l - 1]) { 239 return INVALID_UNICODE_FROM_UTF8; 240 } 241 *pp = p; 242 return c; 243 } 244 InplaceAppend(std::u16string & str1,const std::u16string & str2)245 static inline void InplaceAppend(std::u16string &str1, const std::u16string &str2) 246 { 247 str1.append(str2); 248 } 249 Append(const std::u16string & str1,const std::u16string & str2)250 static inline std::u16string Append(const std::u16string &str1, const std::u16string &str2) 251 { 252 std::u16string tmpStr = str1; 253 return tmpStr.append(str2); 254 } 255 Utf8ToU32String(const std::vector<uint8_t> & data)256 static inline uint32_t Utf8ToU32String(const std::vector<uint8_t> &data) 257 { 258 std::string str(data.begin(), data.end()); 259 std::u32string u32str = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str); 260 auto u32data = reinterpret_cast<uint32_t *>(u32str.data()); 261 return *u32data; 262 } 263 Utf32ToString(uint32_t u32Data)264 static inline std::string Utf32ToString(uint32_t u32Data) 265 { 266 UChar32 charData = static_cast<int32_t>(u32Data); 267 icu::UnicodeString uString(charData); 268 std::string res; 269 uString.toUTF8String(res); 270 return res; 271 } 272 GetSpecifiedLine(const std::string & srcStr,int lineNumber)273 static inline std::string GetSpecifiedLine(const std::string &srcStr, int lineNumber) 274 { 275 if (lineNumber < 1) { 276 return ""; 277 } 278 bool escape = true; 279 if (srcStr.find('\n') == std::string::npos) { 280 escape = false; 281 } 282 size_t prePos = 0; 283 int findPrePos = lineNumber - 1; 284 for (int i = 0; i < findPrePos; i++) { 285 if (escape) { 286 prePos = srcStr.find('\n', prePos); 287 if (prePos == std::string::npos) { 288 return ""; 289 } 290 prePos += 1; 291 } else { 292 prePos = srcStr.find("\\n", prePos); 293 if (prePos == std::string::npos) { 294 return ""; 295 } 296 prePos += 2; // 2 : add the two characters found to start searching again 297 } 298 } 299 size_t findEndPos = 0; 300 if (escape) { 301 findEndPos = srcStr.find('\n', prePos); 302 } else { 303 findEndPos = srcStr.find("\\n", prePos); 304 } 305 if (findEndPos == std::string::npos) { 306 return srcStr.substr(prePos, srcStr.length() - prePos); 307 } 308 ASSERT(findEndPos > prePos); 309 return srcStr.substr(prePos, findEndPos - prePos); 310 } 311 IsNonspace(uint16_t c)312 static inline bool IsNonspace(uint16_t c) 313 { 314 uint32_t len = sizeof(SPACE_OR_LINE_TERMINAL) / sizeof(SPACE_OR_LINE_TERMINAL[0]); 315 for (uint32_t i = 0; i < len; i++) { 316 if (c == SPACE_OR_LINE_TERMINAL[i]) { 317 return true; 318 } 319 if (c < SPACE_OR_LINE_TERMINAL[i]) { 320 return false; 321 } 322 } 323 return false; 324 } 325 326 template<typename T> GetStart(Span<T> & data,uint32_t length)327 static inline uint32_t GetStart(Span<T> &data, uint32_t length) 328 { 329 uint32_t start = 0; 330 while (start < length && IsNonspace(data[start])) { 331 start++; 332 } 333 return start; 334 } 335 336 template<typename T> GetEnd(Span<T> & data,int32_t start,uint32_t length)337 static inline int32_t GetEnd(Span<T> &data, int32_t start, uint32_t length) 338 { 339 if (length == 0U) { 340 return 0; 341 } 342 int32_t end = static_cast<int32_t>(length - 1); 343 while (end >= start && IsNonspace(data[end])) { 344 end--; 345 } 346 return end; 347 } 348 Utf8CharInRange(uint8_t value,char start,char end)349 static bool Utf8CharInRange(uint8_t value, char start, char end) 350 { 351 ASSERT(start <= end); 352 return (value >= static_cast<uint8_t>(start)) && (value <= static_cast<uint8_t>(end)); 353 } 354 Vformat(const char * fmt,va_list args)355 static inline std::string Vformat(const char *fmt, va_list args) 356 { 357 static constexpr size_t SIZE = 1024; 358 359 std::string result; 360 result.resize(SIZE); 361 362 bool is_truncated = true; 363 while (is_truncated) { 364 va_list copy_args; 365 va_copy(copy_args, args); 366 int r = vsnprintf_truncated_s(result.data(), result.size() + 1, fmt, copy_args); 367 va_end(copy_args); 368 369 if (r < 0) { 370 return ""; 371 } 372 373 is_truncated = static_cast<size_t>(r) == result.size(); 374 result.resize(result.size() * 2U); 375 } 376 377 result.erase(std::find(result.begin(), result.end(), '\0'), result.end()); 378 379 return result; 380 } 381 SplitString(const std::string & str,const std::string & delimiter)382 static std::vector<std::string> SplitString(const std::string &str, const std::string &delimiter) 383 { 384 std::size_t strIndex = 0; 385 std::vector<std::string> value; 386 std::size_t pos = str.find_first_of(delimiter, strIndex); 387 while ((pos < str.size()) && (pos > strIndex)) { 388 std::string subStr = str.substr(strIndex, pos - strIndex); 389 value.push_back(std::move(subStr)); 390 strIndex = pos; 391 strIndex = str.find_first_not_of(delimiter, strIndex); 392 pos = str.find_first_of(delimiter, strIndex); 393 } 394 if (pos > strIndex) { 395 std::string subStr = str.substr(strIndex, pos - strIndex); 396 if (!subStr.empty()) { 397 value.push_back(std::move(subStr)); 398 } 399 } 400 return value; 401 } 402 EndsWith(const std::string & str,const std::string & suffix)403 static bool EndsWith(const std::string &str, const std::string &suffix) 404 { 405 if (str.length() < suffix.length()) { 406 return false; 407 } 408 std::string subStr = str.substr(str.length() - suffix.length(), str.length()); 409 return subStr == suffix; 410 } 411 StrToUInt32(const char * content,uint32_t * result)412 static bool StrToUInt32(const char *content, uint32_t *result) 413 { 414 const int DEC = 10; 415 char *endPtr = nullptr; 416 *result = std::strtoul(content, &endPtr, DEC); 417 if (endPtr == content || *endPtr != '\0') { 418 return false; 419 } 420 return true; 421 } 422 StringStartWith(const CString & str,const CString & startStr)423 static bool StringStartWith(const CString& str, const CString& startStr) 424 { 425 size_t startStrLen = startStr.length(); 426 return ((str.length() >= startStrLen) && (str.compare(0, startStrLen, startStr) == 0)); 427 } 428 StringEndWith(const CString & str,const CString & endStr)429 static bool StringEndWith(const CString& str, const CString& endStr) 430 { 431 size_t endStrLen = endStr.length(); 432 size_t len = str.length(); 433 return ((len >= endStrLen) && (str.compare(len - endStrLen, endStrLen, endStr) == 0)); 434 } 435 436 static void SplitString(const CString& str, CVector<CString>& out, size_t startPos, size_t times = 0, char c = '/') 437 { 438 size_t left = startPos; 439 size_t pos = 0; 440 size_t index = 0; 441 while ((pos = str.find(c, left)) != CString::npos) { 442 if (times != 0 && index >= times) { 443 return; 444 } 445 out.emplace_back(str.substr(left, pos - left)); 446 left = pos + 1; 447 index++; 448 } 449 450 if ((times == 0 || index < times) && left < str.length()) { 451 out.emplace_back(str.substr(left)); 452 } 453 } 454 455 static CString JoinString(const CVector<CString>& strs, size_t startIndex, size_t endIndex, char c = '/') 456 { 457 CString out; 458 for (size_t index = startIndex; index < strs.size() && index <= endIndex; ++index) { 459 if (!strs[index].empty()) { 460 out.append(strs[index]) += c; 461 } 462 } 463 if (!out.empty()) { 464 out.pop_back(); 465 } 466 return out; 467 } 468 }; 469 } // namespace panda::ecmascript::base 470 #endif // ECMASCRIPT_BASE_STRING_HELP_H 471