1 /* 2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef ECMASCRIPT_BASE_STRING_HELP_H 17 #define ECMASCRIPT_BASE_STRING_HELP_H 18 19 #include <algorithm> 20 #include <codecvt> 21 #include <locale> 22 #include <regex> 23 #include <sstream> 24 #include <string> 25 #include <vector> 26 27 #include "ecmascript/base/utf_helper.h" 28 #include "ecmascript/mem/c_containers.h" 29 #include "ecmascript/mem/c_string.h" 30 31 #include "securec.h" 32 #include "unicode/unistr.h" 33 34 namespace panda::ecmascript::base { 35 // White Space Code Points and Line Terminators Code Point 36 // NOLINTNEXTLINE(modernize-avoid-c-arrays) 37 static constexpr uint16_t SPACE_OR_LINE_TERMINAL[] = { 38 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 39 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF, 40 }; 41 static constexpr int UICODE_FROM_UTF8[] = { 42 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd, 43 }; 44 static constexpr int UTF8_MIN_CODE[] = { 45 0x80, 0x800, 0x10000, 0x00200000, 0x04000000, 46 }; 47 static constexpr char UTF8_FIRST_CODE[] = { 48 0x1f, 0xf, 0x7, 0x3, 0x1, 49 }; 50 class StringHelper { 51 public: 52 static constexpr int INVALID_UNICODE_FROM_UTF8 = -1; 53 ReplaceAll(CString str,const CString & oldValue,const CString & newValue)54 static inline CString ReplaceAll(CString str, const CString &oldValue, 55 const CString &newValue) 56 { 57 if (oldValue.empty() || oldValue == newValue) { 58 return str; 59 } 60 CString::size_type pos(0); 61 while ((pos = str.find(oldValue, pos)) != CString::npos) { 62 str.replace(pos, oldValue.length(), newValue); 63 pos += newValue.length(); 64 } 65 return str; 66 } 67 Replace(CString str,const CString & oldValue,const CString & newValue)68 static inline CString Replace(CString str, const CString &oldValue, 69 const CString &newValue) 70 { 71 if (oldValue.empty() || oldValue == newValue) { 72 return str; 73 } 74 CString::size_type pos(0); 75 if ((pos = str.find(oldValue, pos)) != CString::npos) { 76 str.replace(pos, oldValue.length(), newValue); 77 } 78 return str; 79 } 80 Utf16ToU16String(const uint16_t * utf16Data,uint32_t dataLen)81 static inline std::u16string Utf16ToU16String(const uint16_t *utf16Data, uint32_t dataLen) 82 { 83 auto *char16tData = reinterpret_cast<const char16_t *>(utf16Data); 84 std::u16string u16str(char16tData, dataLen); 85 return u16str; 86 } 87 Utf8ToString(const uint8_t * utf8Data,uint32_t dataLen)88 static inline std::string Utf8ToString(const uint8_t *utf8Data, uint32_t dataLen) 89 { 90 auto *charData = reinterpret_cast<const char *>(utf8Data); 91 std::string str(charData, dataLen); 92 return str; 93 } 94 Utf8ToU16String(const uint8_t * utf8Data,uint32_t dataLen)95 static inline std::u16string Utf8ToU16String(const uint8_t *utf8Data, uint32_t dataLen) 96 { 97 auto *charData = reinterpret_cast<const char *>(utf8Data); 98 std::string str(charData, dataLen); 99 std::u16string u16str = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str); 100 return u16str; 101 } 102 WstringToString(const std::wstring & wstr)103 static inline std::string WstringToString(const std::wstring &wstr) 104 { 105 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.to_bytes(wstr); 106 } 107 StringToWstring(const std::string & str)108 static inline std::wstring StringToWstring(const std::string &str) 109 { 110 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.from_bytes(str); 111 } 112 U16stringToString(const std::u16string & u16str)113 static inline std::string U16stringToString(const std::u16string &u16str) 114 { 115 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str); 116 } 117 StringToU16string(const std::string & str)118 static inline std::u16string StringToU16string(const std::string &str) 119 { 120 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str); 121 } 122 Find(const std::string & thisStr,const std::string & searchStr,int32_t pos)123 static inline size_t Find(const std::string &thisStr, const std::string &searchStr, int32_t pos) 124 { 125 size_t idx = thisStr.find(searchStr, pos); 126 return idx; 127 } 128 Find(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)129 static inline size_t Find(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos) 130 { 131 size_t idx = thisStr.find(searchStr, pos); 132 return idx; 133 } 134 RFind(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)135 static inline size_t RFind(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos) 136 { 137 size_t idx = thisStr.rfind(searchStr, pos); 138 return idx; 139 } 140 ToUpper(const std::u16string & str)141 static inline std::string ToUpper(const std::u16string &str) 142 { 143 std::u16string tmpStr = str; 144 const char16_t *constChar16tData = tmpStr.data(); 145 icu::UnicodeString uString(constChar16tData); 146 icu::UnicodeString up = uString.toUpper(); 147 std::string res; 148 up.toUTF8String(res); 149 return res; 150 } 151 ToLocaleUpper(const std::u16string & str,const icu::Locale & locale)152 static inline std::string ToLocaleUpper(const std::u16string &str, const icu::Locale &locale) 153 { 154 std::u16string tmpStr = str; 155 const char16_t *constChar16tData = tmpStr.data(); 156 icu::UnicodeString uString(constChar16tData); 157 icu::UnicodeString up = uString.toUpper(locale); 158 std::string res; 159 up.toUTF8String(res); 160 return res; 161 } 162 ToLower(const std::u16string & str)163 static inline std::string ToLower(const std::u16string &str) 164 { 165 const char16_t *constChar16tData = str.data(); 166 icu::UnicodeString uString(constChar16tData, str.length()); 167 std::string res; 168 uString.toLower().toUTF8String(res); 169 return res; 170 } 171 ToLocaleLower(const std::u16string & str,const icu::Locale & locale)172 static inline std::string ToLocaleLower(const std::u16string &str, const icu::Locale &locale) 173 { 174 std::u16string tmpStr = str; 175 const char16_t *constChar16tData = tmpStr.data(); 176 icu::UnicodeString uString(constChar16tData); 177 icu::UnicodeString low = uString.toLower(locale); 178 std::string res; 179 low.toUTF8String(res); 180 return res; 181 } 182 FindFromU16ToUpper(const std::u16string & thisStr,uint16_t * u16Data)183 static inline size_t FindFromU16ToUpper(const std::u16string &thisStr, uint16_t *u16Data) 184 { 185 std::u16string tmpStr = Utf16ToU16String(u16Data, 1); 186 const char16_t *constChar16tData = tmpStr.data(); 187 icu::UnicodeString uString(constChar16tData); 188 icu::UnicodeString up = uString.toUpper(); 189 std::string res; 190 up.toUTF8String(res); 191 std::u16string searchStr = StringToU16string(res); 192 size_t idx = Find(thisStr, searchStr, 0); 193 return idx; 194 } 195 UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)196 static int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp) 197 { 198 int c = *p++; 199 if (c < UICODE_FROM_UTF8[0]) { 200 *pp = p; 201 return c; 202 } 203 int l = 0; 204 if (c >= UICODE_FROM_UTF8[1] && c <= UICODE_FROM_UTF8[2]) { // 1 - 2: 0000 0080 - 0000 07FF 205 l = 1; // 1: 0000 0080 - 0000 07FF Unicode 206 } else if (c >= UICODE_FROM_UTF8[3] && c <= UICODE_FROM_UTF8[4]) { // 3 - 4: 0000 0800 - 0000 FFFF 207 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode 208 } else if (c >= UICODE_FROM_UTF8[5] && c <= UICODE_FROM_UTF8[6]) { // 5 - 6: 0001 0000 - 0010 FFFF 209 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode 210 } else if (c >= UICODE_FROM_UTF8[7] && c <= UICODE_FROM_UTF8[8]) { // 7 - 8: 0020 0000 - 03FF FFFF 211 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode 212 } else if (c == UICODE_FROM_UTF8[9] || c == UICODE_FROM_UTF8[10]) { // 9 - 10: 0400 0000 - 7FFF FFFF 213 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode 214 } else { 215 return INVALID_UNICODE_FROM_UTF8; 216 } 217 /* check that we have enough characters */ 218 if ((l + 1) > maxLen) { 219 return INVALID_UNICODE_FROM_UTF8; 220 } 221 return FromUtf8(c, l, p, pp); 222 } 223 FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)224 static int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp) 225 { 226 uint32_t b; 227 c &= UTF8_FIRST_CODE[l - 1]; 228 for (int i = 0; i < l; i++) { 229 b = *p++; 230 if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) { 231 return INVALID_UNICODE_FROM_UTF8; 232 } 233 c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range 234 } 235 if (c < UTF8_MIN_CODE[l - 1]) { 236 return INVALID_UNICODE_FROM_UTF8; 237 } 238 *pp = p; 239 return c; 240 } 241 InplaceAppend(std::u16string & str1,const std::u16string & str2)242 static inline void InplaceAppend(std::u16string &str1, const std::u16string &str2) 243 { 244 str1.append(str2); 245 } 246 Append(const std::u16string & str1,const std::u16string & str2)247 static inline std::u16string Append(const std::u16string &str1, const std::u16string &str2) 248 { 249 std::u16string tmpStr = str1; 250 return tmpStr.append(str2); 251 } 252 Utf8ToU32String(const std::vector<uint8_t> & data)253 static inline uint32_t Utf8ToU32String(const std::vector<uint8_t> &data) 254 { 255 std::string str(data.begin(), data.end()); 256 std::u32string u32str = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str); 257 auto u32data = reinterpret_cast<uint32_t *>(u32str.data()); 258 return *u32data; 259 } 260 Utf32ToString(uint32_t u32Data)261 static inline std::string Utf32ToString(uint32_t u32Data) 262 { 263 UChar32 charData = static_cast<int32_t>(u32Data); 264 icu::UnicodeString uString(charData); 265 std::string res; 266 uString.toUTF8String(res); 267 return res; 268 } 269 GetSpecifiedLine(const std::string & srcStr,int lineNumber)270 static inline std::string GetSpecifiedLine(const std::string &srcStr, int lineNumber) 271 { 272 if (lineNumber < 1) { 273 return ""; 274 } 275 bool escape = true; 276 if (srcStr.find('\n') == std::string::npos) { 277 escape = false; 278 } 279 size_t prePos = 0; 280 int findPrePos = lineNumber - 1; 281 for (int i = 0; i < findPrePos; i++) { 282 if (escape) { 283 prePos = srcStr.find('\n', prePos); 284 if (prePos == std::string::npos) { 285 return ""; 286 } 287 prePos += 1; 288 } else { 289 prePos = srcStr.find("\\n", prePos); 290 if (prePos == std::string::npos) { 291 return ""; 292 } 293 prePos += 2; // 2 : add the two characters found to start searching again 294 } 295 } 296 size_t findEndPos = 0; 297 if (escape) { 298 findEndPos = srcStr.find('\n', prePos); 299 } else { 300 findEndPos = srcStr.find("\\n", prePos); 301 } 302 if (findEndPos == std::string::npos) { 303 return srcStr.substr(prePos, srcStr.length() - prePos); 304 } 305 ASSERT(findEndPos > prePos); 306 return srcStr.substr(prePos, findEndPos - prePos); 307 } 308 IsNonspace(uint16_t c)309 static inline bool IsNonspace(uint16_t c) 310 { 311 uint32_t len = sizeof(SPACE_OR_LINE_TERMINAL) / sizeof(SPACE_OR_LINE_TERMINAL[0]); 312 for (uint32_t i = 0; i < len; i++) { 313 if (c == SPACE_OR_LINE_TERMINAL[i]) { 314 return true; 315 } 316 if (c < SPACE_OR_LINE_TERMINAL[i]) { 317 return false; 318 } 319 } 320 return false; 321 } 322 323 template<typename T> GetStart(Span<T> & data,uint32_t length)324 static inline uint32_t GetStart(Span<T> &data, uint32_t length) 325 { 326 uint32_t start = 0; 327 while (start < length && IsNonspace(data[start])) { 328 start++; 329 } 330 return start; 331 } 332 333 template<typename T> GetEnd(Span<T> & data,int32_t start,uint32_t length)334 static inline int32_t GetEnd(Span<T> &data, int32_t start, uint32_t length) 335 { 336 if (length == 0U) { 337 return 0; 338 } 339 int32_t end = static_cast<int32_t>(length - 1); 340 while (end >= start && IsNonspace(data[end])) { 341 end--; 342 } 343 return end; 344 } 345 Utf8CharInRange(uint8_t value,char start,char end)346 static bool Utf8CharInRange(uint8_t value, char start, char end) 347 { 348 ASSERT(start <= end); 349 return (value >= static_cast<uint8_t>(start)) && (value <= static_cast<uint8_t>(end)); 350 } 351 Vformat(const char * fmt,va_list args)352 static inline std::string Vformat(const char *fmt, va_list args) 353 { 354 static constexpr size_t SIZE = 1024; 355 356 std::string result; 357 result.resize(SIZE); 358 359 bool is_truncated = true; 360 while (is_truncated) { 361 va_list copy_args; 362 va_copy(copy_args, args); 363 int r = vsnprintf_truncated_s(result.data(), result.size() + 1, fmt, copy_args); 364 va_end(copy_args); 365 366 if (r < 0) { 367 return ""; 368 } 369 370 is_truncated = static_cast<size_t>(r) == result.size(); 371 result.resize(result.size() * 2U); 372 } 373 374 result.erase(std::find(result.begin(), result.end(), '\0'), result.end()); 375 376 return result; 377 } 378 SplitString(const std::string & str,const std::string & delimiter)379 static std::vector<std::string> SplitString(const std::string &str, const std::string &delimiter) 380 { 381 std::size_t strIndex = 0; 382 std::vector<std::string> value; 383 std::size_t pos = str.find_first_of(delimiter, strIndex); 384 while ((pos < str.size()) && (pos > strIndex)) { 385 std::string subStr = str.substr(strIndex, pos - strIndex); 386 value.push_back(std::move(subStr)); 387 strIndex = pos; 388 strIndex = str.find_first_not_of(delimiter, strIndex); 389 pos = str.find_first_of(delimiter, strIndex); 390 } 391 if (pos > strIndex) { 392 std::string subStr = str.substr(strIndex, pos - strIndex); 393 if (!subStr.empty()) { 394 value.push_back(std::move(subStr)); 395 } 396 } 397 return value; 398 } 399 EndsWith(const std::string & str,const std::string & suffix)400 static bool EndsWith(const std::string &str, const std::string &suffix) 401 { 402 if (str.length() < suffix.length()) { 403 return false; 404 } 405 std::string subStr = str.substr(str.length() - suffix.length(), str.length()); 406 return subStr == suffix; 407 } 408 StrToUInt32(const char * content,uint32_t * result)409 static bool StrToUInt32(const char *content, uint32_t *result) 410 { 411 const int DEC = 10; 412 char *endPtr = nullptr; 413 *result = std::strtoul(content, &endPtr, DEC); 414 if (endPtr == content || *endPtr != '\0') { 415 return false; 416 } 417 return true; 418 } 419 StringStartWith(const CString & str,const CString & startStr)420 static bool StringStartWith(const CString& str, const CString& startStr) 421 { 422 size_t startStrLen = startStr.length(); 423 return ((str.length() >= startStrLen) && (str.compare(0, startStrLen, startStr) == 0)); 424 } 425 StringEndWith(const CString & str,const CString & endStr)426 static bool StringEndWith(const CString& str, const CString& endStr) 427 { 428 size_t endStrLen = endStr.length(); 429 size_t len = str.length(); 430 return ((len >= endStrLen) && (str.compare(len - endStrLen, endStrLen, endStr) == 0)); 431 } 432 433 static void SplitString(const CString& str, CVector<CString>& out, size_t startPos, size_t times = 0, char c = '/') 434 { 435 size_t left = startPos; 436 size_t pos = 0; 437 size_t index = 0; 438 while ((pos = str.find(c, left)) != CString::npos) { 439 if (times != 0 && index >= times) { 440 return; 441 } 442 out.emplace_back(str.substr(left, pos - left)); 443 left = pos + 1; 444 index++; 445 } 446 447 if ((times == 0 || index < times) && left < str.length()) { 448 out.emplace_back(str.substr(left)); 449 } 450 } 451 452 static CString JoinString(const CVector<CString>& strs, size_t startIndex, size_t endIndex, char c = '/') 453 { 454 CString out; 455 for (size_t index = startIndex; index < strs.size() && index <= endIndex; ++index) { 456 if (!strs[index].empty()) { 457 out.append(strs[index]) += c; 458 } 459 } 460 if (!out.empty()) { 461 out.pop_back(); 462 } 463 return out; 464 } 465 }; 466 } // namespace panda::ecmascript::base 467 #endif // ECMASCRIPT_BASE_STRING_HELP_H 468