1 /*
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ECMASCRIPT_BASE_STRING_HELP_H
17 #define ECMASCRIPT_BASE_STRING_HELP_H
18
19 #include <algorithm>
20 #include <codecvt>
21 #include <locale>
22 #include <regex>
23 #include <sstream>
24 #include <string>
25 #include <vector>
26
27 #include "ecmascript/base/utf_helper.h"
28 #include "ecmascript/mem/c_containers.h"
29 #include "ecmascript/mem/c_string.h"
30
31 #include "securec.h"
32 #include "unicode/unistr.h"
33
34 namespace panda::ecmascript::base {
35 // White Space Code Points and Line Terminators Code Point
36 // NOLINTNEXTLINE(modernize-avoid-c-arrays)
37 static constexpr uint16_t SPACE_OR_LINE_TERMINAL[] = {
38 0x0009, 0x000A, 0x000B, 0x000C, 0x000D, 0x0020, 0x00A0, 0x1680, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004,
39 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x2028, 0x2029, 0x202F, 0x205F, 0x3000, 0xFEFF,
40 };
41 static constexpr int UICODE_FROM_UTF8[] = {
42 0x80, 0xc0, 0xdf, 0xe0, 0xef, 0xf0, 0xf7, 0xf8, 0xfb, 0xfc, 0xfd,
43 };
44 static constexpr int UTF8_MIN_CODE[] = {
45 0x80, 0x800, 0x10000, 0x00200000, 0x04000000,
46 };
47 static constexpr char UTF8_FIRST_CODE[] = {
48 0x1f, 0xf, 0x7, 0x3, 0x1,
49 };
50
51 // Concatenates multiple arguments into a CString.
52 // Note: For better performance, consider using AppendToBaseString with a pre-allocated CString.
53 template <typename T, typename... Args>
ConcatToCString(T && first,Args &&...args)54 inline CString ConcatToCString(T&& first, Args&&... args)
55 {
56 CString result = CString(std::forward<T>(first));
57 ((result += std::forward<Args>(args)), ...);
58 return result;
59 }
60
61 // Concatenates multiple arguments into a std::string.
62 // Note: For better performance, consider using AppendToBaseString with a pre-allocated std::string.
63 template <typename T, typename... Args>
ConcatToStdString(T && first,Args &&...args)64 inline std::string ConcatToStdString(T&& first, Args&&... args)
65 {
66 std::string result = std::string(std::forward<T>(first));
67 ((result += std::forward<Args>(args)), ...);
68 return result;
69 }
70
71 // Appends multiple arguments to a base string (std::string or CString).
72 // Note: For better performance, pre-allocate the base string using reserve() before calling this function.
73 template <typename T, typename... Args>
AppendToBaseString(T & base,Args &&...args)74 inline void AppendToBaseString(T& base, Args&&... args)
75 {
76 static_assert(std::is_same_v<T, std::string> || std::is_same_v<T, CString>,
77 "base must be std::string or CString");
78 ((base += std::forward<Args>(args)), ...);
79 }
80
81 class StringHelper {
82 public:
83 static constexpr int INVALID_UNICODE_FROM_UTF8 = -1;
84
ReplaceAll(CString str,const CString & oldValue,const CString & newValue)85 static inline CString ReplaceAll(CString str, const CString &oldValue,
86 const CString &newValue)
87 {
88 if (oldValue.empty() || oldValue == newValue) {
89 return str;
90 }
91 CString::size_type pos(0);
92 while ((pos = str.find(oldValue, pos)) != CString::npos) {
93 str.replace(pos, oldValue.length(), newValue);
94 pos += newValue.length();
95 }
96 return str;
97 }
98
99 template<class T>
Replace(CString str,const T & oldValue,const T & newValue)100 static inline CString Replace(CString str, const T &oldValue, const T &newValue)
101 {
102 if (oldValue.empty() || oldValue == newValue) {
103 return str;
104 }
105 CString::size_type pos(0);
106 if ((pos = str.find(oldValue, pos)) != CString::npos) {
107 str.replace(pos, oldValue.length(), newValue);
108 }
109 return str;
110 }
111
Utf16ToU16String(const uint16_t * utf16Data,uint32_t dataLen)112 static inline std::u16string Utf16ToU16String(const uint16_t *utf16Data, uint32_t dataLen)
113 {
114 auto *char16tData = reinterpret_cast<const char16_t *>(utf16Data);
115 std::u16string u16str(char16tData, dataLen);
116 return u16str;
117 }
118
Utf8ToString(const uint8_t * utf8Data,uint32_t dataLen)119 static inline std::string Utf8ToString(const uint8_t *utf8Data, uint32_t dataLen)
120 {
121 auto *charData = reinterpret_cast<const char *>(utf8Data);
122 std::string str(charData, dataLen);
123 return str;
124 }
125
Utf8ToCString(const uint8_t * utf8Data,uint32_t dataLen)126 static inline CString Utf8ToCString(const uint8_t *utf8Data, uint32_t dataLen)
127 {
128 auto *charData = reinterpret_cast<const char *>(utf8Data);
129 return { charData, dataLen };
130 }
131
Utf8ToU16String(const uint8_t * utf8Data,uint32_t dataLen)132 static inline std::u16string Utf8ToU16String(const uint8_t *utf8Data, uint32_t dataLen)
133 {
134 auto *charData = reinterpret_cast<const char *>(utf8Data);
135 std::string str(charData, dataLen);
136 std::u16string u16str = std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str);
137 return u16str;
138 }
139
WstringToString(const std::wstring & wstr)140 static inline std::string WstringToString(const std::wstring &wstr)
141 {
142 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.to_bytes(wstr);
143 }
144
StringToWstring(const std::string & str)145 static inline std::wstring StringToWstring(const std::string &str)
146 {
147 return std::wstring_convert<std::codecvt_utf8<wchar_t>, wchar_t>{}.from_bytes(str);
148 }
149
U16stringToString(const std::u16string & u16str)150 static inline std::string U16stringToString(const std::u16string &u16str)
151 {
152 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.to_bytes(u16str);
153 }
154
StringToU16string(const std::string & str)155 static inline std::u16string StringToU16string(const std::string &str)
156 {
157 return std::wstring_convert<std::codecvt_utf8_utf16<char16_t>, char16_t>{}.from_bytes(str);
158 }
159
Find(const std::string & thisStr,const std::string & searchStr,int32_t pos)160 static inline size_t Find(const std::string &thisStr, const std::string &searchStr, int32_t pos)
161 {
162 size_t idx = thisStr.find(searchStr, pos);
163 return idx;
164 }
165
Find(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)166 static inline size_t Find(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos)
167 {
168 size_t idx = thisStr.find(searchStr, pos);
169 return idx;
170 }
171
RFind(const std::u16string & thisStr,const std::u16string & searchStr,int32_t pos)172 static inline size_t RFind(const std::u16string &thisStr, const std::u16string &searchStr, int32_t pos)
173 {
174 size_t idx = thisStr.rfind(searchStr, pos);
175 return idx;
176 }
177
ToUpper(const std::u16string & str)178 static inline std::string ToUpper(const std::u16string &str)
179 {
180 std::u16string tmpStr = str;
181 const char16_t *constChar16tData = tmpStr.data();
182 icu::UnicodeString uString(constChar16tData);
183 icu::UnicodeString up = uString.toUpper();
184 std::string res;
185 up.toUTF8String(res);
186 return res;
187 }
188
ToLocaleUpper(const std::u16string & str,const icu::Locale & locale)189 static inline std::string ToLocaleUpper(const std::u16string &str, const icu::Locale &locale)
190 {
191 std::u16string tmpStr = str;
192 const char16_t *constChar16tData = tmpStr.data();
193 icu::UnicodeString uString(constChar16tData);
194 icu::UnicodeString up = uString.toUpper(locale);
195 std::string res;
196 up.toUTF8String(res);
197 return res;
198 }
199
ToLower(const std::u16string & str)200 static inline std::string ToLower(const std::u16string &str)
201 {
202 const char16_t *constChar16tData = str.data();
203 icu::UnicodeString uString(constChar16tData, str.length());
204 std::string res;
205 uString.toLower().toUTF8String(res);
206 return res;
207 }
208
ToLocaleLower(const std::u16string & str,const icu::Locale & locale)209 static inline std::string ToLocaleLower(const std::u16string &str, const icu::Locale &locale)
210 {
211 std::u16string tmpStr = str;
212 const char16_t *constChar16tData = tmpStr.data();
213 icu::UnicodeString uString(constChar16tData);
214 icu::UnicodeString low = uString.toLower(locale);
215 std::string res;
216 low.toUTF8String(res);
217 return res;
218 }
219
FindFromU16ToUpper(const std::u16string & thisStr,uint16_t * u16Data)220 static inline size_t FindFromU16ToUpper(const std::u16string &thisStr, uint16_t *u16Data)
221 {
222 std::u16string tmpStr = Utf16ToU16String(u16Data, 1);
223 const char16_t *constChar16tData = tmpStr.data();
224 icu::UnicodeString uString(constChar16tData);
225 icu::UnicodeString up = uString.toUpper();
226 std::string res;
227 up.toUTF8String(res);
228 std::u16string searchStr = StringToU16string(res);
229 size_t idx = Find(thisStr, searchStr, 0);
230 return idx;
231 }
232
FindFromU8ToUpper(const std::string & thisStr,uint8_t * u8Data)233 static inline size_t FindFromU8ToUpper(const std::string &thisStr, uint8_t *u8Data)
234 {
235 std::string tmpStr = Utf8ToString(u8Data, 1);
236 std::transform(tmpStr.begin(), tmpStr.end(), tmpStr.begin(), [](unsigned char c) { return std::toupper(c); });
237 size_t idx = Find(thisStr, tmpStr, 0);
238 return idx;
239 }
240
UnicodeFromUtf8(const uint8_t * p,int maxLen,const uint8_t ** pp)241 static int UnicodeFromUtf8(const uint8_t *p, int maxLen, const uint8_t **pp)
242 {
243 int c = *p++;
244 if (c < UICODE_FROM_UTF8[0]) {
245 *pp = p;
246 return c;
247 }
248 int l = 0;
249 if (c >= UICODE_FROM_UTF8[1] && c <= UICODE_FROM_UTF8[2]) { // 1 - 2: 0000 0080 - 0000 07FF
250 l = 1; // 1: 0000 0080 - 0000 07FF Unicode
251 } else if (c >= UICODE_FROM_UTF8[3] && c <= UICODE_FROM_UTF8[4]) { // 3 - 4: 0000 0800 - 0000 FFFF
252 l = 2; // 2: 0000 0800 - 0000 FFFF Unicode
253 } else if (c >= UICODE_FROM_UTF8[5] && c <= UICODE_FROM_UTF8[6]) { // 5 - 6: 0001 0000 - 0010 FFFF
254 l = 3; // 3: 0001 0000 - 0010 FFFF Unicode
255 } else if (c >= UICODE_FROM_UTF8[7] && c <= UICODE_FROM_UTF8[8]) { // 7 - 8: 0020 0000 - 03FF FFFF
256 l = 4; // 4: 0020 0000 - 03FF FFFF Unicode
257 } else if (c == UICODE_FROM_UTF8[9] || c == UICODE_FROM_UTF8[10]) { // 9 - 10: 0400 0000 - 7FFF FFFF
258 l = 5; // 5: 0400 0000 - 7FFF FFFF Unicode
259 } else {
260 return INVALID_UNICODE_FROM_UTF8;
261 }
262 /* check that we have enough characters */
263 if ((l + 1) > maxLen) {
264 return INVALID_UNICODE_FROM_UTF8;
265 }
266 return FromUtf8(c, l, p, pp);
267 }
268
FromUtf8(int c,int l,const uint8_t * p,const uint8_t ** pp)269 static int FromUtf8(int c, int l, const uint8_t *p, const uint8_t **pp)
270 {
271 uint32_t b;
272 c &= UTF8_FIRST_CODE[l - 1];
273 for (int i = 0; i < l; i++) {
274 b = *p++;
275 if (b < utf_helper::UTF8_2B_SECOND || b >= utf_helper::UTF8_2B_FIRST) {
276 return INVALID_UNICODE_FROM_UTF8;
277 }
278 c = (c << 6) | (b & utf_helper::UTF8_2B_THIRD); // 6: Maximum Unicode range
279 }
280 if (c < UTF8_MIN_CODE[l - 1]) {
281 return INVALID_UNICODE_FROM_UTF8;
282 }
283 *pp = p;
284 return c;
285 }
286
InplaceAppend(std::u16string & str1,const std::u16string & str2)287 static inline void InplaceAppend(std::u16string &str1, const std::u16string &str2)
288 {
289 str1.append(str2);
290 }
291
Append(const std::u16string & str1,const std::u16string & str2)292 static inline std::u16string Append(const std::u16string &str1, const std::u16string &str2)
293 {
294 std::u16string tmpStr = str1;
295 return tmpStr.append(str2);
296 }
297
Utf8ToU32String(const std::vector<uint8_t> & data)298 static inline uint32_t Utf8ToU32String(const std::vector<uint8_t> &data)
299 {
300 std::string str(data.begin(), data.end());
301 std::u32string u32str = std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t>{}.from_bytes(str);
302 auto u32data = reinterpret_cast<uint32_t *>(u32str.data());
303 return *u32data;
304 }
305
Utf32ToString(uint32_t u32Data)306 static inline std::string Utf32ToString(uint32_t u32Data)
307 {
308 UChar32 charData = static_cast<int32_t>(u32Data);
309 icu::UnicodeString uString(charData);
310 std::string res;
311 uString.toUTF8String(res);
312 return res;
313 }
314
GetSpecifiedLine(const std::string & srcStr,int lineNumber)315 static inline std::string GetSpecifiedLine(const std::string &srcStr, int lineNumber)
316 {
317 if (lineNumber < 1) {
318 return "";
319 }
320 bool escape = true;
321 if (srcStr.find('\n') == std::string::npos) {
322 escape = false;
323 }
324 size_t prePos = 0;
325 int findPrePos = lineNumber - 1;
326 for (int i = 0; i < findPrePos; i++) {
327 if (escape) {
328 prePos = srcStr.find('\n', prePos);
329 if (prePos == std::string::npos) {
330 return "";
331 }
332 prePos += 1;
333 } else {
334 prePos = srcStr.find("\\n", prePos);
335 if (prePos == std::string::npos) {
336 return "";
337 }
338 prePos += 2; // 2 : add the two characters found to start searching again
339 }
340 }
341 size_t findEndPos = 0;
342 if (escape) {
343 findEndPos = srcStr.find('\n', prePos);
344 } else {
345 findEndPos = srcStr.find("\\n", prePos);
346 }
347 if (findEndPos == std::string::npos) {
348 return srcStr.substr(prePos, srcStr.length() - prePos);
349 }
350 ASSERT(findEndPos > prePos);
351 return srcStr.substr(prePos, findEndPos - prePos);
352 }
353
IsNonspace(uint16_t c)354 static inline bool IsNonspace(uint16_t c)
355 {
356 uint32_t len = sizeof(SPACE_OR_LINE_TERMINAL) / sizeof(SPACE_OR_LINE_TERMINAL[0]);
357 for (uint32_t i = 0; i < len; i++) {
358 if (c == SPACE_OR_LINE_TERMINAL[i]) {
359 return true;
360 }
361 if (c < SPACE_OR_LINE_TERMINAL[i]) {
362 return false;
363 }
364 }
365 return false;
366 }
367
368 template<typename T>
GetStart(Span<T> & data,uint32_t length)369 static inline uint32_t GetStart(Span<T> &data, uint32_t length)
370 {
371 uint32_t start = 0;
372 while (start < length && IsNonspace(data[start])) {
373 start++;
374 }
375 return start;
376 }
377
378 template<typename T>
GetEnd(Span<T> & data,int32_t start,uint32_t length)379 static inline int32_t GetEnd(Span<T> &data, int32_t start, uint32_t length)
380 {
381 if (length == 0U) {
382 return 0;
383 }
384 int32_t end = static_cast<int32_t>(length - 1);
385 while (end >= start && IsNonspace(data[end])) {
386 end--;
387 }
388 return end;
389 }
390
Utf8CharInRange(uint8_t value,char start,char end)391 static bool Utf8CharInRange(uint8_t value, char start, char end)
392 {
393 ASSERT(start <= end);
394 return (value >= static_cast<uint8_t>(start)) && (value <= static_cast<uint8_t>(end));
395 }
396
Vformat(const char * fmt,va_list args)397 static inline std::string Vformat(const char *fmt, va_list args)
398 {
399 static constexpr size_t SIZE = 1024;
400
401 std::string result;
402 result.resize(SIZE);
403
404 bool is_truncated = true;
405 while (is_truncated) {
406 va_list copy_args;
407 va_copy(copy_args, args);
408 int r = vsnprintf_truncated_s(result.data(), result.size() + 1, fmt, copy_args);
409 va_end(copy_args);
410
411 if (r < 0) {
412 return "";
413 }
414
415 is_truncated = static_cast<size_t>(r) == result.size();
416 result.resize(result.size() * 2U);
417 }
418
419 result.erase(std::find(result.begin(), result.end(), '\0'), result.end());
420
421 return result;
422 }
423
SplitString(const std::string & str,const std::string & delimiter)424 static std::vector<std::string> SplitString(const std::string &str, const std::string &delimiter)
425 {
426 std::size_t strIndex = 0;
427 std::vector<std::string> value;
428 std::size_t pos = str.find_first_of(delimiter, strIndex);
429 while ((pos < str.size()) && (pos > strIndex)) {
430 std::string subStr = str.substr(strIndex, pos - strIndex);
431 value.push_back(std::move(subStr));
432 strIndex = pos;
433 strIndex = str.find_first_not_of(delimiter, strIndex);
434 pos = str.find_first_of(delimiter, strIndex);
435 }
436 if (pos > strIndex) {
437 std::string subStr = str.substr(strIndex, pos - strIndex);
438 if (!subStr.empty()) {
439 value.push_back(std::move(subStr));
440 }
441 }
442 return value;
443 }
444
EndsWith(const std::string & str,const std::string & suffix)445 static bool EndsWith(const std::string &str, const std::string &suffix)
446 {
447 if (str.length() < suffix.length()) {
448 return false;
449 }
450 std::string subStr = str.substr(str.length() - suffix.length(), str.length());
451 return subStr == suffix;
452 }
453
StrToUInt32(const char * content,uint32_t * result)454 static bool StrToUInt32(const char *content, uint32_t *result)
455 {
456 const int DEC = 10;
457 char *endPtr = nullptr;
458 *result = std::strtoul(content, &endPtr, DEC);
459 if (endPtr == content || *endPtr != '\0') {
460 return false;
461 }
462 return true;
463 }
464
465 template<class T>
StringStartWith(const CString & str,const T & startStr)466 static bool StringStartWith(const CString& str, const T& startStr)
467 {
468 size_t startStrLen = startStr.length();
469 return str.length() >= startStrLen && str.compare(0, startStrLen, startStr) == 0;
470 }
471
StringStartWith(const CString & str,const char startStr)472 static bool StringStartWith(const CString& str, const char startStr)
473 {
474 return !str.empty() && str[0] == startStr;
475 }
476
477 template<class T>
StringEndWith(const CString & str,const T & endStr)478 static bool StringEndWith(const CString& str, const T& endStr)
479 {
480 size_t endStrLen = endStr.length();
481 size_t len = str.length();
482 return len >= endStrLen && str.compare(len - endStrLen, endStrLen, endStr) == 0;
483 }
484
StringEndWith(const CString & str,const char endStr)485 static bool StringEndWith(const CString& str, const char endStr)
486 {
487 const size_t len = str.length();
488 return len > 0 && str[len - 1] == endStr;
489 }
490
491 static void SplitString(const CString& str, CVector<CString>& out, size_t startPos, size_t times = 0, char c = '/')
492 {
493 size_t left = startPos;
494 size_t pos = 0;
495 size_t index = 0;
496 while ((pos = str.find(c, left)) != CString::npos) {
497 if (times != 0 && index >= times) {
498 return;
499 }
500 out.emplace_back(str.substr(left, pos - left));
501 left = pos + 1;
502 index++;
503 }
504
505 if ((times == 0 || index < times) && left < str.length()) {
506 out.emplace_back(str.substr(left));
507 }
508 }
509
510 static CString JoinString(const CVector<CString>& strs, size_t startIndex, size_t endIndex, char c = '/')
511 {
512 CString out;
513 for (size_t index = startIndex; index < strs.size() && index <= endIndex; ++index) {
514 if (!strs[index].empty()) {
515 out.append(strs[index]) += c;
516 }
517 }
518 if (!out.empty()) {
519 out.pop_back();
520 }
521 return out;
522 }
523 };
524 } // namespace panda::ecmascript::base
525 #endif // ECMASCRIPT_BASE_STRING_HELP_H
526