1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include "util/es2pandaMacros.h"
20 #include "utils/arena_containers.h"
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace ark::es2panda::util {
29 class StringView {
30 public:
31 explicit StringView() noexcept = default;
StringView(const ArenaString * str)32 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33 // CC-OFFNXT(G.FMT.06-CPP,G.CLS.03-CPP) same as clang, project code style
34 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35 StringView(std::string_view sv) noexcept : sv_(sv) {}
36 // CC-OFFNXT(G.FMT.06-CPP, G.CLS.03-CPP) same as clang, project code style
37 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)38 StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
39 DEFAULT_COPY_SEMANTIC(StringView);
40 DEFAULT_MOVE_SEMANTIC(StringView);
41 ~StringView() = default;
42
43 bool operator==(const StringView &rhs) const noexcept
44 {
45 return sv_ == rhs.sv_;
46 }
47
48 bool operator!=(const StringView &rhs) const noexcept
49 {
50 return sv_ != rhs.sv_;
51 }
52
53 bool operator<(const StringView &rhs) const noexcept
54 {
55 return sv_ < rhs.sv_;
56 }
57
58 bool operator>(const StringView &rhs) const noexcept
59 {
60 return sv_ > rhs.sv_;
61 }
62
63 bool operator<=(const StringView &rhs) const noexcept
64 {
65 return sv_ <= rhs.sv_;
66 }
67
68 bool operator>=(const StringView &rhs) const noexcept
69 {
70 return sv_ >= rhs.sv_;
71 }
72
Compare(const StringView & other)73 int Compare(const StringView &other) const noexcept
74 {
75 return sv_.compare(other.sv_);
76 }
77
Compare(const std::string_view & other)78 int Compare(const std::string_view &other) const noexcept
79 {
80 return sv_.compare(other);
81 }
82
Is(const char * str)83 bool Is(const char *str) const noexcept
84 {
85 return sv_ == str;
86 }
87
Is(const std::string_view str)88 bool Is(const std::string_view str) const noexcept
89 {
90 return sv_ == str;
91 }
92
StartsWith(const std::string_view str)93 bool StartsWith(const std::string_view str) const noexcept
94 {
95 auto const length = str.size();
96 return sv_.size() >= length && sv_.substr(0U, length) == str;
97 }
98
EndsWith(const std::string_view str)99 bool EndsWith(const std::string_view str) const noexcept
100 {
101 auto const myLength = sv_.size();
102 auto const strLength = str.size();
103 return myLength >= strLength && sv_.substr(myLength - strLength, strLength) == str;
104 }
105
Length()106 size_t Length() const noexcept
107 {
108 return sv_.length();
109 }
110
Empty()111 bool Empty() const noexcept
112 {
113 return sv_.empty();
114 }
115
Utf8()116 std::string_view Utf8() const noexcept
117 {
118 return sv_;
119 }
120
string()121 explicit operator std::string() const noexcept
122 {
123 return std::string {sv_};
124 }
125
Bytes()126 const char *Bytes() const noexcept
127 {
128 return sv_.data();
129 }
130
Substr(size_t begin,size_t end)131 StringView Substr(size_t begin, size_t end) const noexcept
132 {
133 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
134 return StringView(std::string_view(sv_.data() + begin, end - begin));
135 }
136
IsHighSurrogate(char32_t cp)137 static bool IsHighSurrogate(char32_t cp)
138 {
139 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
140 }
141
IsLowSurrogate(char32_t cp)142 static bool IsLowSurrogate(char32_t cp)
143 {
144 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
145 }
146
147 std::string Mutf8() const noexcept;
148 static char32_t DecodeSurrogates(char32_t high, char32_t low);
149 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
150
151 template <void ENCODER(std::string *, char32_t)>
152 std::string EscapeSymbol() const;
153
154 template <typename T>
155 static void Utf8Encode(T *str, char32_t cu);
156 template <typename T>
157 static void Mutf8Encode(T *str, char32_t cu);
158
159 class Iterator {
160 public:
161 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
162
Iterator(const StringView & sv)163 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
164 DEFAULT_COPY_SEMANTIC(Iterator);
165 DEFAULT_MOVE_SEMANTIC(Iterator);
166 ~Iterator() = default;
167
Index()168 inline size_t Index() const
169 {
170 return static_cast<size_t>(iter_ - sv_.begin());
171 }
172
Next()173 inline char32_t Next()
174 {
175 return DecodeCP<true>(nullptr);
176 }
177
Peek()178 inline char32_t Peek() const
179 {
180 return HasNext() ? *iter_ : INVALID_CP;
181 }
182
PeekCp()183 inline char32_t PeekCp() const
184 {
185 return DecodeCP<false>(nullptr);
186 }
187
PeekCp(size_t * cpSize)188 inline char32_t PeekCp(size_t *cpSize) const
189 {
190 return DecodeCP<false, true>(cpSize);
191 }
192
Forward(size_t offset)193 inline void Forward(size_t offset)
194 {
195 iter_ += offset;
196 }
197
Backward(size_t offset)198 inline void Backward(size_t offset)
199 {
200 iter_ -= offset;
201 }
202
Reset(size_t offset)203 inline void Reset(size_t offset)
204 {
205 ES2PANDA_ASSERT(sv_.begin() + offset <= sv_.end());
206 iter_ = sv_.begin() + offset;
207 }
208
Rewind(std::string_view::const_iterator pos)209 inline void Rewind(std::string_view::const_iterator pos)
210 {
211 iter_ = pos;
212 }
213
Save()214 inline std::string_view::const_iterator Save() const
215 {
216 return iter_;
217 }
218
HasNext()219 inline bool HasNext() const
220 {
221 return iter_ < sv_.end();
222 }
223
224 void SkipCp();
225
226 private:
227 template <bool MOVE_ITER, bool SET_CP_SIZE = false>
228 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
229
230 std::string_view sv_;
231 mutable std::string_view::const_iterator iter_;
232 };
233
234 class Constants {
235 public:
236 static constexpr uint16_t UTF8_2BYTE_REQUIRED = 2;
237 static constexpr uint16_t UTF8_3BYTE_REQUIRED = 3;
238 static constexpr uint16_t UTF8_4BYTE_REQUIRED = 4;
239
240 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
241 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
242 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
243
244 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
245 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
246 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
247
248 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
249 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
250
251 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
252 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
253 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
254
255 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
256 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
257 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
258
259 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
260 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
261
262 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
263 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
264 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
265 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
266 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
267 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
268 };
269
270 private:
271 friend class Iterator;
272 std::string_view sv_;
273 };
274
275 class UString {
276 public:
277 UString() = default;
UString(ArenaAllocator * allocator)278 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)279 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
280 {
281 Alloc();
282 *str_ = str;
283 }
284
UString(const std::string_view & str,ArenaAllocator * allocator)285 explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
286 {
287 Alloc();
288 *str_ = str;
289 }
290
UString(const util::StringView & str,ArenaAllocator * allocator)291 explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
292
293 DEFAULT_COPY_SEMANTIC(UString);
294 DEFAULT_MOVE_SEMANTIC(UString);
295 ~UString() = default;
296
View()297 util::StringView View() const
298 {
299 if (str_ == nullptr) {
300 return util::StringView();
301 }
302
303 return util::StringView(str_);
304 }
305
View()306 util::StringView View()
307 {
308 if (str_ == nullptr) {
309 return util::StringView();
310 }
311
312 return util::StringView(str_);
313 }
314
Append(char32_t ch)315 util::UString &Append(char32_t ch) noexcept
316 {
317 if (str_ == nullptr) {
318 Alloc();
319 }
320
321 StringView::Utf8Encode<ArenaString>(str_, ch);
322 return *this;
323 }
324
Append(StringView other)325 util::UString &Append(StringView other) noexcept
326 {
327 if (str_ == nullptr) {
328 Alloc();
329 }
330
331 *str_ += other.Utf8();
332 return *this;
333 }
334
Append(std::string_view other)335 util::UString &Append(std::string_view other) noexcept
336 {
337 return Append(StringView(other));
338 }
339
Append(const char * other)340 util::UString &Append(const char *other) noexcept
341 {
342 if (str_ == nullptr) {
343 Alloc();
344 }
345
346 *str_ += other;
347 return *this;
348 }
349
Append(const std::string & other)350 util::UString &Append(const std::string &other) noexcept
351 {
352 if (str_ == nullptr) {
353 Alloc();
354 }
355
356 *str_ += other;
357 return *this;
358 }
359
360 private:
Alloc()361 void Alloc()
362 {
363 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
364 ES2PANDA_ASSERT(str_ != nullptr);
365 }
366
367 protected:
368 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
369 ArenaString *str_ {};
370 ArenaAllocator *allocator_ {};
371 // NOLINTEND(misc-non-private-member-variables-in-classes)
372 };
373
374 template <bool MOVE_ITER, bool SET_CP_SIZE>
DecodeCP(size_t * cpSize)375 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
376 {
377 if (!HasNext()) {
378 return INVALID_CP;
379 }
380
381 const auto *iterNext = iter_;
382 const auto remain = static_cast<size_t>(sv_.end() - iterNext);
383
384 char32_t cu0 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
385 char32_t res {};
386
387 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
388 res = cu0;
389 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER &&
390 remain >= Constants::UTF8_2BYTE_REQUIRED) {
391 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
392 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
393 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER &&
394 remain >= Constants::UTF8_3BYTE_REQUIRED) {
395 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
396 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
397 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
398 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
399 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
400 cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT && remain >= Constants::UTF8_4BYTE_REQUIRED) {
401 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
402 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
403 char32_t cu3 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
404 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
405 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
406 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
407 } else {
408 res = INVALID_CP;
409 }
410
411 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
412 if constexpr (MOVE_ITER) {
413 iter_ = iterNext;
414 return res;
415 }
416
417 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
418 if constexpr (SET_CP_SIZE) {
419 *cpSize = iterNext - iter_;
420 }
421
422 return res;
423 }
424
425 template <void ENCODER(std::string *, char32_t)>
EscapeSymbol()426 std::string StringView::EscapeSymbol() const
427 {
428 std::string str;
429 str.reserve(Length());
430
431 auto skipNewLine = [](auto &iter) {
432 if (iter.HasNext()) {
433 iter.Forward(1);
434
435 if (iter.Peek() != '\n') {
436 iter.Backward(1);
437 }
438 }
439 };
440
441 Iterator iter(*this);
442 while (iter.HasNext()) {
443 auto cp = iter.Next();
444
445 switch (cp) {
446 case '\r': {
447 skipNewLine(iter);
448 [[fallthrough]];
449 }
450 case '\n': {
451 str += "\\n";
452 break;
453 }
454 case '\b': {
455 str += "\\b";
456 break;
457 }
458 case '\t': {
459 str += "\\t";
460 break;
461 }
462 case '\f': {
463 str += "\\f";
464 break;
465 }
466 case '"': {
467 str += "\\\"";
468 break;
469 }
470 case '\\': {
471 str += "\\\\";
472 break;
473 }
474 default: {
475 ENCODER(&str, cp);
476 }
477 }
478 }
479
480 return str;
481 }
482
483 template <typename T>
Utf8Encode(T * str,char32_t cu)484 void StringView::Utf8Encode(T *str, char32_t cu)
485 {
486 ES2PANDA_ASSERT(str != nullptr);
487 if (cu < Constants::UTF8_1BYTE_LIMIT) {
488 str->push_back(static_cast<char>(cu));
489 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
490 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
491 Constants::UTF8_2BYTE_HEADER));
492 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
493 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
494 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
495 Constants::UTF8_3BYTE_HEADER));
496 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
497 Constants::UTF8_CONT_HEADER));
498 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
499 } else {
500 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
501 Constants::UTF8_4BYTE_HEADER));
502 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
503 Constants::UTF8_CONT_HEADER));
504 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
505 Constants::UTF8_CONT_HEADER));
506 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
507 }
508 }
509
510 template <typename T>
Mutf8Encode(T * str,char32_t cu)511 void StringView::Mutf8Encode(T *str, char32_t cu)
512 {
513 if (cu == 0) {
514 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
515 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
516 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
517 str->push_back(static_cast<char>(cu));
518 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
519 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
520 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
521 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
522 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
523 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
524 Constants::UTF8_CONT_HEADER));
525 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
526 } else {
527 auto [cu1, cu2] = EncodeSurrogate(cu);
528 Mutf8Encode(str, cu1);
529 Mutf8Encode(str, cu2);
530 }
531 }
532 } // namespace ark::es2panda::util
533
534 // NOLINTNEXTLINE(cert-dcl58-cpp)
535 namespace std {
536
537 template <>
538 // NOLINTNEXTLINE(altera-struct-pack-align)
539 struct hash<ark::es2panda::util::StringView> {
540 std::size_t operator()(const ark::es2panda::util::StringView &str) const
541 {
542 return std::hash<std::string_view> {}(str.Utf8());
543 }
544 };
545
546 ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
547
548 } // namespace std
549
550 #endif
551