1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include "macros.h"
20 #include "utils/arena_containers.h"
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace ark::es2panda::util {
29 class StringView {
30 public:
31 explicit StringView() noexcept = default;
StringView(const ArenaString * str)32 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33 // CC-OFFNXT(G.FMT.06-CPP,G.CLS.03-CPP) same as clang, project code style
34 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35 StringView(std::string_view sv) noexcept : sv_(sv) {}
36 // CC-OFFNXT(G.FMT.06-CPP, G.CLS.03-CPP) same as clang, project code style
37 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)38 StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
39 DEFAULT_COPY_SEMANTIC(StringView);
40 DEFAULT_MOVE_SEMANTIC(StringView);
41 ~StringView() = default;
42
43 bool operator==(const StringView &rhs) const noexcept
44 {
45 return sv_ == rhs.sv_;
46 }
47
48 bool operator!=(const StringView &rhs) const noexcept
49 {
50 return sv_ != rhs.sv_;
51 }
52
53 bool operator<(const StringView &rhs) const noexcept
54 {
55 return sv_ < rhs.sv_;
56 }
57
58 bool operator>(const StringView &rhs) const noexcept
59 {
60 return sv_ > rhs.sv_;
61 }
62
Compare(const StringView & other)63 int Compare(const StringView &other) const noexcept
64 {
65 return sv_.compare(other.sv_);
66 }
67
Compare(const std::string_view & other)68 int Compare(const std::string_view &other) const noexcept
69 {
70 return sv_.compare(other);
71 }
72
Is(const char * str)73 bool Is(const char *str) const noexcept
74 {
75 return sv_ == str;
76 }
77
Is(const std::string_view str)78 bool Is(const std::string_view str) const noexcept
79 {
80 return sv_ == str;
81 }
82
StartsWith(const std::string_view str)83 bool StartsWith(const std::string_view str) const noexcept
84 {
85 auto const length = str.size();
86 return sv_.size() >= length && sv_.substr(0U, length) == str;
87 }
88
EndsWith(const std::string_view str)89 bool EndsWith(const std::string_view str) const noexcept
90 {
91 auto const myLength = sv_.size();
92 auto const strLength = str.size();
93 return myLength >= strLength && sv_.substr(myLength - strLength, strLength) == str;
94 }
95
Length()96 size_t Length() const noexcept
97 {
98 return sv_.length();
99 }
100
Empty()101 bool Empty() const noexcept
102 {
103 return sv_.empty();
104 }
105
Utf8()106 const std::string_view &Utf8() const noexcept
107 {
108 return sv_;
109 }
110
string()111 explicit operator std::string() const noexcept
112 {
113 return std::string {sv_};
114 }
115
Bytes()116 const char *Bytes() const noexcept
117 {
118 return sv_.data();
119 }
120
Substr(size_t begin,size_t end)121 StringView Substr(size_t begin, size_t end) const noexcept
122 {
123 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
124 return StringView(std::string_view(sv_.data() + begin, end - begin));
125 }
126
IsHighSurrogate(char32_t cp)127 static bool IsHighSurrogate(char32_t cp)
128 {
129 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
130 }
131
IsLowSurrogate(char32_t cp)132 static bool IsLowSurrogate(char32_t cp)
133 {
134 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
135 }
136
137 std::string Mutf8() const noexcept;
138 static char32_t DecodeSurrogates(char32_t high, char32_t low);
139 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
140
141 template <void ENCODER(std::string *, char32_t)>
142 std::string EscapeSymbol() const;
143
144 template <typename T>
145 static void Utf8Encode(T *str, char32_t cu);
146 template <typename T>
147 static void Mutf8Encode(T *str, char32_t cu);
148
149 bool IsConvertibleToChar() const;
150
151 class Iterator {
152 public:
153 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
154
Iterator(const StringView & sv)155 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
156 DEFAULT_COPY_SEMANTIC(Iterator);
157 DEFAULT_MOVE_SEMANTIC(Iterator);
158 ~Iterator() = default;
159
Index()160 inline size_t Index() const
161 {
162 return static_cast<size_t>(iter_ - sv_.begin());
163 }
164
Next()165 inline char32_t Next()
166 {
167 return DecodeCP<true>(nullptr);
168 }
169
Peek()170 inline char32_t Peek() const
171 {
172 return HasNext() ? *iter_ : INVALID_CP;
173 }
174
PeekCp()175 inline char32_t PeekCp() const
176 {
177 return DecodeCP<false>(nullptr);
178 }
179
PeekCp(size_t * cpSize)180 inline char32_t PeekCp(size_t *cpSize) const
181 {
182 return DecodeCP<false, true>(cpSize);
183 }
184
Forward(size_t offset)185 inline void Forward(size_t offset)
186 {
187 iter_ += offset;
188 }
189
Backward(size_t offset)190 inline void Backward(size_t offset)
191 {
192 iter_ -= offset;
193 }
194
Reset(size_t offset)195 inline void Reset(size_t offset)
196 {
197 iter_ = sv_.begin() + offset;
198 }
199
Rewind(std::string_view::const_iterator pos)200 inline void Rewind(std::string_view::const_iterator pos)
201 {
202 iter_ = pos;
203 }
204
Save()205 inline std::string_view::const_iterator Save() const
206 {
207 return iter_;
208 }
209
HasNext()210 inline bool HasNext() const
211 {
212 return iter_ < sv_.end();
213 }
214
215 void SkipCp();
216
217 private:
218 template <bool MOVE_ITER, bool SET_CP_SIZE = false>
219 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220
221 std::string_view sv_;
222 mutable std::string_view::const_iterator iter_;
223 };
224
225 class Constants {
226 public:
227 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
228 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
229 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
230
231 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
232 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
233 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
234
235 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
236 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
237
238 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
239 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
240 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
241
242 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
243 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
244 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
245
246 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
247 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
248
249 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
250 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
251 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
252 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
253 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
254 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
255 };
256
257 private:
258 friend class Iterator;
259 std::string_view sv_;
260 };
261
262 class UString {
263 public:
264 UString() = default;
UString(ArenaAllocator * allocator)265 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)266 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
267 {
268 Alloc();
269 *str_ = str;
270 }
271
UString(const std::string_view & str,ArenaAllocator * allocator)272 explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
273 {
274 Alloc();
275 *str_ = str;
276 }
277
UString(const util::StringView & str,ArenaAllocator * allocator)278 explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
279
280 DEFAULT_COPY_SEMANTIC(UString);
281 DEFAULT_MOVE_SEMANTIC(UString);
282 ~UString() = default;
283
View()284 util::StringView View() const
285 {
286 if (str_ == nullptr) {
287 return util::StringView();
288 }
289
290 return util::StringView(str_);
291 }
292
View()293 util::StringView View()
294 {
295 if (str_ == nullptr) {
296 return util::StringView();
297 }
298
299 return util::StringView(str_);
300 }
301
Append(char32_t ch)302 util::UString &Append(char32_t ch) noexcept
303 {
304 if (str_ == nullptr) {
305 Alloc();
306 }
307
308 StringView::Utf8Encode<ArenaString>(str_, ch);
309 return *this;
310 }
311
Append(const StringView & other)312 util::UString &Append(const StringView &other) noexcept
313 {
314 if (str_ == nullptr) {
315 Alloc();
316 }
317
318 *str_ += other.Utf8();
319 return *this;
320 }
321
Append(const char * other)322 util::UString &Append(const char *other) noexcept
323 {
324 if (str_ == nullptr) {
325 Alloc();
326 }
327
328 *str_ += other;
329 return *this;
330 }
331
Append(const std::string & other)332 util::UString &Append(const std::string &other) noexcept
333 {
334 if (str_ == nullptr) {
335 Alloc();
336 }
337
338 *str_ += other;
339 return *this;
340 }
341
342 private:
Alloc()343 void Alloc()
344 {
345 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
346 }
347
348 protected:
349 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
350 ArenaString *str_ {};
351 ArenaAllocator *allocator_ {};
352 // NOLINTEND(misc-non-private-member-variables-in-classes)
353 };
354
355 template <bool MOVE_ITER, bool SET_CP_SIZE>
DecodeCP(size_t * cpSize)356 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
357 {
358 if (!HasNext()) {
359 return INVALID_CP;
360 }
361
362 const auto *iterNext = iter_;
363
364 char32_t cu0 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
365 char32_t res {};
366
367 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
368 res = cu0;
369 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
370 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
371 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
372 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
373 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
375 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
376 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
377 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
378 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
379 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
380 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
381 char32_t cu3 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
382 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
383 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
384 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
385 } else {
386 res = INVALID_CP;
387 }
388
389 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
390 if constexpr (MOVE_ITER) {
391 iter_ = iterNext;
392 return res;
393 }
394
395 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
396 if constexpr (SET_CP_SIZE) {
397 *cpSize = iterNext - iter_;
398 }
399
400 return res;
401 }
402
403 template <void ENCODER(std::string *, char32_t)>
EscapeSymbol()404 std::string StringView::EscapeSymbol() const
405 {
406 std::string str;
407 str.reserve(Length());
408
409 auto skipNewLine = [](auto &iter) {
410 if (iter.HasNext()) {
411 iter.Forward(1);
412
413 if (iter.Peek() != '\n') {
414 iter.Backward(1);
415 }
416 }
417 };
418
419 Iterator iter(*this);
420 while (iter.HasNext()) {
421 auto cp = iter.Next();
422
423 switch (cp) {
424 case '\r': {
425 skipNewLine(iter);
426 [[fallthrough]];
427 }
428 case '\n': {
429 str += "\\n";
430 break;
431 }
432 case '\b': {
433 str += "\\b";
434 break;
435 }
436 case '\t': {
437 str += "\\t";
438 break;
439 }
440 case '\f': {
441 str += "\\f";
442 break;
443 }
444 case '"': {
445 str += "\\\"";
446 break;
447 }
448 case '\\': {
449 str += "\\\\";
450 break;
451 }
452 default: {
453 ENCODER(&str, cp);
454 }
455 }
456 }
457
458 return str;
459 }
460
461 template <typename T>
Utf8Encode(T * str,char32_t cu)462 void StringView::Utf8Encode(T *str, char32_t cu)
463 {
464 if (cu < Constants::UTF8_1BYTE_LIMIT) {
465 str->push_back(static_cast<char>(cu));
466 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
467 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
468 Constants::UTF8_2BYTE_HEADER));
469 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
470 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
471 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
472 Constants::UTF8_3BYTE_HEADER));
473 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
474 Constants::UTF8_CONT_HEADER));
475 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
476 } else {
477 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
478 Constants::UTF8_4BYTE_HEADER));
479 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
480 Constants::UTF8_CONT_HEADER));
481 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
482 Constants::UTF8_CONT_HEADER));
483 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
484 }
485 }
486
487 template <typename T>
Mutf8Encode(T * str,char32_t cu)488 void StringView::Mutf8Encode(T *str, char32_t cu)
489 {
490 if (cu == 0) {
491 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
492 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
493 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
494 str->push_back(static_cast<char>(cu));
495 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
496 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
497 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
498 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
499 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
500 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
501 Constants::UTF8_CONT_HEADER));
502 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
503 } else {
504 auto [cu1, cu2] = EncodeSurrogate(cu);
505 Mutf8Encode(str, cu1);
506 Mutf8Encode(str, cu2);
507 }
508 }
509 } // namespace ark::es2panda::util
510
511 // NOLINTNEXTLINE(cert-dcl58-cpp)
512 namespace std {
513
514 template <>
515 // NOLINTNEXTLINE(altera-struct-pack-align)
516 struct hash<ark::es2panda::util::StringView> {
517 std::size_t operator()(const ark::es2panda::util::StringView &str) const
518 {
519 return std::hash<std::string_view> {}(str.Utf8());
520 }
521 };
522
523 ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
524
525 } // namespace std
526
527 #endif
528