1 /*
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace panda::es2panda::util {
29
30 class StringView {
31 public:
32 explicit StringView() noexcept = default;
StringView(const ArenaString * str)33 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const std::string_view & sv)35 StringView(const std::string_view &sv) noexcept : sv_(sv) {}
36 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37 StringView(const char *str) noexcept : sv_(str) {}
38 DEFAULT_COPY_SEMANTIC(StringView);
39 DEFAULT_MOVE_SEMANTIC(StringView);
40 ~StringView() = default;
41
42 bool operator==(const StringView &rhs) const noexcept
43 {
44 return sv_ == rhs.sv_;
45 }
46
47 bool operator!=(const StringView &rhs) const noexcept
48 {
49 return sv_ != rhs.sv_;
50 }
51
52 bool operator<(const StringView &rhs) const noexcept
53 {
54 return sv_ < rhs.sv_;
55 }
56
57 bool operator>(const StringView &rhs) const noexcept
58 {
59 return sv_ > rhs.sv_;
60 }
61
Compare(const StringView & other)62 int Compare(const StringView &other) const noexcept
63 {
64 return sv_.compare(other.sv_);
65 }
66
Compare(const std::string_view & other)67 int Compare(const std::string_view &other) const noexcept
68 {
69 return sv_.compare(other);
70 }
71
Is(const char * str)72 bool Is(const char *str) const noexcept
73 {
74 return sv_ == str;
75 }
76
Is(const std::string_view & str)77 bool Is(const std::string_view &str) const noexcept
78 {
79 return sv_ == str;
80 }
81
Length()82 size_t Length() const noexcept
83 {
84 return sv_.length();
85 }
86
Empty()87 bool Empty() const noexcept
88 {
89 return sv_.empty();
90 }
91
Utf8()92 const std::string_view &Utf8() const noexcept
93 {
94 return sv_;
95 }
96
string()97 explicit operator std::string() const noexcept
98 {
99 return std::string {sv_};
100 }
101
Bytes()102 const char *Bytes() const noexcept
103 {
104 return sv_.data();
105 }
106
Substr(size_t begin,size_t end)107 StringView Substr(size_t begin, size_t end) const noexcept
108 {
109 return StringView(std::string_view(sv_.data() + begin, end - begin));
110 }
111
Find(const char * str)112 constexpr size_t Find(const char *str) const
113 {
114 return sv_.find(str);
115 }
116
StartsWith(const std::string_view str)117 bool StartsWith(const std::string_view str) const noexcept
118 {
119 auto const length = str.size();
120 return sv_.size() >= length && sv_.substr(0U, length) == str;
121 }
122
IsHighSurrogate(char32_t cp)123 static bool IsHighSurrogate(char32_t cp)
124 {
125 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
126 }
127
IsLowSurrogate(char32_t cp)128 static bool IsLowSurrogate(char32_t cp)
129 {
130 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
131 }
132
133 std::string Mutf8() const noexcept;
134 static char32_t DecodeSurrogates(char32_t high, char32_t low);
135 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
136
137 template <void encoder(std::string *, char32_t)>
138 std::string EscapeSymbol() const;
139
140 template <typename T>
141 static void Utf8Encode(T *str, char32_t cu);
142 template <typename T>
143 static void Mutf8Encode(T *str, char32_t cu);
144
145 class Iterator {
146 public:
147 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
148
Iterator(const StringView & sv)149 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
150 DEFAULT_COPY_SEMANTIC(Iterator);
151 DEFAULT_MOVE_SEMANTIC(Iterator);
152 ~Iterator() = default;
153
Index()154 inline size_t Index() const
155 {
156 return static_cast<size_t>(iter_ - sv_.begin());
157 }
158
Next()159 inline char32_t Next()
160 {
161 return DecodeCP<true>(nullptr);
162 }
163
Peek()164 inline char32_t Peek() const
165 {
166 return HasNext() ? *iter_ : INVALID_CP;
167 }
168
PeekCp()169 inline char32_t PeekCp() const
170 {
171 return DecodeCP<false>(nullptr);
172 }
173
PeekCp(size_t * cpSize)174 inline char32_t PeekCp(size_t *cpSize) const
175 {
176 return DecodeCP<false, true>(cpSize);
177 }
178
Forward(size_t offset)179 inline void Forward(size_t offset) const
180 {
181 iter_ += offset;
182 }
183
Backward(size_t offset)184 inline void Backward(size_t offset) const
185 {
186 iter_ -= offset;
187 }
188
Reset(size_t offset)189 inline void Reset(size_t offset)
190 {
191 iter_ = sv_.begin() + offset;
192 }
193
Rewind(std::string_view::const_iterator pos)194 inline void Rewind(std::string_view::const_iterator pos) const
195 {
196 iter_ = pos;
197 }
198
Save()199 inline std::string_view::const_iterator Save() const
200 {
201 return iter_;
202 }
203
HasNext()204 inline bool HasNext() const
205 {
206 return iter_ != sv_.end();
207 }
208
HasExpectedNumberOfBytes(size_t count)209 bool HasExpectedNumberOfBytes(size_t count) const
210 {
211 for (size_t i = 0; i < count; ++i) {
212 if (!HasNext()) {
213 return false;
214 }
215 ++iter_;
216 }
217 iter_ -= count;
218 return true;
219 }
220
221 void SkipCp() const;
222
223 private:
224 template <bool moveIter, bool setCpSize = false>
225 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
226
227 std::string_view sv_;
228 mutable std::string_view::const_iterator iter_;
229 };
230
231 private:
232 class Constants {
233 public:
234 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
235 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
236 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
237
238 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
239 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
240 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
241
242 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
243 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
244
245 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
246 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
247 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
248
249 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
250 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
251 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
252
253 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
254 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
255
256 static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
257 static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
258 static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
259 static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4;
260
261 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
262 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
263 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
264 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
265 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
266 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
267 };
268
269 friend class Iterator;
270 std::string_view sv_;
271 };
272
273 class UString {
274 public:
275 UString() = default;
UString(ArenaAllocator * allocator)276 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)277 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
278 {
279 Alloc();
280 *str_ = str;
281 }
282
283 DEFAULT_COPY_SEMANTIC(UString);
284 DEFAULT_MOVE_SEMANTIC(UString);
285 ~UString() = default;
286
View()287 util::StringView View() const
288 {
289 if (!str_) {
290 return util::StringView();
291 }
292
293 return util::StringView(str_);
294 }
295
Append(char32_t ch)296 void Append(char32_t ch) noexcept
297 {
298 if (!str_) {
299 Alloc();
300 }
301
302 StringView::Utf8Encode<ArenaString>(str_, ch);
303 }
304
Append(const StringView & other)305 void Append(const StringView &other) noexcept
306 {
307 if (!str_) {
308 Alloc();
309 }
310
311 *str_ += other.Utf8();
312 }
313
Append(const char * other)314 void Append(const char *other) noexcept
315 {
316 if (!str_) {
317 Alloc();
318 }
319 *str_ += other;
320 }
321
322 private:
Alloc()323 void Alloc()
324 {
325 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
326 CHECK_NOT_NULL(str_);
327 }
328
329 protected:
330 ArenaString *str_ {};
331 ArenaAllocator *allocator_ {};
332 };
333
334 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)335 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
336 {
337 if (!HasNext()) {
338 return INVALID_CP;
339 }
340
341 const auto *iterNext = iter_;
342
343 char32_t cu0 = static_cast<uint8_t>(*iterNext++);
344 char32_t res {};
345
346 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
347 res = cu0;
348 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
349 // Should be 2 bytes decoded in UTF-8
350 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
351 return INVALID_CP;
352 }
353 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
354 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
355 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
356 // Should be 3 bytes decoded in UTF-8
357 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
358 return INVALID_CP;
359 }
360 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
361 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
362 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
363 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
364 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
365 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
366 // Should be 4 bytes decoded in UTF-8
367 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) {
368 return INVALID_CP;
369 }
370 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
371 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
372 char32_t cu3 = static_cast<uint8_t>(*iterNext++);
373 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
374 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
375 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
376 } else {
377 res = INVALID_CP;
378 }
379
380 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
381 if constexpr (moveIter) {
382 iter_ = iterNext;
383 return res;
384 }
385
386 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
387 if constexpr (setCpSize) {
388 *cpSize = iterNext - iter_;
389 }
390
391 return res;
392 }
393
394 template <void encoder(std::string *, char32_t)>
EscapeSymbol()395 std::string StringView::EscapeSymbol() const
396 {
397 std::string str;
398 str.reserve(Length());
399
400 Iterator iter(*this);
401
402 while (iter.HasNext()) {
403 auto cp = iter.Next();
404
405 switch (cp) {
406 case '\r': {
407 if (iter.HasNext()) {
408 iter.Forward(1);
409
410 if (iter.Peek() != '\n') {
411 iter.Backward(1);
412 }
413 }
414
415 [[fallthrough]];
416 }
417 case '\n': {
418 str += "\\n";
419 break;
420 }
421 case '\b': {
422 str += "\\b";
423 break;
424 }
425 case '\t': {
426 str += "\\t";
427 break;
428 }
429 case '\f': {
430 str += "\\f";
431 break;
432 }
433 case '"': {
434 str += "\\\"";
435 break;
436 }
437 case '\\': {
438 str += "\\\\";
439 break;
440 }
441 default: {
442 encoder(&str, cp);
443 }
444 }
445 }
446
447 return str;
448 }
449
450 template <typename T>
Utf8Encode(T * str,char32_t cu)451 void StringView::Utf8Encode(T *str, char32_t cu)
452 {
453 if (cu < Constants::UTF8_1BYTE_LIMIT) {
454 str->push_back(static_cast<char>(cu));
455 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
456 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
457 Constants::UTF8_2BYTE_HEADER));
458 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
459 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
461 Constants::UTF8_3BYTE_HEADER));
462 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463 Constants::UTF8_CONT_HEADER));
464 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
465 } else {
466 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
467 Constants::UTF8_4BYTE_HEADER));
468 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
469 Constants::UTF8_CONT_HEADER));
470 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
471 Constants::UTF8_CONT_HEADER));
472 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
473 }
474 }
475
476 template <typename T>
Mutf8Encode(T * str,char32_t cu)477 void StringView::Mutf8Encode(T *str, char32_t cu)
478 {
479 if (cu == 0) {
480 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
481 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
482 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
483 str->push_back(static_cast<char>(cu));
484 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
485 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
486 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
487 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
488 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
489 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
490 Constants::UTF8_CONT_HEADER));
491 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
492 } else {
493 auto [cu1, cu2] = EncodeSurrogate(cu);
494 Mutf8Encode(str, cu1);
495 Mutf8Encode(str, cu2);
496 }
497 }
498
499 } // namespace panda::es2panda::util
500
501 // NOLINTNEXTLINE(cert-dcl58-cpp)
502 namespace std {
503
504 template <>
505 // NOLINTNEXTLINE(altera-struct-pack-align)
506 struct hash<panda::es2panda::util::StringView> {
507 std::size_t operator()(const panda::es2panda::util::StringView &str) const
508 {
509 return std::hash<std::string_view> {}(str.Utf8());
510 }
511 };
512
513 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
514
515 } // namespace std
516
517 #ifndef NDEBUG
518 #define DCOUT std::cout
519 #else
520 #define DCOUT false && std::cout
521 #endif // NDEBUG
522
523 #endif
524