• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace panda::es2panda::util {
29 
30 class StringView {
31 public:
32     explicit StringView() noexcept = default;
StringView(const ArenaString * str)33     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const std::string_view & sv)35     StringView(const std::string_view &sv) noexcept : sv_(sv) {}
36     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37     StringView(const char *str) noexcept : sv_(str) {}
38     DEFAULT_COPY_SEMANTIC(StringView);
39     DEFAULT_MOVE_SEMANTIC(StringView);
40     ~StringView() = default;
41 
42     bool operator==(const StringView &rhs) const noexcept
43     {
44         return sv_ == rhs.sv_;
45     }
46 
47     bool operator!=(const StringView &rhs) const noexcept
48     {
49         return sv_ != rhs.sv_;
50     }
51 
52     bool operator<(const StringView &rhs) const noexcept
53     {
54         return sv_ < rhs.sv_;
55     }
56 
57     bool operator>(const StringView &rhs) const noexcept
58     {
59         return sv_ > rhs.sv_;
60     }
61 
Compare(const StringView & other)62     int Compare(const StringView &other) const noexcept
63     {
64         return sv_.compare(other.sv_);
65     }
66 
Compare(const std::string_view & other)67     int Compare(const std::string_view &other) const noexcept
68     {
69         return sv_.compare(other);
70     }
71 
Is(const char * str)72     bool Is(const char *str) const noexcept
73     {
74         return sv_ == str;
75     }
76 
Is(const std::string_view & str)77     bool Is(const std::string_view &str) const noexcept
78     {
79         return sv_ == str;
80     }
81 
Length()82     size_t Length() const noexcept
83     {
84         return sv_.length();
85     }
86 
Empty()87     bool Empty() const noexcept
88     {
89         return sv_.empty();
90     }
91 
Utf8()92     const std::string_view &Utf8() const noexcept
93     {
94         return sv_;
95     }
96 
string()97     explicit operator std::string() const noexcept
98     {
99         return std::string {sv_};
100     }
101 
Bytes()102     const char *Bytes() const noexcept
103     {
104         return sv_.data();
105     }
106 
Substr(size_t begin,size_t end)107     StringView Substr(size_t begin, size_t end) const noexcept
108     {
109         return StringView(std::string_view(sv_.data() + begin, end - begin));
110     }
111 
Find(const char * str)112     constexpr size_t Find(const char *str) const
113     {
114         return sv_.find(str);
115     }
116 
StartsWith(const std::string_view str)117     bool StartsWith(const std::string_view str) const noexcept
118     {
119         auto const length = str.size();
120         return sv_.size() >= length && sv_.substr(0U, length) == str;
121     }
122 
IsHighSurrogate(char32_t cp)123     static bool IsHighSurrogate(char32_t cp)
124     {
125         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
126     }
127 
IsLowSurrogate(char32_t cp)128     static bool IsLowSurrogate(char32_t cp)
129     {
130         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
131     }
132 
133     std::string Mutf8() const noexcept;
134     static char32_t DecodeSurrogates(char32_t high, char32_t low);
135     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
136 
137     template <void encoder(std::string *, char32_t)>
138     std::string EscapeSymbol() const;
139 
140     template <typename T>
141     static void Utf8Encode(T *str, char32_t cu);
142     template <typename T>
143     static void Mutf8Encode(T *str, char32_t cu);
144 
145     class Iterator {
146     public:
147         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
148 
Iterator(const StringView & sv)149         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
150         DEFAULT_COPY_SEMANTIC(Iterator);
151         DEFAULT_MOVE_SEMANTIC(Iterator);
152         ~Iterator() = default;
153 
Index()154         inline size_t Index() const
155         {
156             return static_cast<size_t>(iter_ - sv_.begin());
157         }
158 
Next()159         inline char32_t Next()
160         {
161             return DecodeCP<true>(nullptr);
162         }
163 
Peek()164         inline char32_t Peek() const
165         {
166             return HasNext() ? *iter_ : INVALID_CP;
167         }
168 
PeekCp()169         inline char32_t PeekCp() const
170         {
171             return DecodeCP<false>(nullptr);
172         }
173 
PeekCp(size_t * cpSize)174         inline char32_t PeekCp(size_t *cpSize) const
175         {
176             return DecodeCP<false, true>(cpSize);
177         }
178 
Forward(size_t offset)179         inline void Forward(size_t offset) const
180         {
181             iter_ += offset;
182         }
183 
Backward(size_t offset)184         inline void Backward(size_t offset) const
185         {
186             iter_ -= offset;
187         }
188 
Reset(size_t offset)189         inline void Reset(size_t offset)
190         {
191             iter_ = sv_.begin() + offset;
192         }
193 
Rewind(std::string_view::const_iterator pos)194         inline void Rewind(std::string_view::const_iterator pos) const
195         {
196             iter_ = pos;
197         }
198 
Save()199         inline std::string_view::const_iterator Save() const
200         {
201             return iter_;
202         }
203 
HasNext()204         inline bool HasNext() const
205         {
206             return iter_ != sv_.end();
207         }
208 
HasExpectedNumberOfBytes(size_t count)209         bool HasExpectedNumberOfBytes(size_t count) const
210         {
211             for (size_t i = 0; i < count; ++i) {
212                 if (!HasNext()) {
213                     return false;
214                 }
215                 ++iter_;
216             }
217             iter_ -= count;
218             return true;
219         }
220 
221         void SkipCp() const;
222 
223     private:
224         template <bool moveIter, bool setCpSize = false>
225         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
226 
227         std::string_view sv_;
228         mutable std::string_view::const_iterator iter_;
229     };
230 
231 private:
232     class Constants {
233     public:
234         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
235         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
236         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
237 
238         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
239         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
240         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
241 
242         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
243         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
244 
245         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
246         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
247         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
248 
249         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
250         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
251         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
252 
253         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
254         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
255 
256         static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
257         static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
258         static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
259         static constexpr size_t UTF8_NEXT_FOUR_BYTE = 4;
260 
261         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
262         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
263         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
264         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
265         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
266         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
267     };
268 
269     friend class Iterator;
270     std::string_view sv_;
271 };
272 
273 class UString {
274 public:
275     UString() = default;
UString(ArenaAllocator * allocator)276     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)277     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
278     {
279         Alloc();
280         *str_ = str;
281     }
282 
283     DEFAULT_COPY_SEMANTIC(UString);
284     DEFAULT_MOVE_SEMANTIC(UString);
285     ~UString() = default;
286 
View()287     util::StringView View() const
288     {
289         if (!str_) {
290             return util::StringView();
291         }
292 
293         return util::StringView(str_);
294     }
295 
Append(char32_t ch)296     void Append(char32_t ch) noexcept
297     {
298         if (!str_) {
299             Alloc();
300         }
301 
302         StringView::Utf8Encode<ArenaString>(str_, ch);
303     }
304 
Append(const StringView & other)305     void Append(const StringView &other) noexcept
306     {
307         if (!str_) {
308             Alloc();
309         }
310 
311         *str_ += other.Utf8();
312     }
313 
Append(const char * other)314     void Append(const char *other) noexcept
315     {
316         if (!str_) {
317             Alloc();
318         }
319         *str_ += other;
320     }
321 
322 private:
Alloc()323     void Alloc()
324     {
325         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
326         CHECK_NOT_NULL(str_);
327     }
328 
329 protected:
330     ArenaString *str_ {};
331     ArenaAllocator *allocator_ {};
332 };
333 
334 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)335 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
336 {
337     if (!HasNext()) {
338         return INVALID_CP;
339     }
340 
341     const auto *iterNext = iter_;
342 
343     char32_t cu0 = static_cast<uint8_t>(*iterNext++);
344     char32_t res {};
345 
346     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
347         res = cu0;
348     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
349         // Should be 2 bytes decoded in UTF-8
350         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
351             return INVALID_CP;
352         }
353         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
354         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
355     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
356         // Should be 3 bytes decoded in UTF-8
357         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
358             return INVALID_CP;
359         }
360         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
361         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
362         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
363               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
364     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
365                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
366         // Should be 4 bytes decoded in UTF-8
367         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_FOUR_BYTE)) {
368             return INVALID_CP;
369         }
370         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
371         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
372         char32_t cu3 = static_cast<uint8_t>(*iterNext++);
373         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
374               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
375               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
376     } else {
377         res = INVALID_CP;
378     }
379 
380     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
381     if constexpr (moveIter) {
382         iter_ = iterNext;
383         return res;
384     }
385 
386     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
387     if constexpr (setCpSize) {
388         *cpSize = iterNext - iter_;
389     }
390 
391     return res;
392 }
393 
394 template <void encoder(std::string *, char32_t)>
EscapeSymbol()395 std::string StringView::EscapeSymbol() const
396 {
397     std::string str;
398     str.reserve(Length());
399 
400     Iterator iter(*this);
401 
402     while (iter.HasNext()) {
403         auto cp = iter.Next();
404 
405         switch (cp) {
406             case '\r': {
407                 if (iter.HasNext()) {
408                     iter.Forward(1);
409 
410                     if (iter.Peek() != '\n') {
411                         iter.Backward(1);
412                     }
413                 }
414 
415                 [[fallthrough]];
416             }
417             case '\n': {
418                 str += "\\n";
419                 break;
420             }
421             case '\b': {
422                 str += "\\b";
423                 break;
424             }
425             case '\t': {
426                 str += "\\t";
427                 break;
428             }
429             case '\f': {
430                 str += "\\f";
431                 break;
432             }
433             case '"': {
434                 str += "\\\"";
435                 break;
436             }
437             case '\\': {
438                 str += "\\\\";
439                 break;
440             }
441             default: {
442                 encoder(&str, cp);
443             }
444         }
445     }
446 
447     return str;
448 }
449 
450 template <typename T>
Utf8Encode(T * str,char32_t cu)451 void StringView::Utf8Encode(T *str, char32_t cu)
452 {
453     if (cu < Constants::UTF8_1BYTE_LIMIT) {
454         str->push_back(static_cast<char>(cu));
455     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
456         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
457                                          Constants::UTF8_2BYTE_HEADER));
458         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
459     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
460         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
461                                          Constants::UTF8_3BYTE_HEADER));
462         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463                                          Constants::UTF8_CONT_HEADER));
464         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
465     } else {
466         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
467                                          Constants::UTF8_4BYTE_HEADER));
468         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
469                                          Constants::UTF8_CONT_HEADER));
470         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
471                                          Constants::UTF8_CONT_HEADER));
472         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
473     }
474 }
475 
476 template <typename T>
Mutf8Encode(T * str,char32_t cu)477 void StringView::Mutf8Encode(T *str, char32_t cu)
478 {
479     if (cu == 0) {
480         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
481         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
482     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
483         str->push_back(static_cast<char>(cu));
484     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
485         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
486         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
487     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
488         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
489         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
490                                          Constants::UTF8_CONT_HEADER));
491         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
492     } else {
493         auto [cu1, cu2] = EncodeSurrogate(cu);
494         Mutf8Encode(str, cu1);
495         Mutf8Encode(str, cu2);
496     }
497 }
498 
499 }  // namespace panda::es2panda::util
500 
501 // NOLINTNEXTLINE(cert-dcl58-cpp)
502 namespace std {
503 
504 template <>
505 // NOLINTNEXTLINE(altera-struct-pack-align)
506 struct hash<panda::es2panda::util::StringView> {
507     std::size_t operator()(const panda::es2panda::util::StringView &str) const
508     {
509         return std::hash<std::string_view> {}(str.Utf8());
510     }
511 };
512 
513 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
514 
515 }  // namespace std
516 
517 #ifndef NDEBUG
518 #define DCOUT std::cout
519 #else
520 #define DCOUT false && std::cout
521 #endif  // NDEBUG
522 
523 #endif
524