• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include "macros.h"
20 #include "utils/arena_containers.h"
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace ark::es2panda::util {
29 class StringView {
30 public:
31     explicit StringView() noexcept = default;
StringView(const ArenaString * str)32     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)34     StringView(std::string_view sv) noexcept : sv_(sv) {}
35     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)36     StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
37     DEFAULT_COPY_SEMANTIC(StringView);
38     DEFAULT_MOVE_SEMANTIC(StringView);
39     ~StringView() = default;
40 
41     bool operator==(const StringView &rhs) const noexcept
42     {
43         return sv_ == rhs.sv_;
44     }
45 
46     bool operator!=(const StringView &rhs) const noexcept
47     {
48         return sv_ != rhs.sv_;
49     }
50 
51     bool operator<(const StringView &rhs) const noexcept
52     {
53         return sv_ < rhs.sv_;
54     }
55 
56     bool operator>(const StringView &rhs) const noexcept
57     {
58         return sv_ > rhs.sv_;
59     }
60 
Compare(const StringView & other)61     int Compare(const StringView &other) const noexcept
62     {
63         return sv_.compare(other.sv_);
64     }
65 
Compare(const std::string_view & other)66     int Compare(const std::string_view &other) const noexcept
67     {
68         return sv_.compare(other);
69     }
70 
Is(const char * str)71     bool Is(const char *str) const noexcept
72     {
73         return sv_ == str;
74     }
75 
Is(const std::string_view & str)76     bool Is(const std::string_view &str) const noexcept
77     {
78         return sv_ == str;
79     }
80 
Length()81     size_t Length() const noexcept
82     {
83         return sv_.length();
84     }
85 
Empty()86     bool Empty() const noexcept
87     {
88         return sv_.empty();
89     }
90 
Utf8()91     const std::string_view &Utf8() const noexcept
92     {
93         return sv_;
94     }
95 
string()96     explicit operator std::string() const noexcept
97     {
98         return std::string {sv_};
99     }
100 
Bytes()101     const char *Bytes() const noexcept
102     {
103         return sv_.data();
104     }
105 
Substr(size_t begin,size_t end)106     StringView Substr(size_t begin, size_t end) const noexcept
107     {
108         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
109         return StringView(std::string_view(sv_.data() + begin, end - begin));
110     }
111 
IsHighSurrogate(char32_t cp)112     static bool IsHighSurrogate(char32_t cp)
113     {
114         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
115     }
116 
IsLowSurrogate(char32_t cp)117     static bool IsLowSurrogate(char32_t cp)
118     {
119         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
120     }
121 
122     std::string Mutf8() const noexcept;
123     static char32_t DecodeSurrogates(char32_t high, char32_t low);
124     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
125 
126     template <void ENCODER(std::string *, char32_t)>
127     std::string EscapeSymbol() const;
128 
129     template <typename T>
130     static void Utf8Encode(T *str, char32_t cu);
131     template <typename T>
132     static void Mutf8Encode(T *str, char32_t cu);
133 
134     bool IsConvertibleToChar() const;
135 
136     class Iterator {
137     public:
138         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
139 
Iterator(const StringView & sv)140         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
141         DEFAULT_COPY_SEMANTIC(Iterator);
142         DEFAULT_MOVE_SEMANTIC(Iterator);
143         ~Iterator() = default;
144 
Index()145         inline size_t Index() const
146         {
147             return static_cast<size_t>(iter_ - sv_.begin());
148         }
149 
Next()150         inline char32_t Next()
151         {
152             return DecodeCP<true>(nullptr);
153         }
154 
Peek()155         inline char32_t Peek() const
156         {
157             return HasNext() ? *iter_ : INVALID_CP;
158         }
159 
PeekCp()160         inline char32_t PeekCp() const
161         {
162             return DecodeCP<false>(nullptr);
163         }
164 
PeekCp(size_t * cpSize)165         inline char32_t PeekCp(size_t *cpSize) const
166         {
167             return DecodeCP<false, true>(cpSize);
168         }
169 
Forward(size_t offset)170         inline void Forward(size_t offset)
171         {
172             iter_ += offset;
173         }
174 
Backward(size_t offset)175         inline void Backward(size_t offset)
176         {
177             iter_ -= offset;
178         }
179 
Reset(size_t offset)180         inline void Reset(size_t offset)
181         {
182             iter_ = sv_.begin() + offset;
183         }
184 
Rewind(std::string_view::const_iterator pos)185         inline void Rewind(std::string_view::const_iterator pos)
186         {
187             iter_ = pos;
188         }
189 
Save()190         inline std::string_view::const_iterator Save() const
191         {
192             return iter_;
193         }
194 
HasNext()195         inline bool HasNext() const
196         {
197             return iter_ != sv_.end();
198         }
199 
200         void SkipCp();
201 
202     private:
203         template <bool MOVE_ITER, bool SET_CP_SIZE = false>
204         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
205 
206         std::string_view sv_;
207         mutable std::string_view::const_iterator iter_;
208     };
209 
210     class Constants {
211     public:
212         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
213         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
214         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
215 
216         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
217         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
218         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
219 
220         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
221         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
222 
223         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
224         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
225         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
226 
227         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
228         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
229         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
230 
231         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
232         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
233 
234         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
235         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
236         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
237         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
238         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
239         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
240     };
241 
242 private:
243     friend class Iterator;
244     std::string_view sv_;
245 };
246 
247 class UString {
248 public:
249     UString() = default;
UString(ArenaAllocator * allocator)250     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)251     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
252     {
253         Alloc();
254         *str_ = str;
255     }
256 
UString(const std::string_view & str,ArenaAllocator * allocator)257     explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
258     {
259         Alloc();
260         *str_ = str;
261     }
262 
UString(const util::StringView & str,ArenaAllocator * allocator)263     explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
264 
265     DEFAULT_COPY_SEMANTIC(UString);
266     DEFAULT_MOVE_SEMANTIC(UString);
267     ~UString() = default;
268 
View()269     util::StringView View() const
270     {
271         if (str_ == nullptr) {
272             return util::StringView();
273         }
274 
275         return util::StringView(str_);
276     }
277 
View()278     util::StringView View()
279     {
280         if (str_ == nullptr) {
281             return util::StringView();
282         }
283 
284         return util::StringView(str_);
285     }
286 
Append(char32_t ch)287     void Append(char32_t ch) noexcept
288     {
289         if (str_ == nullptr) {
290             Alloc();
291         }
292 
293         StringView::Utf8Encode<ArenaString>(str_, ch);
294     }
295 
Append(const StringView & other)296     void Append(const StringView &other) noexcept
297     {
298         if (str_ == nullptr) {
299             Alloc();
300         }
301 
302         *str_ += other.Utf8();
303     }
304 
Append(const char * other)305     void Append(const char *other) noexcept
306     {
307         if (str_ == nullptr) {
308             Alloc();
309         }
310         *str_ += other;
311     }
312 
Append(const std::string & other)313     void Append(const std::string &other) noexcept
314     {
315         if (str_ == nullptr) {
316             Alloc();
317         }
318         *str_ += other;
319     }
320 
321 private:
Alloc()322     void Alloc()
323     {
324         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
325     }
326 
327 protected:
328     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
329     ArenaString *str_ {};
330     ArenaAllocator *allocator_ {};
331     // NOLINTEND(misc-non-private-member-variables-in-classes)
332 };
333 
334 template <bool MOVE_ITER, bool SET_CP_SIZE>
DecodeCP(size_t * cpSize)335 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
336 {
337     if (!HasNext()) {
338         return INVALID_CP;
339     }
340 
341     const auto *iterNext = iter_;
342 
343     char32_t cu0 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344     char32_t res {};
345 
346     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
347         res = cu0;
348     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
349         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
350         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
351     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
352         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
353         char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
355               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
356     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
357                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
358         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359         char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360         char32_t cu3 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
362               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
363               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
364     } else {
365         res = INVALID_CP;
366     }
367 
368     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369     if constexpr (MOVE_ITER) {
370         iter_ = iterNext;
371         return res;
372     }
373 
374     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375     if constexpr (SET_CP_SIZE) {
376         *cpSize = iterNext - iter_;
377     }
378 
379     return res;
380 }
381 
382 template <void ENCODER(std::string *, char32_t)>
EscapeSymbol()383 std::string StringView::EscapeSymbol() const
384 {
385     std::string str;
386     str.reserve(Length());
387 
388     auto skipNewLine = [](auto &iter) {
389         if (iter.HasNext()) {
390             iter.Forward(1);
391 
392             if (iter.Peek() != '\n') {
393                 iter.Backward(1);
394             }
395         }
396     };
397 
398     Iterator iter(*this);
399     while (iter.HasNext()) {
400         auto cp = iter.Next();
401 
402         switch (cp) {
403             case '\r': {
404                 skipNewLine(iter);
405                 [[fallthrough]];
406             }
407             case '\n': {
408                 str += "\\n";
409                 break;
410             }
411             case '\b': {
412                 str += "\\b";
413                 break;
414             }
415             case '\t': {
416                 str += "\\t";
417                 break;
418             }
419             case '\f': {
420                 str += "\\f";
421                 break;
422             }
423             case '"': {
424                 str += "\\\"";
425                 break;
426             }
427             case '\\': {
428                 str += "\\\\";
429                 break;
430             }
431             default: {
432                 ENCODER(&str, cp);
433             }
434         }
435     }
436 
437     return str;
438 }
439 
440 template <typename T>
Utf8Encode(T * str,char32_t cu)441 void StringView::Utf8Encode(T *str, char32_t cu)
442 {
443     if (cu < Constants::UTF8_1BYTE_LIMIT) {
444         str->push_back(static_cast<char>(cu));
445     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
446         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
447                                          Constants::UTF8_2BYTE_HEADER));
448         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
449     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
450         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
451                                          Constants::UTF8_3BYTE_HEADER));
452         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
453                                          Constants::UTF8_CONT_HEADER));
454         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
455     } else {
456         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
457                                          Constants::UTF8_4BYTE_HEADER));
458         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
459                                          Constants::UTF8_CONT_HEADER));
460         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
461                                          Constants::UTF8_CONT_HEADER));
462         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
463     }
464 }
465 
466 template <typename T>
Mutf8Encode(T * str,char32_t cu)467 void StringView::Mutf8Encode(T *str, char32_t cu)
468 {
469     if (cu == 0) {
470         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
471         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
472     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
473         str->push_back(static_cast<char>(cu));
474     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
475         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
476         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
477     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
478         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
479         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
480                                          Constants::UTF8_CONT_HEADER));
481         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
482     } else {
483         auto [cu1, cu2] = EncodeSurrogate(cu);
484         Mutf8Encode(str, cu1);
485         Mutf8Encode(str, cu2);
486     }
487 }
488 }  // namespace ark::es2panda::util
489 
490 // NOLINTNEXTLINE(cert-dcl58-cpp)
491 namespace std {
492 
493 template <>
494 // NOLINTNEXTLINE(altera-struct-pack-align)
495 struct hash<ark::es2panda::util::StringView> {
496     std::size_t operator()(const ark::es2panda::util::StringView &str) const
497     {
498         return std::hash<std::string_view> {}(str.Utf8());
499     }
500 };
501 
502 ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
503 
504 }  // namespace std
505 
506 #endif
507