• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include "macros.h"
20 #include "utils/arena_containers.h"
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace ark::es2panda::util {
29 class StringView {
30 public:
31     explicit StringView() noexcept = default;
StringView(const ArenaString * str)32     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33     // CC-OFFNXT(G.FMT.06-CPP,G.CLS.03-CPP) same as clang, project code style
34     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35     StringView(std::string_view sv) noexcept : sv_(sv) {}
36     // CC-OFFNXT(G.FMT.06-CPP, G.CLS.03-CPP) same as clang, project code style
37     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)38     StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
39     DEFAULT_COPY_SEMANTIC(StringView);
40     DEFAULT_MOVE_SEMANTIC(StringView);
41     ~StringView() = default;
42 
43     bool operator==(const StringView &rhs) const noexcept
44     {
45         return sv_ == rhs.sv_;
46     }
47 
48     bool operator!=(const StringView &rhs) const noexcept
49     {
50         return sv_ != rhs.sv_;
51     }
52 
53     bool operator<(const StringView &rhs) const noexcept
54     {
55         return sv_ < rhs.sv_;
56     }
57 
58     bool operator>(const StringView &rhs) const noexcept
59     {
60         return sv_ > rhs.sv_;
61     }
62 
Compare(const StringView & other)63     int Compare(const StringView &other) const noexcept
64     {
65         return sv_.compare(other.sv_);
66     }
67 
Compare(const std::string_view & other)68     int Compare(const std::string_view &other) const noexcept
69     {
70         return sv_.compare(other);
71     }
72 
Is(const char * str)73     bool Is(const char *str) const noexcept
74     {
75         return sv_ == str;
76     }
77 
Is(const std::string_view str)78     bool Is(const std::string_view str) const noexcept
79     {
80         return sv_ == str;
81     }
82 
StartsWith(const std::string_view str)83     bool StartsWith(const std::string_view str) const noexcept
84     {
85         auto const length = str.size();
86         return sv_.size() >= length && sv_.substr(0U, length) == str;
87     }
88 
EndsWith(const std::string_view str)89     bool EndsWith(const std::string_view str) const noexcept
90     {
91         auto const myLength = sv_.size();
92         auto const strLength = str.size();
93         return myLength >= strLength && sv_.substr(myLength - strLength, strLength) == str;
94     }
95 
Length()96     size_t Length() const noexcept
97     {
98         return sv_.length();
99     }
100 
Empty()101     bool Empty() const noexcept
102     {
103         return sv_.empty();
104     }
105 
Utf8()106     const std::string_view &Utf8() const noexcept
107     {
108         return sv_;
109     }
110 
string()111     explicit operator std::string() const noexcept
112     {
113         return std::string {sv_};
114     }
115 
Bytes()116     const char *Bytes() const noexcept
117     {
118         return sv_.data();
119     }
120 
Substr(size_t begin,size_t end)121     StringView Substr(size_t begin, size_t end) const noexcept
122     {
123         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
124         return StringView(std::string_view(sv_.data() + begin, end - begin));
125     }
126 
IsHighSurrogate(char32_t cp)127     static bool IsHighSurrogate(char32_t cp)
128     {
129         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
130     }
131 
IsLowSurrogate(char32_t cp)132     static bool IsLowSurrogate(char32_t cp)
133     {
134         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
135     }
136 
137     std::string Mutf8() const noexcept;
138     static char32_t DecodeSurrogates(char32_t high, char32_t low);
139     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
140 
141     template <void ENCODER(std::string *, char32_t)>
142     std::string EscapeSymbol() const;
143 
144     template <typename T>
145     static void Utf8Encode(T *str, char32_t cu);
146     template <typename T>
147     static void Mutf8Encode(T *str, char32_t cu);
148 
149     bool IsConvertibleToChar() const;
150 
151     class Iterator {
152     public:
153         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
154 
Iterator(const StringView & sv)155         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
156         DEFAULT_COPY_SEMANTIC(Iterator);
157         DEFAULT_MOVE_SEMANTIC(Iterator);
158         ~Iterator() = default;
159 
Index()160         inline size_t Index() const
161         {
162             return static_cast<size_t>(iter_ - sv_.begin());
163         }
164 
Next()165         inline char32_t Next()
166         {
167             return DecodeCP<true>(nullptr);
168         }
169 
Peek()170         inline char32_t Peek() const
171         {
172             return HasNext() ? *iter_ : INVALID_CP;
173         }
174 
PeekCp()175         inline char32_t PeekCp() const
176         {
177             return DecodeCP<false>(nullptr);
178         }
179 
PeekCp(size_t * cpSize)180         inline char32_t PeekCp(size_t *cpSize) const
181         {
182             return DecodeCP<false, true>(cpSize);
183         }
184 
Forward(size_t offset)185         inline void Forward(size_t offset)
186         {
187             iter_ += offset;
188         }
189 
Backward(size_t offset)190         inline void Backward(size_t offset)
191         {
192             iter_ -= offset;
193         }
194 
Reset(size_t offset)195         inline void Reset(size_t offset)
196         {
197             iter_ = sv_.begin() + offset;
198         }
199 
Rewind(std::string_view::const_iterator pos)200         inline void Rewind(std::string_view::const_iterator pos)
201         {
202             iter_ = pos;
203         }
204 
Save()205         inline std::string_view::const_iterator Save() const
206         {
207             return iter_;
208         }
209 
HasNext()210         inline bool HasNext() const
211         {
212             return iter_ < sv_.end();
213         }
214 
215         void SkipCp();
216 
217     private:
218         template <bool MOVE_ITER, bool SET_CP_SIZE = false>
219         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220 
221         std::string_view sv_;
222         mutable std::string_view::const_iterator iter_;
223     };
224 
225     class Constants {
226     public:
227         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
228         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
229         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
230 
231         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
232         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
233         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
234 
235         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
236         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
237 
238         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
239         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
240         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
241 
242         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
243         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
244         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
245 
246         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
247         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
248 
249         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
250         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
251         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
252         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
253         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
254         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
255     };
256 
257 private:
258     friend class Iterator;
259     std::string_view sv_;
260 };
261 
262 class UString {
263 public:
264     UString() = default;
UString(ArenaAllocator * allocator)265     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)266     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
267     {
268         Alloc();
269         *str_ = str;
270     }
271 
UString(const std::string_view & str,ArenaAllocator * allocator)272     explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
273     {
274         Alloc();
275         *str_ = str;
276     }
277 
UString(const util::StringView & str,ArenaAllocator * allocator)278     explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
279 
280     DEFAULT_COPY_SEMANTIC(UString);
281     DEFAULT_MOVE_SEMANTIC(UString);
282     ~UString() = default;
283 
View()284     util::StringView View() const
285     {
286         if (str_ == nullptr) {
287             return util::StringView();
288         }
289 
290         return util::StringView(str_);
291     }
292 
View()293     util::StringView View()
294     {
295         if (str_ == nullptr) {
296             return util::StringView();
297         }
298 
299         return util::StringView(str_);
300     }
301 
Append(char32_t ch)302     util::UString &Append(char32_t ch) noexcept
303     {
304         if (str_ == nullptr) {
305             Alloc();
306         }
307 
308         StringView::Utf8Encode<ArenaString>(str_, ch);
309         return *this;
310     }
311 
Append(const StringView & other)312     util::UString &Append(const StringView &other) noexcept
313     {
314         if (str_ == nullptr) {
315             Alloc();
316         }
317 
318         *str_ += other.Utf8();
319         return *this;
320     }
321 
Append(const char * other)322     util::UString &Append(const char *other) noexcept
323     {
324         if (str_ == nullptr) {
325             Alloc();
326         }
327 
328         *str_ += other;
329         return *this;
330     }
331 
Append(const std::string & other)332     util::UString &Append(const std::string &other) noexcept
333     {
334         if (str_ == nullptr) {
335             Alloc();
336         }
337 
338         *str_ += other;
339         return *this;
340     }
341 
342 private:
Alloc()343     void Alloc()
344     {
345         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
346     }
347 
348 protected:
349     // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
350     ArenaString *str_ {};
351     ArenaAllocator *allocator_ {};
352     // NOLINTEND(misc-non-private-member-variables-in-classes)
353 };
354 
355 template <bool MOVE_ITER, bool SET_CP_SIZE>
DecodeCP(size_t * cpSize)356 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
357 {
358     if (!HasNext()) {
359         return INVALID_CP;
360     }
361 
362     const auto *iterNext = iter_;
363 
364     char32_t cu0 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
365     char32_t res {};
366 
367     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
368         res = cu0;
369     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
370         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
371         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
372     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
373         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
374         char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
375         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
376               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
377     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
378                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
379         char32_t cu1 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
380         char32_t cu2 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
381         char32_t cu3 = static_cast<uint8_t>(*iterNext++);  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
382         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
383               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
384               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
385     } else {
386         res = INVALID_CP;
387     }
388 
389     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
390     if constexpr (MOVE_ITER) {
391         iter_ = iterNext;
392         return res;
393     }
394 
395     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
396     if constexpr (SET_CP_SIZE) {
397         *cpSize = iterNext - iter_;
398     }
399 
400     return res;
401 }
402 
403 template <void ENCODER(std::string *, char32_t)>
EscapeSymbol()404 std::string StringView::EscapeSymbol() const
405 {
406     std::string str;
407     str.reserve(Length());
408 
409     auto skipNewLine = [](auto &iter) {
410         if (iter.HasNext()) {
411             iter.Forward(1);
412 
413             if (iter.Peek() != '\n') {
414                 iter.Backward(1);
415             }
416         }
417     };
418 
419     Iterator iter(*this);
420     while (iter.HasNext()) {
421         auto cp = iter.Next();
422 
423         switch (cp) {
424             case '\r': {
425                 skipNewLine(iter);
426                 [[fallthrough]];
427             }
428             case '\n': {
429                 str += "\\n";
430                 break;
431             }
432             case '\b': {
433                 str += "\\b";
434                 break;
435             }
436             case '\t': {
437                 str += "\\t";
438                 break;
439             }
440             case '\f': {
441                 str += "\\f";
442                 break;
443             }
444             case '"': {
445                 str += "\\\"";
446                 break;
447             }
448             case '\\': {
449                 str += "\\\\";
450                 break;
451             }
452             default: {
453                 ENCODER(&str, cp);
454             }
455         }
456     }
457 
458     return str;
459 }
460 
461 template <typename T>
Utf8Encode(T * str,char32_t cu)462 void StringView::Utf8Encode(T *str, char32_t cu)
463 {
464     if (cu < Constants::UTF8_1BYTE_LIMIT) {
465         str->push_back(static_cast<char>(cu));
466     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
467         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
468                                          Constants::UTF8_2BYTE_HEADER));
469         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
470     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
471         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
472                                          Constants::UTF8_3BYTE_HEADER));
473         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
474                                          Constants::UTF8_CONT_HEADER));
475         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
476     } else {
477         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
478                                          Constants::UTF8_4BYTE_HEADER));
479         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
480                                          Constants::UTF8_CONT_HEADER));
481         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
482                                          Constants::UTF8_CONT_HEADER));
483         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
484     }
485 }
486 
487 template <typename T>
Mutf8Encode(T * str,char32_t cu)488 void StringView::Mutf8Encode(T *str, char32_t cu)
489 {
490     if (cu == 0) {
491         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
492         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
493     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
494         str->push_back(static_cast<char>(cu));
495     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
496         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
497         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
498     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
499         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
500         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
501                                          Constants::UTF8_CONT_HEADER));
502         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
503     } else {
504         auto [cu1, cu2] = EncodeSurrogate(cu);
505         Mutf8Encode(str, cu1);
506         Mutf8Encode(str, cu2);
507     }
508 }
509 }  // namespace ark::es2panda::util
510 
511 // NOLINTNEXTLINE(cert-dcl58-cpp)
512 namespace std {
513 
514 template <>
515 // NOLINTNEXTLINE(altera-struct-pack-align)
516 struct hash<ark::es2panda::util::StringView> {
517     std::size_t operator()(const ark::es2panda::util::StringView &str) const
518     {
519         return std::hash<std::string_view> {}(str.Utf8());
520     }
521 };
522 
523 ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
524 
525 }  // namespace std
526 
527 #endif
528