• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace panda::es2panda::util {
29 
30 class StringView {
31 public:
32     explicit StringView() noexcept = default;
StringView(const ArenaString * str)33     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35     StringView(std::string_view sv) noexcept : sv_(sv) {}
36     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37     StringView(const char *str) noexcept : sv_(str) {}
38     DEFAULT_COPY_SEMANTIC(StringView);
39     DEFAULT_MOVE_SEMANTIC(StringView);
40     ~StringView() = default;
41 
42     bool operator==(const StringView &rhs) const noexcept
43     {
44         return sv_ == rhs.sv_;
45     }
46 
47     bool operator!=(const StringView &rhs) const noexcept
48     {
49         return sv_ != rhs.sv_;
50     }
51 
52     bool operator<(const StringView &rhs) const noexcept
53     {
54         return sv_ < rhs.sv_;
55     }
56 
57     bool operator>(const StringView &rhs) const noexcept
58     {
59         return sv_ > rhs.sv_;
60     }
61 
Compare(const StringView & other)62     int Compare(const StringView &other) const noexcept
63     {
64         return sv_.compare(other.sv_);
65     }
66 
Compare(const std::string_view & other)67     int Compare(const std::string_view &other) const noexcept
68     {
69         return sv_.compare(other);
70     }
71 
Is(const char * str)72     bool Is(const char *str) const noexcept
73     {
74         return sv_ == str;
75     }
76 
Is(const std::string_view & str)77     bool Is(const std::string_view &str) const noexcept
78     {
79         return sv_ == str;
80     }
81 
Length()82     size_t Length() const noexcept
83     {
84         return sv_.length();
85     }
86 
Empty()87     bool Empty() const noexcept
88     {
89         return sv_.empty();
90     }
91 
Utf8()92     const std::string_view &Utf8() const noexcept
93     {
94         return sv_;
95     }
96 
string()97     explicit operator std::string() const noexcept
98     {
99         return std::string {sv_};
100     }
101 
Bytes()102     const char *Bytes() const noexcept
103     {
104         return sv_.data();
105     }
106 
Substr(size_t begin,size_t end)107     StringView Substr(size_t begin, size_t end) const noexcept
108     {
109         return StringView(std::string_view(sv_.data() + begin, end - begin));
110     }
111 
Find(const char * str)112     constexpr size_t Find(const char *str) const
113     {
114         return sv_.find(str);
115     }
116 
IsHighSurrogate(char32_t cp)117     static bool IsHighSurrogate(char32_t cp)
118     {
119         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120     }
121 
IsLowSurrogate(char32_t cp)122     static bool IsLowSurrogate(char32_t cp)
123     {
124         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125     }
126 
127     std::string Mutf8() const noexcept;
128     static char32_t DecodeSurrogates(char32_t high, char32_t low);
129     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130 
131     template <void encoder(std::string *, char32_t)>
132     std::string EscapeSymbol() const;
133 
134     template <typename T>
135     static void Utf8Encode(T *str, char32_t cu);
136     template <typename T>
137     static void Mutf8Encode(T *str, char32_t cu);
138 
139     class Iterator {
140     public:
141         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142 
Iterator(const StringView & sv)143         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144         DEFAULT_COPY_SEMANTIC(Iterator);
145         DEFAULT_MOVE_SEMANTIC(Iterator);
146         ~Iterator() = default;
147 
Index()148         inline size_t Index() const
149         {
150             return static_cast<size_t>(iter_ - sv_.begin());
151         }
152 
Next()153         inline char32_t Next()
154         {
155             return DecodeCP<true>(nullptr);
156         }
157 
Peek()158         inline char32_t Peek() const
159         {
160             return HasNext() ? *iter_ : INVALID_CP;
161         }
162 
PeekCp()163         inline char32_t PeekCp() const
164         {
165             return DecodeCP<false>(nullptr);
166         }
167 
PeekCp(size_t * cpSize)168         inline char32_t PeekCp(size_t *cpSize) const
169         {
170             return DecodeCP<false, true>(cpSize);
171         }
172 
Forward(size_t offset)173         inline void Forward(size_t offset) const
174         {
175             iter_ += offset;
176         }
177 
Backward(size_t offset)178         inline void Backward(size_t offset) const
179         {
180             iter_ -= offset;
181         }
182 
Reset(size_t offset)183         inline void Reset(size_t offset)
184         {
185             iter_ = sv_.begin() + offset;
186         }
187 
Rewind(std::string_view::const_iterator pos)188         inline void Rewind(std::string_view::const_iterator pos) const
189         {
190             iter_ = pos;
191         }
192 
Save()193         inline std::string_view::const_iterator Save() const
194         {
195             return iter_;
196         }
197 
HasNext()198         inline bool HasNext() const
199         {
200             return iter_ != sv_.end();
201         }
202 
203         void SkipCp() const;
204 
205     private:
206         template <bool moveIter, bool setCpSize = false>
207         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
208 
209         std::string_view sv_;
210         mutable std::string_view::const_iterator iter_;
211     };
212 
213 private:
214     class Constants {
215     public:
216         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
217         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
218         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
219 
220         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
221         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
222         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
223 
224         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
225         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
226 
227         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
228         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
229         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
230 
231         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
232         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
233         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
234 
235         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
236         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
237 
238         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
239         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
240         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
241         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
242         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
243         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
244     };
245 
246     friend class Iterator;
247     std::string_view sv_;
248 };
249 
250 class UString {
251 public:
252     UString() = default;
UString(ArenaAllocator * allocator)253     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)254     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
255     {
256         Alloc();
257         *str_ = str;
258     }
259 
260     DEFAULT_COPY_SEMANTIC(UString);
261     DEFAULT_MOVE_SEMANTIC(UString);
262     ~UString() = default;
263 
View()264     util::StringView View() const
265     {
266         if (!str_) {
267             return util::StringView();
268         }
269 
270         return util::StringView(str_);
271     }
272 
Append(char32_t ch)273     void Append(char32_t ch) noexcept
274     {
275         if (!str_) {
276             Alloc();
277         }
278 
279         StringView::Utf8Encode<ArenaString>(str_, ch);
280     }
281 
Append(const StringView & other)282     void Append(const StringView &other) noexcept
283     {
284         if (!str_) {
285             Alloc();
286         }
287 
288         *str_ += other.Utf8();
289     }
290 
Append(const char * other)291     void Append(const char *other) noexcept
292     {
293         if (!str_) {
294             Alloc();
295         }
296         *str_ += other;
297     }
298 
299 private:
Alloc()300     void Alloc()
301     {
302         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
303     }
304 
305 protected:
306     ArenaString *str_ {};
307     ArenaAllocator *allocator_ {};
308 };
309 
310 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)311 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
312 {
313     if (!HasNext()) {
314         return INVALID_CP;
315     }
316 
317     const auto *iterNext = iter_;
318 
319     char32_t cu0 = static_cast<uint8_t>(*iterNext++);
320     char32_t res {};
321 
322     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
323         res = cu0;
324     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
325         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
326         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
327     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
328         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
329         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
330         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
331               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
332     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
333                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
334         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
335         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
336         char32_t cu3 = static_cast<uint8_t>(*iterNext++);
337         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
338               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
339               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
340     } else {
341         res = INVALID_CP;
342     }
343 
344     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
345     if constexpr (moveIter) {
346         iter_ = iterNext;
347         return res;
348     }
349 
350     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
351     if constexpr (setCpSize) {
352         *cpSize = iterNext - iter_;
353     }
354 
355     return res;
356 }
357 
358 template <void encoder(std::string *, char32_t)>
EscapeSymbol()359 std::string StringView::EscapeSymbol() const
360 {
361     std::string str;
362     str.reserve(Length());
363 
364     Iterator iter(*this);
365 
366     while (iter.HasNext()) {
367         auto cp = iter.Next();
368 
369         switch (cp) {
370             case '\r': {
371                 if (iter.HasNext()) {
372                     iter.Forward(1);
373 
374                     if (iter.Peek() != '\n') {
375                         iter.Backward(1);
376                     }
377                 }
378 
379                 [[fallthrough]];
380             }
381             case '\n': {
382                 str += "\\n";
383                 break;
384             }
385             case '\b': {
386                 str += "\\b";
387                 break;
388             }
389             case '\t': {
390                 str += "\\t";
391                 break;
392             }
393             case '\f': {
394                 str += "\\f";
395                 break;
396             }
397             case '"': {
398                 str += "\\\"";
399                 break;
400             }
401             case '\\': {
402                 str += "\\\\";
403                 break;
404             }
405             default: {
406                 encoder(&str, cp);
407             }
408         }
409     }
410 
411     return str;
412 }
413 
414 template <typename T>
Utf8Encode(T * str,char32_t cu)415 void StringView::Utf8Encode(T *str, char32_t cu)
416 {
417     if (cu < Constants::UTF8_1BYTE_LIMIT) {
418         str->push_back(static_cast<char>(cu));
419     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
420         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
421                                          Constants::UTF8_2BYTE_HEADER));
422         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
423     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
424         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
425                                          Constants::UTF8_3BYTE_HEADER));
426         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
427                                          Constants::UTF8_CONT_HEADER));
428         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
429     } else {
430         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
431                                          Constants::UTF8_4BYTE_HEADER));
432         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
433                                          Constants::UTF8_CONT_HEADER));
434         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
435                                          Constants::UTF8_CONT_HEADER));
436         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
437     }
438 }
439 
440 template <typename T>
Mutf8Encode(T * str,char32_t cu)441 void StringView::Mutf8Encode(T *str, char32_t cu)
442 {
443     if (cu == 0) {
444         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
445         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
446     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
447         str->push_back(static_cast<char>(cu));
448     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
449         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
450         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
451     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
452         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
453         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
454                                          Constants::UTF8_CONT_HEADER));
455         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
456     } else {
457         auto [cu1, cu2] = EncodeSurrogate(cu);
458         Mutf8Encode(str, cu1);
459         Mutf8Encode(str, cu2);
460     }
461 }
462 
463 }  // namespace panda::es2panda::util
464 
465 // NOLINTNEXTLINE(cert-dcl58-cpp)
466 namespace std {
467 
468 template <>
469 // NOLINTNEXTLINE(altera-struct-pack-align)
470 struct hash<panda::es2panda::util::StringView> {
471     std::size_t operator()(const panda::es2panda::util::StringView &str) const
472     {
473         return std::hash<std::string_view> {}(str.Utf8());
474     }
475 };
476 
477 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
478 
479 }  // namespace std
480 
481 #ifndef NDEBUG
482 #define DCOUT std::cout
483 #else
484 #define DCOUT false && std::cout
485 #endif  // NDEBUG
486 
487 #endif
488