• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18 
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21 
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27 
28 namespace panda::es2panda::util {
29 
30 class StringView {
31 public:
32     explicit StringView() noexcept = default;
StringView(const ArenaString * str)33     explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35     StringView(std::string_view sv) noexcept : sv_(sv) {}
36     // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37     StringView(const char *str) noexcept : sv_(str) {}
38     DEFAULT_COPY_SEMANTIC(StringView);
39     DEFAULT_MOVE_SEMANTIC(StringView);
40     ~StringView() = default;
41 
42     bool operator==(const StringView &rhs) const noexcept
43     {
44         return sv_ == rhs.sv_;
45     }
46 
47     bool operator!=(const StringView &rhs) const noexcept
48     {
49         return sv_ != rhs.sv_;
50     }
51 
52     bool operator<(const StringView &rhs) const noexcept
53     {
54         return sv_ < rhs.sv_;
55     }
56 
57     bool operator>(const StringView &rhs) const noexcept
58     {
59         return sv_ > rhs.sv_;
60     }
61 
Compare(const StringView & other)62     int Compare(const StringView &other) const noexcept
63     {
64         return sv_.compare(other.sv_);
65     }
66 
Compare(const std::string_view & other)67     int Compare(const std::string_view &other) const noexcept
68     {
69         return sv_.compare(other);
70     }
71 
Is(const char * str)72     bool Is(const char *str) const noexcept
73     {
74         return sv_ == str;
75     }
76 
Is(const std::string_view & str)77     bool Is(const std::string_view &str) const noexcept
78     {
79         return sv_ == str;
80     }
81 
Length()82     size_t Length() const noexcept
83     {
84         return sv_.length();
85     }
86 
Empty()87     bool Empty() const noexcept
88     {
89         return sv_.empty();
90     }
91 
Utf8()92     const std::string_view &Utf8() const noexcept
93     {
94         return sv_;
95     }
96 
string()97     explicit operator std::string() const noexcept
98     {
99         return std::string {sv_};
100     }
101 
Bytes()102     const char *Bytes() const noexcept
103     {
104         return sv_.data();
105     }
106 
Substr(size_t begin,size_t end)107     StringView Substr(size_t begin, size_t end) const noexcept
108     {
109         return StringView(std::string_view(sv_.data() + begin, end - begin));
110     }
111 
Find(const char * str)112     constexpr size_t Find(const char *str) const
113     {
114         return sv_.find(str);
115     }
116 
IsHighSurrogate(char32_t cp)117     static bool IsHighSurrogate(char32_t cp)
118     {
119         return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120     }
121 
IsLowSurrogate(char32_t cp)122     static bool IsLowSurrogate(char32_t cp)
123     {
124         return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125     }
126 
127     std::string Mutf8() const noexcept;
128     static char32_t DecodeSurrogates(char32_t high, char32_t low);
129     static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130 
131     template <void encoder(std::string *, char32_t)>
132     std::string EscapeSymbol() const;
133 
134     template <typename T>
135     static void Utf8Encode(T *str, char32_t cu);
136     template <typename T>
137     static void Mutf8Encode(T *str, char32_t cu);
138 
139     class Iterator {
140     public:
141         static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142 
Iterator(const StringView & sv)143         explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144         DEFAULT_COPY_SEMANTIC(Iterator);
145         DEFAULT_MOVE_SEMANTIC(Iterator);
146         ~Iterator() = default;
147 
Index()148         inline size_t Index() const
149         {
150             return static_cast<size_t>(iter_ - sv_.begin());
151         }
152 
Next()153         inline char32_t Next()
154         {
155             return DecodeCP<true>(nullptr);
156         }
157 
Peek()158         inline char32_t Peek() const
159         {
160             return HasNext() ? *iter_ : INVALID_CP;
161         }
162 
PeekCp()163         inline char32_t PeekCp() const
164         {
165             return DecodeCP<false>(nullptr);
166         }
167 
PeekCp(size_t * cpSize)168         inline char32_t PeekCp(size_t *cpSize) const
169         {
170             return DecodeCP<false, true>(cpSize);
171         }
172 
Forward(size_t offset)173         inline void Forward(size_t offset) const
174         {
175             iter_ += offset;
176         }
177 
Backward(size_t offset)178         inline void Backward(size_t offset) const
179         {
180             iter_ -= offset;
181         }
182 
Reset(size_t offset)183         inline void Reset(size_t offset)
184         {
185             iter_ = sv_.begin() + offset;
186         }
187 
Rewind(std::string_view::const_iterator pos)188         inline void Rewind(std::string_view::const_iterator pos) const
189         {
190             iter_ = pos;
191         }
192 
Save()193         inline std::string_view::const_iterator Save() const
194         {
195             return iter_;
196         }
197 
HasNext()198         inline bool HasNext() const
199         {
200             return iter_ != sv_.end();
201         }
202 
HasExpectedNumberOfBytes(size_t count)203         bool HasExpectedNumberOfBytes(size_t count) const
204         {
205             for (size_t i = 0; i < count; ++i) {
206                 if (!HasNext()) {
207                     return false;
208                 }
209                 iter_++;
210             }
211             iter_ -= count;
212             return true;
213         }
214 
215         void SkipCp() const;
216 
217     private:
218         template <bool moveIter, bool setCpSize = false>
219         char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220 
221         std::string_view sv_;
222         mutable std::string_view::const_iterator iter_;
223     };
224 
225 private:
226     class Constants {
227     public:
228         static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
229         static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
230         static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
231 
232         static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
233         static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
234         static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
235 
236         static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
237         static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
238 
239         static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
240         static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
241         static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
242 
243         static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
244         static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
245         static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
246 
247         static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
248         static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
249 
250         static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
251         static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
252         static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
253 
254         static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
255         static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
256         static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
257         static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
258         static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
259         static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
260     };
261 
262     friend class Iterator;
263     std::string_view sv_;
264 };
265 
266 class UString {
267 public:
268     UString() = default;
UString(ArenaAllocator * allocator)269     explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)270     explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
271     {
272         Alloc();
273         *str_ = str;
274     }
275 
276     DEFAULT_COPY_SEMANTIC(UString);
277     DEFAULT_MOVE_SEMANTIC(UString);
278     ~UString() = default;
279 
View()280     util::StringView View() const
281     {
282         if (!str_) {
283             return util::StringView();
284         }
285 
286         return util::StringView(str_);
287     }
288 
Append(char32_t ch)289     void Append(char32_t ch) noexcept
290     {
291         if (!str_) {
292             Alloc();
293         }
294 
295         StringView::Utf8Encode<ArenaString>(str_, ch);
296     }
297 
Append(const StringView & other)298     void Append(const StringView &other) noexcept
299     {
300         if (!str_) {
301             Alloc();
302         }
303 
304         *str_ += other.Utf8();
305     }
306 
Append(const char * other)307     void Append(const char *other) noexcept
308     {
309         if (!str_) {
310             Alloc();
311         }
312         *str_ += other;
313     }
314 
315 private:
Alloc()316     void Alloc()
317     {
318         str_ = allocator_->New<ArenaString>(allocator_->Adapter());
319     }
320 
321 protected:
322     ArenaString *str_ {};
323     ArenaAllocator *allocator_ {};
324 };
325 
326 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)327 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
328 {
329     if (!HasNext()) {
330         return INVALID_CP;
331     }
332 
333     const auto *iterNext = iter_;
334 
335     char32_t cu0 = static_cast<uint8_t>(*iterNext++);
336     char32_t res {};
337 
338     if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
339         res = cu0;
340     } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
341         // Should be 2 bytes decoded in UTF-8, check if there is one byte following.
342         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_ONE_BYTE)) {
343             return INVALID_CP;
344         }
345         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
346         res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
347     } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
348         // Should be 3 bytes decoded in UTF-8, check if there are 2 bytes following.
349         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
350             return INVALID_CP;
351         }
352         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
353         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
354         res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
355               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
356     } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
357                (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
358         // Should be 4 bytes decoded in UTF-8, check if there are 3 bytes following.
359         if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
360             return INVALID_CP;
361         }
362         char32_t cu1 = static_cast<uint8_t>(*iterNext++);
363         char32_t cu2 = static_cast<uint8_t>(*iterNext++);
364         char32_t cu3 = static_cast<uint8_t>(*iterNext++);
365         res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
366               ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
367               ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
368     } else {
369         res = INVALID_CP;
370     }
371 
372     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
373     if constexpr (moveIter) {
374         iter_ = iterNext;
375         return res;
376     }
377 
378     // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
379     if constexpr (setCpSize) {
380         *cpSize = iterNext - iter_;
381     }
382 
383     return res;
384 }
385 
386 template <void encoder(std::string *, char32_t)>
EscapeSymbol()387 std::string StringView::EscapeSymbol() const
388 {
389     std::string str;
390     str.reserve(Length());
391 
392     Iterator iter(*this);
393 
394     while (iter.HasNext()) {
395         auto cp = iter.Next();
396 
397         switch (cp) {
398             case '\r': {
399                 if (iter.HasNext()) {
400                     iter.Forward(1);
401 
402                     if (iter.Peek() != '\n') {
403                         iter.Backward(1);
404                     }
405                 }
406 
407                 [[fallthrough]];
408             }
409             case '\n': {
410                 str += "\\n";
411                 break;
412             }
413             case '\b': {
414                 str += "\\b";
415                 break;
416             }
417             case '\t': {
418                 str += "\\t";
419                 break;
420             }
421             case '\f': {
422                 str += "\\f";
423                 break;
424             }
425             case '"': {
426                 str += "\\\"";
427                 break;
428             }
429             case '\\': {
430                 str += "\\\\";
431                 break;
432             }
433             default: {
434                 encoder(&str, cp);
435             }
436         }
437     }
438 
439     return str;
440 }
441 
442 template <typename T>
Utf8Encode(T * str,char32_t cu)443 void StringView::Utf8Encode(T *str, char32_t cu)
444 {
445     if (cu < Constants::UTF8_1BYTE_LIMIT) {
446         str->push_back(static_cast<char>(cu));
447     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
448         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
449                                          Constants::UTF8_2BYTE_HEADER));
450         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
451     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
452         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
453                                          Constants::UTF8_3BYTE_HEADER));
454         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
455                                          Constants::UTF8_CONT_HEADER));
456         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
457     } else {
458         str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
459                                          Constants::UTF8_4BYTE_HEADER));
460         str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
461                                          Constants::UTF8_CONT_HEADER));
462         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463                                          Constants::UTF8_CONT_HEADER));
464         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
465     }
466 }
467 
468 template <typename T>
Mutf8Encode(T * str,char32_t cu)469 void StringView::Mutf8Encode(T *str, char32_t cu)
470 {
471     if (cu == 0) {
472         str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
473         str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
474     } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
475         str->push_back(static_cast<char>(cu));
476     } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
477         str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
478         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
479     } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
480         str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
481         str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
482                                          Constants::UTF8_CONT_HEADER));
483         str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
484     } else {
485         auto [cu1, cu2] = EncodeSurrogate(cu);
486         Mutf8Encode(str, cu1);
487         Mutf8Encode(str, cu2);
488     }
489 }
490 
491 }  // namespace panda::es2panda::util
492 
493 // NOLINTNEXTLINE(cert-dcl58-cpp)
494 namespace std {
495 
496 template <>
497 // NOLINTNEXTLINE(altera-struct-pack-align)
498 struct hash<panda::es2panda::util::StringView> {
499     std::size_t operator()(const panda::es2panda::util::StringView &str) const
500     {
501         return std::hash<std::string_view> {}(str.Utf8());
502     }
503 };
504 
505 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
506 
507 }  // namespace std
508 
509 #ifndef NDEBUG
510 #define DCOUT std::cout
511 #else
512 #define DCOUT false && std::cout
513 #endif  // NDEBUG
514 
515 #endif
516