1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace panda::es2panda::util {
29
30 class StringView {
31 public:
32 explicit StringView() noexcept = default;
StringView(const ArenaString * str)33 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35 StringView(std::string_view sv) noexcept : sv_(sv) {}
36 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37 StringView(const char *str) noexcept : sv_(str) {}
38 DEFAULT_COPY_SEMANTIC(StringView);
39 DEFAULT_MOVE_SEMANTIC(StringView);
40 ~StringView() = default;
41
42 bool operator==(const StringView &rhs) const noexcept
43 {
44 return sv_ == rhs.sv_;
45 }
46
47 bool operator!=(const StringView &rhs) const noexcept
48 {
49 return sv_ != rhs.sv_;
50 }
51
52 bool operator<(const StringView &rhs) const noexcept
53 {
54 return sv_ < rhs.sv_;
55 }
56
57 bool operator>(const StringView &rhs) const noexcept
58 {
59 return sv_ > rhs.sv_;
60 }
61
Compare(const StringView & other)62 int Compare(const StringView &other) const noexcept
63 {
64 return sv_.compare(other.sv_);
65 }
66
Compare(const std::string_view & other)67 int Compare(const std::string_view &other) const noexcept
68 {
69 return sv_.compare(other);
70 }
71
Is(const char * str)72 bool Is(const char *str) const noexcept
73 {
74 return sv_ == str;
75 }
76
Is(const std::string_view & str)77 bool Is(const std::string_view &str) const noexcept
78 {
79 return sv_ == str;
80 }
81
Length()82 size_t Length() const noexcept
83 {
84 return sv_.length();
85 }
86
Empty()87 bool Empty() const noexcept
88 {
89 return sv_.empty();
90 }
91
Utf8()92 const std::string_view &Utf8() const noexcept
93 {
94 return sv_;
95 }
96
string()97 explicit operator std::string() const noexcept
98 {
99 return std::string {sv_};
100 }
101
Bytes()102 const char *Bytes() const noexcept
103 {
104 return sv_.data();
105 }
106
Substr(size_t begin,size_t end)107 StringView Substr(size_t begin, size_t end) const noexcept
108 {
109 return StringView(std::string_view(sv_.data() + begin, end - begin));
110 }
111
Find(const char * str)112 constexpr size_t Find(const char *str) const
113 {
114 return sv_.find(str);
115 }
116
IsHighSurrogate(char32_t cp)117 static bool IsHighSurrogate(char32_t cp)
118 {
119 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120 }
121
IsLowSurrogate(char32_t cp)122 static bool IsLowSurrogate(char32_t cp)
123 {
124 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125 }
126
127 std::string Mutf8() const noexcept;
128 static char32_t DecodeSurrogates(char32_t high, char32_t low);
129 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130
131 template <void encoder(std::string *, char32_t)>
132 std::string EscapeSymbol() const;
133
134 template <typename T>
135 static void Utf8Encode(T *str, char32_t cu);
136 template <typename T>
137 static void Mutf8Encode(T *str, char32_t cu);
138
139 class Iterator {
140 public:
141 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142
Iterator(const StringView & sv)143 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144 DEFAULT_COPY_SEMANTIC(Iterator);
145 DEFAULT_MOVE_SEMANTIC(Iterator);
146 ~Iterator() = default;
147
Index()148 inline size_t Index() const
149 {
150 return static_cast<size_t>(iter_ - sv_.begin());
151 }
152
Next()153 inline char32_t Next()
154 {
155 return DecodeCP<true>(nullptr);
156 }
157
Peek()158 inline char32_t Peek() const
159 {
160 return HasNext() ? *iter_ : INVALID_CP;
161 }
162
PeekCp()163 inline char32_t PeekCp() const
164 {
165 return DecodeCP<false>(nullptr);
166 }
167
PeekCp(size_t * cpSize)168 inline char32_t PeekCp(size_t *cpSize) const
169 {
170 return DecodeCP<false, true>(cpSize);
171 }
172
Forward(size_t offset)173 inline void Forward(size_t offset) const
174 {
175 iter_ += offset;
176 }
177
Backward(size_t offset)178 inline void Backward(size_t offset) const
179 {
180 iter_ -= offset;
181 }
182
Reset(size_t offset)183 inline void Reset(size_t offset)
184 {
185 iter_ = sv_.begin() + offset;
186 }
187
Rewind(std::string_view::const_iterator pos)188 inline void Rewind(std::string_view::const_iterator pos) const
189 {
190 iter_ = pos;
191 }
192
Save()193 inline std::string_view::const_iterator Save() const
194 {
195 return iter_;
196 }
197
HasNext()198 inline bool HasNext() const
199 {
200 return iter_ != sv_.end();
201 }
202
203 void SkipCp() const;
204
205 private:
206 template <bool moveIter, bool setCpSize = false>
207 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
208
209 std::string_view sv_;
210 mutable std::string_view::const_iterator iter_;
211 };
212
213 private:
214 class Constants {
215 public:
216 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
217 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
218 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
219
220 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
221 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
222 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
223
224 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
225 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
226
227 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
228 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
229 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
230
231 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
232 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
233 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
234
235 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
236 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
237
238 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
239 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
240 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
241 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
242 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
243 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
244 };
245
246 friend class Iterator;
247 std::string_view sv_;
248 };
249
250 class UString {
251 public:
252 UString() = default;
UString(ArenaAllocator * allocator)253 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)254 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
255 {
256 Alloc();
257 *str_ = str;
258 }
259
260 DEFAULT_COPY_SEMANTIC(UString);
261 DEFAULT_MOVE_SEMANTIC(UString);
262 ~UString() = default;
263
View()264 util::StringView View() const
265 {
266 if (!str_) {
267 return util::StringView();
268 }
269
270 return util::StringView(str_);
271 }
272
Append(char32_t ch)273 void Append(char32_t ch) noexcept
274 {
275 if (!str_) {
276 Alloc();
277 }
278
279 StringView::Utf8Encode<ArenaString>(str_, ch);
280 }
281
Append(const StringView & other)282 void Append(const StringView &other) noexcept
283 {
284 if (!str_) {
285 Alloc();
286 }
287
288 *str_ += other.Utf8();
289 }
290
Append(const char * other)291 void Append(const char *other) noexcept
292 {
293 if (!str_) {
294 Alloc();
295 }
296 *str_ += other;
297 }
298
299 private:
Alloc()300 void Alloc()
301 {
302 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
303 }
304
305 protected:
306 ArenaString *str_ {};
307 ArenaAllocator *allocator_ {};
308 };
309
310 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)311 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
312 {
313 if (!HasNext()) {
314 return INVALID_CP;
315 }
316
317 const auto *iterNext = iter_;
318
319 char32_t cu0 = static_cast<uint8_t>(*iterNext++);
320 char32_t res {};
321
322 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
323 res = cu0;
324 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
325 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
326 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
327 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
328 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
329 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
330 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
331 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
332 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
333 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
334 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
335 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
336 char32_t cu3 = static_cast<uint8_t>(*iterNext++);
337 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
338 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
339 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
340 } else {
341 res = INVALID_CP;
342 }
343
344 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
345 if constexpr (moveIter) {
346 iter_ = iterNext;
347 return res;
348 }
349
350 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
351 if constexpr (setCpSize) {
352 *cpSize = iterNext - iter_;
353 }
354
355 return res;
356 }
357
358 template <void encoder(std::string *, char32_t)>
EscapeSymbol()359 std::string StringView::EscapeSymbol() const
360 {
361 std::string str;
362 str.reserve(Length());
363
364 Iterator iter(*this);
365
366 while (iter.HasNext()) {
367 auto cp = iter.Next();
368
369 switch (cp) {
370 case '\r': {
371 if (iter.HasNext()) {
372 iter.Forward(1);
373
374 if (iter.Peek() != '\n') {
375 iter.Backward(1);
376 }
377 }
378
379 [[fallthrough]];
380 }
381 case '\n': {
382 str += "\\n";
383 break;
384 }
385 case '\b': {
386 str += "\\b";
387 break;
388 }
389 case '\t': {
390 str += "\\t";
391 break;
392 }
393 case '\f': {
394 str += "\\f";
395 break;
396 }
397 case '"': {
398 str += "\\\"";
399 break;
400 }
401 case '\\': {
402 str += "\\\\";
403 break;
404 }
405 default: {
406 encoder(&str, cp);
407 }
408 }
409 }
410
411 return str;
412 }
413
414 template <typename T>
Utf8Encode(T * str,char32_t cu)415 void StringView::Utf8Encode(T *str, char32_t cu)
416 {
417 if (cu < Constants::UTF8_1BYTE_LIMIT) {
418 str->push_back(static_cast<char>(cu));
419 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
420 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
421 Constants::UTF8_2BYTE_HEADER));
422 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
423 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
424 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
425 Constants::UTF8_3BYTE_HEADER));
426 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
427 Constants::UTF8_CONT_HEADER));
428 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
429 } else {
430 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
431 Constants::UTF8_4BYTE_HEADER));
432 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
433 Constants::UTF8_CONT_HEADER));
434 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
435 Constants::UTF8_CONT_HEADER));
436 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
437 }
438 }
439
440 template <typename T>
Mutf8Encode(T * str,char32_t cu)441 void StringView::Mutf8Encode(T *str, char32_t cu)
442 {
443 if (cu == 0) {
444 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
445 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
446 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
447 str->push_back(static_cast<char>(cu));
448 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
449 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
450 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
451 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
452 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
453 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
454 Constants::UTF8_CONT_HEADER));
455 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
456 } else {
457 auto [cu1, cu2] = EncodeSurrogate(cu);
458 Mutf8Encode(str, cu1);
459 Mutf8Encode(str, cu2);
460 }
461 }
462
463 } // namespace panda::es2panda::util
464
465 // NOLINTNEXTLINE(cert-dcl58-cpp)
466 namespace std {
467
468 template <>
469 // NOLINTNEXTLINE(altera-struct-pack-align)
470 struct hash<panda::es2panda::util::StringView> {
471 std::size_t operator()(const panda::es2panda::util::StringView &str) const
472 {
473 return std::hash<std::string_view> {}(str.Utf8());
474 }
475 };
476
477 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
478
479 } // namespace std
480
481 #ifndef NDEBUG
482 #define DCOUT std::cout
483 #else
484 #define DCOUT false && std::cout
485 #endif // NDEBUG
486
487 #endif
488