1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include <macros.h>
20 #include <utils/arena_containers.h>
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace panda::es2panda::util {
29
30 class StringView {
31 public:
32 explicit StringView() noexcept = default;
StringView(const ArenaString * str)33 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
34 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)35 StringView(std::string_view sv) noexcept : sv_(sv) {}
36 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)37 StringView(const char *str) noexcept : sv_(str) {}
38 DEFAULT_COPY_SEMANTIC(StringView);
39 DEFAULT_MOVE_SEMANTIC(StringView);
40 ~StringView() = default;
41
42 bool operator==(const StringView &rhs) const noexcept
43 {
44 return sv_ == rhs.sv_;
45 }
46
47 bool operator!=(const StringView &rhs) const noexcept
48 {
49 return sv_ != rhs.sv_;
50 }
51
52 bool operator<(const StringView &rhs) const noexcept
53 {
54 return sv_ < rhs.sv_;
55 }
56
57 bool operator>(const StringView &rhs) const noexcept
58 {
59 return sv_ > rhs.sv_;
60 }
61
Compare(const StringView & other)62 int Compare(const StringView &other) const noexcept
63 {
64 return sv_.compare(other.sv_);
65 }
66
Compare(const std::string_view & other)67 int Compare(const std::string_view &other) const noexcept
68 {
69 return sv_.compare(other);
70 }
71
Is(const char * str)72 bool Is(const char *str) const noexcept
73 {
74 return sv_ == str;
75 }
76
Is(const std::string_view & str)77 bool Is(const std::string_view &str) const noexcept
78 {
79 return sv_ == str;
80 }
81
Length()82 size_t Length() const noexcept
83 {
84 return sv_.length();
85 }
86
Empty()87 bool Empty() const noexcept
88 {
89 return sv_.empty();
90 }
91
Utf8()92 const std::string_view &Utf8() const noexcept
93 {
94 return sv_;
95 }
96
string()97 explicit operator std::string() const noexcept
98 {
99 return std::string {sv_};
100 }
101
Bytes()102 const char *Bytes() const noexcept
103 {
104 return sv_.data();
105 }
106
Substr(size_t begin,size_t end)107 StringView Substr(size_t begin, size_t end) const noexcept
108 {
109 return StringView(std::string_view(sv_.data() + begin, end - begin));
110 }
111
Find(const char * str)112 constexpr size_t Find(const char *str) const
113 {
114 return sv_.find(str);
115 }
116
IsHighSurrogate(char32_t cp)117 static bool IsHighSurrogate(char32_t cp)
118 {
119 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
120 }
121
IsLowSurrogate(char32_t cp)122 static bool IsLowSurrogate(char32_t cp)
123 {
124 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
125 }
126
127 std::string Mutf8() const noexcept;
128 static char32_t DecodeSurrogates(char32_t high, char32_t low);
129 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
130
131 template <void encoder(std::string *, char32_t)>
132 std::string EscapeSymbol() const;
133
134 template <typename T>
135 static void Utf8Encode(T *str, char32_t cu);
136 template <typename T>
137 static void Mutf8Encode(T *str, char32_t cu);
138
139 class Iterator {
140 public:
141 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
142
Iterator(const StringView & sv)143 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
144 DEFAULT_COPY_SEMANTIC(Iterator);
145 DEFAULT_MOVE_SEMANTIC(Iterator);
146 ~Iterator() = default;
147
Index()148 inline size_t Index() const
149 {
150 return static_cast<size_t>(iter_ - sv_.begin());
151 }
152
Next()153 inline char32_t Next()
154 {
155 return DecodeCP<true>(nullptr);
156 }
157
Peek()158 inline char32_t Peek() const
159 {
160 return HasNext() ? *iter_ : INVALID_CP;
161 }
162
PeekCp()163 inline char32_t PeekCp() const
164 {
165 return DecodeCP<false>(nullptr);
166 }
167
PeekCp(size_t * cpSize)168 inline char32_t PeekCp(size_t *cpSize) const
169 {
170 return DecodeCP<false, true>(cpSize);
171 }
172
Forward(size_t offset)173 inline void Forward(size_t offset) const
174 {
175 iter_ += offset;
176 }
177
Backward(size_t offset)178 inline void Backward(size_t offset) const
179 {
180 iter_ -= offset;
181 }
182
Reset(size_t offset)183 inline void Reset(size_t offset)
184 {
185 iter_ = sv_.begin() + offset;
186 }
187
Rewind(std::string_view::const_iterator pos)188 inline void Rewind(std::string_view::const_iterator pos) const
189 {
190 iter_ = pos;
191 }
192
Save()193 inline std::string_view::const_iterator Save() const
194 {
195 return iter_;
196 }
197
HasNext()198 inline bool HasNext() const
199 {
200 return iter_ != sv_.end();
201 }
202
HasExpectedNumberOfBytes(size_t count)203 bool HasExpectedNumberOfBytes(size_t count) const
204 {
205 for (size_t i = 0; i < count; ++i) {
206 if (!HasNext()) {
207 return false;
208 }
209 iter_++;
210 }
211 iter_ -= count;
212 return true;
213 }
214
215 void SkipCp() const;
216
217 private:
218 template <bool moveIter, bool setCpSize = false>
219 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
220
221 std::string_view sv_;
222 mutable std::string_view::const_iterator iter_;
223 };
224
225 private:
226 class Constants {
227 public:
228 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
229 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
230 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
231
232 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
233 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
234 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
235
236 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
237 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
238
239 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
240 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
241 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
242
243 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
244 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
245 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
246
247 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
248 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
249
250 static constexpr size_t UTF8_NEXT_ONE_BYTE = 1;
251 static constexpr size_t UTF8_NEXT_TWO_BYTE = 2;
252 static constexpr size_t UTF8_NEXT_THREE_BYTE = 3;
253
254 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
255 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
256 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
257 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
258 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
259 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
260 };
261
262 friend class Iterator;
263 std::string_view sv_;
264 };
265
266 class UString {
267 public:
268 UString() = default;
UString(ArenaAllocator * allocator)269 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)270 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
271 {
272 Alloc();
273 *str_ = str;
274 }
275
276 DEFAULT_COPY_SEMANTIC(UString);
277 DEFAULT_MOVE_SEMANTIC(UString);
278 ~UString() = default;
279
View()280 util::StringView View() const
281 {
282 if (!str_) {
283 return util::StringView();
284 }
285
286 return util::StringView(str_);
287 }
288
Append(char32_t ch)289 void Append(char32_t ch) noexcept
290 {
291 if (!str_) {
292 Alloc();
293 }
294
295 StringView::Utf8Encode<ArenaString>(str_, ch);
296 }
297
Append(const StringView & other)298 void Append(const StringView &other) noexcept
299 {
300 if (!str_) {
301 Alloc();
302 }
303
304 *str_ += other.Utf8();
305 }
306
Append(const char * other)307 void Append(const char *other) noexcept
308 {
309 if (!str_) {
310 Alloc();
311 }
312 *str_ += other;
313 }
314
315 private:
Alloc()316 void Alloc()
317 {
318 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
319 }
320
321 protected:
322 ArenaString *str_ {};
323 ArenaAllocator *allocator_ {};
324 };
325
326 template <bool moveIter, bool setCpSize>
DecodeCP(size_t * cpSize)327 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
328 {
329 if (!HasNext()) {
330 return INVALID_CP;
331 }
332
333 const auto *iterNext = iter_;
334
335 char32_t cu0 = static_cast<uint8_t>(*iterNext++);
336 char32_t res {};
337
338 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
339 res = cu0;
340 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
341 // Should be 2 bytes decoded in UTF-8, check if there is one byte following.
342 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_ONE_BYTE)) {
343 return INVALID_CP;
344 }
345 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
346 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
347 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
348 // Should be 3 bytes decoded in UTF-8, check if there are 2 bytes following.
349 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_TWO_BYTE)) {
350 return INVALID_CP;
351 }
352 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
353 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
354 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
355 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
356 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
357 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
358 // Should be 4 bytes decoded in UTF-8, check if there are 3 bytes following.
359 if (!HasExpectedNumberOfBytes(Constants::UTF8_NEXT_THREE_BYTE)) {
360 return INVALID_CP;
361 }
362 char32_t cu1 = static_cast<uint8_t>(*iterNext++);
363 char32_t cu2 = static_cast<uint8_t>(*iterNext++);
364 char32_t cu3 = static_cast<uint8_t>(*iterNext++);
365 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
366 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
367 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
368 } else {
369 res = INVALID_CP;
370 }
371
372 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
373 if constexpr (moveIter) {
374 iter_ = iterNext;
375 return res;
376 }
377
378 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
379 if constexpr (setCpSize) {
380 *cpSize = iterNext - iter_;
381 }
382
383 return res;
384 }
385
386 template <void encoder(std::string *, char32_t)>
EscapeSymbol()387 std::string StringView::EscapeSymbol() const
388 {
389 std::string str;
390 str.reserve(Length());
391
392 Iterator iter(*this);
393
394 while (iter.HasNext()) {
395 auto cp = iter.Next();
396
397 switch (cp) {
398 case '\r': {
399 if (iter.HasNext()) {
400 iter.Forward(1);
401
402 if (iter.Peek() != '\n') {
403 iter.Backward(1);
404 }
405 }
406
407 [[fallthrough]];
408 }
409 case '\n': {
410 str += "\\n";
411 break;
412 }
413 case '\b': {
414 str += "\\b";
415 break;
416 }
417 case '\t': {
418 str += "\\t";
419 break;
420 }
421 case '\f': {
422 str += "\\f";
423 break;
424 }
425 case '"': {
426 str += "\\\"";
427 break;
428 }
429 case '\\': {
430 str += "\\\\";
431 break;
432 }
433 default: {
434 encoder(&str, cp);
435 }
436 }
437 }
438
439 return str;
440 }
441
442 template <typename T>
Utf8Encode(T * str,char32_t cu)443 void StringView::Utf8Encode(T *str, char32_t cu)
444 {
445 if (cu < Constants::UTF8_1BYTE_LIMIT) {
446 str->push_back(static_cast<char>(cu));
447 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
448 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
449 Constants::UTF8_2BYTE_HEADER));
450 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
451 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
452 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
453 Constants::UTF8_3BYTE_HEADER));
454 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
455 Constants::UTF8_CONT_HEADER));
456 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
457 } else {
458 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
459 Constants::UTF8_4BYTE_HEADER));
460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
461 Constants::UTF8_CONT_HEADER));
462 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
463 Constants::UTF8_CONT_HEADER));
464 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
465 }
466 }
467
468 template <typename T>
Mutf8Encode(T * str,char32_t cu)469 void StringView::Mutf8Encode(T *str, char32_t cu)
470 {
471 if (cu == 0) {
472 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
473 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
474 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
475 str->push_back(static_cast<char>(cu));
476 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
477 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
478 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
479 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
480 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
481 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
482 Constants::UTF8_CONT_HEADER));
483 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
484 } else {
485 auto [cu1, cu2] = EncodeSurrogate(cu);
486 Mutf8Encode(str, cu1);
487 Mutf8Encode(str, cu2);
488 }
489 }
490
491 } // namespace panda::es2panda::util
492
493 // NOLINTNEXTLINE(cert-dcl58-cpp)
494 namespace std {
495
496 template <>
497 // NOLINTNEXTLINE(altera-struct-pack-align)
498 struct hash<panda::es2panda::util::StringView> {
499 std::size_t operator()(const panda::es2panda::util::StringView &str) const
500 {
501 return std::hash<std::string_view> {}(str.Utf8());
502 }
503 };
504
505 ostream &operator<<(ostream &os, const panda::es2panda::util::StringView &us);
506
507 } // namespace std
508
509 #ifndef NDEBUG
510 #define DCOUT std::cout
511 #else
512 #define DCOUT false && std::cout
513 #endif // NDEBUG
514
515 #endif
516