1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ES2PANDA_UTIL_INCLUDE_USTRING_H
17 #define ES2PANDA_UTIL_INCLUDE_USTRING_H
18
19 #include "macros.h"
20 #include "utils/arena_containers.h"
21
22 #include <cstddef>
23 #include <limits>
24 #include <memory>
25 #include <string>
26 #include <string_view>
27
28 namespace ark::es2panda::util {
29 class StringView {
30 public:
31 explicit StringView() noexcept = default;
StringView(const ArenaString * str)32 explicit StringView(const ArenaString *str) noexcept : sv_(*str) {}
33 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(std::string_view sv)34 StringView(std::string_view sv) noexcept : sv_(sv) {}
35 // NOLINTNEXTLINE(google-explicit-constructor)
StringView(const char * str)36 StringView(const char *str) noexcept : sv_(str == nullptr ? "" : str) {}
37 DEFAULT_COPY_SEMANTIC(StringView);
38 DEFAULT_MOVE_SEMANTIC(StringView);
39 ~StringView() = default;
40
41 bool operator==(const StringView &rhs) const noexcept
42 {
43 return sv_ == rhs.sv_;
44 }
45
46 bool operator!=(const StringView &rhs) const noexcept
47 {
48 return sv_ != rhs.sv_;
49 }
50
51 bool operator<(const StringView &rhs) const noexcept
52 {
53 return sv_ < rhs.sv_;
54 }
55
56 bool operator>(const StringView &rhs) const noexcept
57 {
58 return sv_ > rhs.sv_;
59 }
60
Compare(const StringView & other)61 int Compare(const StringView &other) const noexcept
62 {
63 return sv_.compare(other.sv_);
64 }
65
Compare(const std::string_view & other)66 int Compare(const std::string_view &other) const noexcept
67 {
68 return sv_.compare(other);
69 }
70
Is(const char * str)71 bool Is(const char *str) const noexcept
72 {
73 return sv_ == str;
74 }
75
Is(const std::string_view & str)76 bool Is(const std::string_view &str) const noexcept
77 {
78 return sv_ == str;
79 }
80
Length()81 size_t Length() const noexcept
82 {
83 return sv_.length();
84 }
85
Empty()86 bool Empty() const noexcept
87 {
88 return sv_.empty();
89 }
90
Utf8()91 const std::string_view &Utf8() const noexcept
92 {
93 return sv_;
94 }
95
string()96 explicit operator std::string() const noexcept
97 {
98 return std::string {sv_};
99 }
100
Bytes()101 const char *Bytes() const noexcept
102 {
103 return sv_.data();
104 }
105
Substr(size_t begin,size_t end)106 StringView Substr(size_t begin, size_t end) const noexcept
107 {
108 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
109 return StringView(std::string_view(sv_.data() + begin, end - begin));
110 }
111
IsHighSurrogate(char32_t cp)112 static bool IsHighSurrogate(char32_t cp)
113 {
114 return (cp >= Constants::SURROGATE_HIGH_MIN && cp < Constants::SURROGATE_HIGH_MAX);
115 }
116
IsLowSurrogate(char32_t cp)117 static bool IsLowSurrogate(char32_t cp)
118 {
119 return (cp >= Constants::SURROGATE_LOW_MIN && cp < Constants::SURROGATE_LOW_MAX);
120 }
121
122 std::string Mutf8() const noexcept;
123 static char32_t DecodeSurrogates(char32_t high, char32_t low);
124 static std::tuple<char32_t, char32_t> EncodeSurrogate(char32_t cp);
125
126 template <void ENCODER(std::string *, char32_t)>
127 std::string EscapeSymbol() const;
128
129 template <typename T>
130 static void Utf8Encode(T *str, char32_t cu);
131 template <typename T>
132 static void Mutf8Encode(T *str, char32_t cu);
133
134 bool IsConvertibleToChar() const;
135
136 class Iterator {
137 public:
138 static char32_t constexpr INVALID_CP = std::numeric_limits<char32_t>::max();
139
Iterator(const StringView & sv)140 explicit Iterator(const StringView &sv) noexcept : sv_(sv.sv_), iter_(sv_.begin()) {}
141 DEFAULT_COPY_SEMANTIC(Iterator);
142 DEFAULT_MOVE_SEMANTIC(Iterator);
143 ~Iterator() = default;
144
Index()145 inline size_t Index() const
146 {
147 return static_cast<size_t>(iter_ - sv_.begin());
148 }
149
Next()150 inline char32_t Next()
151 {
152 return DecodeCP<true>(nullptr);
153 }
154
Peek()155 inline char32_t Peek() const
156 {
157 return HasNext() ? *iter_ : INVALID_CP;
158 }
159
PeekCp()160 inline char32_t PeekCp() const
161 {
162 return DecodeCP<false>(nullptr);
163 }
164
PeekCp(size_t * cpSize)165 inline char32_t PeekCp(size_t *cpSize) const
166 {
167 return DecodeCP<false, true>(cpSize);
168 }
169
Forward(size_t offset)170 inline void Forward(size_t offset)
171 {
172 iter_ += offset;
173 }
174
Backward(size_t offset)175 inline void Backward(size_t offset)
176 {
177 iter_ -= offset;
178 }
179
Reset(size_t offset)180 inline void Reset(size_t offset)
181 {
182 iter_ = sv_.begin() + offset;
183 }
184
Rewind(std::string_view::const_iterator pos)185 inline void Rewind(std::string_view::const_iterator pos)
186 {
187 iter_ = pos;
188 }
189
Save()190 inline std::string_view::const_iterator Save() const
191 {
192 return iter_;
193 }
194
HasNext()195 inline bool HasNext() const
196 {
197 return iter_ != sv_.end();
198 }
199
200 void SkipCp();
201
202 private:
203 template <bool MOVE_ITER, bool SET_CP_SIZE = false>
204 char32_t DecodeCP([[maybe_unused]] size_t *cpSize) const;
205
206 std::string_view sv_;
207 mutable std::string_view::const_iterator iter_;
208 };
209
210 class Constants {
211 public:
212 static constexpr uint16_t UTF8_1BYTE_LIMIT = 0x80;
213 static constexpr uint16_t UTF8_2BYTE_LIMIT = 0x800;
214 static constexpr uint32_t UTF8_3BYTE_LIMIT = 0x10000;
215
216 static constexpr uint16_t UTF8_2BYTE_MASK = 0x1F;
217 static constexpr uint16_t UTF8_3BYTE_MASK = 0x0F;
218 static constexpr uint16_t UTF8_4BYTE_MASK = 0x07;
219
220 static constexpr uint16_t UTF8_DECODE_4BYTE_MASK = 0xf8;
221 static constexpr uint16_t UTF8_DECODE_4BYTE_LIMIT = 0xf4;
222
223 static constexpr uint16_t UTF8_2BYTE_HEADER = 0xC0;
224 static constexpr uint16_t UTF8_3BYTE_HEADER = 0xE0;
225 static constexpr uint16_t UTF8_4BYTE_HEADER = 0xF0;
226
227 static constexpr uint16_t UTF8_2BYTE_SHIFT = 6U;
228 static constexpr uint16_t UTF8_3BYTE_SHIFT = 12U;
229 static constexpr uint16_t UTF8_4BYTE_SHIFT = 18U;
230
231 static constexpr uint16_t UTF8_CONT_MASK = 0x3F;
232 static constexpr uint16_t UTF8_CONT_HEADER = 0x80;
233
234 static constexpr char32_t SURROGATE_HIGH_MIN = 0xD800;
235 static constexpr char32_t SURROGATE_HIGH_MAX = 0xDC00;
236 static constexpr char32_t SURROGATE_LOW_MIN = 0xDC00;
237 static constexpr char32_t SURROGATE_LOW_MAX = 0xE000;
238 static constexpr char32_t SURROGATE_LOW_MARKER = 0x3ff;
239 static constexpr char32_t CELESTIAL_OFFSET = UTF8_3BYTE_LIMIT;
240 };
241
242 private:
243 friend class Iterator;
244 std::string_view sv_;
245 };
246
247 class UString {
248 public:
249 UString() = default;
UString(ArenaAllocator * allocator)250 explicit UString(ArenaAllocator *allocator) : allocator_(allocator) {}
UString(const std::string & str,ArenaAllocator * allocator)251 explicit UString(const std::string &str, ArenaAllocator *allocator) : UString(allocator)
252 {
253 Alloc();
254 *str_ = str;
255 }
256
UString(const std::string_view & str,ArenaAllocator * allocator)257 explicit UString(const std::string_view &str, ArenaAllocator *allocator) : UString(allocator)
258 {
259 Alloc();
260 *str_ = str;
261 }
262
UString(const util::StringView & str,ArenaAllocator * allocator)263 explicit UString(const util::StringView &str, ArenaAllocator *allocator) : UString(str.Utf8(), allocator) {}
264
265 DEFAULT_COPY_SEMANTIC(UString);
266 DEFAULT_MOVE_SEMANTIC(UString);
267 ~UString() = default;
268
View()269 util::StringView View() const
270 {
271 if (str_ == nullptr) {
272 return util::StringView();
273 }
274
275 return util::StringView(str_);
276 }
277
View()278 util::StringView View()
279 {
280 if (str_ == nullptr) {
281 return util::StringView();
282 }
283
284 return util::StringView(str_);
285 }
286
Append(char32_t ch)287 void Append(char32_t ch) noexcept
288 {
289 if (str_ == nullptr) {
290 Alloc();
291 }
292
293 StringView::Utf8Encode<ArenaString>(str_, ch);
294 }
295
Append(const StringView & other)296 void Append(const StringView &other) noexcept
297 {
298 if (str_ == nullptr) {
299 Alloc();
300 }
301
302 *str_ += other.Utf8();
303 }
304
Append(const char * other)305 void Append(const char *other) noexcept
306 {
307 if (str_ == nullptr) {
308 Alloc();
309 }
310 *str_ += other;
311 }
312
Append(const std::string & other)313 void Append(const std::string &other) noexcept
314 {
315 if (str_ == nullptr) {
316 Alloc();
317 }
318 *str_ += other;
319 }
320
321 private:
Alloc()322 void Alloc()
323 {
324 str_ = allocator_->New<ArenaString>(allocator_->Adapter());
325 }
326
327 protected:
328 // NOLINTBEGIN(misc-non-private-member-variables-in-classes)
329 ArenaString *str_ {};
330 ArenaAllocator *allocator_ {};
331 // NOLINTEND(misc-non-private-member-variables-in-classes)
332 };
333
334 template <bool MOVE_ITER, bool SET_CP_SIZE>
DecodeCP(size_t * cpSize)335 char32_t StringView::Iterator::DecodeCP([[maybe_unused]] size_t *cpSize) const
336 {
337 if (!HasNext()) {
338 return INVALID_CP;
339 }
340
341 const auto *iterNext = iter_;
342
343 char32_t cu0 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344 char32_t res {};
345
346 if (cu0 < Constants::UTF8_1BYTE_LIMIT) {
347 res = cu0;
348 } else if ((cu0 & Constants::UTF8_3BYTE_HEADER) == Constants::UTF8_2BYTE_HEADER) {
349 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
350 res = ((cu0 & Constants::UTF8_2BYTE_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu1 & Constants::UTF8_CONT_MASK);
351 } else if ((cu0 & Constants::UTF8_4BYTE_HEADER) == Constants::UTF8_3BYTE_HEADER) {
352 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
353 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354 res = ((cu0 & Constants::UTF8_3BYTE_MASK) << Constants::UTF8_3BYTE_SHIFT) |
355 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu2 & Constants::UTF8_CONT_MASK);
356 } else if (((cu0 & Constants::UTF8_DECODE_4BYTE_MASK) == Constants::UTF8_4BYTE_HEADER) &&
357 (cu0 <= Constants::UTF8_DECODE_4BYTE_LIMIT)) {
358 char32_t cu1 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359 char32_t cu2 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360 char32_t cu3 = static_cast<uint8_t>(*iterNext++); // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
361 res = ((cu0 & Constants::UTF8_4BYTE_MASK) << Constants::UTF8_4BYTE_SHIFT) |
362 ((cu1 & Constants::UTF8_CONT_MASK) << Constants::UTF8_3BYTE_SHIFT) |
363 ((cu2 & Constants::UTF8_CONT_MASK) << Constants::UTF8_2BYTE_SHIFT) | (cu3 & Constants::UTF8_CONT_MASK);
364 } else {
365 res = INVALID_CP;
366 }
367
368 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
369 if constexpr (MOVE_ITER) {
370 iter_ = iterNext;
371 return res;
372 }
373
374 // NOLINTNEXTLINE(readability-braces-around-statements,bugprone-suspicious-semicolon)
375 if constexpr (SET_CP_SIZE) {
376 *cpSize = iterNext - iter_;
377 }
378
379 return res;
380 }
381
382 template <void ENCODER(std::string *, char32_t)>
EscapeSymbol()383 std::string StringView::EscapeSymbol() const
384 {
385 std::string str;
386 str.reserve(Length());
387
388 auto skipNewLine = [](auto &iter) {
389 if (iter.HasNext()) {
390 iter.Forward(1);
391
392 if (iter.Peek() != '\n') {
393 iter.Backward(1);
394 }
395 }
396 };
397
398 Iterator iter(*this);
399 while (iter.HasNext()) {
400 auto cp = iter.Next();
401
402 switch (cp) {
403 case '\r': {
404 skipNewLine(iter);
405 [[fallthrough]];
406 }
407 case '\n': {
408 str += "\\n";
409 break;
410 }
411 case '\b': {
412 str += "\\b";
413 break;
414 }
415 case '\t': {
416 str += "\\t";
417 break;
418 }
419 case '\f': {
420 str += "\\f";
421 break;
422 }
423 case '"': {
424 str += "\\\"";
425 break;
426 }
427 case '\\': {
428 str += "\\\\";
429 break;
430 }
431 default: {
432 ENCODER(&str, cp);
433 }
434 }
435 }
436
437 return str;
438 }
439
440 template <typename T>
Utf8Encode(T * str,char32_t cu)441 void StringView::Utf8Encode(T *str, char32_t cu)
442 {
443 if (cu < Constants::UTF8_1BYTE_LIMIT) {
444 str->push_back(static_cast<char>(cu));
445 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
446 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_2BYTE_MASK) |
447 Constants::UTF8_2BYTE_HEADER));
448 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
449 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
450 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_3BYTE_MASK) |
451 Constants::UTF8_3BYTE_HEADER));
452 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
453 Constants::UTF8_CONT_HEADER));
454 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
455 } else {
456 str->push_back(static_cast<char>(((cu >> Constants::UTF8_4BYTE_SHIFT) & Constants::UTF8_4BYTE_MASK) |
457 Constants::UTF8_4BYTE_HEADER));
458 str->push_back(static_cast<char>(((cu >> Constants::UTF8_3BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
459 Constants::UTF8_CONT_HEADER));
460 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
461 Constants::UTF8_CONT_HEADER));
462 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
463 }
464 }
465
466 template <typename T>
Mutf8Encode(T * str,char32_t cu)467 void StringView::Mutf8Encode(T *str, char32_t cu)
468 {
469 if (cu == 0) {
470 str->push_back(static_cast<char>(Constants::UTF8_2BYTE_HEADER));
471 str->push_back(static_cast<char>(Constants::UTF8_CONT_HEADER));
472 } else if (cu < Constants::UTF8_1BYTE_LIMIT) {
473 str->push_back(static_cast<char>(cu));
474 } else if (cu < Constants::UTF8_2BYTE_LIMIT) {
475 str->push_back(static_cast<char>((cu >> Constants::UTF8_2BYTE_SHIFT) | Constants::UTF8_2BYTE_HEADER));
476 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
477 } else if (cu < Constants::UTF8_3BYTE_LIMIT) {
478 str->push_back(static_cast<char>((cu >> Constants::UTF8_3BYTE_SHIFT) | Constants::UTF8_3BYTE_HEADER));
479 str->push_back(static_cast<char>(((cu >> Constants::UTF8_2BYTE_SHIFT) & Constants::UTF8_CONT_MASK) |
480 Constants::UTF8_CONT_HEADER));
481 str->push_back(static_cast<char>((cu & Constants::UTF8_CONT_MASK) | Constants::UTF8_CONT_HEADER));
482 } else {
483 auto [cu1, cu2] = EncodeSurrogate(cu);
484 Mutf8Encode(str, cu1);
485 Mutf8Encode(str, cu2);
486 }
487 }
488 } // namespace ark::es2panda::util
489
490 // NOLINTNEXTLINE(cert-dcl58-cpp)
491 namespace std {
492
493 template <>
494 // NOLINTNEXTLINE(altera-struct-pack-align)
495 struct hash<ark::es2panda::util::StringView> {
496 std::size_t operator()(const ark::es2panda::util::StringView &str) const
497 {
498 return std::hash<std::string_view> {}(str.Utf8());
499 }
500 };
501
502 ostream &operator<<(ostream &os, const ark::es2panda::util::StringView &us);
503
504 } // namespace std
505
506 #endif
507