1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ECMASCRIPT_BASE_UTF_HELPER_H
17 #define ECMASCRIPT_BASE_UTF_HELPER_H
18
19 #include <cstdint>
20 #include <vector>
21
22 #include "libpandabase/utils/utf.h"
23
24 namespace panda::ecmascript::base::utf_helper {
25 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
26 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
27 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
28 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
29 static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
30 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
31 static constexpr uint32_t UTF8_OFFSET = 6;
32 static constexpr uint32_t UTF16_OFFSET = 10;
33 static constexpr uint16_t SURROGATE_MASK = 0xF800;
34
35 static constexpr uint8_t BIT_MASK_1 = 0x80;
36 static constexpr uint8_t BIT_MASK_2 = 0xC0;
37 static constexpr uint8_t BIT_MASK_3 = 0xE0;
38 static constexpr uint8_t BIT_MASK_4 = 0xF0;
39 static constexpr uint8_t BIT_MASK_5 = 0xF8;
40
41 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
42
43 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
44 static constexpr uint8_t UTF8_2B_FIRST = 0xc0;
45 static constexpr uint8_t UTF8_2B_SECOND = 0x80;
46 static constexpr uint8_t UTF8_2B_THIRD = 0x3f;
47
48 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
49 static constexpr uint8_t UTF8_3B_FIRST = 0xe0;
50 static constexpr uint8_t UTF8_3B_SECOND = 0x80;
51 static constexpr uint8_t UTF8_3B_THIRD = 0x80;
52
53 static constexpr uint8_t UTF8_4B_FIRST = 0xf0;
54
55 static constexpr uint8_t byteMask = 0xbf;
56 static constexpr uint8_t byteMark = 0x80;
57
58 static constexpr uint8_t latin1Limit = 0xFF;
59
60 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
61 enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
62
63 static constexpr size_t MAX_BYTES = 4;
64 struct Utf8Char {
65 size_t n;
66 std::array<uint8_t, MAX_BYTES> ch;
67 };
68
69 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
70
71 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index);
72
73 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index);
74
75 uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
76
77 bool IsValidUTF8(const std::vector<uint8_t> &data);
78
79 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);
80
81 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true);
82
83 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
84 size_t start, bool modify = true, bool isWriteBuffer = false);
85
86 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
87
88 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
89
90 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
91 size_t start);
92
93 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);
94
CombineTwoU16(uint16_t d0,uint16_t d1)95 static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
96 {
97 uint32_t codePoint = d0 - utf::HI_SURROGATE_MIN;
98 codePoint <<= UtfOffset::TEN;
99 codePoint |= d1 - utf::LO_SURROGATE_MIN;
100 codePoint += utf::LO_SUPPLEMENTS_MIN;
101 return codePoint;
102 }
103 } // namespace panda::ecmascript::base::utf_helper
104
105 #endif // ECMASCRIPT_BASE_UTF_HELPER_H