1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef ECMASCRIPT_BASE_UTF_HELPER_H
17 #define ECMASCRIPT_BASE_UTF_HELPER_H
18
19 #include <array>
20 #include <cstdint>
21 #include <vector>
22
23 #include "common_interfaces/base/common.h"
24
25 namespace common::utf_helper {
26 constexpr size_t HI_SURROGATE_MIN = 0xd800;
27 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
28 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
29 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
30
31 constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
32
33 static constexpr size_t CONST_2 = 2;
34 static constexpr size_t CONST_3 = 3;
35 static constexpr size_t CONST_4 = 4;
36 static constexpr size_t MASK1 = 0x80;
37 static constexpr size_t MASK2 = 0x20;
38 static constexpr size_t MASK3 = 0x10;
39 static constexpr size_t LOW_3BITS = 0x7;
40 static constexpr size_t LOW_4BITS = 0xF;
41 static constexpr size_t LOW_5BITS = 0x1F;
42 static constexpr size_t LOW_6BITS = 0x3F;
43 static constexpr size_t L_SURROGATE_START = 0xDC00;
44 static constexpr size_t H_SURROGATE_START = 0xD800;
45 static constexpr size_t SURROGATE_RAIR_START = 0x10000;
46 static constexpr size_t OFFSET_18POS = 18;
47 static constexpr size_t OFFSET_12POS = 12;
48 static constexpr size_t OFFSET_10POS = 10;
49 static constexpr size_t OFFSET_6POS = 6;
50 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
51 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
52 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
53 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
54 static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
55 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
56 static constexpr uint32_t UTF8_OFFSET = 6;
57 static constexpr uint32_t UTF16_OFFSET = 10;
58 static constexpr uint16_t SURROGATE_MASK = 0xF800;
59 static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
60
61 constexpr size_t DATA_WIDTH = 6;
62 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
63
64 constexpr size_t U16_LEAD = 0xd7c0;
65 constexpr size_t U16_TAIL = 0xdc00;
66
67 static constexpr uint8_t BIT_MASK_1 = 0x80;
68 static constexpr uint8_t BIT_MASK_2 = 0xC0;
69 static constexpr uint8_t BIT_MASK_3 = 0xE0;
70 static constexpr uint8_t BIT_MASK_4 = 0xF0;
71 static constexpr uint8_t BIT_MASK_5 = 0xF8;
72 static constexpr uint8_t BIT_MASK_FF = 0xFF;
73 static constexpr uint16_t BIT16_MASK = 0x3FF;
74
75 constexpr size_t MASK_4BIT = 0x0f;
76 constexpr size_t MASK_5BIT = 0x1f;
77 constexpr size_t MASK_6BIT = 0x3f;
78 constexpr size_t MASK_10BIT = 0x03ff;
79 constexpr size_t MASK_16BIT = 0xffff;
80
81 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
82
83 static constexpr size_t UTF8_SINGLE_BYTE_LENGTH = 1;
84 static constexpr size_t UTF8_DOUBLE_BYTE_LENGTH = 2;
85 static constexpr size_t UTF8_TRIPLE_BYTE_LENGTH = 3;
86 static constexpr size_t UTF8_QUAD_BYTE_LENGTH = 4;
87 static constexpr uint8_t UTF8_NUL = 0x00U;
88 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
89 static constexpr uint8_t UTF8_2B_FIRST = 0xc0;
90 static constexpr uint8_t UTF8_2B_SECOND = 0x80;
91 static constexpr uint8_t UTF8_2B_THIRD = 0x3f;
92 static constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280
93
94 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
95 static constexpr uint8_t UTF8_3B_FIRST = 0xe0;
96 static constexpr uint8_t UTF8_3B_SECOND = 0x80;
97 static constexpr uint8_t UTF8_3B_THIRD = 0x80;
98 static constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080
99 static constexpr uint8_t UTF8_3B_RESERVED_FIRST = 0xED;
100 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MIN = 0xA0;
101 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MAX = 0xBF; // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs
102
103 static constexpr uint8_t UTF8_4B_FIRST = 0xf0;
104 static constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080
105 static constexpr uint8_t UTF8_4B_FIRST_MAX = 0xF4; // the maximum for 4 bytes is 1114111, which is 0x10FFFF
106 static constexpr uint8_t UTF8_4B_SECOND_MAX = 0x8F;
107
108 static constexpr uint8_t byteMask = 0xbf;
109 static constexpr uint8_t byteMark = 0x80;
110
111 static constexpr uint8_t latin1Limit = 0xFF;
112
113 static constexpr int32_t INVALID_UTF8 = -1;
114
115 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
116 enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
117
118 static constexpr size_t MAX_BYTES = 4;
119 struct Utf8Char {
120 size_t n;
121 std::array<uint8_t, MAX_BYTES> ch;
122 };
123
124 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
125
IsUTF16Surrogate(uint16_t ch)126 inline bool IsUTF16Surrogate(uint16_t ch)
127 {
128 return DECODE_LEAD_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
129 }
130
IsUTF16HighSurrogate(uint16_t ch)131 inline bool IsUTF16HighSurrogate(uint16_t ch)
132 {
133 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
134 }
135
IsUTF16LowSurrogate(uint16_t ch)136 inline bool IsUTF16LowSurrogate(uint16_t ch)
137 {
138 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
139 }
140
141 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false);
142
143 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t index, size_t size);
144
145 uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
146
147 bool IsValidUTF8(const std::vector<uint8_t> &data);
148
149 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);
150
151 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true,
152 bool isGetBufferSize = false, bool cesu8 = false);
153
154 size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
155 size_t utf8Len, size_t start, bool modify = true,
156 bool isWriteBuffer = false, bool cesu = false);
157
158 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
159 size_t start, bool modify = true, bool isWriteBuffer = false);
160
161 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index);
162
163 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
164
165 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
166
167 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len);
168
169 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);
170
CombineTwoU16(uint16_t d0,uint16_t d1)171 static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
172 {
173 uint32_t codePoint = d0 - HI_SURROGATE_MIN;
174 codePoint <<= UtfOffset::TEN;
175 codePoint |= d1 - LO_SURROGATE_MIN;
176 codePoint += LO_SUPPLEMENTS_MIN;
177 return codePoint;
178 }
179
180 std::pair<int32_t, size_t> PUBLIC_API ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen);
181
IsHexDigits(uint16_t ch)182 static inline bool IsHexDigits(uint16_t ch)
183 {
184 return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
185 }
186
187 // Convert a char representing hex digit to the value it represents
HexChar16Value(uint16_t ch)188 static inline size_t HexChar16Value(uint16_t ch)
189 {
190 // Converting table:
191 // +------+-------+-------+
192 // | Char | ASCII | Value |
193 // +------+-------+-------+
194 // | '0' | 0x30 | 0 |
195 // | ... |
196 // | '9' | 0x39 | 9 |
197 // | 'A' | 0x41 | 10 |
198 // | ... |
199 // | 'F' | 0x46 | 15 |
200 // | 'a' | 0x61 | 10 |
201 // | ... |
202 // | 'f' | 0x66 | 15 |
203 // +------+-------+-------+
204 DCHECK_CC(IsHexDigits(ch));
205 size_t res = ch - '0'; // res in [0x0, 0x9], [0x11, 0x16], [0x31, 0x36]
206
207 if (res > 9) { // 9: res in [0x11, 0x16], [0x31, 0x36], which means ch in ['A', 'F'], ['a', 'f']
208 res |= 0x20; // 0x20: [0x11, 0x16] -> [0x31, 0x36], converting ['A' - '0', 'F' - '0'] to ['a' - '0', 'f' - '0']
209 // res = [0x31, 0x36]
210 res -= ('a' - '0'); // res = [0x0, 0x5]
211 res += 10; // 10: res = [10, 15], successfully converts ['A', 'F'] and ['a', 'f'] to [10, 15]
212 }
213
214 return res;
215 }
216
217 // Convert a hex value to the char representing it
GetHexChar16(uint8_t val)218 static inline uint16_t GetHexChar16(uint8_t val)
219 {
220 // Converting table:
221 // +------+-------+-------+
222 // | Value | ASCII | Char |
223 // +------+-------+-------+
224 // | 0 | 0x30 | '0' |
225 // | ... |
226 // | 9 | 0x39 | '9' |
227 // | 10 | 0x41 | 'A' |
228 // | ... |
229 // | 15 | 0x46 | 'F' |
230 // +------+-------+-------+
231 if (val < 10) { // 10: val in [0x0, 0x9], convert to ['0', '9']
232 return val + '0';
233 }
234 return val - 0xA + 'A'; // 0xA: val in [0xA, 0xF], convert to ['A', 'F']
235 }
236
GetValueFromTwoHex(uint8_t front,uint8_t behind)237 static inline uint8_t GetValueFromTwoHex(uint8_t front, uint8_t behind)
238 {
239 size_t high = HexChar16Value(front);
240 size_t low = HexChar16Value(behind);
241 uint8_t res = ((high << 4U) | low) & common::utf_helper::BIT_MASK_FF; // NOLINT 4: means shift left by 4 digits
242 return res;
243 }
244 } // namespace common::utf_helper
245
246 #endif // ECMASCRIPT_BASE_UTF_HELPER_H
247