• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_BASE_UTF_HELPER_H
17 #define ECMASCRIPT_BASE_UTF_HELPER_H
18 
19 #include <array>
20 #include <cstdint>
21 #include <vector>
22 
23 #include "common_interfaces/base/common.h"
24 
25 namespace common::utf_helper {
26 constexpr size_t HI_SURROGATE_MIN = 0xd800;
27 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
28 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
29 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
30 
31 constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
32 
33 static constexpr size_t CONST_2 = 2;
34 static constexpr size_t CONST_3 = 3;
35 static constexpr size_t CONST_4 = 4;
36 static constexpr size_t MASK1 = 0x80;
37 static constexpr size_t MASK2 = 0x20;
38 static constexpr size_t MASK3 = 0x10;
39 static constexpr size_t LOW_3BITS = 0x7;
40 static constexpr size_t LOW_4BITS = 0xF;
41 static constexpr size_t LOW_5BITS = 0x1F;
42 static constexpr size_t LOW_6BITS = 0x3F;
43 static constexpr size_t L_SURROGATE_START = 0xDC00;
44 static constexpr size_t H_SURROGATE_START = 0xD800;
45 static constexpr size_t SURROGATE_RAIR_START = 0x10000;
46 static constexpr size_t OFFSET_18POS = 18;
47 static constexpr size_t OFFSET_12POS = 12;
48 static constexpr size_t OFFSET_10POS = 10;
49 static constexpr size_t OFFSET_6POS = 6;
50 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
51 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
52 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
53 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
54 static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
55 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
56 static constexpr uint32_t UTF8_OFFSET = 6;
57 static constexpr uint32_t UTF16_OFFSET = 10;
58 static constexpr uint16_t SURROGATE_MASK = 0xF800;
59 static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
60 
61 constexpr size_t DATA_WIDTH = 6;
62 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
63 
64 constexpr size_t U16_LEAD = 0xd7c0;
65 constexpr size_t U16_TAIL = 0xdc00;
66 
67 static constexpr uint8_t BIT_MASK_1 = 0x80;
68 static constexpr uint8_t BIT_MASK_2 = 0xC0;
69 static constexpr uint8_t BIT_MASK_3 = 0xE0;
70 static constexpr uint8_t BIT_MASK_4 = 0xF0;
71 static constexpr uint8_t BIT_MASK_5 = 0xF8;
72 static constexpr uint8_t BIT_MASK_FF = 0xFF;
73 static constexpr uint16_t BIT16_MASK = 0x3FF;
74 
75 constexpr size_t MASK_4BIT = 0x0f;
76 constexpr size_t MASK_5BIT = 0x1f;
77 constexpr size_t MASK_6BIT = 0x3f;
78 constexpr size_t MASK_10BIT = 0x03ff;
79 constexpr size_t MASK_16BIT = 0xffff;
80 
81 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
82 
83 static constexpr size_t UTF8_SINGLE_BYTE_LENGTH = 1;
84 static constexpr size_t UTF8_DOUBLE_BYTE_LENGTH = 2;
85 static constexpr size_t UTF8_TRIPLE_BYTE_LENGTH = 3;
86 static constexpr size_t UTF8_QUAD_BYTE_LENGTH = 4;
87 static constexpr uint8_t UTF8_NUL = 0x00U;
88 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
89 static constexpr uint8_t UTF8_2B_FIRST = 0xc0;
90 static constexpr uint8_t UTF8_2B_SECOND = 0x80;
91 static constexpr uint8_t UTF8_2B_THIRD = 0x3f;
92 static constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2;  // the minimum for 2 bytes is 128, which is 0xc280
93 
94 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
95 static constexpr uint8_t UTF8_3B_FIRST = 0xe0;
96 static constexpr uint8_t UTF8_3B_SECOND = 0x80;
97 static constexpr uint8_t UTF8_3B_THIRD = 0x80;
98 static constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0;  // the minimum for 3 bytes is 2048, which is 0xe0a080
99 static constexpr uint8_t UTF8_3B_RESERVED_FIRST = 0xED;
100 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MIN = 0xA0;
101 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MAX = 0xBF; // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs
102 
103 static constexpr uint8_t UTF8_4B_FIRST = 0xf0;
104 static constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90;  // the minimum for 4 bytes is 65536, which is 0xf0908080
105 static constexpr uint8_t UTF8_4B_FIRST_MAX = 0xF4; // the maximum for 4 bytes is 1114111, which is 0x10FFFF
106 static constexpr uint8_t UTF8_4B_SECOND_MAX = 0x8F;
107 
108 static constexpr uint8_t byteMask = 0xbf;
109 static constexpr uint8_t byteMark = 0x80;
110 
111 static constexpr uint8_t latin1Limit = 0xFF;
112 
113 static constexpr int32_t INVALID_UTF8 = -1;
114 
115 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
116 enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
117 
118 static constexpr size_t MAX_BYTES = 4;
119 struct Utf8Char {
120     size_t n;
121     std::array<uint8_t, MAX_BYTES> ch;
122 };
123 
124 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
125 
IsUTF16Surrogate(uint16_t ch)126 inline bool IsUTF16Surrogate(uint16_t ch)
127 {
128     return DECODE_LEAD_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
129 }
130 
IsUTF16HighSurrogate(uint16_t ch)131 inline bool IsUTF16HighSurrogate(uint16_t ch)
132 {
133     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
134 }
135 
IsUTF16LowSurrogate(uint16_t ch)136 inline bool IsUTF16LowSurrogate(uint16_t ch)
137 {
138     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
139 }
140 
141 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false);
142 
143 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t index, size_t size);
144 
145 uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
146 
147 bool IsValidUTF8(const std::vector<uint8_t> &data);
148 
149 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);
150 
151 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true,
152                        bool isGetBufferSize = false, bool cesu8 = false);
153 
154 size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
155                                            size_t utf8Len, size_t start, bool modify = true,
156                                            bool isWriteBuffer = false, bool cesu = false);
157 
158 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
159                                         size_t start, bool modify = true, bool isWriteBuffer = false);
160 
161 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index);
162 
163 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
164 
165 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
166 
167 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len);
168 
169 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);
170 
CombineTwoU16(uint16_t d0,uint16_t d1)171 static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
172 {
173     uint32_t codePoint = d0 - HI_SURROGATE_MIN;
174     codePoint <<= UtfOffset::TEN;
175     codePoint |= d1 - LO_SURROGATE_MIN;
176     codePoint += LO_SUPPLEMENTS_MIN;
177     return codePoint;
178 }
179 
180 std::pair<int32_t, size_t> PUBLIC_API ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen);
181 
IsHexDigits(uint16_t ch)182 static inline bool IsHexDigits(uint16_t ch)
183 {
184     return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
185 }
186 
187 // Convert a char representing hex digit to the value it represents
HexChar16Value(uint16_t ch)188 static inline size_t HexChar16Value(uint16_t ch)
189 {
190     // Converting table:
191     // +------+-------+-------+
192     // | Char | ASCII | Value |
193     // +------+-------+-------+
194     // | '0'  | 0x30  |   0   |
195     // |        ...           |
196     // | '9'  | 0x39  |   9   |
197     // | 'A'  | 0x41  |  10   |
198     // |        ...           |
199     // | 'F'  | 0x46  |  15   |
200     // | 'a'  | 0x61  |  10   |
201     // |        ...           |
202     // | 'f'  | 0x66  |  15   |
203     // +------+-------+-------+
204     DCHECK_CC(IsHexDigits(ch));
205     size_t res = ch - '0'; // res in [0x0, 0x9], [0x11, 0x16], [0x31, 0x36]
206 
207     if (res > 9) { // 9: res in [0x11, 0x16], [0x31, 0x36], which means ch in ['A', 'F'], ['a', 'f']
208         res |= 0x20; // 0x20: [0x11, 0x16] -> [0x31, 0x36], converting ['A' - '0', 'F' - '0'] to ['a' - '0', 'f' - '0']
209         // res = [0x31, 0x36]
210         res -= ('a' - '0'); // res = [0x0, 0x5]
211         res += 10; // 10: res = [10, 15], successfully converts ['A', 'F'] and ['a', 'f'] to [10, 15]
212     }
213 
214     return res;
215 }
216 
217 // Convert a hex value to the char representing it
GetHexChar16(uint8_t val)218 static inline uint16_t GetHexChar16(uint8_t val)
219 {
220     // Converting table:
221     // +------+-------+-------+
222     // | Value | ASCII | Char |
223     // +------+-------+-------+
224     // |   0   | 0x30  | '0'  |
225     // |        ...           |
226     // |   9   | 0x39  | '9'  |
227     // |  10   | 0x41  | 'A'  |
228     // |        ...           |
229     // |  15   | 0x46  | 'F'  |
230     // +------+-------+-------+
231     if (val < 10) { // 10: val in [0x0, 0x9], convert to ['0', '9']
232         return val + '0';
233     }
234     return val - 0xA + 'A'; // 0xA: val in [0xA, 0xF], convert to ['A', 'F']
235 }
236 
GetValueFromTwoHex(uint8_t front,uint8_t behind)237 static inline uint8_t GetValueFromTwoHex(uint8_t front, uint8_t behind)
238 {
239     size_t high = HexChar16Value(front);
240     size_t low = HexChar16Value(behind);
241     uint8_t res = ((high << 4U) | low) & common::utf_helper::BIT_MASK_FF;  // NOLINT 4: means shift left by 4 digits
242     return res;
243 }
244 }  // namespace common::utf_helper
245 
246 #endif  // ECMASCRIPT_BASE_UTF_HELPER_H
247