1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "ecmascript/base/utf_helper.h"
17
18 #include "ecmascript/log_wrapper.h"
19
20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
22 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
23 #define U16_GET_SUPPLEMENTARY(lead, trail) \
24 ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
25
26 namespace panda::ecmascript::base::utf_helper {
UTF16Decode(uint16_t lead,uint16_t trail)27 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
28 {
29 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
30 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
31 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
32 return cp;
33 }
34
IsUTF16HighSurrogate(uint16_t ch)35 bool IsUTF16HighSurrogate(uint16_t ch)
36 {
37 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
38 }
39
IsUTF16LowSurrogate(uint16_t ch)40 bool IsUTF16LowSurrogate(uint16_t ch)
41 {
42 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
43 }
44
45 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index)46 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
47 {
48 uint16_t high = utf16[*index];
49 if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
50 return high;
51 }
52 uint16_t low = utf16[*index + 1];
53 if (!IsUTF16LowSurrogate(low)) {
54 return high;
55 }
56 (*index)++;
57 return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
58 }
59
UTF8Length(uint32_t codepoint)60 inline size_t UTF8Length(uint32_t codepoint)
61 {
62 if (codepoint <= UTF8_1B_MAX) {
63 return UtfLength::ONE;
64 }
65 if (codepoint <= UTF8_2B_MAX) {
66 return UtfLength::TWO;
67 }
68 if (codepoint <= UTF8_3B_MAX) {
69 return UtfLength::THREE;
70 }
71 return UtfLength::FOUR;
72 }
73
74 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t len,size_t index)75 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
76 {
77 size_t size = UTF8Length(codepoint);
78 if (index + size > len) {
79 return 0;
80 }
81 for (size_t j = size - 1; j > 0; j--) {
82 uint8_t cont = ((codepoint | byteMark) & byteMask);
83 utf8[index + j] = cont;
84 codepoint >>= UTF8_OFFSET;
85 }
86 utf8[index] = codepoint | firstByteMark[size];
87 return size;
88 }
89
IsValidUTF8(const std::vector<uint8_t> & data)90 bool IsValidUTF8(const std::vector<uint8_t> &data)
91 {
92 uint32_t length = data.size();
93 switch (length) {
94 case UtfLength::ONE:
95 if (data.at(0) >= BIT_MASK_1) {
96 return false;
97 }
98 break;
99 case UtfLength::TWO:
100 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
101 return false;
102 }
103 break;
104 case UtfLength::THREE:
105 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
106 return false;
107 }
108 break;
109 case UtfLength::FOUR:
110 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
111 return false;
112 }
113 break;
114 default:
115 LOG_ECMA(FATAL) << "this branch is unreachable";
116 UNREACHABLE();
117 break;
118 }
119
120 for (uint32_t i = 1; i < length; i++) {
121 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
122 return false;
123 }
124 }
125 return true;
126 }
127
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify,bool isWriteBuffer)128 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
129 {
130 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
131 // means that is a single code point, it needs to be represented by three UTF8 code.
132 if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
133 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
134 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
135 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
136 return {UtfLength::THREE, {ch0, ch1, ch2}};
137 }
138
139 if (d0 == 0) {
140 if (isWriteBuffer) {
141 return {1, {0x00U}};
142 }
143 if (modify) {
144 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
145 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
146 }
147 // For print string, just skip '\u0000'
148 return {0, {0x00U}};
149 }
150 if (d0 <= UTF8_1B_MAX) {
151 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
152 }
153 if (d0 <= UTF8_2B_MAX) {
154 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
155 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
156 return {UtfLength::TWO, {ch0, ch1}};
157 }
158 if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
159 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
160 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
161 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
162 return {UtfLength::THREE, {ch0, ch1, ch2}};
163 }
164 if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
165 // Bad sequence
166 LOG_ECMA(FATAL) << "this branch is unreachable";
167 UNREACHABLE();
168 }
169
170 uint32_t codePoint = CombineTwoU16(d0, d1);
171
172 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
173 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
174 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
175 auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
176 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
177 }
178
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)179 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
180 {
181 size_t res = 1; // zero byte
182 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
183 // means that is a single code point, it needs to be represented by three UTF8 code.
184 if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
185 utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
186 res += UtfLength::THREE;
187 return res;
188 }
189
190 for (uint32_t i = 0; i < length; ++i) {
191 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192 if (modify) {
193 res += UtfLength::TWO; // special case for U+0000 => C0 80
194 }
195 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196 res += 1;
197 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198 res += UtfLength::TWO;
199 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
200 } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
201 res += UtfLength::THREE;
202 } else {
203 if (i < length - 1 &&
204 utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
205 utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
206 res += UtfLength::FOUR;
207 ++i;
208 } else {
209 res += UtfLength::THREE;
210 }
211 }
212 }
213 return res;
214 }
215
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)216 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
217 size_t start, bool modify, bool isWriteBuffer)
218 {
219 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
220 return 0;
221 }
222 size_t utf8Pos = 0;
223 size_t end = start + utf16Len;
224 for (size_t i = start; i < end; ++i) {
225 uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
226 if (codepoint == 0) {
227 if (isWriteBuffer) {
228 utf8Out[utf8Pos++] = 0x00U;
229 continue;
230 }
231 if (modify) {
232 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
233 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
234 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
235 }
236 continue;
237 }
238 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
239 }
240 return utf8Pos;
241 }
242
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)243 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
244 {
245 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
246 if ((d0 & utf::MASK1) == 0) {
247 return {d0, 1};
248 }
249
250 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
251 if ((d0 & utf::MASK2) == 0) {
252 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
253 }
254
255 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
256 if ((d0 & utf::MASK3) == 0) {
257 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
258 (d2 & utf::MASK_6BIT),
259 UtfLength::THREE};
260 }
261
262 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
263 uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
264 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
265
266 uint32_t pair = 0;
267 if (combine) {
268 uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
269 uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
270 pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail)); // NOLINTNEXTLINE(hicpp-signed-bitwise)
271 } else {
272 pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
273 pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
274 }
275
276 return {pair, UtfLength::FOUR};
277 }
278
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)279 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
280 {
281 return utf::MUtf8ToUtf16Size(utf8, utf8Len);
282 }
283
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)284 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
285 size_t start)
286 {
287 return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
288 }
289
ConvertRegionUtf16ToLatin1(const uint16_t * utf16In,uint8_t * latin1Out,size_t utf16Len,size_t latin1Len)290 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
291 {
292 if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
293 return 0;
294 }
295 size_t latin1Pos = 0;
296 size_t end = utf16Len;
297 for (size_t i = 0; i < end; ++i) {
298 if (latin1Pos == latin1Len) {
299 break;
300 }
301 uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
302 uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
303 latin1Out[latin1Pos++] = latin1Code;
304 }
305 return latin1Pos;
306 }
307 } // namespace panda::ecmascript::base::utf_helper
308