• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/base/utf_helper.h"
17 
18 #include "ecmascript/log_wrapper.h"
19 
20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
22 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
23 #define U16_GET_SUPPLEMENTARY(lead, trail) \
24     ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
25 
26 namespace panda::ecmascript::base::utf_helper {
UTF16Decode(uint16_t lead,uint16_t trail)27 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
28 {
29     ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
30            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
31     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
32     return cp;
33 }
34 
IsUTF16HighSurrogate(uint16_t ch)35 bool IsUTF16HighSurrogate(uint16_t ch)
36 {
37     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
38 }
39 
IsUTF16LowSurrogate(uint16_t ch)40 bool IsUTF16LowSurrogate(uint16_t ch)
41 {
42     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
43 }
44 
45 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index)46 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
47 {
48     uint16_t high = utf16[*index];
49     if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
50         return high;
51     }
52     uint16_t low = utf16[*index + 1];
53     if (!IsUTF16LowSurrogate(low)) {
54         return high;
55     }
56     (*index)++;
57     return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
58 }
59 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)60 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
61 {
62     uint16_t first = utf16[*index];
63     // A valid surrogate pair should always start with a High Surrogate
64     if (IsUTF16LowSurrogate(first)) {
65         return UTF16_REPLACEMENT_CHARACTER;
66     }
67     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
68         if (*index == len - 1) {
69             // A High surrogate not paired with another surrogate
70             return UTF16_REPLACEMENT_CHARACTER;
71         }
72         uint16_t second = utf16[*index + 1];
73         if (!IsUTF16LowSurrogate(second)) {
74             // A High surrogate not followed by a low surrogate
75             return UTF16_REPLACEMENT_CHARACTER;
76         }
77         // A valid surrogate pair, decode normally
78         (*index)++;
79         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
80     }
81     // A unicode not fallen into the range of representing by surrogate pair, return as it is
82     return first;
83 }
84 
UTF8Length(uint32_t codepoint)85 inline size_t UTF8Length(uint32_t codepoint)
86 {
87     if (codepoint <= UTF8_1B_MAX) {
88         return UtfLength::ONE;
89     }
90     if (codepoint <= UTF8_2B_MAX) {
91         return UtfLength::TWO;
92     }
93     if (codepoint <= UTF8_3B_MAX) {
94         return UtfLength::THREE;
95     }
96     return UtfLength::FOUR;
97 }
98 
99 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t len,size_t index)100 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
101 {
102     size_t size = UTF8Length(codepoint);
103     if (index + size > len) {
104         return 0;
105     }
106     for (size_t j = size - 1; j > 0; j--) {
107         uint8_t cont = ((codepoint | byteMark) & byteMask);
108         utf8[index + j] = cont;
109         codepoint >>= UTF8_OFFSET;
110     }
111     utf8[index] = codepoint | firstByteMark[size];
112     return size;
113 }
114 
IsValidUTF8(const std::vector<uint8_t> & data)115 bool IsValidUTF8(const std::vector<uint8_t> &data)
116 {
117     uint32_t length = data.size();
118     switch (length) {
119         case UtfLength::ONE:
120             if (data.at(0) >= BIT_MASK_1) {
121                 return false;
122             }
123             break;
124         case UtfLength::TWO:
125             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
126                 return false;
127             }
128             if (data.at(0) < UTF8_2B_FIRST_MIN) {
129                 return false;
130             }
131             break;
132         case UtfLength::THREE:
133             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
134                 return false;
135             }
136             if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
137                 return false;
138             }
139             break;
140         case UtfLength::FOUR:
141             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
142                 return false;
143             }
144             if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
145                 return false;
146             }
147             break;
148         default:
149             LOG_ECMA(FATAL) << "this branch is unreachable";
150             UNREACHABLE();
151             break;
152     }
153 
154     for (uint32_t i = 1; i < length; i++) {
155         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
156             return false;
157         }
158     }
159     return true;
160 }
161 
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify,bool isWriteBuffer)162 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
163 {
164     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
165     // means that is a single code point, it needs to be represented by three UTF8 code.
166     if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
167         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
168         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
169         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
170         return {UtfLength::THREE, {ch0, ch1, ch2}};
171     }
172 
173     if (d0 == 0) {
174         if (isWriteBuffer) {
175             return {1, {0x00U}};
176         }
177         if (modify) {
178             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
179             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
180         }
181         // For print string, just skip '\u0000'
182         return {0, {0x00U}};
183     }
184     if (d0 <= UTF8_1B_MAX) {
185         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
186     }
187     if (d0 <= UTF8_2B_MAX) {
188         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
189         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
190         return {UtfLength::TWO, {ch0, ch1}};
191     }
192     if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
193         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
194         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
195         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
196         return {UtfLength::THREE, {ch0, ch1, ch2}};
197     }
198     if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
199         // Bad sequence
200         LOG_ECMA(FATAL) << "this branch is unreachable";
201         UNREACHABLE();
202     }
203 
204     uint32_t codePoint = CombineTwoU16(d0, d1);
205 
206     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
207     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
208     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
209     auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
210     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
211 }
212 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)213 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
214 {
215     size_t res = 1;  // zero byte
216     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
217     // means that is a single code point, it needs to be represented by three UTF8 code.
218     if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
219         utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
220         res += UtfLength::THREE;
221         return res;
222     }
223 
224     for (uint32_t i = 0; i < length; ++i) {
225         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
226             if (modify) {
227                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
228             }
229         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
230             res += 1;
231         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
232             res += UtfLength::TWO;
233             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
234         } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
235             res += UtfLength::THREE;
236         } else {
237             if (i < length - 1 &&
238                 utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239                 utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240                 res += UtfLength::FOUR;
241                 ++i;
242             } else {
243                 res += UtfLength::THREE;
244             }
245         }
246     }
247     return res;
248 }
249 
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)250 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
251                                 size_t start, bool modify, bool isWriteBuffer)
252 {
253     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
254         return 0;
255     }
256     size_t utf8Pos = 0;
257     size_t end = start + utf16Len;
258     for (size_t i = start; i < end; ++i) {
259         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
260         if (codepoint == 0) {
261             if (isWriteBuffer) {
262                 utf8Out[utf8Pos++] = 0x00U;
263                 continue;
264             }
265             if (modify) {
266                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
267                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
268                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
269             }
270             continue;
271         }
272         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
273     }
274     return utf8Pos;
275 }
276 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)277 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
278                                         size_t start, bool modify, bool isWriteBuffer)
279 {
280     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
281         return 0;
282     }
283     size_t utf8Pos = 0;
284     size_t end = start + utf16Len;
285     for (size_t i = start; i < end; ++i) {
286         uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
287         if (codepoint == 0) {
288             if (isWriteBuffer) {
289                 utf8Out[utf8Pos++] = 0x00U;
290                 continue;
291             }
292             if (modify) {
293                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
294                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
295                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
296             }
297             continue;
298         }
299         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
300     }
301     return utf8Pos;
302 }
303 
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)304 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
305 {
306     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
307     if ((d0 & utf::MASK1) == 0) {
308         return {d0, 1};
309     }
310 
311     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
312     if ((d0 & utf::MASK2) == 0) {
313         return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
314     }
315 
316     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
317     if ((d0 & utf::MASK3) == 0) {
318         return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
319                     (d2 & utf::MASK_6BIT),
320                 UtfLength::THREE};
321     }
322 
323     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
324     uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
325                          ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
326 
327     uint32_t pair = 0;
328     if (combine) {
329         uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
330         uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
331         pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
332     } else {
333         pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
334         pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
335     }
336 
337     return {pair, UtfLength::FOUR};
338 }
339 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)340 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
341 {
342     return utf::MUtf8ToUtf16Size(utf8, utf8Len);
343 }
344 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)345 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
346                                 size_t start)
347 {
348     return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
349 }
350 
ConvertRegionUtf16ToLatin1(const uint16_t * utf16In,uint8_t * latin1Out,size_t utf16Len,size_t latin1Len)351 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
352 {
353     if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
354         return 0;
355     }
356     size_t latin1Pos = 0;
357     size_t end = utf16Len;
358     for (size_t i = 0; i < end; ++i) {
359         if (latin1Pos == latin1Len) {
360             break;
361         }
362         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
363         uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
364         latin1Out[latin1Pos++] = latin1Code;
365     }
366     return latin1Pos;
367 }
368 
ConvertUtf8ToUnicodeChar(const uint8_t * utf8,size_t maxLen)369 std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
370 {
371     if (maxLen == 0) {
372         return {INVALID_UTF8, 0};
373     }
374     Span<const uint8_t> sp(utf8, maxLen);
375     // one byte
376     uint8_t d0 = sp[0];
377     if ((d0 & BIT_MASK_1) == 0) {
378         return {d0, UtfLength::ONE};
379     }
380     if (maxLen < UtfLength::TWO) {
381         return {INVALID_UTF8, 0};
382     }
383     // two bytes
384     uint8_t d1 = sp[UtfLength::ONE];
385     if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
386         if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
387             return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
388         } else {
389             return {INVALID_UTF8, 0};
390         }
391     }
392     if (maxLen < UtfLength::THREE) {
393         return {INVALID_UTF8, 0};
394     }
395     // three bytes
396     uint8_t d2 = sp[UtfLength::TWO];
397     if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
398         if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
399             return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) |
400                 ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE};
401         } else {
402             return {INVALID_UTF8, 0};
403         }
404     }
405     if (maxLen < UtfLength::FOUR) {
406         return {INVALID_UTF8, 0};
407     }
408     // four bytes
409     uint8_t d3 = sp[UtfLength::THREE];
410     if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
411         if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
412             ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
413             return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
414                 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR};
415         } else {
416             return {INVALID_UTF8, 0};
417         }
418     }
419     return {INVALID_UTF8, 0};
420 }
421 }  // namespace panda::ecmascript::base::utf_helper
422