• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/base/utf_helper.h"
17 
18 #include "ecmascript/log_wrapper.h"
19 
20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
22 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
23 #define U16_GET_SUPPLEMENTARY(lead, trail) \
24     ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
25 
26 namespace panda::ecmascript::base::utf_helper {
27 
UTF16Decode(uint16_t lead,uint16_t trail)28 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
29 {
30     ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
31            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
32     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
33     return cp;
34 }
35 
IsUTF16HighSurrogate(uint16_t ch)36 bool IsUTF16HighSurrogate(uint16_t ch)
37 {
38     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
39 }
40 
IsUTF16LowSurrogate(uint16_t ch)41 bool IsUTF16LowSurrogate(uint16_t ch)
42 {
43     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
44 }
45 
46 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index,bool cesu8)47 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
48 {
49     uint16_t high = utf16[*index];
50     if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
51         return high;
52     }
53     uint16_t low = utf16[*index + 1];
54     if (!IsUTF16LowSurrogate(low) || cesu8) {
55         return high;
56     }
57     (*index)++;
58     return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
59 }
60 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)61 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
62 {
63     uint16_t first = utf16[*index];
64     // A valid surrogate pair should always start with a High Surrogate
65     if (IsUTF16LowSurrogate(first)) {
66         return UTF16_REPLACEMENT_CHARACTER;
67     }
68     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
69         if (*index == len - 1) {
70             // A High surrogate not paired with another surrogate
71             return UTF16_REPLACEMENT_CHARACTER;
72         }
73         uint16_t second = utf16[*index + 1];
74         if (!IsUTF16LowSurrogate(second)) {
75             // A High surrogate not followed by a low surrogate
76             return UTF16_REPLACEMENT_CHARACTER;
77         }
78         // A valid surrogate pair, decode normally
79         (*index)++;
80         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
81     }
82     // A unicode not fallen into the range of representing by surrogate pair, return as it is
83     return first;
84 }
85 
UTF8Length(uint32_t codepoint)86 inline size_t UTF8Length(uint32_t codepoint)
87 {
88     if (codepoint <= UTF8_1B_MAX) {
89         return UtfLength::ONE;
90     }
91     if (codepoint <= UTF8_2B_MAX) {
92         return UtfLength::TWO;
93     }
94     if (codepoint <= UTF8_3B_MAX) {
95         return UtfLength::THREE;
96     }
97     return UtfLength::FOUR;
98 }
99 
100 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t index,size_t size)101 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t index, size_t size)
102 {
103     for (size_t j = size - 1; j > 0; j--) {
104         uint8_t cont = ((codepoint | byteMark) & byteMask);
105         utf8[index + j] = cont;
106         codepoint >>= UTF8_OFFSET;
107     }
108     utf8[index] = codepoint | firstByteMark[size];
109     return size;
110 }
111 
IsValidUTF8(const std::vector<uint8_t> & data)112 bool IsValidUTF8(const std::vector<uint8_t> &data)
113 {
114     uint32_t length = data.size();
115     switch (length) {
116         case UtfLength::ONE:
117             if (data.at(0) >= BIT_MASK_1) {
118                 return false;
119             }
120             break;
121         case UtfLength::TWO:
122             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
123                 return false;
124             }
125             if (data.at(0) < UTF8_2B_FIRST_MIN) {
126                 return false;
127             }
128             break;
129         case UtfLength::THREE:
130             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
131                 return false;
132             }
133             if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
134                 return false;
135             }
136             // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF
137             if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN &&
138                 data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) {
139                 return false;
140             }
141             break;
142         case UtfLength::FOUR:
143             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
144                 return false;
145             }
146             if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
147                 return false;
148             }
149             // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F
150             if (data.at(0) > UTF8_4B_FIRST_MAX ||
151                (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) {
152                 return false;
153             }
154             break;
155         default:
156             LOG_ECMA(FATAL) << "this branch is unreachable";
157             UNREACHABLE();
158             break;
159     }
160 
161     for (uint32_t i = 1; i < length; i++) {
162         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
163             return false;
164         }
165     }
166     return true;
167 }
168 
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify,bool isWriteBuffer)169 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
170 {
171     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
172     // means that is a single code point, it needs to be represented by three UTF8 code.
173     if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
174         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
175         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
176         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
177         return {UtfLength::THREE, {ch0, ch1, ch2}};
178     }
179 
180     if (d0 == 0) {
181         if (isWriteBuffer) {
182             return {1, {0x00U}};
183         }
184         if (modify) {
185             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
186             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
187         }
188         // For print string, just skip '\u0000'
189         return {0, {0x00U}};
190     }
191     if (d0 <= UTF8_1B_MAX) {
192         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
193     }
194     if (d0 <= UTF8_2B_MAX) {
195         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
196         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
197         return {UtfLength::TWO, {ch0, ch1}};
198     }
199     if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
200         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
201         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
202         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
203         return {UtfLength::THREE, {ch0, ch1, ch2}};
204     }
205     if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
206         // Bad sequence
207         LOG_ECMA(FATAL) << "this branch is unreachable";
208         UNREACHABLE();
209     }
210 
211     uint32_t codePoint = CombineTwoU16(d0, d1);
212 
213     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
214     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
215     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
216     auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
217     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
218 }
219 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify,bool isGetBufferSize,bool cesu8)220 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
221 {
222     size_t res = 1;  // zero byte
223     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
224     // means that is a single code point, it needs to be represented by three UTF8 code.
225     if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
226         utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
227         res += UtfLength::THREE;
228         return res;
229     }
230 
231     for (uint32_t i = 0; i < length; ++i) {
232         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
233             if (isGetBufferSize) {
234                 res += UtfLength::ONE;
235             } else if (modify) {
236                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
237             }
238         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239             res += 1;
240         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
241             res += UtfLength::TWO;
242             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
243         } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
244             res += UtfLength::THREE;
245         } else {
246             if (!cesu8 && i < length - 1 &&
247                 utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
248                 utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
249                 res += UtfLength::FOUR;
250                 ++i;
251             } else {
252                 res += UtfLength::THREE;
253             }
254         }
255     }
256     return res;
257 }
258 
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer,bool cesu8)259 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
260                                 size_t start, bool modify, bool isWriteBuffer, bool cesu8)
261 {
262     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
263         return 0;
264     }
265     size_t utf8Pos = 0;
266     size_t end = start + utf16Len;
267     for (size_t i = start; i < end; ++i) {
268         uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
269         if (codepoint == 0) {
270             if (isWriteBuffer) {
271                 utf8Out[utf8Pos++] = 0x00U;
272                 continue;
273             }
274             if (modify) {
275                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
276                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
277                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
278             }
279             continue;
280         }
281         size_t size = UTF8Length(codepoint);
282         if (utf8Pos + size > utf8Len) {
283             break;
284         }
285         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Pos, size);
286     }
287     return utf8Pos;
288 }
289 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)290 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
291                                         size_t start, bool modify, bool isWriteBuffer)
292 {
293     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
294         return 0;
295     }
296     size_t utf8Pos = 0;
297     size_t end = start + utf16Len;
298     for (size_t i = start; i < end; ++i) {
299         uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
300         if (codepoint == 0) {
301             if (isWriteBuffer) {
302                 utf8Out[utf8Pos++] = 0x00U;
303                 continue;
304             }
305             if (modify) {
306                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
307                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
308                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
309             }
310             continue;
311         }
312         size_t size = UTF8Length(codepoint);
313         if (utf8Pos + size > utf8Len) {
314             break;
315         }
316         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Pos, size);
317     }
318     return utf8Pos;
319 }
320 
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)321 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
322 {
323     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
324     if ((d0 & utf::MASK1) == 0) {
325         return {d0, 1};
326     }
327 
328     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
329     if ((d0 & utf::MASK2) == 0) {
330         return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
331     }
332 
333     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334     if ((d0 & utf::MASK3) == 0) {
335         return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
336                     (d2 & utf::MASK_6BIT),
337                 UtfLength::THREE};
338     }
339 
340     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
341     uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
342                          ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
343 
344     uint32_t pair = 0;
345     if (combine) {
346         uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
347         uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
348         pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
349     } else {
350         pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
351         pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
352     }
353 
354     return {pair, UtfLength::FOUR};
355 }
356 
357 // drop the tail bytes if the remain length can't fill the length it represents.
FixUtf8Len(const uint8_t * utf8,size_t utf8Len)358 static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
359 {
360     size_t trimSize = 0;
361     if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
362         // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
363         trimSize = 1;
364     }
365     if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
366         // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
367         trimSize = CONST_2;
368     }
369     if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
370         // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
371         trimSize = CONST_3;
372     }
373     return utf8Len - trimSize;
374 }
375 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)376 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
377 {
378     size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
379     size_t in_pos = 0;
380     size_t res = 0;
381     while (in_pos < safeUtf8Len) {
382         uint8_t src = utf8[in_pos];
383         switch (src & 0xF0) {
384             case 0xF0: {
385                 const uint8_t c2 = utf8[++in_pos];
386                 const uint8_t c3 = utf8[++in_pos];
387                 const uint8_t c4 = utf8[++in_pos];
388                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
389                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
390                 if (codePoint >= SURROGATE_RAIR_START) {
391                     res += CONST_2;
392                 } else {
393                     res++;
394                 }
395                 in_pos++;
396                 break;
397             }
398             case 0xE0: {
399                 in_pos += CONST_3;
400                 res++;
401                 break;
402             }
403             case 0xD0:
404             case 0xC0: {
405                 in_pos += CONST_2;
406                 res++;
407                 break;
408             }
409             default:
410                 do {
411                     in_pos++;
412                     res++;
413                 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
414                 break;
415         }
416     }
417     // The remain chars should be treated as single byte char.
418     res += utf8Len - in_pos;
419     return res;
420 }
421 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len)422 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
423 {
424     size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
425     size_t in_pos = 0;
426     size_t out_pos = 0;
427     while (in_pos < safeUtf8Len && out_pos < utf16Len) {
428         uint8_t src = utf8In[in_pos];
429         switch (src & 0xF0) {
430             case 0xF0: {
431                 const uint8_t c2 = utf8In[++in_pos];
432                 const uint8_t c3 = utf8In[++in_pos];
433                 const uint8_t c4 = utf8In[++in_pos];
434                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
435                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
436                 if (codePoint >= SURROGATE_RAIR_START) {
437                     ASSERT(utf16Len >= 1);
438                     if (out_pos >= utf16Len - 1) {
439                         return out_pos;
440                     }
441                     codePoint -= SURROGATE_RAIR_START;
442                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
443                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
444                 } else {
445                     utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
446                 }
447                 in_pos++;
448                 break;
449             }
450             case 0xE0: {
451                 const uint8_t c2 = utf8In[++in_pos];
452                 const uint8_t c3 = utf8In[++in_pos];
453                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
454                     ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
455                 in_pos++;
456                 break;
457             }
458             case 0xD0:
459             case 0xC0: {
460                 const uint8_t c2 = utf8In[++in_pos];
461                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
462                 in_pos++;
463                 break;
464             }
465             default:
466                 do {
467                     utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
468                 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
469                 break;
470         }
471     }
472     // The remain chars should be treated as single byte char.
473     while (in_pos < utf8Len && out_pos < utf16Len) {
474         utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
475     }
476     return out_pos;
477 }
478 
ConvertRegionUtf16ToLatin1(const uint16_t * utf16In,uint8_t * latin1Out,size_t utf16Len,size_t latin1Len)479 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
480 {
481     if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
482         return 0;
483     }
484     size_t latin1Pos = 0;
485     size_t end = utf16Len;
486     for (size_t i = 0; i < end; ++i) {
487         if (latin1Pos == latin1Len) {
488             break;
489         }
490         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
491         uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
492         latin1Out[latin1Pos++] = latin1Code;
493     }
494     return latin1Pos;
495 }
496 
ConvertUtf8ToUnicodeChar(const uint8_t * utf8,size_t maxLen)497 std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
498 {
499     if (maxLen == 0) {
500         return {INVALID_UTF8, 0};
501     }
502     Span<const uint8_t> sp(utf8, maxLen);
503     // one byte
504     uint8_t d0 = sp[0];
505     if ((d0 & BIT_MASK_1) == 0) {
506         return {d0, UtfLength::ONE};
507     }
508     if (maxLen < UtfLength::TWO) {
509         return {INVALID_UTF8, 0};
510     }
511     // two bytes
512     uint8_t d1 = sp[UtfLength::ONE];
513     if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
514         if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
515             return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
516         } else {
517             return {INVALID_UTF8, 0};
518         }
519     }
520     if (maxLen < UtfLength::THREE) {
521         return {INVALID_UTF8, 0};
522     }
523     // three bytes
524     uint8_t d2 = sp[UtfLength::TWO];
525     if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
526         if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
527             return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) |
528                 ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d2 & utf::MASK_6BIT), UtfLength::THREE};
529         } else {
530             return {INVALID_UTF8, 0};
531         }
532     }
533     if (maxLen < UtfLength::FOUR) {
534         return {INVALID_UTF8, 0};
535     }
536     // four bytes
537     uint8_t d3 = sp[UtfLength::THREE];
538     if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
539         if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
540             ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
541             return {((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
542                 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT), UtfLength::FOUR};
543         } else {
544             return {INVALID_UTF8, 0};
545         }
546     }
547     return {INVALID_UTF8, 0};
548 }
549 }  // namespace panda::ecmascript::base::utf_helper
550