• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "common_components/base/config.h"
17 #include "common_components/base/utf_helper.h"
18 
19 #include "common_components/log/log.h"
20 #include "libpandabase/utils/span.h"
21 
22 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
23 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
24 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
25 #define U16_GET_SUPPLEMENTARY(lead, trail) \
26     ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
27 
28 namespace common::utf_helper {
29 
UTF16Decode(uint16_t lead,uint16_t trail)30 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
31 {
32     DCHECK_CC((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
33            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
34     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
35     return cp;
36 }
37 
38 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index,bool cesu8)39 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8)
40 {
41     uint16_t high = utf16[*index];
42     if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
43         return high;
44     }
45     uint16_t low = utf16[*index + 1];
46     if (!IsUTF16LowSurrogate(low) || cesu8) {
47         return high;
48     }
49     (*index)++;
50     return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
51 }
52 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)53 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
54 {
55     uint16_t first = utf16[*index];
56     // A valid surrogate pair should always start with a High Surrogate
57     if (IsUTF16LowSurrogate(first)) {
58         return UTF16_REPLACEMENT_CHARACTER;
59     }
60     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
61         if (*index == len - 1) {
62             // A High surrogate not paired with another surrogate
63             return UTF16_REPLACEMENT_CHARACTER;
64         }
65         uint16_t second = utf16[*index + 1];
66         if (!IsUTF16LowSurrogate(second)) {
67             // A High surrogate not followed by a low surrogate
68             return UTF16_REPLACEMENT_CHARACTER;
69         }
70         // A valid surrogate pair, decode normally
71         (*index)++;
72         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
73     }
74     // A unicode not fallen into the range of representing by surrogate pair, return as it is
75     return first;
76 }
77 
UTF8Length(uint32_t codepoint)78 inline size_t UTF8Length(uint32_t codepoint)
79 {
80     if (codepoint <= UTF8_1B_MAX) {
81         return UtfLength::ONE;
82     }
83     if (codepoint <= UTF8_2B_MAX) {
84         return UtfLength::TWO;
85     }
86     if (codepoint <= UTF8_3B_MAX) {
87         return UtfLength::THREE;
88     }
89     return UtfLength::FOUR;
90 }
91 
92 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t index,size_t size)93 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t index, size_t size)
94 {
95     for (size_t j = size - 1; j > 0; j--) {
96         uint8_t cont = ((codepoint | byteMark) & byteMask);
97         utf8[index + j] = cont;
98         codepoint >>= UTF8_OFFSET;
99     }
100     utf8[index] = codepoint | firstByteMark[size];
101     return size;
102 }
103 
IsValidUTF8(const std::vector<uint8_t> & data)104 bool IsValidUTF8(const std::vector<uint8_t> &data)
105 {
106     uint32_t length = data.size();
107     switch (length) {
108         case UtfLength::ONE:
109             if (data.at(0) >= BIT_MASK_1) {
110                 return false;
111             }
112             break;
113         case UtfLength::TWO:
114             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
115                 return false;
116             }
117             if (data.at(0) < UTF8_2B_FIRST_MIN) {
118                 return false;
119             }
120             break;
121         case UtfLength::THREE:
122             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
123                 return false;
124             }
125             if (data.at(0) == UTF8_3B_FIRST && data.at(1) < UTF8_3B_SECOND_MIN) {
126                 return false;
127             }
128             // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs, corresponds to %ED%A0%80~%ED%BF%BF
129             if (data.at(0) == UTF8_3B_RESERVED_FIRST && data.at(1) >= UTF8_3B_RESERVED_SECOND_MIN &&
130                 data.at(1) <= UTF8_3B_RESERVED_SECOND_MAX) {
131                 return false;
132             }
133             break;
134         case UtfLength::FOUR:
135             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
136                 return false;
137             }
138             if (data.at(0) == UTF8_4B_FIRST && data.at(1) < UTF8_4B_SECOND_MIN) {
139                 return false;
140             }
141             // max four length binary: 11110(100) 10(001111) 10(111111) 10(111111), max data[0] is 0xF4, data[1] is 0x8F
142             if (data.at(0) > UTF8_4B_FIRST_MAX ||
143                (data.at(0) == UTF8_4B_FIRST_MAX && data.at(1) > UTF8_4B_SECOND_MAX)) {
144                 return false;
145             }
146             break;
147         default: //LCOV_EXCL_BR_LINE
148             LOG_COMMON(FATAL) << "this branch is unreachable";
149             UNREACHABLE_CC();
150             break;
151     }
152 
153     for (uint32_t i = 1; i < length; i++) {
154         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
155             return false;
156         }
157     }
158     return true;
159 }
160 
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify,bool isWriteBuffer)161 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer)
162 {
163     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
164     // means that is a single code point, it needs to be represented by three UTF8 code.
165     if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
166         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
167         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
168         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
169         return {UtfLength::THREE, {ch0, ch1, ch2}};
170     }
171 
172     if (d0 == 0) {
173         if (isWriteBuffer) {
174             return {1, {0x00U}};
175         }
176         if (modify) {
177             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
178             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
179         }
180         // For print string, just skip '\u0000'
181         return {0, {0x00U}};
182     }
183     if (d0 <= UTF8_1B_MAX) {
184         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
185     }
186     if (d0 <= UTF8_2B_MAX) {
187         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
188         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
189         return {UtfLength::TWO, {ch0, ch1}};
190     }
191     if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
192         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
193         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
194         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
195         return {UtfLength::THREE, {ch0, ch1, ch2}};
196     }
197     if (d1 < LO_SURROGATE_MIN || d1 > LO_SURROGATE_MAX) {
198         // Bad sequence
199         LOG_COMMON(FATAL) << "this branch is unreachable";
200         UNREACHABLE_CC();
201     }
202 
203     uint32_t codePoint = CombineTwoU16(d0, d1);
204 
205     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
206     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
207     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
208     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
209     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
210 }
211 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify,bool isGetBufferSize,bool cesu8)212 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify, bool isGetBufferSize, bool cesu8)
213 {
214     size_t res = 1;  // zero byte
215     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
216     // means that is a single code point, it needs to be represented by three UTF8 code.
217     if (length == 1 && utf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
218         utf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
219         res += UtfLength::THREE;
220         return res;
221     }
222 
223     for (uint32_t i = 0; i < length; ++i) {
224         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
225             if (isGetBufferSize) {
226                 res += UtfLength::ONE;
227             } else if (modify) {
228                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
229             }
230         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
231             res += 1;
232         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
233             res += UtfLength::TWO;
234             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
235         } else if (utf16[i] < HI_SURROGATE_MIN || utf16[i] > HI_SURROGATE_MAX) {
236             res += UtfLength::THREE;
237         } else {
238             if (!cesu8 && i < length - 1 &&
239                 utf16[i + 1] >= LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240                 utf16[i + 1] <= LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
241                 res += UtfLength::FOUR;
242                 ++i;
243             } else {
244                 res += UtfLength::THREE;
245             }
246         }
247     }
248     return res;
249 }
250 
251 #if ENABLE_NEXT_OPTIMIZATION && defined(USE_CMC_GC)
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer,bool cesu8)252 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
253                                 size_t start, bool modify, bool isWriteBuffer, bool cesu8)
254 {
255     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
256         return 0;
257     }
258     size_t utf8Pos = 0;
259     size_t end = start + utf16Len;
260     for (size_t i = start; i < end; ++i) {
261         uint32_t codepoint = utf16In[i];
262         if (codepoint == 0) {
263             if (isWriteBuffer) {
264                 utf8Out[utf8Pos++] = UTF8_NUL;
265                 continue;
266             }
267             if (modify) {
268                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
269                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
270             }
271             continue;
272         }
273         if (codepoint >= DECODE_LEAD_LOW && codepoint <= DECODE_LEAD_HIGH && i + 1 < end) {
274             uint32_t high = utf16In[i];
275             uint32_t low = utf16In[i + 1];
276             if (!cesu8) {
277                 if (low >= DECODE_TRAIL_LOW && low <= DECODE_TRAIL_HIGH) {
278                     codepoint =
279                     SURROGATE_RAIR_START + ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW);
280                     i++;
281                 }
282             }
283         }
284         if (codepoint <= UTF8_1B_MAX) {
285             if (UNLIKELY(utf8Pos + UTF8_SINGLE_BYTE_LENGTH > utf8Len)) {
286                 break;
287             }
288             utf8Out[utf8Pos++] = static_cast<uint8_t>(codepoint);
289         } else if (codepoint <= UTF8_2B_MAX) {
290             if (UNLIKELY(utf8Pos + UTF8_DOUBLE_BYTE_LENGTH > utf8Len)) {
291                 break;
292             }
293             utf8Out[utf8Pos++] = (BIT_MASK_2 | (codepoint >> OFFSET_6POS));
294             utf8Out[utf8Pos++] = (byteMark | (codepoint & LOW_6BITS));
295         } else if (codepoint <= UTF8_3B_MAX) {
296             if (UNLIKELY(utf8Pos + UTF8_TRIPLE_BYTE_LENGTH > utf8Len)) {
297                 break;
298             }
299             utf8Out[utf8Pos++] = (UTF8_3B_FIRST | (codepoint >> OFFSET_12POS));
300             utf8Out[utf8Pos++] = (byteMark | ((codepoint >> OFFSET_6POS) & LOW_6BITS));
301             utf8Out[utf8Pos++] = (byteMark | (codepoint & LOW_6BITS));
302         } else {
303             if (UNLIKELY(utf8Pos + UTF8_QUAD_BYTE_LENGTH > utf8Len)) {
304                 break;
305             }
306             utf8Out[utf8Pos++] = (UTF8_4B_FIRST | (codepoint >> OFFSET_18POS));
307             utf8Out[utf8Pos++] = (byteMark | ((codepoint >> OFFSET_12POS) & LOW_6BITS));
308             utf8Out[utf8Pos++] = (byteMark | ((codepoint >> OFFSET_6POS) & LOW_6BITS));
309             utf8Out[utf8Pos++] = (byteMark | (codepoint & LOW_6BITS));
310         }
311     }
312     return utf8Pos;
313 }
314 #else
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer,bool cesu8)315 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
316                                 size_t start, bool modify, bool isWriteBuffer, bool cesu8)
317 {
318     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
319         return 0;
320     }
321     size_t utf8Pos = 0;
322     size_t end = start + utf16Len;
323     for (size_t i = start; i < end; ++i) {
324         uint32_t codepoint = DecodeUTF16(utf16In, end, &i, cesu8);
325         if (codepoint == 0) {
326             if (isWriteBuffer) {
327                 utf8Out[utf8Pos++] = 0x00U;
328                 continue;
329             }
330             if (modify) {
331                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
332                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
333                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
334             }
335             continue;
336         }
337         size_t size = UTF8Length(codepoint);
338         if (utf8Pos + size > utf8Len) {
339             break;
340         }
341         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Pos, size);
342     }
343     return utf8Pos;
344 }
345 #endif
346 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify,bool isWriteBuffer)347 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
348                                         size_t start, bool modify, bool isWriteBuffer)
349 {
350     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
351         return 0;
352     }
353     size_t utf8Pos = 0;
354     size_t end = start + utf16Len;
355     for (size_t i = start; i < end; ++i) {
356         uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
357         if (codepoint == 0) {
358             if (isWriteBuffer) {
359                 utf8Out[utf8Pos++] = 0x00U;
360                 continue;
361             }
362             if (modify) {
363                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
364                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
365                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
366             }
367             continue;
368         }
369         size_t size = UTF8Length(codepoint);
370         if (utf8Pos + size > utf8Len) {
371             break;
372         }
373         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Pos, size);
374     }
375     return utf8Pos;
376 }
377 
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)378 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
379 {
380     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
381     if ((d0 & MASK1) == 0) {
382         return {d0, 1};
383     }
384 
385     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
386     if ((d0 & MASK2) == 0) {
387         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
388     }
389 
390     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
391     if ((d0 & MASK3) == 0) {
392         return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) |
393                     (d2 & MASK_6BIT),
394                 UtfLength::THREE};
395     }
396 
397     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
398     uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
399                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
400 
401     uint32_t pair = 0;
402     if (combine) {
403         uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
404         uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
405         pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
406     } else {
407         pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
408         pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
409     }
410 
411     return {pair, UtfLength::FOUR};
412 }
413 
414 // drop the tail bytes if the remain length can't fill the length it represents.
FixUtf8Len(const uint8_t * utf8,size_t utf8Len)415 static inline size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
416 {
417     size_t trimSize = 0;
418     if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
419         // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
420         trimSize = 1;
421     }
422     if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
423         // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
424         trimSize = CONST_2;
425     }
426     if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
427         // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
428         trimSize = CONST_3;
429     }
430     return utf8Len - trimSize;
431 }
432 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)433 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
434 {
435     size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
436     size_t in_pos = 0;
437     size_t res = 0;
438     while (in_pos < safeUtf8Len) {
439         uint8_t src = utf8[in_pos];
440         switch (src & 0xF0) {
441             case 0xF0: {
442                 const uint8_t c2 = utf8[++in_pos];
443                 const uint8_t c3 = utf8[++in_pos];
444                 const uint8_t c4 = utf8[++in_pos];
445                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
446                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
447                 if (codePoint >= SURROGATE_RAIR_START) {
448                     res += CONST_2;
449                 } else {
450                     res++;
451                 }
452                 in_pos++;
453                 break;
454             }
455             case 0xE0: {
456                 in_pos += CONST_3;
457                 res++;
458                 break;
459             }
460             case 0xD0:
461             case 0xC0: {
462                 in_pos += CONST_2;
463                 res++;
464                 break;
465             }
466             default:
467                 do {
468                     in_pos++;
469                     res++;
470                 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
471                 break;
472         }
473     }
474     // The remain chars should be treated as single byte char.
475     res += utf8Len - in_pos;
476     return res;
477 }
478 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len)479 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
480 {
481     size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
482     size_t in_pos = 0;
483     size_t out_pos = 0;
484     while (in_pos < safeUtf8Len && out_pos < utf16Len) {
485         uint8_t src = utf8In[in_pos];
486         switch (src & 0xF0) {
487             case 0xF0: {
488                 const uint8_t c2 = utf8In[++in_pos];
489                 const uint8_t c3 = utf8In[++in_pos];
490                 const uint8_t c4 = utf8In[++in_pos];
491                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
492                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
493                 if (codePoint >= SURROGATE_RAIR_START) {
494                     DCHECK_CC(utf16Len >= 1);
495                     if (out_pos >= utf16Len - 1) {
496                         return out_pos;
497                     }
498                     codePoint -= SURROGATE_RAIR_START;
499                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
500                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
501                 } else {
502                     utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
503                 }
504                 in_pos++;
505                 break;
506             }
507             case 0xE0: {
508                 const uint8_t c2 = utf8In[++in_pos];
509                 const uint8_t c3 = utf8In[++in_pos];
510                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
511                     ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
512                 in_pos++;
513                 break;
514             }
515             case 0xD0:
516             case 0xC0: {
517                 const uint8_t c2 = utf8In[++in_pos];
518                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
519                 in_pos++;
520                 break;
521             }
522             default:
523                 do {
524                     utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
525                 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
526                 break;
527         }
528     }
529     // The remain chars should be treated as single byte char.
530     while (in_pos < utf8Len && out_pos < utf16Len) {
531         utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
532     }
533     return out_pos;
534 }
535 
ConvertRegionUtf16ToLatin1(const uint16_t * utf16In,uint8_t * latin1Out,size_t utf16Len,size_t latin1Len)536 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len)
537 {
538     if (utf16In == nullptr || latin1Out == nullptr || latin1Len == 0) {
539         return 0;
540     }
541     size_t latin1Pos = 0;
542     size_t end = utf16Len;
543     for (size_t i = 0; i < end; ++i) {
544         if (latin1Pos == latin1Len) {
545             break;
546         }
547         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
548         uint8_t latin1Code = static_cast<uint8_t>(codepoint & latin1Limit);
549         latin1Out[latin1Pos++] = latin1Code;
550     }
551     return latin1Pos;
552 }
553 
ConvertUtf8ToUnicodeChar(const uint8_t * utf8,size_t maxLen)554 std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen)
555 {
556     if (maxLen == 0) {
557         return {INVALID_UTF8, 0};
558     }
559     panda::Span<const uint8_t> sp(utf8, maxLen);
560     // one byte
561     uint8_t d0 = sp[0];
562     if ((d0 & BIT_MASK_1) == 0) {
563         return {d0, UtfLength::ONE};
564     }
565     if (maxLen < UtfLength::TWO) {
566         return {INVALID_UTF8, 0};
567     }
568     // two bytes
569     uint8_t d1 = sp[UtfLength::ONE];
570     if ((d0 & BIT_MASK_3) == BIT_MASK_2) {
571         if ((d1 & BIT_MASK_2) == BIT_MASK_1) {
572             return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
573         } else {
574             return {INVALID_UTF8, 0};
575         }
576     }
577     if (maxLen < UtfLength::THREE) {
578         return {INVALID_UTF8, 0};
579     }
580     // three bytes
581     uint8_t d2 = sp[UtfLength::TWO];
582     if ((d0 & BIT_MASK_4) == BIT_MASK_3) {
583         if (((d1 & BIT_MASK_2) == BIT_MASK_1) && ((d2 & BIT_MASK_2) == BIT_MASK_1)) {
584             return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) |
585                 ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), UtfLength::THREE};
586         } else {
587             return {INVALID_UTF8, 0};
588         }
589     }
590     if (maxLen < UtfLength::FOUR) {
591         return {INVALID_UTF8, 0};
592     }
593     // four bytes
594     uint8_t d3 = sp[UtfLength::THREE];
595     if ((d0 & BIT_MASK_5) == BIT_MASK_4) {
596         if (((d1 & BIT_MASK_2) == BIT_MASK_1) &&
597             ((d2 & BIT_MASK_2) == BIT_MASK_1) && ((d3 & BIT_MASK_2) == BIT_MASK_1)) {
598             return {((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
599                 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT), UtfLength::FOUR};
600         } else {
601             return {INVALID_UTF8, 0};
602         }
603     }
604     return {INVALID_UTF8, 0};
605 }
606 }  // namespace common::utf_helper
607