• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "base/utils/utf_helper.h"
17 #include "unicode/unistr.h"
18 
19 namespace OHOS::Ace::UtfUtils {
20 
21 const std::string DEFAULT_STR = "error";
22 const std::u16string DEFAULT_U16STR = u"error";
23 const std::u32string DEFAULT_U32STR = U"error";
24 const std::wstring DEFAULT_WSTR = L"error";
25 
26 constexpr size_t HI_SURROGATE_MIN = 0xd800;
27 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
28 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
29 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
30 
31 static constexpr size_t CONST_2 = 2;
32 static constexpr size_t CONST_3 = 3;
33 static constexpr size_t LOW_3BITS = 0x7;
34 static constexpr size_t LOW_4BITS = 0xF;
35 static constexpr size_t LOW_5BITS = 0x1F;
36 static constexpr size_t LOW_6BITS = 0x3F;
37 static constexpr size_t L_SURROGATE_START = 0xDC00;
38 static constexpr size_t H_SURROGATE_START = 0xD800;
39 static constexpr size_t SURROGATE_RAIR_START = 0x10000;
40 static constexpr size_t OFFSET_18POS = 18;
41 static constexpr size_t OFFSET_12POS = 12;
42 static constexpr size_t OFFSET_10POS = 10;
43 static constexpr size_t OFFSET_6POS = 6;
44 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
45 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
46 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
47 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
48 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
49 static constexpr uint32_t UTF8_OFFSET = 6;
50 static constexpr uint32_t UTF16_OFFSET = 10;
51 static constexpr uint16_t SURROGATE_MASK = 0xF800;
52 static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
53 
54 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
55 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
56 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
57 
58 static constexpr uint8_t BYTE_MASK = 0xbf;
59 static constexpr uint8_t BYTE_MARK = 0x80;
60 
61 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
62 
63 static const unsigned char FIRST_BYTE_MARK[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
64 
IsUTF16HighSurrogate(uint16_t ch)65 bool IsUTF16HighSurrogate(uint16_t ch)
66 {
67     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
68 }
69 
IsUTF16LowSurrogate(uint16_t ch)70 bool IsUTF16LowSurrogate(uint16_t ch)
71 {
72     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
73 }
74 
75 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index)76 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
77 {
78     uint16_t high = utf16[*index];
79     if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
80         return high;
81     }
82     uint16_t low = utf16[*index + 1];
83     if (!IsUTF16LowSurrogate(low)) {
84         return high;
85     }
86     (*index)++;
87     return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
88 }
89 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)90 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
91 {
92     uint16_t first = utf16[*index];
93     // A valid surrogate pair should always start with a High Surrogate
94     if (IsUTF16LowSurrogate(first)) {
95         return UTF16_REPLACEMENT_CHARACTER;
96     }
97     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
98         if (*index == len - 1) {
99             // A High surrogate not paired with another surrogate
100             return UTF16_REPLACEMENT_CHARACTER;
101         }
102         uint16_t second = utf16[*index + 1];
103         if (!IsUTF16LowSurrogate(second)) {
104             // A High surrogate not followed by a low surrogate
105             return UTF16_REPLACEMENT_CHARACTER;
106         }
107         // A valid surrogate pair, decode normally
108         (*index)++;
109         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
110     }
111     // A unicode not fallen into the range of representing by surrogate pair, return as it is
112     return first;
113 }
114 
RepalceUnpairedSurrogates(uint16_t * utf16,size_t end,size_t * index)115 static void RepalceUnpairedSurrogates(uint16_t *utf16, size_t end, size_t *index)
116 {
117     uint16_t first = utf16[*index];
118     // A valid surrogate pair should always start with a High Surrogate
119     if (IsUTF16LowSurrogate(first)) {
120         utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
121         return;
122     }
123     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
124         if (*index == end - 1) {
125             // A High surrogate not paired with another surrogate
126             utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
127             return;
128         }
129         uint16_t second = utf16[*index + 1];
130         if (!IsUTF16LowSurrogate(second)) {
131             // A High surrogate not followed by a low surrogate
132             utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
133             return;
134         }
135         // A valid surrogate pair, decode normally
136         (*index)++;
137         return;
138     }
139     // A unicode not fallen into the range of representing by surrogate pair, return as it is
140     return;
141 }
142 
HandleInvalidUTF16(uint16_t * utf16In,size_t utf16Len,size_t start)143 void HandleInvalidUTF16(uint16_t* utf16In, size_t utf16Len, size_t start)
144 {
145     if (utf16In == nullptr) {
146         return;
147     }
148     size_t end = start + utf16Len;
149     for (size_t i = start; i < end; ++i) {
150         RepalceUnpairedSurrogates(utf16In, end, &i);
151     }
152 }
153 
UTF8Length(uint32_t codepoint)154 inline size_t UTF8Length(uint32_t codepoint)
155 {
156     if (codepoint <= UTF8_1B_MAX) {
157         return UtfLength::ONE;
158     }
159     if (codepoint <= UTF8_2B_MAX) {
160         return UtfLength::TWO;
161     }
162     if (codepoint <= UTF8_3B_MAX) {
163         return UtfLength::THREE;
164     }
165     return UtfLength::FOUR;
166 }
167 
168 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t len,size_t index)169 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
170 {
171     size_t size = UTF8Length(codepoint);
172     if (index + size > len) {
173         return 0;
174     }
175     for (size_t j = size - 1; j > 0; j--) {
176         uint8_t cont = ((codepoint | BYTE_MARK) & BYTE_MASK);
177         utf8[index + j] = cont;
178         codepoint >>= UTF8_OFFSET;
179     }
180     utf8[index] = codepoint | FIRST_BYTE_MARK[size];
181     return size;
182 }
183 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length)184 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length)
185 {
186     size_t res = 1;  // zero byte
187     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
188     // means that is a single code point, it needs to be represented by three UTF8 code.
189     if (length == 1 && utf16[0] >= HI_SURROGATE_MIN &&
190         utf16[0] <= LO_SURROGATE_MAX) {
191         res += UtfLength::THREE;
192         return res;
193     }
194 
195     for (uint32_t i = 0; i < length; ++i) {
196         if (utf16[i] == 0) {
197             // do nothing
198         } else if (utf16[i] <= UTF8_1B_MAX) {
199             res += 1;
200         } else if (utf16[i] <= UTF8_2B_MAX) {
201             res += UtfLength::TWO;
202             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
203         } else if (utf16[i] < HI_SURROGATE_MIN || utf16[i] > HI_SURROGATE_MAX) {
204             res += UtfLength::THREE;
205         } else {
206             if (i < length - 1 &&
207                 utf16[i + 1] >= LO_SURROGATE_MIN &&
208                 utf16[i + 1] <= LO_SURROGATE_MAX) {
209                 res += UtfLength::FOUR;
210                 ++i;
211             } else {
212                 res += UtfLength::THREE;
213             }
214         }
215     }
216     return res;
217 }
218 
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)219 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
220     size_t start)
221 {
222     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
223         return 0;
224     }
225     size_t utf8Pos = 0;
226     size_t end = start + utf16Len;
227     for (size_t i = start; i < end; ++i) {
228         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
229         if (codepoint == 0) {
230             continue;
231         }
232         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
233     }
234     return utf8Pos;
235 }
236 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)237 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
238     size_t start)
239 {
240     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
241         return 0;
242     }
243     size_t utf8Pos = 0;
244     size_t end = start + utf16Len;
245     for (size_t i = start; i < end; ++i) {
246         uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
247         if (codepoint == 0) {
248             continue;
249         }
250         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
251     }
252     return utf8Pos;
253 }
254 
255 // drop the tail bytes if the remain length can't fill the length it represents.
FixUtf8Len(const uint8_t * utf8,size_t utf8Len)256 static size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
257 {
258     size_t trimSize = 0;
259     if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
260         // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
261         trimSize = 1;
262     }
263     if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
264         // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
265         trimSize = CONST_2;
266     }
267     if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
268         // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
269         trimSize = CONST_3;
270     }
271     return utf8Len - trimSize;
272 }
273 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)274 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
275 {
276     size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
277     size_t in_pos = 0;
278     size_t res = 0;
279     while (in_pos < safeUtf8Len) {
280         uint8_t src = utf8[in_pos];
281         switch (src & 0xF0) {
282             case 0xF0: {
283                 const uint8_t c2 = utf8[++in_pos];
284                 const uint8_t c3 = utf8[++in_pos];
285                 const uint8_t c4 = utf8[++in_pos];
286                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
287                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
288                 if (codePoint >= SURROGATE_RAIR_START) {
289                     res += CONST_2;
290                 } else {
291                     res++;
292                 }
293                 in_pos++;
294                 break;
295             }
296             case 0xE0: {
297                 in_pos += CONST_3;
298                 res++;
299                 break;
300             }
301             case 0xD0:
302             case 0xC0: {
303                 in_pos += CONST_2;
304                 res++;
305                 break;
306             }
307             default:
308                 do {
309                     in_pos++;
310                     res++;
311                 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
312                 break;
313         }
314     }
315     // The remain chars should be treated as single byte char.
316     res += utf8Len - in_pos;
317     return res;
318 }
319 
320 #define CHECK_OUT_POS_RETURN(out_pos, utf16Len)     \
321     do {                                            \
322         if ((out_pos) >= (utf16Len) - 1) {          \
323             return out_pos;                         \
324         }                                           \
325     } while (0)
326 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len)327 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
328 {
329     size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
330     size_t in_pos = 0;
331     size_t out_pos = 0;
332     while (in_pos < safeUtf8Len && out_pos < utf16Len) {
333         uint8_t src = utf8In[in_pos];
334         switch (src & 0xF0) {
335             case 0xF0: {
336                 const uint8_t c2 = utf8In[++in_pos];
337                 const uint8_t c3 = utf8In[++in_pos];
338                 const uint8_t c4 = utf8In[++in_pos];
339                 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
340                     ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
341                 if (codePoint >= SURROGATE_RAIR_START) {
342                     CHECK_OUT_POS_RETURN(out_pos, utf16Len);
343                     codePoint -= SURROGATE_RAIR_START;
344                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
345                     utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
346                 } else {
347                     utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
348                 }
349                 in_pos++;
350                 break;
351             }
352             case 0xE0: {
353                 const uint8_t c2 = utf8In[++in_pos];
354                 const uint8_t c3 = utf8In[++in_pos];
355                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
356                     ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
357                 in_pos++;
358                 break;
359             }
360             case 0xD0:
361             case 0xC0: {
362                 const uint8_t c2 = utf8In[++in_pos];
363                 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
364                 in_pos++;
365                 break;
366             }
367             default:
368                 do {
369                     utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
370                 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
371                 break;
372         }
373     }
374     // The remain chars should be treated as single byte char.
375     while (in_pos < utf8Len && out_pos < utf16Len) {
376         utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
377     }
378     return out_pos;
379 }
380 
IsIndexInPairedSurrogates(int32_t index,const std::u16string & utf16)381 bool IsIndexInPairedSurrogates(int32_t index, const std::u16string& utf16)
382 {
383     uint16_t len = utf16.length();
384     if (len == 0 || index <= 0 || index >= static_cast<int32_t>(len)) {
385         return false;
386     }
387     // A valid surrogate pair should always start with a High Surrogate
388     if (IsUTF16HighSurrogate(utf16[index - 1]) && IsUTF16LowSurrogate(utf16[index])) {
389         return true;
390     }
391 
392     return false;
393 }
394 
Utf16ToUtf32Size(const uint16_t * utf16,uint32_t length)395 size_t Utf16ToUtf32Size(const uint16_t *utf16, uint32_t length)
396 {
397     size_t res = 1;  // zero byte
398     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
399     // means that is a single code point, it needs to be represented by 1 UTF32 code.
400     if (length == 1 && utf16[0] >= HI_SURROGATE_MIN &&
401         utf16[0] <= LO_SURROGATE_MAX) {
402         res += UtfLength::ONE;
403         return res;
404     }
405 
406     for (uint32_t i = 0; i < length; ++i) {
407         if (utf16[i] == 0) {
408             // do nothing
409             continue;
410         }
411         if (utf16[i] >= HI_SURROGATE_MIN && utf16[i] <= HI_SURROGATE_MAX) {
412             if (i < length - 1 &&
413                 utf16[i + 1] >= LO_SURROGATE_MIN &&
414                 utf16[i + 1] <= LO_SURROGATE_MAX) {
415                 ++i;
416             }
417         }
418         res += UtfLength::ONE;
419     }
420     return res;
421 }
422 
UTF32Length(uint32_t codepoint)423 inline size_t UTF32Length(uint32_t codepoint)
424 {
425     return UtfLength::ONE;
426 }
427 
EncodeUTF32(uint32_t codepoint,uint32_t * utf32,size_t len,size_t index)428 size_t EncodeUTF32(uint32_t codepoint, uint32_t *utf32, size_t len, size_t index)
429 {
430     size_t size = UTF32Length(codepoint);
431     if (index + size > len) {
432         return 0;
433     }
434     utf32[index] = codepoint;
435     return size;
436 }
437 
ConvertRegionUtf16ToUtf32(const uint16_t * utf16In,uint32_t * utf32Out,size_t utf16Len,size_t utf32Len,size_t start)438 size_t ConvertRegionUtf16ToUtf32(const uint16_t *utf16In, uint32_t *utf32Out, size_t utf16Len, size_t utf32Len,
439     size_t start)
440 {
441     if (utf16In == nullptr || utf32Out == nullptr || utf32Len == 0) {
442         return 0;
443     }
444     size_t utf32Pos = 0;
445     size_t end = start + utf16Len;
446     for (size_t i = start; i < end; ++i) {
447         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
448         if (codepoint == 0) {
449             continue;
450         }
451         utf32Pos += EncodeUTF32(codepoint, utf32Out, utf32Len, utf32Pos);
452     }
453     return utf32Pos;
454 }
455 
Utf32ToUtf16Size(const uint32_t * utf32,uint32_t length)456 size_t Utf32ToUtf16Size(const uint32_t *utf32, uint32_t length)
457 {
458     size_t res = 1;  // zero byte
459 
460     for (uint32_t i = 0; i < length; ++i) {
461         if (utf32[i] == 0) {
462             // do nothing
463         } else if (utf32[i] < SURROGATE_RAIR_START) {
464             res += UtfLength::ONE;
465         } else {
466             res += UtfLength::TWO;
467         }
468     }
469     return res;
470 }
471 
ConvertRegionUtf32ToUtf16(const uint32_t * utf32In,uint16_t * utf16Out,size_t utf32Len,size_t utf16Len)472 size_t ConvertRegionUtf32ToUtf16(const uint32_t *utf32In, uint16_t *utf16Out, size_t utf32Len, size_t utf16Len)
473 {
474     size_t in_pos = 0;
475     size_t out_pos = 0;
476     while (in_pos < utf32Len && out_pos < utf16Len) {
477         uint32_t codePoint = utf32In[in_pos];
478         if (codePoint >= SURROGATE_RAIR_START) {
479             CHECK_OUT_POS_RETURN(out_pos, utf16Len);
480             codePoint -= SURROGATE_RAIR_START;
481             utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
482             utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
483         } else {
484             utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
485         }
486         in_pos++;
487     }
488     // The remain chars should be treated as single byte char.
489     while (in_pos < utf32Len && out_pos < utf16Len) {
490         utf16Out[out_pos++] = static_cast<uint16_t>(utf32In[in_pos++]);
491     }
492     return out_pos;
493 }
494 
Str8ToStr16(const std::string & str)495 std::u16string Str8ToStr16(const std::string& str)
496 {
497     if (str.empty()) {
498         return u"";
499     }
500     if (str == DEFAULT_STR) {
501         return DEFAULT_U16STR;
502     }
503     const uint8_t* buf8 = reinterpret_cast<const uint8_t*>(str.c_str());
504     size_t utf8Len = str.size();
505     auto utf16Len = Utf8ToUtf16Size(buf8, utf8Len);
506     std::unique_ptr<uint16_t[]> pBuf16 = std::make_unique<uint16_t[]>(utf16Len);
507     uint16_t *buf16 = pBuf16.get();
508     auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16, utf8Len, utf16Len);
509     if (resultLen == utf16Len) {
510         return std::u16string(reinterpret_cast<const char16_t*>(buf16), utf16Len);
511     }
512     return u"";
513 }
514 
515 // Illegal bytes are replaced with U+FFFD
Str8DebugToStr16(const std::string & str)516 std::u16string Str8DebugToStr16(const std::string& str)
517 {
518     if (str.empty()) {
519         return u"";
520     }
521     if (str == DEFAULT_STR) {
522         return DEFAULT_U16STR;
523     }
524     icu::UnicodeString ustring = icu::UnicodeString::fromUTF8(str);
525     return std::u16string(ustring.getBuffer(), static_cast<size_t>(ustring.length()));
526 }
527 
Str16ToStr8(const std::u16string & str)528 std::string Str16ToStr8(const std::u16string& str)
529 {
530     if (str.empty()) {
531         return "";
532     }
533     if (str == DEFAULT_U16STR) {
534         return DEFAULT_STR;
535     }
536     const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
537     size_t utf16Len = str.size();
538     auto utf8Len = Utf16ToUtf8Size(buf16, utf16Len) - 1;
539     std::unique_ptr<uint8_t[]> pBuf8 = std::make_unique<uint8_t[]>(utf8Len);
540     uint8_t *buf8 = pBuf8.get();
541     auto resultLen = ConvertRegionUtf16ToUtf8(buf16, buf8, utf16Len, utf8Len, 0);
542     if (resultLen == utf8Len) {
543         return std::string(reinterpret_cast<const char*>(buf8), utf8Len);
544     }
545     return "";
546 }
547 
548 // Unpaired surrogates are replace with U+FFFD
Str16DebugToStr8(const std::u16string & str)549 std::string Str16DebugToStr8(const std::u16string& str)
550 {
551     if (str.empty()) {
552         return "";
553     }
554     if (str == DEFAULT_U16STR) {
555         return DEFAULT_STR;
556     }
557     const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
558     size_t utf16Len = str.size();
559     auto utf8Len = Utf16ToUtf8Size(buf16, utf16Len) - 1;
560     std::unique_ptr<uint8_t[]> pBuf8 = std::make_unique<uint8_t[]>(utf8Len);
561     uint8_t *buf8 = pBuf8.get();
562     auto resultLen = DebuggerConvertRegionUtf16ToUtf8(buf16, buf8, utf16Len, utf8Len, 0);
563     if (resultLen == utf8Len) {
564         return std::string(reinterpret_cast<const char*>(buf8), utf8Len);
565     }
566     return "";
567 }
568 
Str16ToStr32(const std::u16string & str)569 std::u32string Str16ToStr32(const std::u16string& str)
570 {
571     if (str.empty()) {
572         return U"";
573     }
574     if (str == DEFAULT_U16STR) {
575         return DEFAULT_U32STR;
576     }
577     const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
578     size_t utf16Len = str.size();
579     auto utf32Len = Utf16ToUtf32Size(buf16, utf16Len) - 1;
580     std::unique_ptr<uint32_t[]> pBuf32 = std::make_unique<uint32_t[]>(utf32Len);
581     uint32_t *buf32 = pBuf32.get();
582     auto resultLen = ConvertRegionUtf16ToUtf32(buf16, buf32, utf16Len, utf32Len, 0);
583     if (resultLen == utf32Len) {
584         return std::u32string(reinterpret_cast<const char32_t*>(buf32), utf32Len);
585     }
586     return U"";
587 }
588 
Str32ToStr16(const std::u32string & str)589 std::u16string Str32ToStr16(const std::u32string& str)
590 {
591     if (str.empty()) {
592         return u"";
593     }
594     if (str == DEFAULT_U32STR) {
595         return DEFAULT_U16STR;
596     }
597     const uint32_t* buf32 = reinterpret_cast<const uint32_t*>(str.c_str());
598     size_t utf32Len = str.size();
599     auto utf16Len = Utf32ToUtf16Size(buf32, utf32Len) - 1;
600     std::unique_ptr<uint16_t[]> pBuf16 = std::make_unique<uint16_t[]>(utf16Len);
601     uint16_t *buf16 = pBuf16.get();
602     auto resultLen = ConvertRegionUtf32ToUtf16(buf32, buf16, utf32Len, utf16Len);
603     if (resultLen == utf16Len) {
604         return std::u16string(reinterpret_cast<const char16_t*>(buf16), utf16Len);
605     }
606     return u"";
607 }
608 
609 }  // namespace OHOS::Ace::UtfUtils
610