1 /*
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "base/utils/utf_helper.h"
17 #include "unicode/unistr.h"
18
19 namespace OHOS::Ace::UtfUtils {
20
21 const std::string DEFAULT_STR = "error";
22 const std::u16string DEFAULT_U16STR = u"error";
23 const std::u32string DEFAULT_U32STR = U"error";
24 const std::wstring DEFAULT_WSTR = L"error";
25
26 constexpr size_t HI_SURROGATE_MIN = 0xd800;
27 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
28 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
29 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
30
31 static constexpr size_t CONST_2 = 2;
32 static constexpr size_t CONST_3 = 3;
33 static constexpr size_t LOW_3BITS = 0x7;
34 static constexpr size_t LOW_4BITS = 0xF;
35 static constexpr size_t LOW_5BITS = 0x1F;
36 static constexpr size_t LOW_6BITS = 0x3F;
37 static constexpr size_t L_SURROGATE_START = 0xDC00;
38 static constexpr size_t H_SURROGATE_START = 0xD800;
39 static constexpr size_t SURROGATE_RAIR_START = 0x10000;
40 static constexpr size_t OFFSET_18POS = 18;
41 static constexpr size_t OFFSET_12POS = 12;
42 static constexpr size_t OFFSET_10POS = 10;
43 static constexpr size_t OFFSET_6POS = 6;
44 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
45 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
46 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
47 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
48 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
49 static constexpr uint32_t UTF8_OFFSET = 6;
50 static constexpr uint32_t UTF16_OFFSET = 10;
51 static constexpr uint16_t SURROGATE_MASK = 0xF800;
52 static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
53
54 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
55 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
56 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
57
58 static constexpr uint8_t BYTE_MASK = 0xbf;
59 static constexpr uint8_t BYTE_MARK = 0x80;
60
61 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
62
63 static const unsigned char FIRST_BYTE_MARK[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
64
IsUTF16HighSurrogate(uint16_t ch)65 bool IsUTF16HighSurrogate(uint16_t ch)
66 {
67 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
68 }
69
IsUTF16LowSurrogate(uint16_t ch)70 bool IsUTF16LowSurrogate(uint16_t ch)
71 {
72 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
73 }
74
75 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index)76 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
77 {
78 uint16_t high = utf16[*index];
79 if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
80 return high;
81 }
82 uint16_t low = utf16[*index + 1];
83 if (!IsUTF16LowSurrogate(low)) {
84 return high;
85 }
86 (*index)++;
87 return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
88 }
89
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)90 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index)
91 {
92 uint16_t first = utf16[*index];
93 // A valid surrogate pair should always start with a High Surrogate
94 if (IsUTF16LowSurrogate(first)) {
95 return UTF16_REPLACEMENT_CHARACTER;
96 }
97 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
98 if (*index == len - 1) {
99 // A High surrogate not paired with another surrogate
100 return UTF16_REPLACEMENT_CHARACTER;
101 }
102 uint16_t second = utf16[*index + 1];
103 if (!IsUTF16LowSurrogate(second)) {
104 // A High surrogate not followed by a low surrogate
105 return UTF16_REPLACEMENT_CHARACTER;
106 }
107 // A valid surrogate pair, decode normally
108 (*index)++;
109 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
110 }
111 // A unicode not fallen into the range of representing by surrogate pair, return as it is
112 return first;
113 }
114
RepalceUnpairedSurrogates(uint16_t * utf16,size_t end,size_t * index)115 static void RepalceUnpairedSurrogates(uint16_t *utf16, size_t end, size_t *index)
116 {
117 uint16_t first = utf16[*index];
118 // A valid surrogate pair should always start with a High Surrogate
119 if (IsUTF16LowSurrogate(first)) {
120 utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
121 return;
122 }
123 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
124 if (*index == end - 1) {
125 // A High surrogate not paired with another surrogate
126 utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
127 return;
128 }
129 uint16_t second = utf16[*index + 1];
130 if (!IsUTF16LowSurrogate(second)) {
131 // A High surrogate not followed by a low surrogate
132 utf16[*index] = UTF16_REPLACEMENT_CHARACTER;
133 return;
134 }
135 // A valid surrogate pair, decode normally
136 (*index)++;
137 return;
138 }
139 // A unicode not fallen into the range of representing by surrogate pair, return as it is
140 return;
141 }
142
HandleInvalidUTF16(uint16_t * utf16In,size_t utf16Len,size_t start)143 void HandleInvalidUTF16(uint16_t* utf16In, size_t utf16Len, size_t start)
144 {
145 if (utf16In == nullptr) {
146 return;
147 }
148 size_t end = start + utf16Len;
149 for (size_t i = start; i < end; ++i) {
150 RepalceUnpairedSurrogates(utf16In, end, &i);
151 }
152 }
153
UTF8Length(uint32_t codepoint)154 inline size_t UTF8Length(uint32_t codepoint)
155 {
156 if (codepoint <= UTF8_1B_MAX) {
157 return UtfLength::ONE;
158 }
159 if (codepoint <= UTF8_2B_MAX) {
160 return UtfLength::TWO;
161 }
162 if (codepoint <= UTF8_3B_MAX) {
163 return UtfLength::THREE;
164 }
165 return UtfLength::FOUR;
166 }
167
168 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t len,size_t index)169 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
170 {
171 size_t size = UTF8Length(codepoint);
172 if (index + size > len) {
173 return 0;
174 }
175 for (size_t j = size - 1; j > 0; j--) {
176 uint8_t cont = ((codepoint | BYTE_MARK) & BYTE_MASK);
177 utf8[index + j] = cont;
178 codepoint >>= UTF8_OFFSET;
179 }
180 utf8[index] = codepoint | FIRST_BYTE_MARK[size];
181 return size;
182 }
183
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length)184 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length)
185 {
186 size_t res = 1; // zero byte
187 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
188 // means that is a single code point, it needs to be represented by three UTF8 code.
189 if (length == 1 && utf16[0] >= HI_SURROGATE_MIN &&
190 utf16[0] <= LO_SURROGATE_MAX) {
191 res += UtfLength::THREE;
192 return res;
193 }
194
195 for (uint32_t i = 0; i < length; ++i) {
196 if (utf16[i] == 0) {
197 // do nothing
198 } else if (utf16[i] <= UTF8_1B_MAX) {
199 res += 1;
200 } else if (utf16[i] <= UTF8_2B_MAX) {
201 res += UtfLength::TWO;
202 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
203 } else if (utf16[i] < HI_SURROGATE_MIN || utf16[i] > HI_SURROGATE_MAX) {
204 res += UtfLength::THREE;
205 } else {
206 if (i < length - 1 &&
207 utf16[i + 1] >= LO_SURROGATE_MIN &&
208 utf16[i + 1] <= LO_SURROGATE_MAX) {
209 res += UtfLength::FOUR;
210 ++i;
211 } else {
212 res += UtfLength::THREE;
213 }
214 }
215 }
216 return res;
217 }
218
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)219 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
220 size_t start)
221 {
222 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
223 return 0;
224 }
225 size_t utf8Pos = 0;
226 size_t end = start + utf16Len;
227 for (size_t i = start; i < end; ++i) {
228 uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
229 if (codepoint == 0) {
230 continue;
231 }
232 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
233 }
234 return utf8Pos;
235 }
236
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)237 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
238 size_t start)
239 {
240 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
241 return 0;
242 }
243 size_t utf8Pos = 0;
244 size_t end = start + utf16Len;
245 for (size_t i = start; i < end; ++i) {
246 uint32_t codepoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
247 if (codepoint == 0) {
248 continue;
249 }
250 utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
251 }
252 return utf8Pos;
253 }
254
255 // drop the tail bytes if the remain length can't fill the length it represents.
FixUtf8Len(const uint8_t * utf8,size_t utf8Len)256 static size_t FixUtf8Len(const uint8_t* utf8, size_t utf8Len)
257 {
258 size_t trimSize = 0;
259 if (utf8Len >= 1 && utf8[utf8Len - 1] >= 0xC0) {
260 // The last one char claim there are more than 1 byte next to it, it's invalid, so drop the last one.
261 trimSize = 1;
262 }
263 if (utf8Len >= CONST_2 && utf8[utf8Len - CONST_2] >= 0xE0) {
264 // The second to last char claim there are more than 2 bytes next to it, it's invalid, so drop the last two.
265 trimSize = CONST_2;
266 }
267 if (utf8Len >= CONST_3 && utf8[utf8Len - CONST_3] >= 0xF0) {
268 // The third to last char claim there are more than 3 bytes next to it, it's invalid, so drop the last three.
269 trimSize = CONST_3;
270 }
271 return utf8Len - trimSize;
272 }
273
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)274 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
275 {
276 size_t safeUtf8Len = FixUtf8Len(utf8, utf8Len);
277 size_t in_pos = 0;
278 size_t res = 0;
279 while (in_pos < safeUtf8Len) {
280 uint8_t src = utf8[in_pos];
281 switch (src & 0xF0) {
282 case 0xF0: {
283 const uint8_t c2 = utf8[++in_pos];
284 const uint8_t c3 = utf8[++in_pos];
285 const uint8_t c4 = utf8[++in_pos];
286 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
287 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
288 if (codePoint >= SURROGATE_RAIR_START) {
289 res += CONST_2;
290 } else {
291 res++;
292 }
293 in_pos++;
294 break;
295 }
296 case 0xE0: {
297 in_pos += CONST_3;
298 res++;
299 break;
300 }
301 case 0xD0:
302 case 0xC0: {
303 in_pos += CONST_2;
304 res++;
305 break;
306 }
307 default:
308 do {
309 in_pos++;
310 res++;
311 } while (in_pos < safeUtf8Len && utf8[in_pos] < 0x80);
312 break;
313 }
314 }
315 // The remain chars should be treated as single byte char.
316 res += utf8Len - in_pos;
317 return res;
318 }
319
320 #define CHECK_OUT_POS_RETURN(out_pos, utf16Len) \
321 do { \
322 if ((out_pos) >= (utf16Len) - 1) { \
323 return out_pos; \
324 } \
325 } while (0)
326
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len)327 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len)
328 {
329 size_t safeUtf8Len = FixUtf8Len(utf8In, utf8Len);
330 size_t in_pos = 0;
331 size_t out_pos = 0;
332 while (in_pos < safeUtf8Len && out_pos < utf16Len) {
333 uint8_t src = utf8In[in_pos];
334 switch (src & 0xF0) {
335 case 0xF0: {
336 const uint8_t c2 = utf8In[++in_pos];
337 const uint8_t c3 = utf8In[++in_pos];
338 const uint8_t c4 = utf8In[++in_pos];
339 uint32_t codePoint = ((src & LOW_3BITS) << OFFSET_18POS) | ((c2 & LOW_6BITS) << OFFSET_12POS) |
340 ((c3 & LOW_6BITS) << OFFSET_6POS) | (c4 & LOW_6BITS);
341 if (codePoint >= SURROGATE_RAIR_START) {
342 CHECK_OUT_POS_RETURN(out_pos, utf16Len);
343 codePoint -= SURROGATE_RAIR_START;
344 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
345 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
346 } else {
347 utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
348 }
349 in_pos++;
350 break;
351 }
352 case 0xE0: {
353 const uint8_t c2 = utf8In[++in_pos];
354 const uint8_t c3 = utf8In[++in_pos];
355 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_4BITS) << OFFSET_12POS) |
356 ((c2 & LOW_6BITS) << OFFSET_6POS) | (c3 & LOW_6BITS));
357 in_pos++;
358 break;
359 }
360 case 0xD0:
361 case 0xC0: {
362 const uint8_t c2 = utf8In[++in_pos];
363 utf16Out[out_pos++] = static_cast<uint16_t>(((src & LOW_5BITS) << OFFSET_6POS) | (c2 & LOW_6BITS));
364 in_pos++;
365 break;
366 }
367 default:
368 do {
369 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
370 } while (in_pos < safeUtf8Len && out_pos < utf16Len && utf8In[in_pos] < 0x80);
371 break;
372 }
373 }
374 // The remain chars should be treated as single byte char.
375 while (in_pos < utf8Len && out_pos < utf16Len) {
376 utf16Out[out_pos++] = static_cast<uint16_t>(utf8In[in_pos++]);
377 }
378 return out_pos;
379 }
380
IsIndexInPairedSurrogates(int32_t index,const std::u16string & utf16)381 bool IsIndexInPairedSurrogates(int32_t index, const std::u16string& utf16)
382 {
383 uint16_t len = utf16.length();
384 if (len == 0 || index <= 0 || index >= static_cast<int32_t>(len)) {
385 return false;
386 }
387 // A valid surrogate pair should always start with a High Surrogate
388 if (IsUTF16HighSurrogate(utf16[index - 1]) && IsUTF16LowSurrogate(utf16[index])) {
389 return true;
390 }
391
392 return false;
393 }
394
Utf16ToUtf32Size(const uint16_t * utf16,uint32_t length)395 size_t Utf16ToUtf32Size(const uint16_t *utf16, uint32_t length)
396 {
397 size_t res = 1; // zero byte
398 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
399 // means that is a single code point, it needs to be represented by 1 UTF32 code.
400 if (length == 1 && utf16[0] >= HI_SURROGATE_MIN &&
401 utf16[0] <= LO_SURROGATE_MAX) {
402 res += UtfLength::ONE;
403 return res;
404 }
405
406 for (uint32_t i = 0; i < length; ++i) {
407 if (utf16[i] == 0) {
408 // do nothing
409 continue;
410 }
411 if (utf16[i] >= HI_SURROGATE_MIN && utf16[i] <= HI_SURROGATE_MAX) {
412 if (i < length - 1 &&
413 utf16[i + 1] >= LO_SURROGATE_MIN &&
414 utf16[i + 1] <= LO_SURROGATE_MAX) {
415 ++i;
416 }
417 }
418 res += UtfLength::ONE;
419 }
420 return res;
421 }
422
UTF32Length(uint32_t codepoint)423 inline size_t UTF32Length(uint32_t codepoint)
424 {
425 return UtfLength::ONE;
426 }
427
EncodeUTF32(uint32_t codepoint,uint32_t * utf32,size_t len,size_t index)428 size_t EncodeUTF32(uint32_t codepoint, uint32_t *utf32, size_t len, size_t index)
429 {
430 size_t size = UTF32Length(codepoint);
431 if (index + size > len) {
432 return 0;
433 }
434 utf32[index] = codepoint;
435 return size;
436 }
437
ConvertRegionUtf16ToUtf32(const uint16_t * utf16In,uint32_t * utf32Out,size_t utf16Len,size_t utf32Len,size_t start)438 size_t ConvertRegionUtf16ToUtf32(const uint16_t *utf16In, uint32_t *utf32Out, size_t utf16Len, size_t utf32Len,
439 size_t start)
440 {
441 if (utf16In == nullptr || utf32Out == nullptr || utf32Len == 0) {
442 return 0;
443 }
444 size_t utf32Pos = 0;
445 size_t end = start + utf16Len;
446 for (size_t i = start; i < end; ++i) {
447 uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
448 if (codepoint == 0) {
449 continue;
450 }
451 utf32Pos += EncodeUTF32(codepoint, utf32Out, utf32Len, utf32Pos);
452 }
453 return utf32Pos;
454 }
455
Utf32ToUtf16Size(const uint32_t * utf32,uint32_t length)456 size_t Utf32ToUtf16Size(const uint32_t *utf32, uint32_t length)
457 {
458 size_t res = 1; // zero byte
459
460 for (uint32_t i = 0; i < length; ++i) {
461 if (utf32[i] == 0) {
462 // do nothing
463 } else if (utf32[i] < SURROGATE_RAIR_START) {
464 res += UtfLength::ONE;
465 } else {
466 res += UtfLength::TWO;
467 }
468 }
469 return res;
470 }
471
ConvertRegionUtf32ToUtf16(const uint32_t * utf32In,uint16_t * utf16Out,size_t utf32Len,size_t utf16Len)472 size_t ConvertRegionUtf32ToUtf16(const uint32_t *utf32In, uint16_t *utf16Out, size_t utf32Len, size_t utf16Len)
473 {
474 size_t in_pos = 0;
475 size_t out_pos = 0;
476 while (in_pos < utf32Len && out_pos < utf16Len) {
477 uint32_t codePoint = utf32In[in_pos];
478 if (codePoint >= SURROGATE_RAIR_START) {
479 CHECK_OUT_POS_RETURN(out_pos, utf16Len);
480 codePoint -= SURROGATE_RAIR_START;
481 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint >> OFFSET_10POS) | H_SURROGATE_START);
482 utf16Out[out_pos++] = static_cast<uint16_t>((codePoint & 0x3FF) | L_SURROGATE_START);
483 } else {
484 utf16Out[out_pos++] = static_cast<uint16_t>(codePoint);
485 }
486 in_pos++;
487 }
488 // The remain chars should be treated as single byte char.
489 while (in_pos < utf32Len && out_pos < utf16Len) {
490 utf16Out[out_pos++] = static_cast<uint16_t>(utf32In[in_pos++]);
491 }
492 return out_pos;
493 }
494
Str8ToStr16(const std::string & str)495 std::u16string Str8ToStr16(const std::string& str)
496 {
497 if (str.empty()) {
498 return u"";
499 }
500 if (str == DEFAULT_STR) {
501 return DEFAULT_U16STR;
502 }
503 const uint8_t* buf8 = reinterpret_cast<const uint8_t*>(str.c_str());
504 size_t utf8Len = str.size();
505 auto utf16Len = Utf8ToUtf16Size(buf8, utf8Len);
506 std::unique_ptr<uint16_t[]> pBuf16 = std::make_unique<uint16_t[]>(utf16Len);
507 uint16_t *buf16 = pBuf16.get();
508 auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16, utf8Len, utf16Len);
509 if (resultLen == utf16Len) {
510 return std::u16string(reinterpret_cast<const char16_t*>(buf16), utf16Len);
511 }
512 return u"";
513 }
514
515 // Illegal bytes are replaced with U+FFFD
Str8DebugToStr16(const std::string & str)516 std::u16string Str8DebugToStr16(const std::string& str)
517 {
518 if (str.empty()) {
519 return u"";
520 }
521 if (str == DEFAULT_STR) {
522 return DEFAULT_U16STR;
523 }
524 icu::UnicodeString ustring = icu::UnicodeString::fromUTF8(str);
525 return std::u16string(ustring.getBuffer(), static_cast<size_t>(ustring.length()));
526 }
527
Str16ToStr8(const std::u16string & str)528 std::string Str16ToStr8(const std::u16string& str)
529 {
530 if (str.empty()) {
531 return "";
532 }
533 if (str == DEFAULT_U16STR) {
534 return DEFAULT_STR;
535 }
536 const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
537 size_t utf16Len = str.size();
538 auto utf8Len = Utf16ToUtf8Size(buf16, utf16Len) - 1;
539 std::unique_ptr<uint8_t[]> pBuf8 = std::make_unique<uint8_t[]>(utf8Len);
540 uint8_t *buf8 = pBuf8.get();
541 auto resultLen = ConvertRegionUtf16ToUtf8(buf16, buf8, utf16Len, utf8Len, 0);
542 if (resultLen == utf8Len) {
543 return std::string(reinterpret_cast<const char*>(buf8), utf8Len);
544 }
545 return "";
546 }
547
548 // Unpaired surrogates are replace with U+FFFD
Str16DebugToStr8(const std::u16string & str)549 std::string Str16DebugToStr8(const std::u16string& str)
550 {
551 if (str.empty()) {
552 return "";
553 }
554 if (str == DEFAULT_U16STR) {
555 return DEFAULT_STR;
556 }
557 const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
558 size_t utf16Len = str.size();
559 auto utf8Len = Utf16ToUtf8Size(buf16, utf16Len) - 1;
560 std::unique_ptr<uint8_t[]> pBuf8 = std::make_unique<uint8_t[]>(utf8Len);
561 uint8_t *buf8 = pBuf8.get();
562 auto resultLen = DebuggerConvertRegionUtf16ToUtf8(buf16, buf8, utf16Len, utf8Len, 0);
563 if (resultLen == utf8Len) {
564 return std::string(reinterpret_cast<const char*>(buf8), utf8Len);
565 }
566 return "";
567 }
568
Str16ToStr32(const std::u16string & str)569 std::u32string Str16ToStr32(const std::u16string& str)
570 {
571 if (str.empty()) {
572 return U"";
573 }
574 if (str == DEFAULT_U16STR) {
575 return DEFAULT_U32STR;
576 }
577 const uint16_t* buf16 = reinterpret_cast<const uint16_t*>(str.c_str());
578 size_t utf16Len = str.size();
579 auto utf32Len = Utf16ToUtf32Size(buf16, utf16Len) - 1;
580 std::unique_ptr<uint32_t[]> pBuf32 = std::make_unique<uint32_t[]>(utf32Len);
581 uint32_t *buf32 = pBuf32.get();
582 auto resultLen = ConvertRegionUtf16ToUtf32(buf16, buf32, utf16Len, utf32Len, 0);
583 if (resultLen == utf32Len) {
584 return std::u32string(reinterpret_cast<const char32_t*>(buf32), utf32Len);
585 }
586 return U"";
587 }
588
Str32ToStr16(const std::u32string & str)589 std::u16string Str32ToStr16(const std::u32string& str)
590 {
591 if (str.empty()) {
592 return u"";
593 }
594 if (str == DEFAULT_U32STR) {
595 return DEFAULT_U16STR;
596 }
597 const uint32_t* buf32 = reinterpret_cast<const uint32_t*>(str.c_str());
598 size_t utf32Len = str.size();
599 auto utf16Len = Utf32ToUtf16Size(buf32, utf32Len) - 1;
600 std::unique_ptr<uint16_t[]> pBuf16 = std::make_unique<uint16_t[]>(utf16Len);
601 uint16_t *buf16 = pBuf16.get();
602 auto resultLen = ConvertRegionUtf32ToUtf16(buf32, buf16, utf32Len, utf16Len);
603 if (resultLen == utf16Len) {
604 return std::u16string(reinterpret_cast<const char16_t*>(buf16), utf16Len);
605 }
606 return u"";
607 }
608
609 } // namespace OHOS::Ace::UtfUtils
610