• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/base/utf_helper.h"
17 #include "ecmascript/tests/test_helper.h"
18 
19 using namespace panda::ecmascript;
20 using namespace panda::ecmascript::base;
21 using namespace panda::ecmascript::base::utf_helper;
22 
23 namespace panda::test {
24 class UtfHelperTest : public BaseTestWithScope<false> {
25 };
26 
27 /*
28 * @tc.name: CombineTwoU16
29 * @tc.desc: Enter a pair of UTF16-encoded surrogate pair corresponding to the lead surrogates and trail surrogates,
30 *           and return the corresponding Unicode codepoint value.
31 * @tc.type: FUNC
32 */
HWTEST_F_L0(UtfHelperTest,CombineTwoU16)33 HWTEST_F_L0(UtfHelperTest, CombineTwoU16)
34 {
35     uint16_t leadSur = 0xD800;
36     uint16_t trailSur = 0xDC00;
37     uint32_t codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10)|(trailSur - 0xDc00U)) + 0x10000U;
38     uint32_t utfHelperCodePoint = CombineTwoU16(leadSur, trailSur);
39     EXPECT_EQ(codePoint, utfHelperCodePoint);
40     EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10000));
41     trailSur = 0xDFFF;
42     codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U))+ 0x10000U;
43     utfHelperCodePoint = CombineTwoU16(leadSur, trailSur);
44     EXPECT_EQ(codePoint, utfHelperCodePoint);
45     EXPECT_EQ(codePoint, static_cast<uint32_t>(0x103FF));
46     leadSur = 0xDBFF;
47     codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U)) + 0x10000U;
48     utfHelperCodePoint = CombineTwoU16(leadSur, trailSur);
49     EXPECT_EQ(codePoint, utfHelperCodePoint);
50     EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10FFFF));
51     trailSur = 0xDC00;
52     codePoint = static_cast<uint32_t>(((leadSur - 0xD800U) << 10) | (trailSur - 0xDC00U)) + 0x10000U;
53     utfHelperCodePoint = CombineTwoU16(leadSur, trailSur);
54     EXPECT_EQ(codePoint, utfHelperCodePoint);
55     EXPECT_EQ(codePoint, static_cast<uint32_t>(0x10FC00));
56     leadSur = 0xD950;
57     trailSur = 0xDF21;
58     codePoint = static_cast<uint32_t>(((leadSur - 0xD800U)<< 10) | (trailSur - 0xDC00U)) + 0x10000U;
59     utfHelperCodePoint = CombineTwoU16(leadSur, trailSur);
60     EXPECT_EQ(codePoint, utfHelperCodePoint);
61     EXPECT_EQ(codePoint, static_cast<uint32_t>(0x64321));
62 }
63 
64 /*
65 * @tc.name: UTF16Decode
66 * @tc.desc: Enter a pair of UTF16-encoded surrogate pair corresponding to the lead surrogates and trail surrogates,
67             Decodes them into corresponding Unicode codepoint values and returns.
68 * @tc.type: FUNC
69 */
HWTEST_F_L0(UtfHelperTest,UTF16Decode)70 HWTEST_F_L0(UtfHelperTest, UTF16Decode)
71 {
72     uint16_t lead = 0xD950;
73     uint16_t trail = 0xDF21;
74     EXPECT_TRUE(lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH);
75     EXPECT_TRUE(trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH);
76     uint32_t codePoint = utf_helper::UTF16Decode(lead, trail);
77     EXPECT_EQ(codePoint, 0x64321U);
78     lead = 0xD85D;
79     trail = 0xDFCC;
80     EXPECT_TRUE(lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH);
81     EXPECT_TRUE(trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH);
82     codePoint = utf_helper::UTF16Decode(lead, trail);
83     EXPECT_EQ(codePoint, 0x277CCU);
84 }
85 
86 /*
87  * @tc.name: IsValidUTF8
88  * @tc.desc: Judge whether an input group of symbols is a valid UTF8 coding sequence.
89  * @tc.type: FUNC
90  */
HWTEST_F_L0(UtfHelperTest,IsValidUTF8)91 HWTEST_F_L0(UtfHelperTest, IsValidUTF8)
92 {
93     // 0xxxxxxx, min:0, max:127
94     const std::vector<uint8_t> utfDataOneBitVaild1 = {0x00};
95     const std::vector<uint8_t> utfDataOneBitVaild2 = {BIT_MASK_1 - 0x01};
96     const std::vector<uint8_t> utfDataOneBitInvaild = {BIT_MASK_1};
97     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataOneBitVaild1));
98     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataOneBitVaild2));
99     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataOneBitInvaild));
100     // 110xxxxx 10xxxxxx, min:128, max:2047
101     const std::vector<uint8_t> utfDataTwoBitVaild1 = {BIT_MASK_2 + 0x02, BIT_MASK_1};
102     const std::vector<uint8_t> utfDataTwoBitVaild2 = {BIT_MASK_3 - 0x01, BIT_MASK_2 - 0x01};
103     const std::vector<uint8_t> utfDataTwoBitInvaild1 = {BIT_MASK_2, BIT_MASK_2};
104     const std::vector<uint8_t> utfDataTwoBitInvaild2 = {BIT_MASK_3, BIT_MASK_1};
105     const std::vector<uint8_t> utfDataTwoBitInvaild3 = {BIT_MASK_2, BIT_MASK_1};
106     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataTwoBitVaild1));
107     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataTwoBitVaild2));
108     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild1));
109     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild2));
110     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataTwoBitInvaild3));
111     // 1110xxxx 10xxxxxx 10xxxxxx, min:2048, max:65535
112     const std::vector<uint8_t> utfDataThreeBitVaild1 = {BIT_MASK_3, BIT_MASK_1 + 0x20, BIT_MASK_1};
113     const std::vector<uint8_t> utfDataThreeBitVaild2 = {BIT_MASK_4 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01};
114     const std::vector<uint8_t> utfDataThreeBitVaild3 = {BIT_MASK_3 + 0x01, BIT_MASK_1, BIT_MASK_1};
115     const std::vector<uint8_t> utfDataThreeBitInvaild1 = {BIT_MASK_3, BIT_MASK_1, BIT_MASK_2};
116     const std::vector<uint8_t> utfDataThreeBitInvaild2 = {BIT_MASK_3, BIT_MASK_2, BIT_MASK_1};
117     const std::vector<uint8_t> utfDataThreeBitInvaild3 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1};
118     const std::vector<uint8_t> utfDataThreeBitInvaild4 = {BIT_MASK_4, BIT_MASK_2, BIT_MASK_2};
119     const std::vector<uint8_t> utfDataThreeBitInvaild5 = {BIT_MASK_3, BIT_MASK_1, BIT_MASK_1};
120     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild1));
121     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild2));
122     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataThreeBitVaild3));
123     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild1));
124     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild2));
125     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild3));
126     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild4));
127     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataThreeBitInvaild5));
128     // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx, min:65536, max:1114111(0x10FFFF)
129     const std::vector<uint8_t> utfDataFourBitVaild1 = {BIT_MASK_4, BIT_MASK_1 + 0x10, BIT_MASK_1, BIT_MASK_1};
130     const std::vector<uint8_t> utfDataFourBitVaild3 = {BIT_MASK_4 + 0x01, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1};
131     const std::vector<uint8_t> utfDataFourBitInvaild1 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1, BIT_MASK_2};
132     const std::vector<uint8_t> utfDataFourBitInvaild2 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_2, BIT_MASK_1};
133     const std::vector<uint8_t> utfDataFourBitInvaild3 = {BIT_MASK_4, BIT_MASK_2, BIT_MASK_1, BIT_MASK_1};
134     const std::vector<uint8_t> utfDataFourBitInvaild4 = {BIT_MASK_5, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1};
135     const std::vector<uint8_t> utfDataFourBitInvaild5 = {BIT_MASK_5, BIT_MASK_2, BIT_MASK_2, BIT_MASK_2};
136     const std::vector<uint8_t> utfDataFourBitInvaild6 = {BIT_MASK_4, BIT_MASK_1, BIT_MASK_1, BIT_MASK_1};
137     const std::vector<uint8_t> utfDataFourBitInvaild7 =
138         {BIT_MASK_5 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01, BIT_MASK_2 - 0x01};
139     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataFourBitVaild1));
140     EXPECT_TRUE(utf_helper::IsValidUTF8(utfDataFourBitVaild3));
141     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild1));
142     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild2));
143     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild3));
144     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild4));
145     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild5));
146     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild6));
147     EXPECT_FALSE(utf_helper::IsValidUTF8(utfDataFourBitInvaild7));
148 }
149 
150 /*
151 * @tc.name: ConvertUtf16ToUtf8
152 * @tc.desc: Converts a UTF16 encoding sequence encoding a character into a UTF8 encoding sequence,
153 *           and returns the sequence and the byte length of the sequence. The parameter "modify"
154 *           indicates whether to perform special conversion for 0.
155 * @tc.type: FUNC
156 */
HWTEST_F_L0(UtfHelperTest,ConvertUtf16ToUtf8_001)157 HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_001)
158 {
159     // codePoint lie in [0,0x7F]--->UTF-8(length:1)
160     {
161         uint16_t utf16Data0 = 0x00;
162         uint16_t utf16Data1 = 0x00;
163         Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
164         Utf8Char utf8CharTemp = {0, {0x00U}};
165         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
166         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
167     }
168 
169     // special case for \u0000 ==> Co80- 1100'0000 1000'0000
170     {
171         uint16_t utf16Data0 = 0x00;
172         uint16_t utf16Data1 = 0x00;
173         Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, true);
174         Utf8Char utf8CharTemp = {2, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
175         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
176         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
177         utf16Data0 = 0x7F;
178         utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
179         utf8CharTemp = {1, {0x7F}};
180         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
181         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
182 
183         // codePoint lie in [0x80,0x7FF]--> UTF-8(length:2)
184         utf16Data0 = 0x80;
185         utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
186         utf8CharTemp = {2, {UTF8_2B_FIRST + 0x02U, UTF8_2B_SECOND}};
187         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
188         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
189         utf16Data0 = 0x7FF;
190         utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
191         utf8CharTemp = {2, {BIT_MASK_3 - 0x01, BIT_MASK_2 - 0x01}};
192         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
193         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
194     }
195 
196     // codePoint lie in [0xD800,0xDFFF]--> UTF-8(length:3)
197     {
198         uint16_t utf16Data0 = 0xD800;
199         uint16_t utf16Data1 = 0x00;
200         Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
201         Utf8Char utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xD800 >> 12),
202                             UTF8_3B_SECOND | (static_cast<uint8_t>(0xD800 >> 6) & utf::MASK_6BIT),
203                             UTF8_3B_THIRD | (static_cast<uint8_t>(0xD800) & utf::MASK_6BIT)}};
204         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
205         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
206         utf16Data0 = 0xDFFF;
207         utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
208         utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xDFFF >> 12),
209                             UTF8_3B_SECOND | (static_cast<uint8_t>(0xDFFF >> 6) & utf::MASK_6BIT),
210                             UTF8_3B_THIRD | (static_cast<uint8_t>(0xDFFF) & utf::MASK_6BIT)}};
211         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
212         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
213     }
214 }
215 
HWTEST_F_L0(UtfHelperTest,ConvertUtf16ToUtf8_002)216 HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_002)
217 {
218     // codePoint lie in [0x800,0xD7FF]&&[0xE000,0xFFFF]-->UTF-8(length:3)
219     uint16_t utf16Data0 = 0x800;
220     uint16_t utf16Data1 = 0x00;
221     Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
222     Utf8Char utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0x800 >> 12),
223                                  UTF8_3B_SECOND | (static_cast<uint8_t>(0x800 >> 6) & utf::MASK_6BIT),
224                                  UTF8_3B_THIRD | (static_cast<uint8_t>(0x800) & utf::MASK_6BIT)}};
225     EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
226     EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
227     utf16Data0 = 0xD7FF;
228     utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
229     utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xD7FF>>12),
230                         UTF8_3B_SECOND | (static_cast<uint8_t>(0xD7FF >> 6) & utf::MASK_6BIT),
231                         UTF8_3B_THIRD | (static_cast<uint8_t>(0xD7FF) & utf::MASK_6BIT)}};
232     EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
233     EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
234     utf16Data0 = 0xE000;
235     utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
236     utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xE000 >> 12),
237                         UTF8_3B_SECOND | (static_cast<uint8_t>(0xE000 >> 6)& utf::MASK_6BIT),
238                         UTF8_3B_THIRD | (static_cast<uint8_t>(0xE000) & utf::MASK_6BIT)}};
239     EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
240     EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
241     utf16Data0 = 0xFFFF;
242     utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
243     utf8CharTemp = {3, {UTF8_3B_FIRST | static_cast<uint8_t>(0xFFFF >> 12),
244                         UTF8_3B_SECOND | (static_cast<uint8_t>(0xFFFF >> 6)& utf::MASK_6BIT),
245                         UTF8_3B_THIRD | (static_cast<uint8_t>(0xFFFF) & utf::MASK_6BIT)}};
246     EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
247     EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
248 }
249 
HWTEST_F_L0(UtfHelperTest,ConvertUtf16ToUtf8_003)250 HWTEST_F_L0(UtfHelperTest, ConvertUtf16ToUtf8_003)
251 {
252     // codePoint lie in [0x10000,0x10FFFF] --> UTF-8(length:4)
253     {
254         uint16_t utf16Data0 = 0xD800;
255         uint16_t utf16Data1 = 0xDC00;
256         Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
257         uint32_t codePoint = CombineTwoU16(utf16Data0, utf16Data1);
258         Utf8Char utf8CharTemp = {4, {static_cast<uint8_t>((codePoint >> 18) | UTF8_4B_FIRST),
259                             static_cast<uint8_t>(((codePoint >> 12) & utf::MASK_6BIT) | utf::MASK1),
260                             static_cast<uint8_t>(((codePoint >> 6) & utf::MASK_6BIT) | utf::MASK1),
261                             static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1)}};
262         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
263         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
264     }
265 
266     // 0xD950 0xDF21 --> 0x64321 --> 0xf1 0xa4 0x8c 0xa1
267     {
268         uint16_t utf16Data0 = 0xD950;
269         uint16_t utf16Data1 = 0xDF21;
270         Utf8Char utf8Char = ConvertUtf16ToUtf8(utf16Data0, utf16Data1, false);
271         uint32_t codePoint = CombineTwoU16(utf16Data0, utf16Data1);
272         Utf8Char utf8CharTemp = {4, {static_cast<uint8_t>((codePoint >> 18) | UTF8_4B_FIRST),
273                             static_cast<uint8_t>(((codePoint >> 12)& utf::MASK_6BIT)| utf::MASK1),
274                             static_cast<uint8_t>(((codePoint >> 6)& utf::MASK_6BIT) | utf::MASK1),
275         static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1)}};
276         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
277         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
278         utf8CharTemp = {4, {0xf1, 0xa4, 0x8c, 0xa1}};
279         EXPECT_EQ(utf8Char.n, utf8CharTemp.n);
280         EXPECT_EQ(utf8Char.ch, utf8CharTemp.ch);
281     }
282 }
283 
284 /*
285 * @tc.name: Utf16ToUtf8Size
286 * @tc.desc: Enter a string of UTF16 coded sequences and return the length of the sequence converted into UTF8 coded
287 *           sequences. "length" indicates the length of the input UTF16 sequence, and "modify" indicates whether
288 *           to perform special conversion for.
289 * @tc.type: FUNC
290 */
HWTEST_F_L0(UtfHelperTest,Utf16ToUtf8Size_001)291 HWTEST_F_L0(UtfHelperTest, Utf16ToUtf8Size_001)
292 {
293     // when utf16 data length is only 1 and code in 0xd800-0xdfff, means that is a single code point, it needs to be
294     // represented by three UTF8 code.
295     uint32_t length = 0;
296     uint16_t utf16Value1[1] = {0xD800};
297     const uint16_t *utf16ValuePtr1 = utf16Value1;
298     length = Utf16ToUtf8Size(utf16ValuePtr1, 1, false);
299     EXPECT_EQ(length - 1, UtfLength::THREE);
300     length = 1;
301     uint16_t utf16Value2[1] = {0xDFFF};
302     const uint16_t *utf16ValuePtr2 = utf16Value2;
303     length = Utf16ToUtf8Size(utf16ValuePtr2, 1, false);
304     EXPECT_EQ(length - 1, UtfLength::THREE);
305 
306     // special case for U+0000 => c0 80
307     uint16_t utf16Value3[1] = {0x00};
308     const uint16_t *utf16ValuePtr3 = utf16Value3;
309     length = Utf16ToUtf8Size(utf16ValuePtr3, 1, false);
310     EXPECT_EQ(length - 1, 0U);
311     length = Utf16ToUtf8Size(utf16ValuePtr3, 1, true);
312     EXPECT_EQ(length - 1, 2U);
313 
314     // if isGetBufferSize is true, special case for U+0000 => 00
315     uint16_t utf16Value12[1] = {0x00};
316     const uint16_t *utf16ValuePtr12 = utf16Value12;
317     length = Utf16ToUtf8Size(utf16ValuePtr12, 1, false, true);
318     EXPECT_EQ(length - 1, 1U);
319     length = Utf16ToUtf8Size(utf16ValuePtr12, 1, true, true);
320     EXPECT_EQ(length - 1, 1U);
321 
322     // code point lie in [0x00, 0x7F], it needs to be represented by one UTF8 code.
323     uint16_t utf16Value4[1] = {0x00};
324     uint16_t utf16Value5[1] = {0x7F};
325     const uint16_t *utf16ValuePtr4 = utf16Value4;
326     const uint16_t *utf16ValuePtr5 = utf16Value5;
327     length = Utf16ToUtf8Size(utf16ValuePtr4, 1, false);
328     EXPECT_EQ(length - 1, 0U);
329     length = Utf16ToUtf8Size(utf16ValuePtr5, 1, false);
330     EXPECT_EQ(length - 1, 1U);
331 
332     // code point lie in [0x80, 0x7FF], it needs to be represented by two UTF8 code,
333     uint16_t utf16Value6[1] = {0x80};
334     uint16_t utf16Value7[1] = {0x7FF};
335     const uint16_t *utf16ValuePtr6 = utf16Value6;
336     const uint16_t *utf16ValuePtr7 = utf16Value7;
337     length = Utf16ToUtf8Size(utf16ValuePtr6, 1, false);
338     EXPECT_EQ(length - 1, 2U);
339     length = Utf16ToUtf8Size(utf16ValuePtr7, 1, false);
340     EXPECT_EQ(length - 1, 2U);
341     // code point lie in [0x800, 0xD7FF] or [0xDCoo, 0xFFFF], it needs to be represented by three UTF8 code.
342     uint16_t utf16Value8[1] = {0x800};
343     uint16_t utf16Value9[1] = {0xD7FF};
344     uint16_t utf16Value10[1] = {0xDC00};
345     uint16_t utf16Value11[1] = {0xFFFF};
346     const uint16_t *utf16ValuePtr8 = utf16Value8;
347     const uint16_t *utf16ValuePtr9 = utf16Value9;
348     const uint16_t *utf16ValuePtr10 = utf16Value10;
349     const uint16_t *utf16ValuePtr11 = utf16Value11;
350     length = Utf16ToUtf8Size(utf16ValuePtr8, 1, false);
351     EXPECT_EQ(length - 1, 3U);
352     length = Utf16ToUtf8Size(utf16ValuePtr9, 1, false);
353     EXPECT_EQ(length - 1, 3U);
354     length = Utf16ToUtf8Size(utf16ValuePtr10, 1, false);
355     EXPECT_EQ(length-1, 3U);
356     length = Utf16ToUtf8Size(utf16ValuePtr11, 1, false);
357     EXPECT_EQ(length - 1, 3U);
358 }
359 
HWTEST_F_L0(UtfHelperTest,Utf16ToUtf8Size_002)360 HWTEST_F_L0(UtfHelperTest, Utf16ToUtf8Size_002)
361 {
362     // The trail value is valid, located in [0xDc00, 0xDFFF].It needs to be represented by four UTF8 code.
363     uint16_t utf16Value12[2] = {0xD800, 0xDc00};
364     uint16_t utf16Value13[2] = {0xD800, 0xDFFF};
365     uint16_t utf16Value14[2] = {0xDBFF, 0xDC00};
366     uint16_t utf16Value15[2] = {0xDBFF, 0xDFFF};
367     const uint16_t *utf16ValuePtr12 = utf16Value12;
368     const uint16_t *utf16ValuePtr13 = utf16Value13;
369     const uint16_t *utf16ValuePtr14 = utf16Value14;
370     const uint16_t *utf16ValuePtr15 = utf16Value15;
371     uint32_t length = Utf16ToUtf8Size(utf16ValuePtr12, 2, false);
372     EXPECT_EQ(length - 1, 4U);
373     length = Utf16ToUtf8Size(utf16ValuePtr13, 2, false);
374     EXPECT_EQ(length- 1, 4U);
375     length = Utf16ToUtf8Size(utf16ValuePtr14, 2, false);
376     EXPECT_EQ(length - 1, 4U);
377     length = Utf16ToUtf8Size(utf16ValuePtr15, 2, false);
378     EXPECT_EQ(length - 1, 4U);
379 
380     // The trail value of Bad sequence is invalid, not located in [0xDC00, 0xDFFF].
381     // Need to return 6 bytes length
382     uint16_t utf16Value16[2] = {0xD800, 0xDBFF};
383     uint16_t utf16Value17[2] = {0xDC00, 0xDFFF};
384     const uint16_t *utf16ValuePtr16 = utf16Value16;
385     const uint16_t *utf16ValuePtr17 = utf16Value17;
386     length = Utf16ToUtf8Size(utf16ValuePtr16, 2, false);
387     EXPECT_EQ(length- 1, 6U);
388     length = Utf16ToUtf8Size(utf16ValuePtr17, 2, false);
389     EXPECT_EQ(length-1, 6U);
390 
391     // 0(or 2)+ 1+ 2 + 3 + 4 = 10(or 12)
392     uint16_t utf16Value18[6] = {0x00, 0x7F, 0x80, 0x800, 0xD800, 0xDC00};
393     const uint16_t *utf16ValuePtr18 = utf16Value18;
394     length = Utf16ToUtf8Size(utf16ValuePtr18, 6, false);
395     EXPECT_EQ(length - 1, 10U);
396     length = Utf16ToUtf8Size(utf16ValuePtr18, 6, true);
397     EXPECT_EQ(length - 1, 12U);
398 }
399 
400 /*
401 * @tc.name: ConvertUtf8ToUtf16Pair
402 * @tc.desc: Converts a UTF8 encoding sequence encoding a character into a UTF16 encoding sequence, and returns the
403 *           sequence and the byte length of the UTF16 encoding sequence. The parameter "combine" identifies whether
404 *           to return a pr0xy pair of Unicode values in the secondary plane, or the Unicode value itself.
405 * @tc.type: FUNC
406 */
HWTEST_F_L0(UtfHelperTest,ConvertUtf8ToUtf16Pair)407 HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUtf16Pair)
408 {
409     // code point lie in [0x00, 0x7F], the length of utf8 code element byte is 1
410     uint8_t utf8Value1[1] = {0x00};
411     uint8_t utf8Value2[1] = {UTF8_1B_MAX};
412     const uint8_t *utf8ValuePtr1 = utf8Value1;
413     const uint8_t *utf8ValuePtr2 = utf8Value2;
414     std::pair<uint32_t, size_t> utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr1);
415     std::pair<uint32_t, size_t> utf16Value = {utf8Value1[0], 1};
416     EXPECT_EQ(utf16Res, utf16Value);
417     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr2);
418     utf16Value = {utf8Value2[0], 1};
419     EXPECT_EQ(utf16Res, utf16Value);
420     // code point lie in [0x80, 0x7FF], the length of utf8 code element byte is 2
421     uint8_t utf8Value3[2] = {0xc2, 0x80}; // 0x80
422     uint8_t utf8Value4[2] = {0xDF, 0xBF}; // 0x7FF
423     const uint8_t *utf8ValuePtr3 = utf8Value3;
424     const uint8_t *utf8ValuePtr4 = utf8Value4;
425     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr3);
426     utf16Value = {0x80, 2};
427     EXPECT_EQ(utf16Res, utf16Value);
428     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr4);
429     utf16Value = {0x7FF, 2};
430     EXPECT_EQ(utf16Res, utf16Value);
431 
432     // code point lie in [0x800, 0xD7FF] or [0xDC00,0xFFFF], the length of utf8 code element byte is 3.
433     // when code point lie in [0xD800, 0xDBFF], due to the use of UCS-2, it corresponds to 3 utf8 symbols.
434     uint8_t utf8Value5[3] = {0xE0, 0xA0, 0x80}; // 0x800
435     uint8_t utf8Value6[3] = {0xEF, 0xBF, 0xBF}; // 0xFFFF
436     const uint8_t *utf8ValuePtr5 = utf8Value5;
437     const uint8_t *utf8ValuePtr6 = utf8Value6;
438     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr5);
439     utf16Value = {0x800, 3};
440     EXPECT_EQ(utf16Res, utf16Value);
441     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr6);
442     utf16Value = {0xFFFF, 3};
443     EXPECT_EQ(utf16Res, utf16Value);
444     // code point lie in [0x10000, 0x10FFFF], the length of utf8 code element byte is 4.
445     uint8_t utf8Value9[4] = {0xF0, 0x90, 0x80, 0x80}; // 0x10000
446     uint8_t utf8Value10[4] = {0xF4, 0x8F, 0xBF, 0xBF}; // 0x10FFFF
447     const uint8_t *utf8ValuePtr9 = utf8Value9;
448     const uint8_t *utf8ValuePtr10 = utf8Value10;
449     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr9);
450     utf16Value = {0xD800 << 16 | 0xDC00U, 4};
451     EXPECT_EQ(utf16Res, utf16Value);
452     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr10);
453     utf16Value = {0xDBFF << 16 | 0xDFFF, 4};
454     EXPECT_EQ(utf16Res, utf16Value);
455     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr9, true);
456     utf16Value = {0x10000, 4};
457     EXPECT_EQ(utf16Res, utf16Value);
458     utf16Res = ConvertUtf8ToUtf16Pair(utf8ValuePtr10, true);
459     utf16Value = {0x10FFFF, 4};
460     EXPECT_EQ(utf16Res, utf16Value);
461 }
462 
463 /*
464 * @tc.name: Utf8ToUtf16Size
465 * @tc.desc: Enter a string of UTF8 coded sequences and return the length of the sequence converted into UTF16 coded
466 *           sequences.
467 * @tc.type: FUNC
468 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size)469 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size)
470 {
471     // when code point lie in (0x00, 0xFFFF], the required utf16 code element length is 1.
472     uint8_t utf8ValueOneByteMin[1] = {0x00};
473     uint8_t utf8ValueOneByteMax[4] = {0xEF, 0xBF, 0xBF, 0x00}; // 0xFFFF
474     const uint8_t *utf8ValueOneByteMinPtr = utf8ValueOneByteMin;
475     const uint8_t *utf8ValueOneByteMaxPtr = utf8ValueOneByteMax;
476     size_t length = Utf8ToUtf16Size(utf8ValueOneByteMinPtr, sizeof(utf8ValueOneByteMin));
477     EXPECT_EQ(length, 1U);
478     length = Utf8ToUtf16Size(utf8ValueOneByteMaxPtr, sizeof(utf8ValueOneByteMax));
479     EXPECT_EQ(length, 2U);
480     // when code point lie in [0x10000, 0x10FFFF], the required utf16 code element length is 2.
481     const uint8_t utf8ValueTwoBytesMin[5] = {0xF0, 0x90, 0x80, 0x80, 0x00}; // 0x10000
482     const uint8_t utf8ValueTwoBytesMax[5] = {0xF4, 0x8F, 0xBF, 0xBF, 0x00}; // 0x10FFFF
483     const uint8_t *utf8ValueTwoBytesMinPtr = utf8ValueTwoBytesMin;
484     const uint8_t *utf8ValueTwoBytesMaxPtr = utf8ValueTwoBytesMax;
485     length = Utf8ToUtf16Size(utf8ValueTwoBytesMinPtr, sizeof(utf8ValueTwoBytesMin));
486     EXPECT_EQ(length, 3U);
487     length = Utf8ToUtf16Size(utf8ValueTwoBytesMaxPtr, sizeof(utf8ValueTwoBytesMax));
488     EXPECT_EQ(length, 3U);
489     uint8_t utf8Value[12] = {
490         0xEF, 0xBF, 0xBF, 0xF0,
491         0x90, 0x80, 0x80, 0xF4,
492         0x8F, 0xBF, 0xBF, 0x00};
493     const uint8_t *utf8ValuePtr = utf8Value;
494     length = Utf8ToUtf16Size(utf8ValuePtr, sizeof(utf8Value));
495     EXPECT_EQ(length, 6U);
496 }
497 
ConvertRegionUtf16ToUtf8Test(bool isDebugger=false)498 static void ConvertRegionUtf16ToUtf8Test(bool isDebugger = false)
499 {
500     size_t utf16Len = 8;
501     size_t utf8Len = 100;
502     size_t start = 0;
503     size_t utf8Pos = 0;
504     bool modify = false;
505     uint16_t utf16Value[8] = {
506         0x00, // 0 or 2 (special case for \u0000 ==> C080 - 1100'0000 1000'0000)
507         0x7F, // 1(0x00, 0x7F]
508         0x7FF, // 2 [0x80, 0x7FF]
509         0x800, // 3 [0x800, 0xD7FF]
510         0xD800, // 3 [0xD800, 0xDFFF]
511         0xFFFF, // 3 [0xE000, 0xFFFF]
512         0xD800, 0xDFFF}; // 4 [0x10000, 0x10FFFF]
513     const uint16_t *utf16ValuePtr = utf16Value;
514     uint8_t *utf8Out = (uint8_t*)malloc(utf8Len);
515     if (isDebugger) {
516         utf8Pos = DebuggerConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify);
517     } else {
518         utf8Pos = ConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify);
519     }
520     // 0 + 1 + 2 +(3 *3)+ 4= 16
521     EXPECT_EQ(utf8Pos, 16U);
522     // 2 + 1 + 2 +(3 * 3)+ 4 = 18
523     modify = true;
524     if (isDebugger) {
525         utf8Pos = DebuggerConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify);
526     } else {
527         utf8Pos = ConvertRegionUtf16ToUtf8(utf16ValuePtr, utf8Out, utf16Len, utf8Len, start, modify);
528     }
529     EXPECT_EQ(utf8Pos, 18U);
530     free(utf8Out);
531 }
532 
533 /*
534 * @tc.name: ConvertRegionUtf16ToUtf8
535 * @tc.desc: Input aUTF16-encoded sequence (thelength is "utf16Len"), convert part of the sequence into a UTF8-encoded
536 *           sequence, and save it to "utf8Out"(the maximum length is "utf8Len"). The start parameter indicates the
537 *           start position of the conversion. Whether to perform special processing for O in the "modify" parameter.
538 * @tc.type: FUNC
539 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf16ToUtf8)540 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf16ToUtf8)
541 {
542     ConvertRegionUtf16ToUtf8Test();
543 }
544 
HWTEST_F_L0(UtfHelperTest,DebuggerConvertRegionUtf16ToUtf8)545 HWTEST_F_L0(UtfHelperTest, DebuggerConvertRegionUtf16ToUtf8)
546 {
547     ConvertRegionUtf16ToUtf8Test(true);
548 }
549 
550 /*
551 * @tc.name: ConvertRegionUtf8ToUtf16
552 * @tc.desc: Input a UTF8-encoded sequence, convert part of the sequence into a UTF8-encoded sequence, and save it to
553 *           "utf16Out"(the maximum length is "utf16Len"), The start parameter indicates the start position of the
554 *           conversion.
555 * @tc.type: FUNC
556 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16)557 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16)
558 {
559     size_t utf16Len = 100;
560     uint8_t utf8Value[10] = {
561         0x7F, // 1-length UTF16 encoding
562         0xDF, 0xBF, // 1-length UTF16 encoding
563         0xEF, 0xBF, 0xBF, // 1-length UTF16 encoding
564         0xF4, 0x8F, 0xBF, 0xBF}; // 2-length UTF16 encoding
565     const uint8_t *utf8ValuePtr = utf8Value;
566     uint16_t *utf16Out = (uint16_t*)malloc(utf16Len);
567     size_t outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value), utf16Len);
568     // 1 + 1 + 1 + 2 = 5s
569     EXPECT_EQ(outPos, 5U);
570     // 1 + 2 = 3
571     utf8ValuePtr = utf8Value + 3;
572     outPos = ConvertRegionUtf8ToUtf16(utf8ValuePtr, utf16Out, sizeof(utf8Value) - 3, utf16Len);
573     EXPECT_EQ(outPos, 3U);
574 }
575 
576 /*
577 * @tc.name: ConvertUtf8ToUnicodeChar
578 * @tc.desc: Converts a UTF8 encoding sequence encoding a character into a unicode point, and returns the
579 *           unicode point and the byte length of the utf8 encoding sequence.
580 * @tc.type: FUNC
581 */
HWTEST_F_L0(UtfHelperTest,ConvertUtf8ToUnicodeChar)582 HWTEST_F_L0(UtfHelperTest, ConvertUtf8ToUnicodeChar)
583 {
584     std::pair<int32_t, size_t> invalidValue = {INVALID_UTF8, 0};
585     // utf-8 is one byte, code point lie in [0x00, 0x7F]
586     uint8_t utf8Value1[1] = {0x00}; // 0x00
587     uint8_t utf8Value2[1] = {0x7F}; // 0x7F
588     const uint8_t *utf8ValuePtr1 = utf8Value1;
589     const uint8_t *utf8ValuePtr2 = utf8Value2;
590     std::pair<int32_t, size_t> unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr1, UtfLength::ONE);
591     std::pair<int32_t, size_t> unicodeValue = {0x00, 1};
592     EXPECT_EQ(unicodeRes, unicodeValue);
593     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr2, UtfLength::ONE);
594     unicodeValue = {0x7F, 1};
595     EXPECT_EQ(unicodeRes, unicodeValue);
596     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr2, 0);
597     EXPECT_EQ(unicodeRes, invalidValue);
598 
599     // utf-8 is two bytes, code point lie in [0x80, 0x7FF]
600     uint8_t utf8Value3[2] = {0xC2, 0x80}; // 0x80
601     uint8_t utf8Value4[2] = {0xDF, 0xBF}; // 0x7FF
602     const uint8_t *utf8ValuePtr3 = utf8Value3;
603     const uint8_t *utf8ValuePtr4 = utf8Value4;
604     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr3, UtfLength::TWO);
605     unicodeValue = {0x80, 2};
606     EXPECT_EQ(unicodeRes, unicodeValue);
607     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr4, UtfLength::TWO);
608     unicodeValue = {0x7FF, 2};
609     EXPECT_EQ(unicodeRes, unicodeValue);
610     uint8_t utf8Value5[2] = {0xD0, 0x00}; // invalid
611     const uint8_t *utf8ValuePtr5 = utf8Value5;
612     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr5, UtfLength::TWO);
613     EXPECT_EQ(unicodeRes, invalidValue);
614     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr4, UtfLength::ONE);
615     EXPECT_EQ(unicodeRes, invalidValue);
616 
617     // utf-8 is three bytes, code point lie in [0x800, 0xFFFF]
618     uint8_t utf8Value6[3] = {0xE0, 0xA0, 0x80}; // 0x800
619     uint8_t utf8Value7[3] = {0xED, 0x9F, 0xBF}; // 0xD7FF
620     const uint8_t *utf8ValuePtr6 = utf8Value6;
621     const uint8_t *utf8ValuePtr7 = utf8Value7;
622     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr6, UtfLength::THREE);
623     unicodeValue = {0x800, 3};
624     EXPECT_EQ(unicodeRes, unicodeValue);
625     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr7, UtfLength::THREE);
626     unicodeValue = {0xD7FF, 3};
627     EXPECT_EQ(unicodeRes, unicodeValue);
628     uint8_t utf8Value8[3] = {0xEB, 0x80, 0x40}; // invalid
629     const uint8_t *utf8ValuePtr8 = utf8Value8;
630     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr8, UtfLength::THREE);
631     EXPECT_EQ(unicodeRes, invalidValue);
632     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr7, UtfLength::TWO);
633     EXPECT_EQ(unicodeRes, invalidValue);
634 
635     // utf-8 is four bytes, code point lie in [0x10000, 0x10FFFF].
636     uint8_t utf8Value9[4] = {0xF0, 0x90, 0x80, 0x80}; // 0x10000
637     uint8_t utf8Value10[4] = {0xF4, 0x8F, 0xBF, 0xBF}; // 0x10FFFF
638     const uint8_t *utf8ValuePtr9 = utf8Value9;
639     const uint8_t *utf8ValuePtr10 = utf8Value10;
640     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr9, UtfLength::FOUR);
641     unicodeValue = {0x10000, 4};
642     EXPECT_EQ(unicodeRes, unicodeValue);
643     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr10, UtfLength::FOUR);
644     unicodeValue = {0x10FFFF, 4};
645     EXPECT_EQ(unicodeRes, unicodeValue);
646     uint8_t utf8Value11[4] = {0xF4, 0x80, 0x80, 0x40}; // invalid
647     const uint8_t *utf8ValuePtr11 = utf8Value11;
648     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr11, UtfLength::FOUR);
649     EXPECT_EQ(unicodeRes, invalidValue);
650     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr10, UtfLength::THREE);
651     EXPECT_EQ(unicodeRes, invalidValue);
652 
653     // other exception
654     uint8_t utf8Value12[2] = {0x90, 0x00}; // invalid
655     const uint8_t *utf8ValuePtr12 = utf8Value12;
656     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr12, UtfLength::FOUR);
657     EXPECT_EQ(unicodeRes, invalidValue);
658     uint8_t utf8Value13[2] = {0xF8, 0x00}; // invalid
659     const uint8_t *utf8ValuePtr13 = utf8Value13;
660     unicodeRes = ConvertUtf8ToUnicodeChar(utf8ValuePtr13, UtfLength::FOUR);
661     EXPECT_EQ(unicodeRes, invalidValue);
662 }
663 
664 /*
665 * @tc.name: Utf8ToUtf16Size
666 * @tc.desc: Test single byte characters
667 * @tc.type: FUNC
668 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_001)669 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_001) {
670     std::string utf8 = "Hello";
671     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello"
672     std::vector<uint16_t> utf16(10);
673     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
674     utf16.resize(converted);
675     EXPECT_EQ(utf16, expected_utf16);
676 }
677 
678 /*
679 * @tc.name: Utf8ToUtf16Size
680 * @tc.desc: Test includes Chinese characters
681 * @tc.type: FUNC
682 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_002)683 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_002) {
684     std::string utf8 = "你好,世界!";
685     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好,世界!"
686     std::vector<uint16_t> utf16(10);
687     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
688     utf16.resize(converted);
689     EXPECT_EQ(utf16, expected_utf16);
690 }
691 
692 /*
693 * @tc.name: Utf8ToUtf16Size
694 * @tc.desc: Test empty string
695 * @tc.type: FUNC
696 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_003)697 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_003) {
698     std::string utf8 = "";
699     std::vector<uint16_t> expected_utf16 = {}; // empty
700     std::vector<uint16_t> utf16(10);
701     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
702     utf16.resize(converted);
703     EXPECT_EQ(utf16, expected_utf16);
704 }
705 
706 /*
707 * @tc.name: Utf8ToUtf16Size
708 * @tc.desc: Test section conversion
709 * @tc.type: FUNC
710 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_004)711 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_004) {
712     std::string utf8 = "Hello, 你好";
713     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "Hello, 你"
714     std::vector<uint16_t> utf16(10);
715     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
716     utf16.resize(converted);
717     EXPECT_EQ(utf16, expected_utf16);
718 }
719 
720 /*
721 * @tc.name: Utf8ToUtf16Size
722 * @tc.desc: Test buffer length limit
723 * @tc.type: FUNC
724 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_005)725 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_005) {
726     std::string utf8 = "你好,世界!";
727     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0}; // "你好"
728     std::vector<uint16_t> utf16(2); // Limit buffer length
729     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
730     utf16.resize(converted);
731     EXPECT_EQ(utf16, expected_utf16);
732 }
733 
734 /*
735 * @tc.name: Utf8ToUtf16Size
736 * @tc.desc: Test for incorrect UTF-8 data
737 * @tc.type: FUNC
738 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_006)739 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_006) {
740     std::string utf8 = "\xF0\x28\x8C\x28";
741     std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
742     std::vector<uint16_t> utf16(10);
743     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
744     utf16.resize(converted);
745     EXPECT_EQ(utf16, expected_utf16);
746 }
747 
748 /*
749 * @tc.name: Utf8ToUtf16Size
750 * @tc.desc: Test single byte UTF-8 characters
751 * @tc.type: FUNC
752 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_007)753 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_007) {
754     std::string utf8 = "ABC"; // All are single byte characters
755     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0}; // ASCII characters: A, B, C
756     std::vector<uint16_t> utf16(10);
757     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
758     utf16.resize(converted);
759     EXPECT_EQ(utf16, expected_utf16);
760 }
761 
762 /*
763 * @tc.name: Utf8ToUtf16Size
764 * @tc.desc: Testing Double Byte UTF-8 Characters
765 * @tc.type: FUNC
766 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_008)767 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_008) {
768     std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
769     std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // Unicode .
770     std::vector<uint16_t> utf16(10);
771     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
772     utf16.resize(converted);
773     EXPECT_EQ(utf16, expected_utf16);
774 }
775 
776 /*
777 * @tc.name: Utf8ToUtf16Size
778 * @tc.desc: Test three byte UTF-8 characters
779 * @tc.type: FUNC
780 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_009)781 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_009) {
782     std::string utf8 = "\xE2\x82\xAC"; // euro: €
783     std::vector<uint16_t> expected_utf16 = {0x0}; // Unicode .
784     std::vector<uint16_t> utf16(10);
785     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
786     utf16.resize(converted);
787     EXPECT_EQ(utf16, expected_utf16);
788 }
789 
790 /*
791 * @tc.name: Utf8ToUtf16Size
792 * @tc.desc: Test four byte UTF-8 characters and proxy pairs
793 * @tc.type: FUNC
794 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_010)795 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_010) {
796     std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji ��
797     std::vector<uint16_t> expected_utf16 = {0x0, 0x0}; // surrogates
798     std::vector<uint16_t> utf16(10);
799     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
800     utf16.resize(converted);
801     EXPECT_EQ(utf16, expected_utf16);
802 }
803 
804 /*
805 * @tc.name: Utf8ToUtf16Size
806 * @tc.desc: Test UTF-8 data containing zero bytes
807 * @tc.type: FUNC
808 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_011)809 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_011) {
810     std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
811     std::vector<uint16_t> expected_utf16 = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
812     std::vector<uint16_t> utf16(15);
813     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8Nul.data()), utf8Nul.size());
814     utf16.resize(converted);
815     EXPECT_EQ(utf16, expected_utf16);
816 }
817 
818 /*
819 * @tc.name: Utf8ToUtf16Size
820 * @tc.desc: Test continuous illegal sequences
821 * @tc.type: FUNC
822 */
HWTEST_F_L0(UtfHelperTest,Utf8ToUtf16Size_012)823 HWTEST_F_L0(UtfHelperTest, Utf8ToUtf16Size_012) {
824     std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
825     std::vector<uint16_t> expected_utf16 = {0x0, 0x0};
826     std::vector<uint16_t> utf16(10);
827     size_t converted = Utf8ToUtf16Size(reinterpret_cast<const uint8_t*>(utf8.data()), utf8.size());
828     utf16.resize(converted);
829     EXPECT_EQ(utf16, expected_utf16);
830 }
831 
832 /*
833 * @tc.name: ConvertRegionUtf8ToUtf16
834 * @tc.desc: Test single byte characters
835 * @tc.type: FUNC
836 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_001)837 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_001) {
838     std::string utf8 = "Hello";
839     std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F}; // "Hello"
840     std::vector<uint16_t> utf16(10);
841     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
842                                                 utf16.data(), utf8.size(), utf16.size());
843     utf16.resize(converted);
844     EXPECT_EQ(utf16, expected_utf16);
845 }
846 
847 /*
848 * @tc.name: ConvertRegionUtf8ToUtf16
849 * @tc.desc: Test includes Chinese characters
850 * @tc.type: FUNC
851 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_002)852 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_002) {
853     std::string utf8 = "你好,世界!";
854     std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D, 0xFF0C, 0x4E16, 0x754C, 0xFF01}; // "你好,世界!"
855     std::vector<uint16_t> utf16(10);
856     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
857                                                 utf16.data(), utf8.size(), utf16.size());
858     utf16.resize(converted);
859     EXPECT_EQ(utf16, expected_utf16);
860 }
861 
862 /*
863 * @tc.name: ConvertRegionUtf8ToUtf16
864 * @tc.desc: Test empty string
865 * @tc.type: FUNC
866 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_003)867 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_003) {
868     std::string utf8 = "";
869     std::vector<uint16_t> expected_utf16 = {}; // Empty
870     std::vector<uint16_t> utf16(10);
871     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
872                                                 utf16.data(), utf8.size(), utf16.size());
873     utf16.resize(converted);
874     EXPECT_EQ(utf16, expected_utf16);
875 }
876 
877 /*
878 * @tc.name: ConvertRegionUtf8ToUtf16
879 * @tc.desc: Test section conversion
880 * @tc.type: FUNC
881 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_004)882 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_004) {
883     std::string utf8 = "Hello, 你好";
884     std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F, 0x002C, 0x20, 0x4F60};
885     std::vector<uint16_t> utf16(10);
886     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
887                                                 utf16.data(), 10, utf16.size()); // Only process the first 9 bytes
888     utf16.resize(converted);
889     EXPECT_EQ(utf16, expected_utf16);
890 }
891 
892 /*
893 * @tc.name: ConvertRegionUtf8ToUtf16
894 * @tc.desc: Test buffer length limit
895 * @tc.type: FUNC
896 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_005)897 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_005) {
898     std::string utf8 = "你好,世界!";
899     std::vector<uint16_t> expected_utf16 = {0x4F60, 0x597D}; // "你好"
900     std::vector<uint16_t> utf16(2); // Limit buffer length
901     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
902                                                 utf16.data(), utf8.size(), utf16.size());
903     utf16.resize(converted);
904     EXPECT_EQ(utf16, expected_utf16);
905 }
906 
907 /*
908 * @tc.name: ConvertRegionUtf8ToUtf16
909 * @tc.desc: Test for incorrect UTF-8 data
910 * @tc.type: FUNC
911 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_006)912 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_006) {
913     std::string utf8 = "\xF0\x28\x8C\x28";
914     std::vector<uint16_t> expected_utf16 = {}; // Expected empty output, handling erroneous data
915     std::vector<uint16_t> utf16(10);
916     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
917                                                 utf16.data(), utf8.size(), utf16.size());
918     utf16.resize(converted);
919     EXPECT_NE(utf16, expected_utf16);
920 }
921 
922 /*
923 * @tc.name: ConvertRegionUtf8ToUtf16
924 * @tc.desc: Test single byte UTF-8 characters
925 * @tc.type: FUNC
926 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_007)927 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_007) {
928     std::string utf8 = "ABC"; // All are single byte characters
929     std::vector<uint16_t> expected_utf16 = {0x0041, 0x0042, 0x0043}; // ASCII characters: A, B, C
930     std::vector<uint16_t> utf16(10);
931     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
932                                                 utf16.data(), utf8.size(), utf16.size());
933     utf16.resize(converted);
934     EXPECT_EQ(utf16, expected_utf16);
935 }
936 
937 /*
938 * @tc.name: ConvertRegionUtf8ToUtf16
939 * @tc.desc: Testing Double Byte UTF-8 Characters
940 * @tc.type: FUNC
941 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_008)942 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_008) {
943     std::string utf8 = "\xC2\xA2\xC3\xBC"; // They are ¢ and ü, respectively
944     std::vector<uint16_t> expected_utf16 = {0x00A2, 0x00FC}; // Unicode .
945     std::vector<uint16_t> utf16(10);
946     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
947                                                 utf16.data(), utf8.size(), utf16.size());
948     utf16.resize(converted);
949     EXPECT_EQ(utf16, expected_utf16);
950 }
951 
952 /*
953 * @tc.name: ConvertRegionUtf8ToUtf16
954 * @tc.desc: Test three byte UTF-8 characters
955 * @tc.type: FUNC
956 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_009)957 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_009) {
958     std::string utf8 = "\xE2\x82\xAC"; // euro €
959     std::vector<uint16_t> expected_utf16 = {0x20AC}; // Unicode .
960     std::vector<uint16_t> utf16(10);
961     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
962                                                 utf16.data(), utf8.size(), utf16.size());
963     utf16.resize(converted);
964     EXPECT_EQ(utf16, expected_utf16);
965 }
966 
967 /*
968 * @tc.name: ConvertRegionUtf8ToUtf16
969 * @tc.desc: Test four byte UTF-8 characters and proxy pairs
970 * @tc.type: FUNC
971 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_010)972 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_010) {
973     std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji ��
974     std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
975     std::vector<uint16_t> utf16(10);
976     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
977                                                 utf16.data(), utf8.size(), utf16.size());
978     utf16.resize(converted);
979     EXPECT_EQ(utf16, expected_utf16);
980 }
981 
982 /*
983 * @tc.name: ConvertRegionUtf8ToUtf16
984 * @tc.desc: Test UTF-8 data containing zero bytes
985 * @tc.type: FUNC
986 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_011)987 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_011) {
988     std::string utf8 = "Hello\0World", utf8Nul = utf8 + '\0' + "World"; // Clearly including zero bytes
989     std::vector<uint16_t> expected_utf16 = {0x0048, 0x0065, 0x006C, 0x006C, 0x006F,
990         0x0000, 0x0057, 0x006F, 0x0072, 0x006C, 0x0064}; // Including NULL characters
991     std::vector<uint16_t> utf16(15);
992     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8Nul.data()),
993                                                 utf16.data(), utf8Nul.size(), utf16.size());
994     utf16.resize(converted);
995     EXPECT_EQ(utf16, expected_utf16);
996 }
997 
998 /*
999 * @tc.name: ConvertRegionUtf8ToUtf16
1000 * @tc.desc: Test continuous illegal sequences
1001 * @tc.type: FUNC
1002 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_012)1003 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_012) {
1004     std::string utf8 = "\xC0\x80\xC0\x80"; // Continuous illegal sequence
1005     std::vector<uint16_t> expected_utf16 = {};
1006     std::vector<uint16_t> utf16(10);
1007     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
1008                                                 utf16.data(), utf8.size(), utf16.size());
1009     utf16.resize(converted);
1010     EXPECT_NE(utf16, expected_utf16);
1011 }
1012 
1013 /*
1014 * @tc.name: ConvertRegionUtf8ToUtf16
1015 * @tc.desc: Test four byte UTF-8 characters and proxy pairs
1016 * @tc.type: FUNC
1017 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_013)1018 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_013) {
1019     std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji ��
1020     std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
1021     std::vector<uint16_t> utf16(0);
1022     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
1023                                                 utf16.data(), utf8.size(), utf16.size());
1024     utf16.resize(converted);
1025     EXPECT_EQ(converted, 0);
1026 }
1027 /*
1028 * @tc.name: ConvertRegionUtf8ToUtf16
1029 * @tc.desc: Test four byte UTF-8 characters and proxy pairs
1030 * @tc.type: FUNC
1031 */
HWTEST_F_L0(UtfHelperTest,ConvertRegionUtf8ToUtf16_014)1032 HWTEST_F_L0(UtfHelperTest, ConvertRegionUtf8ToUtf16_014) {
1033     std::string utf8 = "\xF0\x9F\x98\x8E"; // Emoji ��
1034     std::vector<uint16_t> expected_utf16 = {0xD83D, 0xDE0E}; // surrogates
1035     std::vector<uint16_t> utf16(1);
1036     size_t converted = ConvertRegionUtf8ToUtf16(reinterpret_cast<const uint8_t*>(utf8.data()),
1037                                                 utf16.data(), utf8.size(), utf16.size());
1038     utf16.resize(converted);
1039     EXPECT_EQ(converted, 0);
1040 }
1041 } // namespace panda:test
1042