1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utils/utf.h"
17
18 #include <cstdint>
19
20 #include <vector>
21
22 #include <gtest/gtest.h>
23
24 namespace panda::utf::test {
25
U16_lead(uint32_t codepoint)26 static uint16_t U16_lead(uint32_t codepoint)
27 {
28 return ((codepoint >> 10U) + 0xd7c0) & 0xffff;
29 }
30
U16_tail(uint32_t codepoint)31 static uint16_t U16_tail(uint32_t codepoint)
32 {
33 return (codepoint & 0x3ff) | 0xdc00;
34 }
35
TEST(Utf,ConvertMUtf8ToUtf16)36 TEST(Utf, ConvertMUtf8ToUtf16)
37 {
38 // 2-byte mutf-8 U+0000
39 {
40 const std::vector<uint8_t> in {0xc0, 0x80, 0x00};
41 const std::vector<uint16_t> res {0x0};
42 std::vector<uint16_t> out(res.size());
43 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
44 EXPECT_EQ(out, res);
45 }
46
47 // 1-byte mutf-8: 0xxxxxxx
48 {
49 const std::vector<uint8_t> in {0x7f, 0x00};
50 const std::vector<uint16_t> res {0x7f};
51 std::vector<uint16_t> out(res.size());
52 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
53 EXPECT_EQ(out, res);
54 }
55
56 // 2-byte mutf-8: 110xxxxx 10xxxxxx
57 {
58 const std::vector<uint8_t> in {0xc2, 0xa7, 0x33, 0x00};
59 const std::vector<uint16_t> res {0xa7, 0x33};
60 std::vector<uint16_t> out(res.size());
61 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
62 EXPECT_EQ(out, res);
63 }
64
65 // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
66 {
67 const std::vector<uint8_t> in {0xef, 0xbf, 0x83, 0x33, 0x00};
68 const std::vector<uint16_t> res {0xffc3, 0x33};
69 std::vector<uint16_t> out(res.size());
70 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
71 EXPECT_EQ(out, res);
72 }
73
74 // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
75 {
76 const std::vector<uint8_t> in {0xed, 0xa0, 0x81, 0xed, 0xb0, 0xb7, 0x00};
77 const std::vector<uint16_t> res {0xd801, 0xdc37};
78 std::vector<uint16_t> out(res.size());
79 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
80 EXPECT_EQ(out, res);
81 }
82
83 {
84 const std::vector<uint8_t> in {0x5b, 0x61, 0x62, 0x63, 0xed, 0xa3, 0x92, 0x5d, 0x00};
85 const std::vector<uint16_t> res {0x5b, 0x61, 0x62, 0x63, 0xd8d2, 0x5d};
86 std::vector<uint16_t> out(res.size());
87 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
88 EXPECT_EQ(out, res);
89 }
90
91 {
92 const std::vector<uint8_t> in {0xF0, 0x9F, 0x91, 0xB3, 0x00};
93 const std::vector<uint16_t> res {0xD83D, 0xDC73};
94 std::vector<uint16_t> out(res.size());
95 ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
96 EXPECT_EQ(out, res);
97 }
98 }
99
TEST(Utf,Utf16ToMUtf8Size)100 TEST(Utf, Utf16ToMUtf8Size)
101 {
102 // 2-byte mutf-8 U+0000
103 {
104 const std::vector<uint16_t> in {0x0};
105 size_t res = Utf16ToMUtf8Size(in.data(), in.size());
106 EXPECT_EQ(res, 3);
107 }
108
109 // 1-byte mutf-8: 0xxxxxxx
110 {
111 const std::vector<uint16_t> in {0x7f};
112 size_t res = Utf16ToMUtf8Size(in.data(), in.size());
113 EXPECT_EQ(res, 2);
114 }
115
116 // 2-byte mutf-8: 110xxxxx 10xxxxxx
117 {
118 const std::vector<uint16_t> in {0xa7, 0x33};
119 size_t res = Utf16ToMUtf8Size(in.data(), in.size());
120 EXPECT_EQ(res, 4);
121 }
122
123 // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
124 {
125 const std::vector<uint16_t> in {0xffc3, 0x33};
126 size_t res = Utf16ToMUtf8Size(in.data(), in.size());
127 EXPECT_EQ(res, 5);
128 }
129
130 // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
131 {
132 const std::vector<uint16_t> in {0xd801, 0xdc37};
133 size_t res = Utf16ToMUtf8Size(in.data(), in.size());
134 EXPECT_EQ(res, 5);
135 }
136 }
137
TEST(Utf,ConvertRegionUtf16ToMUtf8)138 TEST(Utf, ConvertRegionUtf16ToMUtf8)
139 {
140 // 2-byte mutf-8 U+0000
141 {
142 const std::vector<uint16_t> in {0x0};
143 const std::vector<uint8_t> res {0xc0, 0x80, 0x00};
144 std::vector<uint8_t> out(res.size());
145 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
146 EXPECT_EQ(sz, 2);
147 out[out.size() - 1] = '\0';
148 EXPECT_EQ(out, res);
149 }
150
151 // 1-byte mutf-8: 0xxxxxxx
152 {
153 const std::vector<uint16_t> in {0x7f};
154 const std::vector<uint8_t> res {0x7f, 0x00};
155 std::vector<uint8_t> out(res.size());
156 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
157 EXPECT_EQ(sz, 1);
158 out[out.size() - 1] = '\0';
159 EXPECT_EQ(out, res);
160 }
161
162 // 2-byte mutf-8: 110xxxxx 10xxxxxx
163 {
164 const std::vector<uint16_t> in {0xa7, 0x33};
165 const std::vector<uint8_t> res {0xc2, 0xa7, 0x33, 0x00};
166 std::vector<uint8_t> out(res.size());
167 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
168 EXPECT_EQ(sz, 3);
169 out[out.size() - 1] = '\0';
170 EXPECT_EQ(out, res);
171 }
172
173 // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
174 {
175 const std::vector<uint16_t> in {0xffc3, 0x33};
176 const std::vector<uint8_t> res {0xef, 0xbf, 0x83, 0x33, 0x00};
177 std::vector<uint8_t> out(res.size());
178 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
179 EXPECT_EQ(sz, 4);
180 out[out.size() - 1] = '\0';
181 EXPECT_EQ(out, res);
182 }
183
184 // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
185 // utf-16 data in 0xd800-0xdfff
186 {
187 const std::vector<uint16_t> in {0xd834, 0x33};
188 const std::vector<uint8_t> res {0xed, 0xa0, 0xb4, 0x33, 0x00};
189 std::vector<uint8_t> out(res.size());
190 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
191 EXPECT_EQ(sz, 4);
192 out[out.size() - 1] = '\0';
193 EXPECT_EQ(out, res);
194 }
195
196 // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
197 // utf-16 data in 0xd800-0xdfff
198 {
199 const std::vector<uint16_t> in {0xdf06, 0x33};
200 const std::vector<uint8_t> res {0xed, 0xbc, 0x86, 0x33, 0x00};
201 std::vector<uint8_t> out(res.size());
202 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
203 EXPECT_EQ(sz, 4);
204 out[out.size() - 1] = '\0';
205 EXPECT_EQ(out, res);
206 }
207
208 // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
209 {
210 const std::vector<uint16_t> in {0xd801, 0xdc37};
211 const std::vector<uint8_t> res {0xf0, 0x90, 0x90, 0xb7, 0x00};
212 std::vector<uint8_t> out(res.size());
213 size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
214 EXPECT_EQ(sz, 4);
215 out[out.size() - 1] = '\0';
216 EXPECT_EQ(out, res);
217 }
218 }
219
TEST(Utf,CompareMUtf8ToMUtf8)220 TEST(Utf, CompareMUtf8ToMUtf8)
221 {
222 // 1-byte utf-8: 0xxxxxxx
223 {
224 const std::vector<uint8_t> v1 {0x00};
225 const std::vector<uint8_t> v2 {0x7f, 0x00};
226 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
227 }
228
229 {
230 const std::vector<uint8_t> v1 {0x02, 0x00};
231 const std::vector<uint8_t> v2 {0x00};
232 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
233 }
234
235 {
236 const std::vector<uint8_t> v1 {0x7f, 0x00};
237 const std::vector<uint8_t> v2 {0x7f, 0x00};
238 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
239 }
240
241 {
242 const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00};
243 const std::vector<uint8_t> v2 {0x01, 0x70, 0x00};
244 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
245 }
246
247 {
248 const std::vector<uint8_t> v1 {0x01, 0x71, 0x00};
249 const std::vector<uint8_t> v2 {0x01, 0x73, 0x00};
250 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
251 }
252
253 // 2-byte utf-8: 110xxxxx 10xxxxxx
254 {
255 const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00};
256 const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00};
257 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
258 }
259
260 {
261 const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00};
262 const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00};
263 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
264 }
265
266 {
267 const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00};
268 const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00};
269 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
270 }
271
272 // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx
273 {
274 const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00};
275 const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00};
276 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
277 }
278
279 {
280 const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00};
281 const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00};
282 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
283 }
284
285 {
286 const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00};
287 const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00};
288 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
289 }
290
291 // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
292 {
293 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
294 const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
295 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
296 }
297
298 {
299 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00};
300 const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
301 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
302 }
303
304 {
305 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
306 const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00};
307 EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
308 }
309 }
310
TEST(Utf,CompareUtf8ToUtf8)311 TEST(Utf, CompareUtf8ToUtf8)
312 {
313 // 1-byte utf-8: 0xxxxxxx
314 {
315 const std::vector<uint8_t> v1 {0x00};
316 const std::vector<uint8_t> v2 {0x7f, 0x00};
317 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
318 }
319
320 {
321 const std::vector<uint8_t> v1 {0x02, 0x00};
322 const std::vector<uint8_t> v2 {0x00};
323 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
324 }
325
326 {
327 const std::vector<uint8_t> v1 {0x7f, 0x00};
328 const std::vector<uint8_t> v2 {0x7f, 0x00};
329 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
330 }
331
332 {
333 const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00};
334 const std::vector<uint8_t> v2 {0x01, 0x70, 0x00};
335 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
336 }
337
338 {
339 const std::vector<uint8_t> v1 {0x01, 0x71, 0x00};
340 const std::vector<uint8_t> v2 {0x01, 0x73, 0x00};
341 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
342 }
343
344 // 2-byte utf-8: 110xxxxx 10xxxxxx
345 {
346 const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00};
347 const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00};
348 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
349 }
350
351 {
352 const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00};
353 const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00};
354 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
355 }
356
357 {
358 const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00};
359 const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00};
360 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
361 }
362
363 // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx
364 {
365 const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00};
366 const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00};
367 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
368 }
369
370 {
371 const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00};
372 const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00};
373 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
374 }
375
376 {
377 const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00};
378 const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00};
379 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
380 }
381
382 // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
383 {
384 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
385 const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
386 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
387 }
388
389 {
390 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00};
391 const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
392 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
393 }
394
395 {
396 const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
397 const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00};
398 EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
399 }
400 }
401
402 } // namespace panda::utf::test
403