• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utils/utf.h"
17 
18 #include <cstdint>
19 
20 #include <vector>
21 
22 #include <gtest/gtest.h>
23 
24 namespace panda::utf::test {
25 
U16_lead(uint32_t codepoint)26 static uint16_t U16_lead(uint32_t codepoint)
27 {
28     return ((codepoint >> 10U) + 0xd7c0) & 0xffff;
29 }
30 
U16_tail(uint32_t codepoint)31 static uint16_t U16_tail(uint32_t codepoint)
32 {
33     return (codepoint & 0x3ff) | 0xdc00;
34 }
35 
TEST(Utf,ConvertMUtf8ToUtf16)36 TEST(Utf, ConvertMUtf8ToUtf16)
37 {
38     // 2-byte mutf-8 U+0000
39     {
40         const std::vector<uint8_t> in {0xc0, 0x80, 0x00};
41         const std::vector<uint16_t> res {0x0};
42         std::vector<uint16_t> out(res.size());
43         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
44         EXPECT_EQ(out, res);
45     }
46 
47     // 1-byte mutf-8: 0xxxxxxx
48     {
49         const std::vector<uint8_t> in {0x7f, 0x00};
50         const std::vector<uint16_t> res {0x7f};
51         std::vector<uint16_t> out(res.size());
52         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
53         EXPECT_EQ(out, res);
54     }
55 
56     // 2-byte mutf-8: 110xxxxx 10xxxxxx
57     {
58         const std::vector<uint8_t> in {0xc2, 0xa7, 0x33, 0x00};
59         const std::vector<uint16_t> res {0xa7, 0x33};
60         std::vector<uint16_t> out(res.size());
61         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
62         EXPECT_EQ(out, res);
63     }
64 
65     // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
66     {
67         const std::vector<uint8_t> in {0xef, 0xbf, 0x83, 0x33, 0x00};
68         const std::vector<uint16_t> res {0xffc3, 0x33};
69         std::vector<uint16_t> out(res.size());
70         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
71         EXPECT_EQ(out, res);
72     }
73 
74     // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
75     {
76         const std::vector<uint8_t> in {0xed, 0xa0, 0x81, 0xed, 0xb0, 0xb7, 0x00};
77         const std::vector<uint16_t> res {0xd801, 0xdc37};
78         std::vector<uint16_t> out(res.size());
79         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
80         EXPECT_EQ(out, res);
81     }
82 
83     {
84         const std::vector<uint8_t> in {0x5b, 0x61, 0x62, 0x63, 0xed, 0xa3, 0x92, 0x5d, 0x00};
85         const std::vector<uint16_t> res {0x5b, 0x61, 0x62, 0x63, 0xd8d2, 0x5d};
86         std::vector<uint16_t> out(res.size());
87         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
88         EXPECT_EQ(out, res);
89     }
90 
91     {
92         const std::vector<uint8_t> in {0xF0, 0x9F, 0x91, 0xB3, 0x00};
93         const std::vector<uint16_t> res {0xD83D, 0xDC73};
94         std::vector<uint16_t> out(res.size());
95         ConvertMUtf8ToUtf16(in.data(), utf::Mutf8Size(in.data()), out.data());
96         EXPECT_EQ(out, res);
97     }
98 }
99 
TEST(Utf,Utf16ToMUtf8Size)100 TEST(Utf, Utf16ToMUtf8Size)
101 {
102     // 2-byte mutf-8 U+0000
103     {
104         const std::vector<uint16_t> in {0x0};
105         size_t res = Utf16ToMUtf8Size(in.data(), in.size());
106         EXPECT_EQ(res, 3);
107     }
108 
109     // 1-byte mutf-8: 0xxxxxxx
110     {
111         const std::vector<uint16_t> in {0x7f};
112         size_t res = Utf16ToMUtf8Size(in.data(), in.size());
113         EXPECT_EQ(res, 2);
114     }
115 
116     // 2-byte mutf-8: 110xxxxx 10xxxxxx
117     {
118         const std::vector<uint16_t> in {0xa7, 0x33};
119         size_t res = Utf16ToMUtf8Size(in.data(), in.size());
120         EXPECT_EQ(res, 4);
121     }
122 
123     // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
124     {
125         const std::vector<uint16_t> in {0xffc3, 0x33};
126         size_t res = Utf16ToMUtf8Size(in.data(), in.size());
127         EXPECT_EQ(res, 5);
128     }
129 
130     // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
131     {
132         const std::vector<uint16_t> in {0xd801, 0xdc37};
133         size_t res = Utf16ToMUtf8Size(in.data(), in.size());
134         EXPECT_EQ(res, 5);
135     }
136 }
137 
TEST(Utf,ConvertRegionUtf16ToMUtf8)138 TEST(Utf, ConvertRegionUtf16ToMUtf8)
139 {
140     // 2-byte mutf-8 U+0000
141     {
142         const std::vector<uint16_t> in {0x0};
143         const std::vector<uint8_t> res {0xc0, 0x80, 0x00};
144         std::vector<uint8_t> out(res.size());
145         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
146         EXPECT_EQ(sz, 2);
147         out[out.size() - 1] = '\0';
148         EXPECT_EQ(out, res);
149     }
150 
151     // 1-byte mutf-8: 0xxxxxxx
152     {
153         const std::vector<uint16_t> in {0x7f};
154         const std::vector<uint8_t> res {0x7f, 0x00};
155         std::vector<uint8_t> out(res.size());
156         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
157         EXPECT_EQ(sz, 1);
158         out[out.size() - 1] = '\0';
159         EXPECT_EQ(out, res);
160     }
161 
162     // 2-byte mutf-8: 110xxxxx 10xxxxxx
163     {
164         const std::vector<uint16_t> in {0xa7, 0x33};
165         const std::vector<uint8_t> res {0xc2, 0xa7, 0x33, 0x00};
166         std::vector<uint8_t> out(res.size());
167         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
168         EXPECT_EQ(sz, 3);
169         out[out.size() - 1] = '\0';
170         EXPECT_EQ(out, res);
171     }
172 
173     // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
174     {
175         const std::vector<uint16_t> in {0xffc3, 0x33};
176         const std::vector<uint8_t> res {0xef, 0xbf, 0x83, 0x33, 0x00};
177         std::vector<uint8_t> out(res.size());
178         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
179         EXPECT_EQ(sz, 4);
180         out[out.size() - 1] = '\0';
181         EXPECT_EQ(out, res);
182     }
183 
184     // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
185     // utf-16 data in 0xd800-0xdfff
186     {
187         const std::vector<uint16_t> in {0xd834, 0x33};
188         const std::vector<uint8_t> res {0xed, 0xa0, 0xb4, 0x33, 0x00};
189         std::vector<uint8_t> out(res.size());
190         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
191         EXPECT_EQ(sz, 4);
192         out[out.size() - 1] = '\0';
193         EXPECT_EQ(out, res);
194     }
195 
196     // 3-byte mutf-8: 1110xxxx 10xxxxxx 10xxxxxx
197     // utf-16 data in 0xd800-0xdfff
198     {
199         const std::vector<uint16_t> in {0xdf06, 0x33};
200         const std::vector<uint8_t> res {0xed, 0xbc, 0x86, 0x33, 0x00};
201         std::vector<uint8_t> out(res.size());
202         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
203         EXPECT_EQ(sz, 4);
204         out[out.size() - 1] = '\0';
205         EXPECT_EQ(out, res);
206     }
207 
208     // double 3-byte mutf-8: 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
209     {
210         const std::vector<uint16_t> in {0xd801, 0xdc37};
211         const std::vector<uint8_t> res {0xf0, 0x90, 0x90, 0xb7, 0x00};
212         std::vector<uint8_t> out(res.size());
213         size_t sz = ConvertRegionUtf16ToMUtf8(in.data(), out.data(), in.size(), out.size() - 1, 0);
214         EXPECT_EQ(sz, 4);
215         out[out.size() - 1] = '\0';
216         EXPECT_EQ(out, res);
217     }
218 }
219 
TEST(Utf,CompareMUtf8ToMUtf8)220 TEST(Utf, CompareMUtf8ToMUtf8)
221 {
222     // 1-byte utf-8: 0xxxxxxx
223     {
224         const std::vector<uint8_t> v1 {0x00};
225         const std::vector<uint8_t> v2 {0x7f, 0x00};
226         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
227     }
228 
229     {
230         const std::vector<uint8_t> v1 {0x02, 0x00};
231         const std::vector<uint8_t> v2 {0x00};
232         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
233     }
234 
235     {
236         const std::vector<uint8_t> v1 {0x7f, 0x00};
237         const std::vector<uint8_t> v2 {0x7f, 0x00};
238         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
239     }
240 
241     {
242         const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00};
243         const std::vector<uint8_t> v2 {0x01, 0x70, 0x00};
244         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
245     }
246 
247     {
248         const std::vector<uint8_t> v1 {0x01, 0x71, 0x00};
249         const std::vector<uint8_t> v2 {0x01, 0x73, 0x00};
250         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
251     }
252 
253     // 2-byte utf-8: 110xxxxx 10xxxxxx
254     {
255         const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00};
256         const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00};
257         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
258     }
259 
260     {
261         const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00};
262         const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00};
263         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
264     }
265 
266     {
267         const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00};
268         const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00};
269         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
270     }
271 
272     // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx
273     {
274         const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00};
275         const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00};
276         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
277     }
278 
279     {
280         const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00};
281         const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00};
282         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
283     }
284 
285     {
286         const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00};
287         const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00};
288         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
289     }
290 
291     // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
292     {
293         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
294         const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
295         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) == 0);
296     }
297 
298     {
299         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00};
300         const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
301         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) > 0);
302     }
303 
304     {
305         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
306         const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00};
307         EXPECT_TRUE(CompareMUtf8ToMUtf8(v1.data(), v2.data()) < 0);
308     }
309 }
310 
TEST(Utf,CompareUtf8ToUtf8)311 TEST(Utf, CompareUtf8ToUtf8)
312 {
313     // 1-byte utf-8: 0xxxxxxx
314     {
315         const std::vector<uint8_t> v1 {0x00};
316         const std::vector<uint8_t> v2 {0x7f, 0x00};
317         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
318     }
319 
320     {
321         const std::vector<uint8_t> v1 {0x02, 0x00};
322         const std::vector<uint8_t> v2 {0x00};
323         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
324     }
325 
326     {
327         const std::vector<uint8_t> v1 {0x7f, 0x00};
328         const std::vector<uint8_t> v2 {0x7f, 0x00};
329         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
330     }
331 
332     {
333         const std::vector<uint8_t> v1 {0x01, 0x7f, 0x00};
334         const std::vector<uint8_t> v2 {0x01, 0x70, 0x00};
335         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
336     }
337 
338     {
339         const std::vector<uint8_t> v1 {0x01, 0x71, 0x00};
340         const std::vector<uint8_t> v2 {0x01, 0x73, 0x00};
341         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
342     }
343 
344     // 2-byte utf-8: 110xxxxx 10xxxxxx
345     {
346         const std::vector<uint8_t> v1 {0xdf, 0xbf, 0x03, 0x00};
347         const std::vector<uint8_t> v2 {0xdf, 0xbf, 0x03, 0x00};
348         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
349     }
350 
351     {
352         const std::vector<uint8_t> v1 {0xdf, 0xb1, 0x03, 0x00};
353         const std::vector<uint8_t> v2 {0xd1, 0xb2, 0x03, 0x00};
354         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
355     }
356 
357     {
358         const std::vector<uint8_t> v1 {0xd1, 0xbf, 0x03, 0x00};
359         const std::vector<uint8_t> v2 {0xdf, 0xb0, 0x03, 0x00};
360         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
361     }
362 
363     // 3-byte utf-8: 1110xxxx 10xxxxxx 10xxxxxx
364     {
365         const std::vector<uint8_t> v1 {0xef, 0xbf, 0x03, 0x04, 0x00};
366         const std::vector<uint8_t> v2 {0xef, 0xbf, 0x03, 0x04, 0x00};
367         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
368     }
369 
370     {
371         const std::vector<uint8_t> v1 {0xef, 0xb2, 0x03, 0x04, 0x00};
372         const std::vector<uint8_t> v2 {0xe0, 0xbf, 0x03, 0x04, 0x00};
373         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
374     }
375 
376     {
377         const std::vector<uint8_t> v1 {0xef, 0xb0, 0x03, 0x04, 0x00};
378         const std::vector<uint8_t> v2 {0xef, 0xbf, 0x05, 0x04, 0x00};
379         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
380     }
381 
382     // 4-byte utf-8: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
383     {
384         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
385         const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
386         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) == 0);
387     }
388 
389     {
390         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x0a, 0x05, 0x00};
391         const std::vector<uint8_t> v2 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
392         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) > 0);
393     }
394 
395     {
396         const std::vector<uint8_t> v1 {0xf7, 0xbf, 0xbf, 0x04, 0x05, 0x00};
397         const std::vector<uint8_t> v2 {0xf8, 0xbf, 0xbf, 0x04, 0x05, 0x00};
398         EXPECT_TRUE(CompareUtf8ToUtf8(v1.data(), v1.size(), v2.data(), v2.size()) < 0);
399     }
400 }
401 
402 }  // namespace panda::utf::test
403