• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 
18 #include <cstring>
19 
20 namespace panda::utf {
21 
22 constexpr size_t MAX_U16 = 0xffff;
23 constexpr size_t CONST_2 = 2;
24 constexpr size_t CONST_3 = 3;
25 constexpr size_t CONST_4 = 4;
26 constexpr size_t CONST_6 = 6;
27 constexpr size_t CONST_12 = 12;
28 
29 struct MUtf8Char {
30     size_t n;
31     std::array<uint8_t, CONST_4> ch;
32 };
33 
34 /*
35  * MUtf-8
36  *
37  * U+0000 => C0 80
38  *
39  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
40  *    code point   code point   code point
41  * 1  7            U+0000       U+007F      0xxxxxxx
42  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
43  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
44  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
45  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
46  */
47 
48 /*
49  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
50  * In case of invalid sequence return first byte of it.
51  */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t max_bytes)52 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
53 {
54     // TODO(d.kovalneko): make the function safe
55     Span<const uint8_t> sp(data, max_bytes);
56     uint8_t d0 = sp[0];
57     if ((d0 & MASK1) == 0) {
58         return {d0, 1};
59     }
60 
61     if (max_bytes < CONST_2) {
62         return {d0, 1};
63     }
64     uint8_t d1 = sp[1];
65     if ((d0 & MASK2) == 0) {
66         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
67     }
68 
69     if (max_bytes < CONST_3) {
70         return {d0, 1};
71     }
72     uint8_t d2 = sp[CONST_2];
73     if ((d0 & MASK3) == 0) {
74         return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
75                 CONST_3};
76     }
77 
78     if (max_bytes < CONST_4) {
79         return {d0, 1};
80     }
81     uint8_t d3 = sp[CONST_3];
82     uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
83                           ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
84 
85     uint32_t pair = 0;
86     pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
87     pair <<= PAIR_ELEMENT_WIDTH;
88     pair |= (code_point & MASK_10BIT) + U16_TAIL;
89 
90     return {pair, CONST_4};
91 }
92 
CombineTwoU16(uint16_t d0,uint16_t d1)93 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
94 {
95     uint32_t codePoint = d0 - HI_SURROGATE_MIN;
96     codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
97     codePoint |= d1 - LO_SURROGATE_MIN;
98     codePoint += LO_SUPPLEMENTS_MIN;
99     return codePoint;
100 }
101 
ConvertUtf16ToMUtf8(uint16_t d0,uint16_t d1)102 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
103 {
104     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
105     // means that is a single code point, it needs to be represented by three MUTF8 code.
106     if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
107         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
108         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
109         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
110         return {CONST_3, {ch0, ch1, ch2}};
111     }
112 
113     if (d0 == 0) {
114         return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
115     }
116     if (d0 <= MUTF8_1B_MAX) {
117         return {1, {static_cast<uint8_t>(d0)}};
118     }
119     if (d0 <= MUTF8_2B_MAX) {
120         auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
121         auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
122         return {CONST_2, {ch0, ch1}};
123     }
124     if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
125         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
126         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
127         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
128         return {CONST_3, {ch0, ch1, ch2}};
129     }
130 
131     uint32_t codePoint = CombineTwoU16(d0, d1);
132 
133     auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
134     auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
135     auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
136     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
137 
138     return {CONST_4, {ch0, ch1, ch2, ch3}};
139 }
140 
IsMUtf8OnlySingleBytes(const uint8_t * mutf8_in)141 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
142 {
143     while (*mutf8_in != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
144         if (*mutf8_in >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145             return false;
146         }
147         mutf8_in += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
148     }
149     return true;
150 }
151 
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16_in,uint8_t * mutf8_out,size_t utf16_len,size_t mutf8_len,size_t start)152 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
153                                  size_t start)
154 {
155     size_t mutf8_pos = 0;
156     if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
157         return 0;
158     }
159     size_t end = start + utf16_len;
160     for (size_t i = start; i < end; ++i) {
161         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
162         uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
163         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
164         MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
165         if (mutf8_pos + ch.n > mutf8_len) {
166             break;
167         }
168         for (size_t c = 0; c < ch.n; ++c) {
169             mutf8_out[mutf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
170         }
171         if (ch.n == CONST_4) {  // Two UTF-16 chars are used
172             ++i;
173         }
174     }
175     return mutf8_pos;
176 }
177 
ConvertMUtf8ToUtf16(const uint8_t * mutf8_in,size_t mutf8_len,uint16_t * utf16_out)178 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
179 {
180     size_t in_pos = 0;
181     while (in_pos < mutf8_len) {
182         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
183         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
184 
185         if (p_hi != 0) {
186             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
187         }
188         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
189 
190         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191         in_pos += nbytes;
192     }
193 }
194 
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8_in,uint16_t * utf16_out,size_t mutf8_len,size_t utf16_len,size_t start)195 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
196                                  size_t start)
197 {
198     size_t in_pos = 0;
199     size_t out_pos = 0;
200     while (in_pos < mutf8_len) {
201         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
202         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
203 
204         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
205         in_pos += nbytes;
206         if (start > 0) {
207             start -= nbytes;
208             continue;
209         }
210 
211         if (p_hi != 0) {
212             ASSERT(utf16_len >= 1);
213             if (out_pos++ >= utf16_len - 1) {  // check for place for two uint16
214                 --out_pos;
215                 break;
216             }
217             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
218         }
219         if (out_pos++ >= utf16_len) {
220             --out_pos;
221             break;
222         }
223         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
224     }
225     return out_pos;
226 }
227 
CompareMUtf8ToMUtf8(const uint8_t * mutf8_1,const uint8_t * mutf8_2)228 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
229 {
230     uint32_t c1;
231     uint32_t c2;
232     uint32_t n1;
233     uint32_t n2;
234 
235     do {
236         c1 = *mutf8_1;
237         c2 = *mutf8_2;
238 
239         if (c1 == 0 && c2 == 0) {
240             return 0;
241         }
242 
243         if (c1 == 0 && c2 != 0) {
244             return -1;
245         }
246 
247         if (c1 != 0 && c2 == 0) {
248             return 1;
249         }
250 
251         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
252         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
253 
254         mutf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
255         mutf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
256     } while (c1 == c2);
257 
258     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
259     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
260 
261     auto result = static_cast<int>(c1p1 - c2p1);
262     if (result != 0) {
263         return result;
264     }
265 
266     return c1p2 - c2p2;
267 }
268 
269 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf8_1,size_t utf8_1_length,const uint8_t * utf8_2,size_t utf8_2_length)270 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
271 {
272     uint32_t c1;
273     uint32_t c2;
274     uint32_t n1;
275     uint32_t n2;
276 
277     uint32_t utf8_1_index = 0;
278     uint32_t utf8_2_index = 0;
279 
280     do {
281         if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
282             return 0;
283         }
284 
285         if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
286             return -1;
287         }
288 
289         if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
290             return 1;
291         }
292 
293         c1 = *utf8_1;
294         c2 = *utf8_2;
295 
296         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
297         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
298 
299         utf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
300         utf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
301         utf8_1_index += n1;
302         utf8_2_index += n2;
303     } while (c1 == c2);
304 
305     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
306     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
307 
308     auto result = static_cast<int>(c1p1 - c2p1);
309     if (result != 0) {
310         return result;
311     }
312 
313     return c1p2 - c2p2;
314 }
315 
Mutf8Size(const uint8_t * mutf8)316 size_t Mutf8Size(const uint8_t *mutf8)
317 {
318     return strlen(Mutf8AsCString(mutf8));
319 }
320 
MUtf8ToUtf16Size(const uint8_t * mutf8)321 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
322 {
323     // TODO(d.kovalenko): make it faster
324     size_t res = 0;
325     while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
326         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
327         res += pair > MAX_U16 ? CONST_2 : 1;
328         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
329     }
330     return res;
331 }
332 
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8_len)333 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
334 {
335     size_t pos = 0;
336     size_t res = 0;
337     while (pos != mutf8_len) {
338         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
339         if (nbytes == 0) {
340             nbytes = 1;
341         }
342         res += pair > MAX_U16 ? CONST_2 : 1;
343         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344         pos += nbytes;
345     }
346     return res;
347 }
348 
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)349 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
350 {
351     size_t res = 1;  // zero byte
352     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
353     // means that is a single code point, it needs to be represented by three MUTF8 code.
354     if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
355         mutf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
356         res += CONST_3;
357         return res;
358     }
359 
360     for (uint32_t i = 0; i < length; ++i) {
361         // NOLINTNEXTLINE(bugprone-branch-clone)
362         if (mutf16[i] == 0) {                    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
363             res += CONST_2;                      // special case for U+0000 => C0 80
364         } else if (mutf16[i] <= MUTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
365             res += 1;
366         } else if (mutf16[i] <= MUTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
367             res += CONST_2;
368             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
369         } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
370             res += CONST_3;
371         } else {
372             res += CONST_4;
373             ++i;
374         }
375     }
376     return res;
377 }
378 
IsEqual(Span<const uint8_t> utf8_1,Span<const uint8_t> utf8_2)379 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
380 {
381     if (utf8_1.size() != utf8_2.size()) {
382         return false;
383     }
384 
385     return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
386 }
387 
IsEqual(const uint8_t * mutf8_1,const uint8_t * mutf8_2)388 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
389 {
390     return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
391 }
392 
IsValidModifiedUTF8(const uint8_t * elems)393 bool IsValidModifiedUTF8(const uint8_t *elems)
394 {
395     ASSERT(elems);
396 
397     while (*elems != '\0') {
398         // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
399         switch (*elems & 0xf0) {
400             case 0x00:
401             case 0x10:  // NOLINT(readability-magic-numbers)
402             case 0x20:  // NOLINT(readability-magic-numbers)
403             case 0x30:  // NOLINT(readability-magic-numbers)
404             case 0x40:  // NOLINT(readability-magic-numbers)
405             case 0x50:  // NOLINT(readability-magic-numbers)
406             case 0x60:  // NOLINT(readability-magic-numbers)
407             case 0x70:  // NOLINT(readability-magic-numbers)
408                 // pattern 0xxx
409                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
410                 ++elems;
411                 break;
412             case 0x80:  // NOLINT(readability-magic-numbers)
413             case 0x90:  // NOLINT(readability-magic-numbers)
414             case 0xa0:  // NOLINT(readability-magic-numbers)
415             case 0xb0:  // NOLINT(readability-magic-numbers)
416                 // pattern 10xx is illegal start
417                 return false;
418 
419             case 0xf0:  // NOLINT(readability-magic-numbers)
420                 // pattern 1111 0xxx starts four byte section
421                 if ((*elems & 0x08) == 0) {  // NOLINT(hicpp-signed-bitwise)
422                     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
423                     ++elems;
424                     if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
425                         return false;
426                     }
427                 } else {
428                     return false;
429                 }
430                 // no need break
431                 [[fallthrough]];
432 
433             case 0xe0:  // NOLINT(readability-magic-numbers)
434                 // pattern 1110
435                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
436                 ++elems;
437                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
438                     return false;
439                 }
440                 // no need break
441                 [[fallthrough]];
442 
443             case 0xc0:  // NOLINT(readability-magic-numbers)
444             case 0xd0:  // NOLINT(readability-magic-numbers)
445                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
446                 ++elems;
447                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
448                     return false;
449                 }
450                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451                 ++elems;
452                 break;
453             default:
454                 break;
455         }
456     }
457     return true;
458 }
459 
460 }  // namespace panda::utf
461