• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 
18 #include <cstddef>
19 #include <cstring>
20 
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24 
25 namespace panda::utf {
26 
27 constexpr size_t MAX_U16 = 0xffff;
28 constexpr size_t CONST_2 = 2;
29 constexpr size_t CONST_3 = 3;
30 constexpr size_t CONST_4 = 4;
31 constexpr size_t CONST_6 = 6;
32 constexpr size_t CONST_12 = 12;
33 
34 struct MUtf8Char {
35     size_t n;
36     std::array<uint8_t, CONST_4> ch;
37 };
38 
39 /*
40  * MUtf-8
41  *
42  * U+0000 => C0 80
43  *
44  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
45  *    code point   code point   code point
46  * 1  7            U+0000       U+007F      0xxxxxxx
47  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
48  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
49  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
50  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
51  */
52 
53 /*
54  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
55  * In case of invalid sequence return first byte of it.
56  */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t max_bytes)57 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
58 {
59     Span<const uint8_t> sp(data, max_bytes);
60     uint8_t d0 = sp[0];
61     if ((d0 & MASK1) == 0) {
62         return {d0, 1};
63     }
64 
65     if (max_bytes < CONST_2) {
66         return {d0, 1};
67     }
68     uint8_t d1 = sp[1];
69     if ((d0 & MASK2) == 0) {
70         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
71     }
72 
73     if (max_bytes < CONST_3) {
74         return {d0, 1};
75     }
76     uint8_t d2 = sp[CONST_2];
77     if ((d0 & MASK3) == 0) {
78         return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
79                 CONST_3};
80     }
81 
82     if (max_bytes < CONST_4) {
83         return {d0, 1};
84     }
85     uint8_t d3 = sp[CONST_3];
86     uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
87                           ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
88 
89     uint32_t pair = 0;
90     pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
91     pair <<= PAIR_ELEMENT_WIDTH;
92     pair |= (code_point & MASK_10BIT) + U16_TAIL;
93 
94     return {pair, CONST_4};
95 }
96 
CombineTwoU16(uint16_t d0,uint16_t d1)97 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
98 {
99     uint32_t codePoint = d0 - HI_SURROGATE_MIN;
100     codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
101     codePoint |= d1 - LO_SURROGATE_MIN;
102     codePoint += LO_SUPPLEMENTS_MIN;
103     return codePoint;
104 }
105 
ConvertUtf16ToMUtf8(uint16_t d0,uint16_t d1)106 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
107 {
108     // When the first utf16 code is in 0xd800-0xdfff and the second utf16 code is 0,
109     // it is a single code point, and it needs to be represented by three MUTF8 code.
110     if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
111         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
112         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
113         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
114         return {CONST_3, {ch0, ch1, ch2}};
115     }
116 
117     if (d0 == 0) {
118         return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
119     }
120     if (d0 <= MUTF8_1B_MAX) {
121         return {1, {static_cast<uint8_t>(d0)}};
122     }
123     if (d0 <= MUTF8_2B_MAX) {
124         auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
125         auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
126         return {CONST_2, {ch0, ch1}};
127     }
128     if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
129         auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
130         auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
131         auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
132         return {CONST_3, {ch0, ch1, ch2}};
133     }
134 
135     uint32_t codePoint = CombineTwoU16(d0, d1);
136 
137     auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
138     auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
139     auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
140     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
141 
142     return {CONST_4, {ch0, ch1, ch2, ch3}};
143 }
144 
IsMUtf8OnlySingleBytes(const uint8_t * mutf8_in)145 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
146 {
147     while (*mutf8_in != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
148         if (*mutf8_in >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
149             return false;
150         }
151         mutf8_in += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
152     }
153     return true;
154 }
155 
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16_in,uint8_t * mutf8_out,size_t utf16_len,size_t mutf8_len,size_t start)156 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
157                                  size_t start)
158 {
159     size_t mutf8_pos = 0;
160     if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
161         return 0;
162     }
163     size_t end = start + utf16_len;
164     for (size_t i = start; i < end; ++i) {
165         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
166         uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
167         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
168         MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
169         if (mutf8_pos + ch.n > mutf8_len) {
170             break;
171         }
172         for (size_t c = 0; c < ch.n; ++c) {
173             mutf8_out[mutf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
174         }
175         if (ch.n == CONST_4) {  // Two UTF-16 chars are used
176             ++i;
177         }
178     }
179     return mutf8_pos;
180 }
181 
ConvertMUtf8ToUtf16(const uint8_t * mutf8_in,size_t mutf8_len,uint16_t * utf16_out)182 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
183 {
184     size_t in_pos = 0;
185     while (in_pos < mutf8_len) {
186         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
187         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
188 
189         if (p_hi != 0) {
190             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191         }
192         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
193 
194         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195         in_pos += nbytes;
196     }
197 }
198 
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8_in,uint16_t * utf16_out,size_t mutf8_len,size_t utf16_len,size_t start)199 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
200                                  size_t start)
201 {
202     size_t in_pos = 0;
203     size_t out_pos = 0;
204     while (in_pos < mutf8_len) {
205         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
206         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
207 
208         mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
209         in_pos += nbytes;
210         if (start > 0) {
211             start -= nbytes;
212             continue;
213         }
214 
215         if (p_hi != 0) {
216             if (out_pos++ >= utf16_len - 1) {  // check for place for two uint16
217                 --out_pos;
218                 break;
219             }
220             *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
221         }
222         if (out_pos++ >= utf16_len) {
223             --out_pos;
224             break;
225         }
226         *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
227     }
228     return out_pos;
229 }
230 
CompareMUtf8ToMUtf8(const uint8_t * mutf8_1,const uint8_t * mutf8_2)231 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
232 {
233     uint32_t c1;
234     uint32_t c2;
235     uint32_t n1;
236     uint32_t n2;
237 
238     do {
239         c1 = *mutf8_1;
240         c2 = *mutf8_2;
241 
242         if (c1 == 0 && c2 == 0) {
243             return 0;
244         }
245 
246         if (c1 == 0 && c2 != 0) {
247             return -1;
248         }
249 
250         if (c1 != 0 && c2 == 0) {
251             return 1;
252         }
253 
254         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
255         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
256 
257         mutf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
258         mutf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
259     } while (c1 == c2);
260 
261     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
262     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
263 
264     auto result = static_cast<int>(c1p1 - c2p1);
265     if (result != 0) {
266         return result;
267     }
268 
269     return c1p2 - c2p2;
270 }
271 
272 // Compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf8_1,size_t utf8_1_length,const uint8_t * utf8_2,size_t utf8_2_length)273 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
274 {
275     uint32_t c1;
276     uint32_t c2;
277     uint32_t n1;
278     uint32_t n2;
279 
280     uint32_t utf8_1_index = 0;
281     uint32_t utf8_2_index = 0;
282 
283     do {
284         if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
285             return 0;
286         }
287 
288         if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
289             return -1;
290         }
291 
292         if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
293             return 1;
294         }
295 
296         c1 = *utf8_1;
297         c2 = *utf8_2;
298 
299         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
300         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
301 
302         utf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
303         utf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
304         utf8_1_index += n1;
305         utf8_2_index += n2;
306     } while (c1 == c2);
307 
308     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
309     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
310 
311     auto result = static_cast<int>(c1p1 - c2p1);
312     if (result != 0) {
313         return result;
314     }
315 
316     return c1p2 - c2p2;
317 }
318 
Mutf8Size(const uint8_t * mutf8)319 size_t Mutf8Size(const uint8_t *mutf8)
320 {
321     return strlen(Mutf8AsCString(mutf8));
322 }
323 
MUtf8ToUtf16Size(const uint8_t * mutf8)324 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
325 {
326     size_t res = 0;
327     while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
328         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
329         res += pair > MAX_U16 ? CONST_2 : 1;
330         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
331     }
332     return res;
333 }
334 
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8_len)335 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
336 {
337     size_t pos = 0;
338     size_t res = 0;
339     while (pos != mutf8_len) {
340         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
341         if (nbytes == 0) {
342             nbytes = 1;
343         }
344         res += pair > MAX_U16 ? CONST_2 : 1;
345         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
346         pos += nbytes;
347     }
348     return res;
349 }
350 
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)351 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
352 {
353     size_t res = 1;  // zero byte
354     // When the utf16 data length is only 1 and the code is in 0xd800-0xdfff,
355     // it is a single code point, and it needs to be represented by three MUTF8 code.
356     if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
357         mutf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
358         res += CONST_3;
359         return res;
360     }
361 
362     for (uint32_t i = 0; i < length; ++i) {
363         // NOLINTNEXTLINE(bugprone-branch-clone)
364         if (mutf16[i] == 0) {                    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
365             res += CONST_2;                      // special case for U+0000 => C0 80
366         } else if (mutf16[i] <= MUTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
367             res += 1;
368         } else if (mutf16[i] <= MUTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
369             res += CONST_2;
370             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
371         } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
372             res += CONST_3;
373         } else {
374             res += CONST_4;
375             ++i;
376         }
377     }
378     return res;
379 }
380 
IsEqual(Span<const uint8_t> utf8_1,Span<const uint8_t> utf8_2)381 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
382 {
383     if (utf8_1.size() != utf8_2.size()) {
384         return false;
385     }
386 
387     return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
388 }
389 
IsEqual(const uint8_t * mutf8_1,const uint8_t * mutf8_2)390 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
391 {
392     return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
393 }
394 
395 }  // namespace panda::utf
396