1 /**
2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17
18 #include <cstring>
19
20 namespace panda::utf {
21
22 constexpr size_t MAX_U16 = 0xffff;
23 constexpr size_t CONST_2 = 2;
24 constexpr size_t CONST_3 = 3;
25 constexpr size_t CONST_4 = 4;
26 constexpr size_t CONST_6 = 6;
27 constexpr size_t CONST_12 = 12;
28
29 struct MUtf8Char {
30 size_t n;
31 std::array<uint8_t, CONST_4> ch;
32 };
33
34 /*
35 * MUtf-8
36 *
37 * U+0000 => C0 80
38 *
39 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
40 * code point code point code point
41 * 1 7 U+0000 U+007F 0xxxxxxx
42 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
43 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
44 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
45 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
46 */
47
48 /*
49 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
50 * In case of invalid sequence return first byte of it.
51 */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t max_bytes)52 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
53 {
54 // TODO(d.kovalneko): make the function safe
55 Span<const uint8_t> sp(data, max_bytes);
56 uint8_t d0 = sp[0];
57 if ((d0 & MASK1) == 0) {
58 return {d0, 1};
59 }
60
61 if (max_bytes < CONST_2) {
62 return {d0, 1};
63 }
64 uint8_t d1 = sp[1];
65 if ((d0 & MASK2) == 0) {
66 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
67 }
68
69 if (max_bytes < CONST_3) {
70 return {d0, 1};
71 }
72 uint8_t d2 = sp[CONST_2];
73 if ((d0 & MASK3) == 0) {
74 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
75 CONST_3};
76 }
77
78 if (max_bytes < CONST_4) {
79 return {d0, 1};
80 }
81 uint8_t d3 = sp[CONST_3];
82 uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
83 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
84
85 uint32_t pair = 0;
86 pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
87 pair <<= PAIR_ELEMENT_WIDTH;
88 pair |= (code_point & MASK_10BIT) + U16_TAIL;
89
90 return {pair, CONST_4};
91 }
92
CombineTwoU16(uint16_t d0,uint16_t d1)93 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
94 {
95 uint32_t codePoint = d0 - HI_SURROGATE_MIN;
96 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
97 codePoint |= d1 - LO_SURROGATE_MIN;
98 codePoint += LO_SUPPLEMENTS_MIN;
99 return codePoint;
100 }
101
ConvertUtf16ToMUtf8(uint16_t d0,uint16_t d1)102 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
103 {
104 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
105 // means that is a single code point, it needs to be represented by three MUTF8 code.
106 if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
107 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
108 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
109 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
110 return {CONST_3, {ch0, ch1, ch2}};
111 }
112
113 if (d0 == 0) {
114 return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
115 }
116 if (d0 <= MUTF8_1B_MAX) {
117 return {1, {static_cast<uint8_t>(d0)}};
118 }
119 if (d0 <= MUTF8_2B_MAX) {
120 auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
121 auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
122 return {CONST_2, {ch0, ch1}};
123 }
124 if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
125 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
126 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
127 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
128 return {CONST_3, {ch0, ch1, ch2}};
129 }
130
131 uint32_t codePoint = CombineTwoU16(d0, d1);
132
133 auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
134 auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
135 auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
136 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
137
138 return {CONST_4, {ch0, ch1, ch2, ch3}};
139 }
140
IsMUtf8OnlySingleBytes(const uint8_t * mutf8_in)141 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
142 {
143 while (*mutf8_in != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
144 if (*mutf8_in >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145 return false;
146 }
147 mutf8_in += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
148 }
149 return true;
150 }
151
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16_in,uint8_t * mutf8_out,size_t utf16_len,size_t mutf8_len,size_t start)152 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
153 size_t start)
154 {
155 size_t mutf8_pos = 0;
156 if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
157 return 0;
158 }
159 size_t end = start + utf16_len;
160 for (size_t i = start; i < end; ++i) {
161 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
162 uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
163 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
164 MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
165 if (mutf8_pos + ch.n > mutf8_len) {
166 break;
167 }
168 for (size_t c = 0; c < ch.n; ++c) {
169 mutf8_out[mutf8_pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
170 }
171 if (ch.n == CONST_4) { // Two UTF-16 chars are used
172 ++i;
173 }
174 }
175 return mutf8_pos;
176 }
177
ConvertMUtf8ToUtf16(const uint8_t * mutf8_in,size_t mutf8_len,uint16_t * utf16_out)178 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
179 {
180 size_t in_pos = 0;
181 while (in_pos < mutf8_len) {
182 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
183 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
184
185 if (p_hi != 0) {
186 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
187 }
188 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
189
190 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191 in_pos += nbytes;
192 }
193 }
194
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8_in,uint16_t * utf16_out,size_t mutf8_len,size_t utf16_len,size_t start)195 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
196 size_t start)
197 {
198 size_t in_pos = 0;
199 size_t out_pos = 0;
200 while (in_pos < mutf8_len) {
201 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
202 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
203
204 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
205 in_pos += nbytes;
206 if (start > 0) {
207 start -= nbytes;
208 continue;
209 }
210
211 if (p_hi != 0) {
212 ASSERT(utf16_len >= 1);
213 if (out_pos++ >= utf16_len - 1) { // check for place for two uint16
214 --out_pos;
215 break;
216 }
217 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
218 }
219 if (out_pos++ >= utf16_len) {
220 --out_pos;
221 break;
222 }
223 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
224 }
225 return out_pos;
226 }
227
CompareMUtf8ToMUtf8(const uint8_t * mutf8_1,const uint8_t * mutf8_2)228 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
229 {
230 uint32_t c1;
231 uint32_t c2;
232 uint32_t n1;
233 uint32_t n2;
234
235 do {
236 c1 = *mutf8_1;
237 c2 = *mutf8_2;
238
239 if (c1 == 0 && c2 == 0) {
240 return 0;
241 }
242
243 if (c1 == 0 && c2 != 0) {
244 return -1;
245 }
246
247 if (c1 != 0 && c2 == 0) {
248 return 1;
249 }
250
251 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
252 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
253
254 mutf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
255 mutf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
256 } while (c1 == c2);
257
258 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
259 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
260
261 auto result = static_cast<int>(c1p1 - c2p1);
262 if (result != 0) {
263 return result;
264 }
265
266 return c1p2 - c2p2;
267 }
268
269 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf8_1,size_t utf8_1_length,const uint8_t * utf8_2,size_t utf8_2_length)270 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
271 {
272 uint32_t c1;
273 uint32_t c2;
274 uint32_t n1;
275 uint32_t n2;
276
277 uint32_t utf8_1_index = 0;
278 uint32_t utf8_2_index = 0;
279
280 do {
281 if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
282 return 0;
283 }
284
285 if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
286 return -1;
287 }
288
289 if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
290 return 1;
291 }
292
293 c1 = *utf8_1;
294 c2 = *utf8_2;
295
296 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
297 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
298
299 utf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
300 utf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
301 utf8_1_index += n1;
302 utf8_2_index += n2;
303 } while (c1 == c2);
304
305 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
306 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
307
308 auto result = static_cast<int>(c1p1 - c2p1);
309 if (result != 0) {
310 return result;
311 }
312
313 return c1p2 - c2p2;
314 }
315
Mutf8Size(const uint8_t * mutf8)316 size_t Mutf8Size(const uint8_t *mutf8)
317 {
318 return strlen(Mutf8AsCString(mutf8));
319 }
320
MUtf8ToUtf16Size(const uint8_t * mutf8)321 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
322 {
323 // TODO(d.kovalenko): make it faster
324 size_t res = 0;
325 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
326 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
327 res += pair > MAX_U16 ? CONST_2 : 1;
328 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
329 }
330 return res;
331 }
332
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8_len)333 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
334 {
335 size_t pos = 0;
336 size_t res = 0;
337 while (pos != mutf8_len) {
338 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
339 if (nbytes == 0) {
340 nbytes = 1;
341 }
342 res += pair > MAX_U16 ? CONST_2 : 1;
343 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344 pos += nbytes;
345 }
346 return res;
347 }
348
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)349 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
350 {
351 size_t res = 1; // zero byte
352 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
353 // means that is a single code point, it needs to be represented by three MUTF8 code.
354 if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
355 mutf16[0] <= LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
356 res += CONST_3;
357 return res;
358 }
359
360 for (uint32_t i = 0; i < length; ++i) {
361 // NOLINTNEXTLINE(bugprone-branch-clone)
362 if (mutf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
363 res += CONST_2; // special case for U+0000 => C0 80
364 } else if (mutf16[i] <= MUTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
365 res += 1;
366 } else if (mutf16[i] <= MUTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
367 res += CONST_2;
368 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
369 } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
370 res += CONST_3;
371 } else {
372 res += CONST_4;
373 ++i;
374 }
375 }
376 return res;
377 }
378
IsEqual(Span<const uint8_t> utf8_1,Span<const uint8_t> utf8_2)379 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
380 {
381 if (utf8_1.size() != utf8_2.size()) {
382 return false;
383 }
384
385 return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
386 }
387
IsEqual(const uint8_t * mutf8_1,const uint8_t * mutf8_2)388 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
389 {
390 return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
391 }
392
IsValidModifiedUTF8(const uint8_t * elems)393 bool IsValidModifiedUTF8(const uint8_t *elems)
394 {
395 ASSERT(elems);
396
397 while (*elems != '\0') {
398 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
399 switch (*elems & 0xf0) {
400 case 0x00:
401 case 0x10: // NOLINT(readability-magic-numbers)
402 case 0x20: // NOLINT(readability-magic-numbers)
403 case 0x30: // NOLINT(readability-magic-numbers)
404 case 0x40: // NOLINT(readability-magic-numbers)
405 case 0x50: // NOLINT(readability-magic-numbers)
406 case 0x60: // NOLINT(readability-magic-numbers)
407 case 0x70: // NOLINT(readability-magic-numbers)
408 // pattern 0xxx
409 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
410 ++elems;
411 break;
412 case 0x80: // NOLINT(readability-magic-numbers)
413 case 0x90: // NOLINT(readability-magic-numbers)
414 case 0xa0: // NOLINT(readability-magic-numbers)
415 case 0xb0: // NOLINT(readability-magic-numbers)
416 // pattern 10xx is illegal start
417 return false;
418
419 case 0xf0: // NOLINT(readability-magic-numbers)
420 // pattern 1111 0xxx starts four byte section
421 if ((*elems & 0x08) == 0) { // NOLINT(hicpp-signed-bitwise)
422 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
423 ++elems;
424 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
425 return false;
426 }
427 } else {
428 return false;
429 }
430 // no need break
431 [[fallthrough]];
432
433 case 0xe0: // NOLINT(readability-magic-numbers)
434 // pattern 1110
435 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
436 ++elems;
437 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
438 return false;
439 }
440 // no need break
441 [[fallthrough]];
442
443 case 0xc0: // NOLINT(readability-magic-numbers)
444 case 0xd0: // NOLINT(readability-magic-numbers)
445 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
446 ++elems;
447 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
448 return false;
449 }
450 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
451 ++elems;
452 break;
453 default:
454 break;
455 }
456 }
457 return true;
458 }
459
460 } // namespace panda::utf
461