1 /** 2 * Copyright (c) 2021-2022 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #include "utf.h" 17 18 #include <cstddef> 19 #include <cstring> 20 21 #include <limits> 22 #include <tuple> 23 #include <utility> 24 25 namespace panda::utf { 26 27 constexpr size_t MAX_U16 = 0xffff; 28 constexpr size_t CONST_2 = 2; 29 constexpr size_t CONST_3 = 3; 30 constexpr size_t CONST_4 = 4; 31 constexpr size_t CONST_6 = 6; 32 constexpr size_t CONST_12 = 12; 33 34 struct MUtf8Char { 35 size_t n; 36 std::array<uint8_t, CONST_4> ch; 37 }; 38 39 /* 40 * MUtf-8 41 * 42 * U+0000 => C0 80 43 * 44 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6 45 * code point code point code point 46 * 1 7 U+0000 U+007F 0xxxxxxx 47 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx 48 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx 49 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx 50 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000) 51 */ 52 53 /* 54 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size]. 55 * In case of invalid sequence return first byte of it. 56 */ ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t max_bytes)57 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes) 58 { 59 // TODO(d.kovalneko): make the function safe 60 Span<const uint8_t> sp(data, max_bytes); 61 uint8_t d0 = sp[0]; 62 if ((d0 & MASK1) == 0) { 63 return {d0, 1}; 64 } 65 66 if (max_bytes < CONST_2) { 67 return {d0, 1}; 68 } 69 uint8_t d1 = sp[1]; 70 if ((d0 & MASK2) == 0) { 71 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2}; 72 } 73 74 if (max_bytes < CONST_3) { 75 return {d0, 1}; 76 } 77 uint8_t d2 = sp[CONST_2]; 78 if ((d0 & MASK3) == 0) { 79 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT), 80 CONST_3}; 81 } 82 83 if (max_bytes < CONST_4) { 84 return {d0, 1}; 85 } 86 uint8_t d3 = sp[CONST_3]; 87 uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) | 88 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT); 89 90 uint32_t pair = 0; 91 pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT; 92 pair <<= PAIR_ELEMENT_WIDTH; 93 pair |= (code_point & MASK_10BIT) + U16_TAIL; 94 95 return {pair, CONST_4}; 96 } 97 CombineTwoU16(uint16_t d0,uint16_t d1)98 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1) 99 { 100 uint32_t codePoint = d0 - HI_SURROGATE_MIN; 101 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH); 102 codePoint |= d1 - LO_SURROGATE_MIN; 103 codePoint += LO_SUPPLEMENTS_MIN; 104 return codePoint; 105 } 106 ConvertUtf16ToMUtf8(uint16_t d0,uint16_t d1)107 constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1) 108 { 109 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, 110 // means that is a single code point, it needs to be represented by three MUTF8 code. 111 if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) { 112 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12)); 113 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT)); 114 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT)); 115 return {CONST_3, {ch0, ch1, ch2}}; 116 } 117 118 if (d0 == 0) { 119 return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}}; 120 } 121 if (d0 <= MUTF8_1B_MAX) { 122 return {1, {static_cast<uint8_t>(d0)}}; 123 } 124 if (d0 <= MUTF8_2B_MAX) { 125 auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6)); 126 auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT)); 127 return {CONST_2, {ch0, ch1}}; 128 } 129 if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) { 130 auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12)); 131 auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT)); 132 auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT)); 133 return {CONST_3, {ch0, ch1, ch2}}; 134 } 135 136 uint32_t codePoint = CombineTwoU16(d0, d1); 137 138 auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST); 139 auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1); 140 auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1); 141 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1); 142 143 return {CONST_4, {ch0, ch1, ch2, ch3}}; 144 } 145 IsMUtf8OnlySingleBytes(const uint8_t * mutf8_in)146 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in) 147 { 148 while (*mutf8_in != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 149 if (*mutf8_in >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 150 return false; 151 } 152 mutf8_in += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 153 } 154 return true; 155 } 156 ConvertRegionUtf16ToMUtf8(const uint16_t * utf16_in,uint8_t * mutf8_out,size_t utf16_len,size_t mutf8_len,size_t start)157 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len, 158 size_t start) 159 { 160 size_t mutf8_pos = 0; 161 if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) { 162 return 0; 163 } 164 size_t end = start + utf16_len; 165 for (size_t i = start; i < end; ++i) { 166 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 167 uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0; 168 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 169 MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code); 170 if (mutf8_pos + ch.n > mutf8_len) { 171 break; 172 } 173 for (size_t c = 0; c < ch.n; ++c) { 174 mutf8_out[mutf8_pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 175 } 176 if (ch.n == CONST_4) { // Two UTF-16 chars are used 177 ++i; 178 } 179 } 180 return mutf8_pos; 181 } 182 ConvertMUtf8ToUtf16(const uint8_t * mutf8_in,size_t mutf8_len,uint16_t * utf16_out)183 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out) 184 { 185 size_t in_pos = 0; 186 while (in_pos < mutf8_len) { 187 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos); 188 auto [p_hi, p_lo] = SplitUtf16Pair(pair); 189 190 if (p_hi != 0) { 191 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 192 } 193 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 194 195 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 196 in_pos += nbytes; 197 } 198 } 199 ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8_in,uint16_t * utf16_out,size_t mutf8_len,size_t utf16_len,size_t start)200 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len, 201 size_t start) 202 { 203 size_t in_pos = 0; 204 size_t out_pos = 0; 205 while (in_pos < mutf8_len) { 206 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos); 207 auto [p_hi, p_lo] = SplitUtf16Pair(pair); 208 209 mutf8_in += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 210 in_pos += nbytes; 211 if (start > 0) { 212 start -= nbytes; 213 continue; 214 } 215 216 if (p_hi != 0) { 217 if (out_pos++ >= utf16_len - 1) { // check for place for two uint16 218 --out_pos; 219 break; 220 } 221 *utf16_out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 222 } 223 if (out_pos++ >= utf16_len) { 224 --out_pos; 225 break; 226 } 227 *utf16_out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 228 } 229 return out_pos; 230 } 231 CompareMUtf8ToMUtf8(const uint8_t * mutf8_1,const uint8_t * mutf8_2)232 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2) 233 { 234 uint32_t c1; 235 uint32_t c2; 236 uint32_t n1; 237 uint32_t n2; 238 239 do { 240 c1 = *mutf8_1; 241 c2 = *mutf8_2; 242 243 if (c1 == 0 && c2 == 0) { 244 return 0; 245 } 246 247 if (c1 == 0 && c2 != 0) { 248 return -1; 249 } 250 251 if (c1 != 0 && c2 == 0) { 252 return 1; 253 } 254 255 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1); 256 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2); 257 258 mutf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 259 mutf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 260 } while (c1 == c2); 261 262 auto [c1p1, c1p2] = SplitUtf16Pair(c1); 263 auto [c2p1, c2p2] = SplitUtf16Pair(c2); 264 265 auto result = static_cast<int>(c1p1 - c2p1); 266 if (result != 0) { 267 return result; 268 } 269 270 return c1p2 - c2p2; 271 } 272 273 // compare plain utf8, which allows 0 inside a string CompareUtf8ToUtf8(const uint8_t * utf8_1,size_t utf8_1_length,const uint8_t * utf8_2,size_t utf8_2_length)274 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length) 275 { 276 uint32_t c1; 277 uint32_t c2; 278 uint32_t n1; 279 uint32_t n2; 280 281 uint32_t utf8_1_index = 0; 282 uint32_t utf8_2_index = 0; 283 284 do { 285 if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) { 286 return 0; 287 } 288 289 if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) { 290 return -1; 291 } 292 293 if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) { 294 return 1; 295 } 296 297 c1 = *utf8_1; 298 c2 = *utf8_2; 299 300 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1); 301 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2); 302 303 utf8_1 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 304 utf8_2 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 305 utf8_1_index += n1; 306 utf8_2_index += n2; 307 } while (c1 == c2); 308 309 auto [c1p1, c1p2] = SplitUtf16Pair(c1); 310 auto [c2p1, c2p2] = SplitUtf16Pair(c2); 311 312 auto result = static_cast<int>(c1p1 - c2p1); 313 if (result != 0) { 314 return result; 315 } 316 317 return c1p2 - c2p2; 318 } 319 Mutf8Size(const uint8_t * mutf8)320 size_t Mutf8Size(const uint8_t *mutf8) 321 { 322 return strlen(Mutf8AsCString(mutf8)); 323 } 324 MUtf8ToUtf16Size(const uint8_t * mutf8)325 size_t MUtf8ToUtf16Size(const uint8_t *mutf8) 326 { 327 // TODO(d.kovalenko): make it faster 328 size_t res = 0; 329 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 330 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8); 331 res += pair > MAX_U16 ? CONST_2 : 1; 332 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 333 } 334 return res; 335 } 336 MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8_len)337 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len) 338 { 339 size_t pos = 0; 340 size_t res = 0; 341 while (pos != mutf8_len) { 342 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos); 343 if (nbytes == 0) { 344 nbytes = 1; 345 } 346 res += pair > MAX_U16 ? CONST_2 : 1; 347 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 348 pos += nbytes; 349 } 350 return res; 351 } 352 Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)353 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length) 354 { 355 size_t res = 1; // zero byte 356 // when utf16 data length is only 1 and code in 0xd800-0xdfff, 357 // means that is a single code point, it needs to be represented by three MUTF8 code. 358 if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 359 mutf16[0] <= LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 360 res += CONST_3; 361 return res; 362 } 363 364 for (uint32_t i = 0; i < length; ++i) { 365 // NOLINTNEXTLINE(bugprone-branch-clone) 366 if (mutf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 367 res += CONST_2; // special case for U+0000 => C0 80 368 } else if (mutf16[i] <= MUTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 369 res += 1; 370 } else if (mutf16[i] <= MUTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 371 res += CONST_2; 372 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 373 } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) { 374 res += CONST_3; 375 } else { 376 res += CONST_4; 377 ++i; 378 } 379 } 380 return res; 381 } 382 IsEqual(Span<const uint8_t> utf8_1,Span<const uint8_t> utf8_2)383 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2) 384 { 385 if (utf8_1.size() != utf8_2.size()) { 386 return false; 387 } 388 389 return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0; 390 } 391 IsEqual(const uint8_t * mutf8_1,const uint8_t * mutf8_2)392 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2) 393 { 394 return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0; 395 } 396 IsValidModifiedUTF8(const uint8_t * elems)397 bool IsValidModifiedUTF8(const uint8_t *elems) 398 { 399 ASSERT(elems); 400 401 while (*elems != '\0') { 402 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers) 403 switch (*elems & 0xf0) { 404 case 0x00: 405 case 0x10: // NOLINT(readability-magic-numbers) 406 case 0x20: // NOLINT(readability-magic-numbers) 407 case 0x30: // NOLINT(readability-magic-numbers) 408 case 0x40: // NOLINT(readability-magic-numbers) 409 case 0x50: // NOLINT(readability-magic-numbers) 410 case 0x60: // NOLINT(readability-magic-numbers) 411 case 0x70: // NOLINT(readability-magic-numbers) 412 // pattern 0xxx 413 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 414 ++elems; 415 break; 416 case 0x80: // NOLINT(readability-magic-numbers) 417 case 0x90: // NOLINT(readability-magic-numbers) 418 case 0xa0: // NOLINT(readability-magic-numbers) 419 case 0xb0: // NOLINT(readability-magic-numbers) 420 // pattern 10xx is illegal start 421 return false; 422 423 case 0xf0: // NOLINT(readability-magic-numbers) 424 // pattern 1111 0xxx starts four byte section 425 if ((*elems & 0x08) == 0) { // NOLINT(hicpp-signed-bitwise) 426 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 427 ++elems; 428 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 429 return false; 430 } 431 } else { 432 return false; 433 } 434 // no need break 435 [[fallthrough]]; 436 437 case 0xe0: // NOLINT(readability-magic-numbers) 438 // pattern 1110 439 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 440 ++elems; 441 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 442 return false; 443 } 444 // no need break 445 [[fallthrough]]; 446 447 case 0xc0: // NOLINT(readability-magic-numbers) 448 case 0xd0: // NOLINT(readability-magic-numbers) 449 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 450 ++elems; 451 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers) 452 return false; 453 } 454 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 455 ++elems; 456 break; 457 default: 458 break; 459 } 460 } 461 return true; 462 } 463 464 } // namespace panda::utf 465