1 /**
2 * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17
18 #include <cstddef>
19 #include <cstring>
20
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24
25 // NOLINTNEXTLINE(hicpp-signed-bitwise)
26 static constexpr uint32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
27 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
28 #define U16_GET_SUPPLEMENTARY(lead, trail) \
29 ((static_cast<uint32_t>(lead) << 10UL) + static_cast<uint32_t>(trail) - U16_SURROGATE_OFFSET)
30
31 namespace ark::utf {
32
33 /*
34 * MUtf-8
35 *
36 * U+0000 => C0 80
37 *
38 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
39 * code point code point code point
40 * 1 7 U+0000 U+007F 0xxxxxxx
41 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
42 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
43 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
44 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
45 */
46
47 /*
48 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
49 * In case of invalid sequence return first byte of it.
50 */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)51 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)
52 {
53 // NOTE(d.kovalneko): make the function safe
54 Span<const uint8_t> sp(data, maxBytes);
55 uint8_t d0 = sp[0];
56 if ((d0 & MASK1) == 0) {
57 return {d0, 1};
58 }
59
60 if (maxBytes < CONST_2 || sp[1] == 0) {
61 return {d0, 1};
62 }
63 uint8_t d1 = sp[1];
64 if ((d0 & MASK2) == 0) {
65 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
66 }
67
68 if (maxBytes < CONST_3 || sp[CONST_2] == 0) {
69 return {d0, 1};
70 }
71 uint8_t d2 = sp[CONST_2];
72 if ((d0 & MASK3) == 0) {
73 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
74 CONST_3};
75 }
76
77 if (maxBytes < CONST_4 || sp[CONST_3] == 0) {
78 return {d0, 1};
79 }
80 uint8_t d3 = sp[CONST_3];
81 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
82 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
83
84 uint32_t pair = 0;
85 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
86 pair <<= PAIR_ELEMENT_WIDTH;
87 pair |= (codePoint & MASK_10BIT) + U16_TAIL;
88
89 return {pair, CONST_4};
90 }
91
CombineTwoU16(uint16_t d0,uint16_t d1)92 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
93 {
94 uint32_t codePoint = d0 - DECODE_LEAD_LOW;
95 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
96 codePoint |= d1 - DECODE_TRAIL_LOW; // NOLINT(hicpp-signed-bitwise
97 codePoint += DECODE_SECOND_FACTOR;
98 return codePoint;
99 }
100
IsMUtf8OnlySingleBytes(const uint8_t * mutf8In)101 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)
102 {
103 while (*mutf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104 if (*mutf8In >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105 return false;
106 }
107 mutf8In += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
108 }
109 return true;
110 }
111
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16In,uint8_t * mutf8Out,size_t utf16Len,size_t mutf8Len,size_t start)112 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len,
113 size_t start)
114 {
115 return ConvertRegionUtf16ToUtf8(utf16In, mutf8Out, utf16Len, mutf8Len, start, true);
116 }
117
ConvertMUtf8ToUtf16(const uint8_t * mutf8In,size_t mutf8Len,uint16_t * utf16Out)118 void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)
119 {
120 size_t inPos = 0;
121 while (inPos < mutf8Len) {
122 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
123 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
124
125 if (p_hi != 0) {
126 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
127 }
128 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
129
130 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
131 inPos += nbytes;
132 }
133 }
134
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)135 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len,
136 size_t start)
137 {
138 size_t inPos = 0;
139 size_t outPos = 0;
140 while (inPos < mutf8Len) {
141 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
142 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
143
144 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145 inPos += nbytes;
146 if (start > 0) {
147 start -= nbytes;
148 continue;
149 }
150
151 if (p_hi != 0) {
152 if (outPos++ >= utf16Len - 1) { // check for place for two uint16
153 --outPos;
154 break;
155 }
156 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
157 }
158 if (outPos++ >= utf16Len) {
159 --outPos;
160 break;
161 }
162 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
163 }
164 return outPos;
165 }
166
CompareMUtf8ToMUtf8(const uint8_t * mutf81,const uint8_t * mutf82)167 int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)
168 {
169 uint32_t c1;
170 uint32_t c2;
171 uint32_t n1;
172 uint32_t n2;
173
174 do {
175 c1 = *mutf81;
176 c2 = *mutf82;
177
178 if (c1 == 0 && c2 == 0) {
179 return 0;
180 }
181
182 if (c1 == 0 && c2 != 0) {
183 return -1;
184 }
185
186 if (c1 != 0 && c2 == 0) {
187 return 1;
188 }
189
190 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf81);
191 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf82);
192
193 mutf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194 mutf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195 } while (c1 == c2);
196
197 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
198 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
199
200 auto result = static_cast<int>(c1p1 - c2p1);
201 if (result != 0) {
202 return result;
203 }
204
205 return c1p2 - c2p2;
206 }
207
208 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf81,size_t utf81Length,const uint8_t * utf82,size_t utf82Length)209 int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)
210 {
211 uint32_t c1;
212 uint32_t c2;
213 uint32_t n1;
214 uint32_t n2;
215
216 uint32_t utf81Index = 0;
217 uint32_t utf82Index = 0;
218
219 do {
220 if (utf81Index == utf81Length && utf82Index == utf82Length) {
221 return 0;
222 }
223
224 if (utf81Index == utf81Length && utf82Index < utf82Length) {
225 return -1;
226 }
227
228 if (utf81Index < utf81Length && utf82Index == utf82Length) {
229 return 1;
230 }
231
232 c1 = *utf81;
233 c2 = *utf82;
234
235 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf81);
236 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf82);
237
238 utf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239 utf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240 utf81Index += n1;
241 utf82Index += n2;
242 } while (c1 == c2);
243
244 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
245 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
246
247 auto result = static_cast<int>(c1p1 - c2p1);
248 if (result != 0) {
249 return result;
250 }
251
252 return c1p2 - c2p2;
253 }
254
Mutf8Size(const uint8_t * mutf8)255 size_t Mutf8Size(const uint8_t *mutf8)
256 {
257 return strlen(Mutf8AsCString(mutf8));
258 }
259
MUtf8ToUtf16Size(const uint8_t * mutf8)260 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
261 {
262 // NOTE(d.kovalenko): make it faster
263 size_t res = 0;
264 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
266 res += pair > MAX_U16 ? CONST_2 : 1;
267 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
268 }
269 return res;
270 }
271
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)272 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)
273 {
274 size_t pos = 0;
275 size_t res = 0;
276 while (pos != mutf8Len) {
277 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
278 if (nbytes == 0) {
279 nbytes = 1;
280 }
281 res += pair > MAX_U16 ? CONST_2 : 1;
282 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283 pos += nbytes;
284 }
285 return res;
286 }
287
IsEqual(Span<const uint8_t> utf81,Span<const uint8_t> utf82)288 bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)
289 {
290 if (utf81.size() != utf82.size()) {
291 return false;
292 }
293
294 return memcmp(utf81.data(), utf82.data(), utf81.size()) == 0;
295 }
296
IsEqual(const uint8_t * mutf81,const uint8_t * mutf82)297 bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)
298 {
299 return strcmp(Mutf8AsCString(mutf81), Mutf8AsCString(mutf82)) == 0;
300 }
301
IsValidModifiedUTF8(const uint8_t * elems)302 bool IsValidModifiedUTF8(const uint8_t *elems)
303 {
304 ASSERT(elems);
305
306 while (*elems != '\0') {
307 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
308 switch (*elems & 0xf0) {
309 case 0x00:
310 case 0x10: // NOLINT(readability-magic-numbers)
311 case 0x20: // NOLINT(readability-magic-numbers)
312 case 0x30: // NOLINT(readability-magic-numbers)
313 case 0x40: // NOLINT(readability-magic-numbers)
314 case 0x50: // NOLINT(readability-magic-numbers)
315 case 0x60: // NOLINT(readability-magic-numbers)
316 case 0x70: // NOLINT(readability-magic-numbers)
317 // pattern 0xxx
318 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
319 ++elems;
320 break;
321 case 0x80: // NOLINT(readability-magic-numbers)
322 case 0x90: // NOLINT(readability-magic-numbers)
323 case 0xa0: // NOLINT(readability-magic-numbers)
324 case 0xb0: // NOLINT(readability-magic-numbers)
325 // pattern 10xx is illegal start
326 return false;
327
328 case 0xf0: // NOLINT(readability-magic-numbers)
329 // pattern 1111 0xxx starts four byte section
330 if ((*elems & 0x08) != 0) { // NOLINT(hicpp-signed-bitwise)
331 return false;
332 }
333 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334 ++elems;
335 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
336 return false;
337 }
338 // no need break
339 [[fallthrough]];
340
341 case 0xe0: // NOLINT(readability-magic-numbers)
342 // pattern 1110
343 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344 ++elems;
345 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
346 return false;
347 }
348 // no need break
349 [[fallthrough]];
350
351 case 0xc0: // NOLINT(readability-magic-numbers)
352 case 0xd0: // NOLINT(readability-magic-numbers)
353 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354 ++elems;
355 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
356 return false;
357 }
358 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359 ++elems;
360 break;
361 default:
362 UNREACHABLE();
363 break;
364 }
365 }
366 return true;
367 }
368
UTF16Decode(uint16_t lead,uint16_t trail)369 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
370 {
371 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
372 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
373 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
374 return cp;
375 }
376
IsValidUTF8(const std::vector<uint8_t> & data)377 bool IsValidUTF8(const std::vector<uint8_t> &data)
378 {
379 uint32_t length = data.size();
380 switch (length) {
381 case UtfLength::ONE:
382 if (data.at(0) >= BIT_MASK_1) {
383 return false;
384 }
385 break;
386 case UtfLength::TWO:
387 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
388 return false;
389 }
390 break;
391 case UtfLength::THREE:
392 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
393 return false;
394 }
395 break;
396 case UtfLength::FOUR:
397 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
398 return false;
399 }
400 break;
401 default:
402 UNREACHABLE();
403 break;
404 }
405
406 for (uint32_t i = 1; i < length; i++) {
407 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
408 return false;
409 }
410 }
411 return true;
412 }
413
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)414 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
415 {
416 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
417 // means that is a single code point, it needs to be represented by three UTF8 code.
418 if (d1 == 0 && d0 >= DECODE_LEAD_LOW && d0 <= DECODE_TRAIL_HIGH) {
419 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
420 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
421 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
422 return {UtfLength::THREE, {ch0, ch1, ch2}};
423 }
424
425 if (d0 == 0) {
426 if (modify) {
427 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
428 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
429 }
430 return {UtfLength::ONE, {0x00U}};
431 }
432 if (d0 <= UTF8_1B_MAX) {
433 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
434 }
435 if (d0 <= UTF8_2B_MAX) {
436 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
437 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
438 return {UtfLength::TWO, {ch0, ch1}};
439 }
440 if (d0 < DECODE_LEAD_LOW || d0 > DECODE_LEAD_HIGH) {
441 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
442 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
443 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
444 return {UtfLength::THREE, {ch0, ch1, ch2}};
445 }
446 if (d1 < DECODE_TRAIL_LOW || d1 > DECODE_TRAIL_HIGH) {
447 // Bad sequence
448 UNREACHABLE();
449 }
450
451 uint32_t codePoint = CombineTwoU16(d0, d1);
452
453 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
454 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
455 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
456 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
457
458 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
459 }
460
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)461 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
462 {
463 size_t res = 1; // zero byte
464 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
465 // means that is a single code point, it needs to be represented by three UTF8 code.
466 if (length == 1 && utf16[0] >= DECODE_LEAD_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
467 utf16[0] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
468 res += UtfLength::THREE;
469 return res;
470 }
471
472 for (uint32_t i = 0; i < length; ++i) {
473 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
474 if (modify) {
475 res += UtfLength::TWO; // special case for U+0000 => C0 80
476 } else {
477 res += UtfLength::ONE;
478 }
479 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480 res += 1;
481 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
482 res += UtfLength::TWO;
483 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484 } else if (utf16[i] < DECODE_LEAD_LOW || utf16[i] > DECODE_LEAD_HIGH) {
485 res += UtfLength::THREE;
486 } else {
487 if (i < length - 1 &&
488 utf16[i + 1] >= DECODE_TRAIL_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
489 utf16[i + 1] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490 res += UtfLength::FOUR;
491 ++i;
492 } else {
493 res += UtfLength::THREE;
494 }
495 }
496 }
497 return res;
498 }
499
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)500 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
501 {
502 return Utf16ToUtf8Size(mutf16, length, true);
503 }
504
505 // CC-OFFNXT(G.FUN.01) solid logic
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)506 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
507 size_t start, bool modify)
508 {
509 size_t utf8Pos = 0;
510 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
511 return 0;
512 }
513 size_t end = start + utf16Len;
514 for (size_t i = start; i < end; ++i) {
515 uint16_t next16Code = 0;
516 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
517 if ((i + 1) != end && IsAvailableNextUtf16Code(utf16In[i + 1])) {
518 next16Code = utf16In[i + 1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
519 }
520 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
521 Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
522 if (utf8Pos + ch.n > utf8Len) {
523 break;
524 }
525 for (size_t c = 0; c < ch.n; ++c) {
526 utf8Out[utf8Pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
527 }
528 if (ch.n == UtfLength::FOUR) { // Two UTF-16 chars are used
529 ++i;
530 }
531 }
532 return utf8Pos;
533 }
534
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)535 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
536 {
537 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
538 if ((d0 & MASK1) == 0) {
539 return {d0, 1};
540 }
541
542 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
543 if ((d0 & MASK2) == 0) {
544 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
545 }
546
547 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
548 if ((d0 & MASK3) == 0) {
549 return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
550 UtfLength::THREE};
551 }
552
553 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
554 uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
555 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
556
557 uint32_t pair = 0;
558 if (combine) {
559 uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
560 uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
561 pair = U16_GET_SUPPLEMENTARY(lead, tail); // NOLINT(hicpp-signed-bitwise)
562 } else {
563 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
564 pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
565 }
566
567 return {pair, UtfLength::FOUR};
568 }
569
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)570 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
571 {
572 return MUtf8ToUtf16Size(utf8, utf8Len);
573 }
574
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)575 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
576 size_t start)
577 {
578 return ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
579 }
580
IsUTF16SurrogatePair(const uint16_t lead)581 bool IsUTF16SurrogatePair(const uint16_t lead)
582 {
583 return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
584 }
585
586 /**
587 * The table below is to translate integer numbers from [0..99] range to pairs of corresponding utf16 codes.
588 * The pairs are packed into utf::BidigitsCode type.
589 *
590 * Example: 0 -> 0x00300030 ("00")
591 * 1 -> 0x00310030 ("01")
592 * ...
593 * 99 -> 0x00390039 ("99")
594 */
595 using BidigitsCode = uint32_t;
596 static constexpr size_t BIDIGITS_CODE_TAB_SIZE = 100U;
597
598 static constexpr std::array<BidigitsCode, BIDIGITS_CODE_TAB_SIZE> BIDIGITS_CODE_TAB = {
599 0x00300030, 0x00310030, 0x00320030, 0x00330030, 0x00340030, 0x00350030, 0x00360030, 0x00370030, 0x00380030,
600 0x00390030, 0x00300031, 0x00310031, 0x00320031, 0x00330031, 0x00340031, 0x00350031, 0x00360031, 0x00370031,
601 0x00380031, 0x00390031, 0x00300032, 0x00310032, 0x00320032, 0x00330032, 0x00340032, 0x00350032, 0x00360032,
602 0x00370032, 0x00380032, 0x00390032, 0x00300033, 0x00310033, 0x00320033, 0x00330033, 0x00340033, 0x00350033,
603 0x00360033, 0x00370033, 0x00380033, 0x00390033, 0x00300034, 0x00310034, 0x00320034, 0x00330034, 0x00340034,
604 0x00350034, 0x00360034, 0x00370034, 0x00380034, 0x00390034, 0x00300035, 0x00310035, 0x00320035, 0x00330035,
605 0x00340035, 0x00350035, 0x00360035, 0x00370035, 0x00380035, 0x00390035, 0x00300036, 0x00310036, 0x00320036,
606 0x00330036, 0x00340036, 0x00350036, 0x00360036, 0x00370036, 0x00380036, 0x00390036, 0x00300037, 0x00310037,
607 0x00320037, 0x00330037, 0x00340037, 0x00350037, 0x00360037, 0x00370037, 0x00380037, 0x00390037, 0x00300038,
608 0x00310038, 0x00320038, 0x00330038, 0x00340038, 0x00350038, 0x00360038, 0x00370038, 0x00380038, 0x00390038,
609 0x00300039, 0x00310039, 0x00320039, 0x00330039, 0x00340039, 0x00350039, 0x00360039, 0x00370039, 0x00380039,
610 0x00390039};
611
UInt64ToUtf16Array(uint64_t v,uint16_t * outUtf16Buf,uint32_t nDigits,bool negative)612 void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)
613 {
614 ASSERT(outUtf16Buf != nullptr && nDigits != 0);
615
616 constexpr uint64_t POW10_1 = 10U;
617 constexpr uint64_t POW10_2 = 100U;
618
619 Span<uint16_t> outSpan(outUtf16Buf, nDigits);
620 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
621 auto *out = reinterpret_cast<uint32_t *>(outUtf16Buf + nDigits);
622 int i = 0;
623 while (v >= POW10_2) {
624 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
625 out[--i] = BIDIGITS_CODE_TAB[v % POW10_2];
626 v /= POW10_2;
627 }
628 if (v >= POW10_1) {
629 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
630 out[--i] = BIDIGITS_CODE_TAB[v];
631 } else {
632 outSpan[negative ? 1U : 0] = v + '0';
633 }
634 if (negative) {
635 outSpan[0] = '-';
636 }
637 }
638
639 static constexpr uint16_t C_SPACE = 0x0020;
640 static constexpr uint16_t C_0009 = 0x0009;
641 static constexpr uint16_t C_000D = 0x000D;
642 static constexpr uint16_t C_000E = 0x000E;
643 static constexpr uint16_t C_00A0 = 0x00A0;
644 static constexpr uint16_t C_1680 = 0x1680;
645 static constexpr uint16_t C_2000 = 0x2000;
646 static constexpr uint16_t C_200A = 0x200A;
647 static constexpr uint16_t C_2028 = 0x2028;
648 static constexpr uint16_t C_2029 = 0x2029;
649 static constexpr uint16_t C_202F = 0x202F;
650 static constexpr uint16_t C_205F = 0x205F;
651 static constexpr uint16_t C_3000 = 0x3000;
652 static constexpr uint16_t C_FEFF = 0xFEFF;
653
IsWhiteSpaceChar(uint16_t c)654 bool IsWhiteSpaceChar(uint16_t c)
655 {
656 if (c == C_SPACE) {
657 return true;
658 }
659 // [0x000E, 0x009F] -- common non-whitespace characters
660 if (C_000E <= c && c < C_00A0) {
661 return false;
662 }
663 // 0x0009 -- horizontal tab
664 if (c < C_0009) {
665 return false;
666 }
667 // 0x000A -- line feed or new line
668 // 0x000B -- vertical tab
669 // 0x000C -- formfeed
670 // 0x000D -- carriage return
671 if (c <= C_000D) {
672 return true;
673 }
674 // 0x00A0 -- no-break space
675 if (c == C_00A0) {
676 return true;
677 }
678 // 0x1680 -- Ogham space mark
679 if (c == C_1680) {
680 return true;
681 }
682 // 0x2000 -- en quad
683 if (c < C_2000) {
684 return false;
685 }
686 // 0x2001 -- em quad
687 // 0x2002 -- en space
688 // 0x2003 -- em space
689 // 0x2004 -- three-per-em space
690 // 0x2005 -- four-per-em space
691 // 0x2006 -- six-per-em space
692 // 0x2007 -- figure space
693 // 0x2008 -- punctuation space
694 // 0x2009 -- thin space
695 // 0x200A -- hair space
696 if (c <= C_200A) {
697 return true;
698 }
699 // 0x2028 -- line separator
700 if (c == C_2028) {
701 return true;
702 }
703 // 0x2029 -- paragraph separator
704 if (c == C_2029) {
705 return true;
706 }
707 // 0x202F -- narrow no-break space
708 if (c == C_202F) {
709 return true;
710 }
711 // 0x205F -- medium mathematical space
712 if (c == C_205F) {
713 return true;
714 }
715 // 0xFEFF -- byte order mark
716 if (c == C_FEFF) {
717 return true;
718 }
719 // 0x3000 -- ideographic space
720 if (c == C_3000) {
721 return true;
722 }
723 return false;
724 }
725
726 } // namespace ark::utf
727