1 /**
2 * Copyright (c) 2021-2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17
18 #include <cstddef>
19 #include <cstring>
20
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24
25 // NOLINTNEXTLINE(hicpp-signed-bitwise)
26 static constexpr uint32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
27 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
28 #define U16_GET_SUPPLEMENTARY(lead, trail) \
29 ((static_cast<uint32_t>(lead) << 10UL) + static_cast<uint32_t>(trail) - U16_SURROGATE_OFFSET)
30
31 namespace ark::utf {
32
33 /*
34 * MUtf-8
35 *
36 * U+0000 => C0 80
37 *
38 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
39 * code point code point code point
40 * 1 7 U+0000 U+007F 0xxxxxxx
41 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
42 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
43 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
44 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
45 */
46
47 /*
48 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
49 * In case of invalid sequence return first byte of it.
50 */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)51 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)
52 {
53 // NOTE(d.kovalneko): make the function safe
54 Span<const uint8_t> sp(data, maxBytes);
55 uint8_t d0 = sp[0];
56 if ((d0 & MASK1) == 0) {
57 return {d0, 1};
58 }
59
60 if (maxBytes < CONST_2) {
61 return {d0, 1};
62 }
63 uint8_t d1 = sp[1];
64 if ((d0 & MASK2) == 0) {
65 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
66 }
67
68 if (maxBytes < CONST_3) {
69 return {d0, 1};
70 }
71 uint8_t d2 = sp[CONST_2];
72 if ((d0 & MASK3) == 0) {
73 return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
74 CONST_3};
75 }
76
77 if (maxBytes < CONST_4) {
78 return {d0, 1};
79 }
80 uint8_t d3 = sp[CONST_3];
81 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
82 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
83
84 uint32_t pair = 0;
85 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
86 pair <<= PAIR_ELEMENT_WIDTH;
87 pair |= (codePoint & MASK_10BIT) + U16_TAIL;
88
89 return {pair, CONST_4};
90 }
91
CombineTwoU16(uint16_t d0,uint16_t d1)92 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
93 {
94 uint32_t codePoint = d0 - DECODE_LEAD_LOW;
95 codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
96 codePoint |= d1 - DECODE_TRAIL_LOW; // NOLINT(hicpp-signed-bitwise
97 codePoint += DECODE_SECOND_FACTOR;
98 return codePoint;
99 }
100
IsMUtf8OnlySingleBytes(const uint8_t * mutf8In)101 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)
102 {
103 while (*mutf8In != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104 if (*mutf8In >= MASK1) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105 return false;
106 }
107 mutf8In += 1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
108 }
109 return true;
110 }
111
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16In,uint8_t * mutf8Out,size_t utf16Len,size_t mutf8Len,size_t start)112 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len,
113 size_t start)
114 {
115 return ConvertRegionUtf16ToUtf8(utf16In, mutf8Out, utf16Len, mutf8Len, start, true);
116 }
117
ConvertMUtf8ToUtf16(const uint8_t * mutf8In,size_t mutf8Len,uint16_t * utf16Out)118 void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)
119 {
120 size_t inPos = 0;
121 while (inPos < mutf8Len) {
122 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
123 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
124
125 if (p_hi != 0) {
126 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
127 }
128 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
129
130 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
131 inPos += nbytes;
132 }
133 }
134
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)135 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len,
136 size_t start)
137 {
138 size_t inPos = 0;
139 size_t outPos = 0;
140 while (inPos < mutf8Len) {
141 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
142 auto [p_hi, p_lo] = SplitUtf16Pair(pair);
143
144 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145 inPos += nbytes;
146 if (start > 0) {
147 start -= nbytes;
148 continue;
149 }
150
151 if (p_hi != 0) {
152 if (outPos++ >= utf16Len - 1) { // check for place for two uint16
153 --outPos;
154 break;
155 }
156 *utf16Out++ = p_hi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
157 }
158 if (outPos++ >= utf16Len) {
159 --outPos;
160 break;
161 }
162 *utf16Out++ = p_lo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
163 }
164 return outPos;
165 }
166
CompareMUtf8ToMUtf8(const uint8_t * mutf81,const uint8_t * mutf82)167 int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)
168 {
169 uint32_t c1;
170 uint32_t c2;
171 uint32_t n1;
172 uint32_t n2;
173
174 do {
175 c1 = *mutf81;
176 c2 = *mutf82;
177
178 if (c1 == 0 && c2 == 0) {
179 return 0;
180 }
181
182 if (c1 == 0 && c2 != 0) {
183 return -1;
184 }
185
186 if (c1 != 0 && c2 == 0) {
187 return 1;
188 }
189
190 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf81);
191 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf82);
192
193 mutf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194 mutf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195 } while (c1 == c2);
196
197 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
198 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
199
200 auto result = static_cast<int>(c1p1 - c2p1);
201 if (result != 0) {
202 return result;
203 }
204
205 return c1p2 - c2p2;
206 }
207
208 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf81,size_t utf81Length,const uint8_t * utf82,size_t utf82Length)209 int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)
210 {
211 uint32_t c1;
212 uint32_t c2;
213 uint32_t n1;
214 uint32_t n2;
215
216 uint32_t utf81Index = 0;
217 uint32_t utf82Index = 0;
218
219 do {
220 if (utf81Index == utf81Length && utf82Index == utf82Length) {
221 return 0;
222 }
223
224 if (utf81Index == utf81Length && utf82Index < utf82Length) {
225 return -1;
226 }
227
228 if (utf81Index < utf81Length && utf82Index == utf82Length) {
229 return 1;
230 }
231
232 c1 = *utf81;
233 c2 = *utf82;
234
235 std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf81);
236 std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf82);
237
238 utf81 += n1; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239 utf82 += n2; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240 utf81Index += n1;
241 utf82Index += n2;
242 } while (c1 == c2);
243
244 auto [c1p1, c1p2] = SplitUtf16Pair(c1);
245 auto [c2p1, c2p2] = SplitUtf16Pair(c2);
246
247 auto result = static_cast<int>(c1p1 - c2p1);
248 if (result != 0) {
249 return result;
250 }
251
252 return c1p2 - c2p2;
253 }
254
Mutf8Size(const uint8_t * mutf8)255 size_t Mutf8Size(const uint8_t *mutf8)
256 {
257 return strlen(Mutf8AsCString(mutf8));
258 }
259
MUtf8ToUtf16Size(const uint8_t * mutf8)260 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
261 {
262 // NOTE(d.kovalenko): make it faster
263 size_t res = 0;
264 while (*mutf8 != '\0') { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
266 res += pair > MAX_U16 ? CONST_2 : 1;
267 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
268 }
269 return res;
270 }
271
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)272 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)
273 {
274 size_t pos = 0;
275 size_t res = 0;
276 while (pos != mutf8Len) {
277 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
278 if (nbytes == 0) {
279 nbytes = 1;
280 }
281 res += pair > MAX_U16 ? CONST_2 : 1;
282 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283 pos += nbytes;
284 }
285 return res;
286 }
287
IsEqual(Span<const uint8_t> utf81,Span<const uint8_t> utf82)288 bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)
289 {
290 if (utf81.size() != utf82.size()) {
291 return false;
292 }
293
294 return memcmp(utf81.data(), utf82.data(), utf81.size()) == 0;
295 }
296
IsEqual(const uint8_t * mutf81,const uint8_t * mutf82)297 bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)
298 {
299 return strcmp(Mutf8AsCString(mutf81), Mutf8AsCString(mutf82)) == 0;
300 }
301
IsValidModifiedUTF8(const uint8_t * elems)302 bool IsValidModifiedUTF8(const uint8_t *elems)
303 {
304 ASSERT(elems);
305
306 while (*elems != '\0') {
307 // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
308 switch (*elems & 0xf0) {
309 case 0x00:
310 case 0x10: // NOLINT(readability-magic-numbers)
311 case 0x20: // NOLINT(readability-magic-numbers)
312 case 0x30: // NOLINT(readability-magic-numbers)
313 case 0x40: // NOLINT(readability-magic-numbers)
314 case 0x50: // NOLINT(readability-magic-numbers)
315 case 0x60: // NOLINT(readability-magic-numbers)
316 case 0x70: // NOLINT(readability-magic-numbers)
317 // pattern 0xxx
318 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
319 ++elems;
320 break;
321 case 0x80: // NOLINT(readability-magic-numbers)
322 case 0x90: // NOLINT(readability-magic-numbers)
323 case 0xa0: // NOLINT(readability-magic-numbers)
324 case 0xb0: // NOLINT(readability-magic-numbers)
325 // pattern 10xx is illegal start
326 return false;
327
328 case 0xf0: // NOLINT(readability-magic-numbers)
329 // pattern 1111 0xxx starts four byte section
330 if ((*elems & 0x08) != 0) { // NOLINT(hicpp-signed-bitwise)
331 return false;
332 }
333 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334 ++elems;
335 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
336 return false;
337 }
338 // no need break
339 [[fallthrough]];
340
341 case 0xe0: // NOLINT(readability-magic-numbers)
342 // pattern 1110
343 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344 ++elems;
345 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
346 return false;
347 }
348 // no need break
349 [[fallthrough]];
350
351 case 0xc0: // NOLINT(readability-magic-numbers)
352 case 0xd0: // NOLINT(readability-magic-numbers)
353 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354 ++elems;
355 if ((*elems & 0xc0) != 0x80) { // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
356 return false;
357 }
358 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359 ++elems;
360 break;
361 default:
362 UNREACHABLE();
363 break;
364 }
365 }
366 return true;
367 }
368
UTF16Decode(uint16_t lead,uint16_t trail)369 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
370 {
371 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
372 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
373 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
374 return cp;
375 }
376
IsValidUTF8(const std::vector<uint8_t> & data)377 bool IsValidUTF8(const std::vector<uint8_t> &data)
378 {
379 uint32_t length = data.size();
380 switch (length) {
381 case UtfLength::ONE:
382 if (data.at(0) >= BIT_MASK_1) {
383 return false;
384 }
385 break;
386 case UtfLength::TWO:
387 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
388 return false;
389 }
390 break;
391 case UtfLength::THREE:
392 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
393 return false;
394 }
395 break;
396 case UtfLength::FOUR:
397 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
398 return false;
399 }
400 break;
401 default:
402 UNREACHABLE();
403 break;
404 }
405
406 for (uint32_t i = 1; i < length; i++) {
407 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
408 return false;
409 }
410 }
411 return true;
412 }
413
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)414 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
415 {
416 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
417 // means that is a single code point, it needs to be represented by three UTF8 code.
418 if (d1 == 0 && d0 >= DECODE_LEAD_LOW && d0 <= DECODE_TRAIL_HIGH) {
419 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
420 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
421 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
422 return {UtfLength::THREE, {ch0, ch1, ch2}};
423 }
424
425 if (d0 == 0) {
426 if (modify) {
427 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
428 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
429 }
430 // For print string, just skip '\u0000'
431 return {0, {0x00U}};
432 }
433 if (d0 <= UTF8_1B_MAX) {
434 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
435 }
436 if (d0 <= UTF8_2B_MAX) {
437 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
438 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
439 return {UtfLength::TWO, {ch0, ch1}};
440 }
441 if (d0 < DECODE_LEAD_LOW || d0 > DECODE_LEAD_HIGH) {
442 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
443 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
444 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
445 return {UtfLength::THREE, {ch0, ch1, ch2}};
446 }
447 if (d1 < DECODE_TRAIL_LOW || d1 > DECODE_TRAIL_HIGH) {
448 // Bad sequence
449 UNREACHABLE();
450 }
451
452 uint32_t codePoint = CombineTwoU16(d0, d1);
453
454 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
455 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
456 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
457 auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
458
459 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
460 }
461
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)462 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
463 {
464 size_t res = 1; // zero byte
465 // when utf16 data length is only 1 and code in 0xd800-0xdfff,
466 // means that is a single code point, it needs to be represented by three UTF8 code.
467 if (length == 1 && utf16[0] >= DECODE_LEAD_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
468 utf16[0] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
469 res += UtfLength::THREE;
470 return res;
471 }
472
473 for (uint32_t i = 0; i < length; ++i) {
474 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
475 if (modify) {
476 res += UtfLength::TWO; // special case for U+0000 => C0 80
477 }
478 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
479 res += 1;
480 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
481 res += UtfLength::TWO;
482 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
483 } else if (utf16[i] < DECODE_LEAD_LOW || utf16[i] > DECODE_LEAD_HIGH) {
484 res += UtfLength::THREE;
485 } else {
486 if (i < length - 1 &&
487 utf16[i + 1] >= DECODE_TRAIL_LOW && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
488 utf16[i + 1] <= DECODE_TRAIL_HIGH) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
489 res += UtfLength::FOUR;
490 ++i;
491 } else {
492 res += UtfLength::THREE;
493 }
494 }
495 }
496 return res;
497 }
498
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)499 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
500 {
501 return Utf16ToUtf8Size(mutf16, length, true);
502 }
503
504 // CC-OFFNXT(G.FUN.01) solid logic
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)505 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
506 size_t start, bool modify)
507 {
508 size_t utf8Pos = 0;
509 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
510 return 0;
511 }
512 size_t end = start + utf16Len;
513 for (size_t i = start; i < end; ++i) {
514 uint16_t next16Code = 0;
515 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
516 if ((i + 1) != end && IsAvailableNextUtf16Code(utf16In[i + 1])) {
517 next16Code = utf16In[i + 1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
518 }
519 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
520 Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
521 if (utf8Pos + ch.n > utf8Len) {
522 break;
523 }
524 for (size_t c = 0; c < ch.n; ++c) {
525 utf8Out[utf8Pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
526 }
527 if (ch.n == UtfLength::FOUR) { // Two UTF-16 chars are used
528 ++i;
529 }
530 }
531 return utf8Pos;
532 }
533
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)534 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
535 {
536 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
537 if ((d0 & MASK1) == 0) {
538 return {d0, 1};
539 }
540
541 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
542 if ((d0 & MASK2) == 0) {
543 return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
544 }
545
546 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
547 if ((d0 & MASK3) == 0) {
548 return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
549 UtfLength::THREE};
550 }
551
552 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
553 uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
554 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
555
556 uint32_t pair = 0;
557 if (combine) {
558 uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
559 uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
560 pair = U16_GET_SUPPLEMENTARY(lead, tail); // NOLINT(hicpp-signed-bitwise)
561 } else {
562 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
563 pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
564 }
565
566 return {pair, UtfLength::FOUR};
567 }
568
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)569 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
570 {
571 return MUtf8ToUtf16Size(utf8, utf8Len);
572 }
573
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)574 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
575 size_t start)
576 {
577 return ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
578 }
579
IsUTF16SurrogatePair(const uint16_t lead)580 bool IsUTF16SurrogatePair(const uint16_t lead)
581 {
582 return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
583 }
584
585 /**
586 * The table below is to translate integer numbers from [0..99] range to pairs of corresponding utf16 codes.
587 * The pairs are packed into utf::BidigitsCode type.
588 *
589 * Example: 0 -> 0x00300030 ("00")
590 * 1 -> 0x00310030 ("01")
591 * ...
592 * 99 -> 0x00390039 ("99")
593 */
594 using BidigitsCode = uint32_t;
595 static constexpr size_t BIDIGITS_CODE_TAB_SIZE = 100U;
596
597 static constexpr std::array<BidigitsCode, BIDIGITS_CODE_TAB_SIZE> BIDIGITS_CODE_TAB = {
598 0x00300030, 0x00310030, 0x00320030, 0x00330030, 0x00340030, 0x00350030, 0x00360030, 0x00370030, 0x00380030,
599 0x00390030, 0x00300031, 0x00310031, 0x00320031, 0x00330031, 0x00340031, 0x00350031, 0x00360031, 0x00370031,
600 0x00380031, 0x00390031, 0x00300032, 0x00310032, 0x00320032, 0x00330032, 0x00340032, 0x00350032, 0x00360032,
601 0x00370032, 0x00380032, 0x00390032, 0x00300033, 0x00310033, 0x00320033, 0x00330033, 0x00340033, 0x00350033,
602 0x00360033, 0x00370033, 0x00380033, 0x00390033, 0x00300034, 0x00310034, 0x00320034, 0x00330034, 0x00340034,
603 0x00350034, 0x00360034, 0x00370034, 0x00380034, 0x00390034, 0x00300035, 0x00310035, 0x00320035, 0x00330035,
604 0x00340035, 0x00350035, 0x00360035, 0x00370035, 0x00380035, 0x00390035, 0x00300036, 0x00310036, 0x00320036,
605 0x00330036, 0x00340036, 0x00350036, 0x00360036, 0x00370036, 0x00380036, 0x00390036, 0x00300037, 0x00310037,
606 0x00320037, 0x00330037, 0x00340037, 0x00350037, 0x00360037, 0x00370037, 0x00380037, 0x00390037, 0x00300038,
607 0x00310038, 0x00320038, 0x00330038, 0x00340038, 0x00350038, 0x00360038, 0x00370038, 0x00380038, 0x00390038,
608 0x00300039, 0x00310039, 0x00320039, 0x00330039, 0x00340039, 0x00350039, 0x00360039, 0x00370039, 0x00380039,
609 0x00390039};
610
UInt64ToUtf16Array(uint64_t v,uint16_t * outUtf16Buf,uint32_t nDigits,bool negative)611 void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)
612 {
613 ASSERT(outUtf16Buf != nullptr && nDigits != 0);
614
615 constexpr uint64_t POW10_1 = 10U;
616 constexpr uint64_t POW10_2 = 100U;
617
618 Span<uint16_t> outSpan(outUtf16Buf, nDigits);
619 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
620 auto *out = reinterpret_cast<uint32_t *>(outUtf16Buf + nDigits);
621 int i = 0;
622 while (v >= POW10_2) {
623 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
624 out[--i] = BIDIGITS_CODE_TAB[v % POW10_2];
625 v /= POW10_2;
626 }
627 if (v >= POW10_1) {
628 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
629 out[--i] = BIDIGITS_CODE_TAB[v];
630 } else {
631 outSpan[negative ? 1U : 0] = v + '0';
632 }
633 if (negative) {
634 outSpan[0] = '-';
635 }
636 }
637
638 static constexpr uint16_t C_SPACE = 0x0020;
639 static constexpr uint16_t C_0009 = 0x0009;
640 static constexpr uint16_t C_000D = 0x000D;
641 static constexpr uint16_t C_000E = 0x000E;
642 static constexpr uint16_t C_00A0 = 0x00A0;
643 static constexpr uint16_t C_1680 = 0x1680;
644 static constexpr uint16_t C_2000 = 0x2000;
645 static constexpr uint16_t C_200A = 0x200A;
646 static constexpr uint16_t C_2028 = 0x2028;
647 static constexpr uint16_t C_2029 = 0x2029;
648 static constexpr uint16_t C_202F = 0x202F;
649 static constexpr uint16_t C_205F = 0x205F;
650 static constexpr uint16_t C_3000 = 0x3000;
651 static constexpr uint16_t C_FEFF = 0xFEFF;
652
IsWhiteSpaceChar(uint16_t c)653 bool IsWhiteSpaceChar(uint16_t c)
654 {
655 if (c == C_SPACE) {
656 return true;
657 }
658 // [0x000E, 0x009F] -- common non-whitespace characters
659 if (C_000E <= c && c < C_00A0) {
660 return false;
661 }
662 // 0x0009 -- horizontal tab
663 if (c < C_0009) {
664 return false;
665 }
666 // 0x000A -- line feed or new line
667 // 0x000B -- vertical tab
668 // 0x000C -- formfeed
669 // 0x000D -- carriage return
670 if (c <= C_000D) {
671 return true;
672 }
673 // 0x00A0 -- no-break space
674 if (c == C_00A0) {
675 return true;
676 }
677 // 0x1680 -- Ogham space mark
678 if (c == C_1680) {
679 return true;
680 }
681 // 0x2000 -- en quad
682 if (c < C_2000) {
683 return false;
684 }
685 // 0x2001 -- em quad
686 // 0x2002 -- en space
687 // 0x2003 -- em space
688 // 0x2004 -- three-per-em space
689 // 0x2005 -- four-per-em space
690 // 0x2006 -- six-per-em space
691 // 0x2007 -- figure space
692 // 0x2008 -- punctuation space
693 // 0x2009 -- thin space
694 // 0x200A -- hair space
695 if (c <= C_200A) {
696 return true;
697 }
698 // 0x2028 -- line separator
699 if (c == C_2028) {
700 return true;
701 }
702 // 0x2029 -- paragraph separator
703 if (c == C_2029) {
704 return true;
705 }
706 // 0x202F -- narrow no-break space
707 if (c == C_202F) {
708 return true;
709 }
710 // 0x205F -- medium mathematical space
711 if (c == C_205F) {
712 return true;
713 }
714 // 0xFEFF -- byte order mark
715 if (c == C_FEFF) {
716 return true;
717 }
718 // 0x3000 -- ideographic space
719 if (c == C_3000) {
720 return true;
721 }
722 return false;
723 }
724
725 } // namespace ark::utf
726