• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2025 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 
18 #include <cstddef>
19 #include <cstring>
20 
21 #include <limits>
22 #include <tuple>
23 #include <utility>
24 
25 // NOLINTNEXTLINE(hicpp-signed-bitwise)
26 static constexpr uint32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
27 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
28 #define U16_GET_SUPPLEMENTARY(lead, trail) \
29     ((static_cast<uint32_t>(lead) << 10UL) + static_cast<uint32_t>(trail) - U16_SURROGATE_OFFSET)
30 
31 namespace ark::utf {
32 
33 /*
34  * MUtf-8
35  *
36  * U+0000 => C0 80
37  *
38  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
39  *    code point   code point   code point
40  * 1  7            U+0000       U+007F      0xxxxxxx
41  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
42  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
43  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
44  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
45  */
46 
47 /*
48  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
49  * In case of invalid sequence return first byte of it.
50  */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)51 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t maxBytes)
52 {
53     // NOTE(d.kovalneko): make the function safe
54     Span<const uint8_t> sp(data, maxBytes);
55     uint8_t d0 = sp[0];
56     if ((d0 & MASK1) == 0) {
57         return {d0, 1};
58     }
59 
60     if (maxBytes < CONST_2 || sp[1] == 0) {
61         return {d0, 1};
62     }
63     uint8_t d1 = sp[1];
64     if ((d0 & MASK2) == 0) {
65         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
66     }
67 
68     if (maxBytes < CONST_3 || sp[CONST_2] == 0) {
69         return {d0, 1};
70     }
71     uint8_t d2 = sp[CONST_2];
72     if ((d0 & MASK3) == 0) {
73         return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
74                 CONST_3};
75     }
76 
77     if (maxBytes < CONST_4 || sp[CONST_3] == 0) {
78         return {d0, 1};
79     }
80     uint8_t d3 = sp[CONST_3];
81     uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
82                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
83 
84     uint32_t pair = 0;
85     pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
86     pair <<= PAIR_ELEMENT_WIDTH;
87     pair |= (codePoint & MASK_10BIT) + U16_TAIL;
88 
89     return {pair, CONST_4};
90 }
91 
CombineTwoU16(uint16_t d0,uint16_t d1)92 static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
93 {
94     uint32_t codePoint = d0 - DECODE_LEAD_LOW;
95     codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
96     codePoint |= d1 - DECODE_TRAIL_LOW;  // NOLINT(hicpp-signed-bitwise
97     codePoint += DECODE_SECOND_FACTOR;
98     return codePoint;
99 }
100 
IsMUtf8OnlySingleBytes(const uint8_t * mutf8In)101 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8In)
102 {
103     while (*mutf8In != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
104         if (*mutf8In >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105             return false;
106         }
107         mutf8In += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
108     }
109     return true;
110 }
111 
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16In,uint8_t * mutf8Out,size_t utf16Len,size_t mutf8Len,size_t start)112 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16In, uint8_t *mutf8Out, size_t utf16Len, size_t mutf8Len,
113                                  size_t start)
114 {
115     return ConvertRegionUtf16ToUtf8(utf16In, mutf8Out, utf16Len, mutf8Len, start, true);
116 }
117 
ConvertMUtf8ToUtf16(const uint8_t * mutf8In,size_t mutf8Len,uint16_t * utf16Out)118 void ConvertMUtf8ToUtf16(const uint8_t *mutf8In, size_t mutf8Len, uint16_t *utf16Out)
119 {
120     size_t inPos = 0;
121     while (inPos < mutf8Len) {
122         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
123         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
124 
125         if (p_hi != 0) {
126             *utf16Out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
127         }
128         *utf16Out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
129 
130         mutf8In += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
131         inPos += nbytes;
132     }
133 }
134 
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)135 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8In, uint16_t *utf16Out, size_t mutf8Len, size_t utf16Len,
136                                  size_t start)
137 {
138     size_t inPos = 0;
139     size_t outPos = 0;
140     while (inPos < mutf8Len) {
141         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
142         auto [p_hi, p_lo] = SplitUtf16Pair(pair);
143 
144         mutf8In += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
145         inPos += nbytes;
146         if (start > 0) {
147             start -= nbytes;
148             continue;
149         }
150 
151         if (p_hi != 0) {
152             if (outPos++ >= utf16Len - 1) {  // check for place for two uint16
153                 --outPos;
154                 break;
155             }
156             *utf16Out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
157         }
158         if (outPos++ >= utf16Len) {
159             --outPos;
160             break;
161         }
162         *utf16Out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
163     }
164     return outPos;
165 }
166 
CompareMUtf8ToMUtf8(const uint8_t * mutf81,const uint8_t * mutf82)167 int CompareMUtf8ToMUtf8(const uint8_t *mutf81, const uint8_t *mutf82)
168 {
169     uint32_t c1;
170     uint32_t c2;
171     uint32_t n1;
172     uint32_t n2;
173 
174     do {
175         c1 = *mutf81;
176         c2 = *mutf82;
177 
178         if (c1 == 0 && c2 == 0) {
179             return 0;
180         }
181 
182         if (c1 == 0 && c2 != 0) {
183             return -1;
184         }
185 
186         if (c1 != 0 && c2 == 0) {
187             return 1;
188         }
189 
190         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf81);
191         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf82);
192 
193         mutf81 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194         mutf82 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
195     } while (c1 == c2);
196 
197     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
198     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
199 
200     auto result = static_cast<int>(c1p1 - c2p1);
201     if (result != 0) {
202         return result;
203     }
204 
205     return c1p2 - c2p2;
206 }
207 
208 // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf81,size_t utf81Length,const uint8_t * utf82,size_t utf82Length)209 int CompareUtf8ToUtf8(const uint8_t *utf81, size_t utf81Length, const uint8_t *utf82, size_t utf82Length)
210 {
211     uint32_t c1;
212     uint32_t c2;
213     uint32_t n1;
214     uint32_t n2;
215 
216     uint32_t utf81Index = 0;
217     uint32_t utf82Index = 0;
218 
219     do {
220         if (utf81Index == utf81Length && utf82Index == utf82Length) {
221             return 0;
222         }
223 
224         if (utf81Index == utf81Length && utf82Index < utf82Length) {
225             return -1;
226         }
227 
228         if (utf81Index < utf81Length && utf82Index == utf82Length) {
229             return 1;
230         }
231 
232         c1 = *utf81;
233         c2 = *utf82;
234 
235         std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf81);
236         std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf82);
237 
238         utf81 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
239         utf82 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240         utf81Index += n1;
241         utf82Index += n2;
242     } while (c1 == c2);
243 
244     auto [c1p1, c1p2] = SplitUtf16Pair(c1);
245     auto [c2p1, c2p2] = SplitUtf16Pair(c2);
246 
247     auto result = static_cast<int>(c1p1 - c2p1);
248     if (result != 0) {
249         return result;
250     }
251 
252     return c1p2 - c2p2;
253 }
254 
Mutf8Size(const uint8_t * mutf8)255 size_t Mutf8Size(const uint8_t *mutf8)
256 {
257     return strlen(Mutf8AsCString(mutf8));
258 }
259 
MUtf8ToUtf16Size(const uint8_t * mutf8)260 size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
261 {
262     // NOTE(d.kovalenko): make it faster
263     size_t res = 0;
264     while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
265         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
266         res += pair > MAX_U16 ? CONST_2 : 1;
267         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
268     }
269     return res;
270 }
271 
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)272 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8Len)
273 {
274     size_t pos = 0;
275     size_t res = 0;
276     while (pos != mutf8Len) {
277         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
278         if (nbytes == 0) {
279             nbytes = 1;
280         }
281         res += pair > MAX_U16 ? CONST_2 : 1;
282         mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
283         pos += nbytes;
284     }
285     return res;
286 }
287 
IsEqual(Span<const uint8_t> utf81,Span<const uint8_t> utf82)288 bool IsEqual(Span<const uint8_t> utf81, Span<const uint8_t> utf82)
289 {
290     if (utf81.size() != utf82.size()) {
291         return false;
292     }
293 
294     return memcmp(utf81.data(), utf82.data(), utf81.size()) == 0;
295 }
296 
IsEqual(const uint8_t * mutf81,const uint8_t * mutf82)297 bool IsEqual(const uint8_t *mutf81, const uint8_t *mutf82)
298 {
299     return strcmp(Mutf8AsCString(mutf81), Mutf8AsCString(mutf82)) == 0;
300 }
301 
IsValidModifiedUTF8(const uint8_t * elems)302 bool IsValidModifiedUTF8(const uint8_t *elems)
303 {
304     ASSERT(elems);
305 
306     while (*elems != '\0') {
307         // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
308         switch (*elems & 0xf0) {
309             case 0x00:
310             case 0x10:  // NOLINT(readability-magic-numbers)
311             case 0x20:  // NOLINT(readability-magic-numbers)
312             case 0x30:  // NOLINT(readability-magic-numbers)
313             case 0x40:  // NOLINT(readability-magic-numbers)
314             case 0x50:  // NOLINT(readability-magic-numbers)
315             case 0x60:  // NOLINT(readability-magic-numbers)
316             case 0x70:  // NOLINT(readability-magic-numbers)
317                 // pattern 0xxx
318                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
319                 ++elems;
320                 break;
321             case 0x80:  // NOLINT(readability-magic-numbers)
322             case 0x90:  // NOLINT(readability-magic-numbers)
323             case 0xa0:  // NOLINT(readability-magic-numbers)
324             case 0xb0:  // NOLINT(readability-magic-numbers)
325                 // pattern 10xx is illegal start
326                 return false;
327 
328             case 0xf0:  // NOLINT(readability-magic-numbers)
329                 // pattern 1111 0xxx starts four byte section
330                 if ((*elems & 0x08) != 0) {  // NOLINT(hicpp-signed-bitwise)
331                     return false;
332                 }
333                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
334                 ++elems;
335                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
336                     return false;
337                 }
338                 // no need break
339                 [[fallthrough]];
340 
341             case 0xe0:  // NOLINT(readability-magic-numbers)
342                 // pattern 1110
343                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
344                 ++elems;
345                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
346                     return false;
347                 }
348                 // no need break
349                 [[fallthrough]];
350 
351             case 0xc0:  // NOLINT(readability-magic-numbers)
352             case 0xd0:  // NOLINT(readability-magic-numbers)
353                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
354                 ++elems;
355                 if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
356                     return false;
357                 }
358                 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359                 ++elems;
360                 break;
361             default:
362                 UNREACHABLE();
363                 break;
364         }
365     }
366     return true;
367 }
368 
UTF16Decode(uint16_t lead,uint16_t trail)369 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
370 {
371     ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
372            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
373     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
374     return cp;
375 }
376 
IsValidUTF8(const std::vector<uint8_t> & data)377 bool IsValidUTF8(const std::vector<uint8_t> &data)
378 {
379     uint32_t length = data.size();
380     switch (length) {
381         case UtfLength::ONE:
382             if (data.at(0) >= BIT_MASK_1) {
383                 return false;
384             }
385             break;
386         case UtfLength::TWO:
387             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
388                 return false;
389             }
390             break;
391         case UtfLength::THREE:
392             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
393                 return false;
394             }
395             break;
396         case UtfLength::FOUR:
397             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
398                 return false;
399             }
400             break;
401         default:
402             UNREACHABLE();
403             break;
404     }
405 
406     for (uint32_t i = 1; i < length; i++) {
407         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
408             return false;
409         }
410     }
411     return true;
412 }
413 
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)414 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
415 {
416     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
417     // means that is a single code point, it needs to be represented by three UTF8 code.
418     if (d1 == 0 && d0 >= DECODE_LEAD_LOW && d0 <= DECODE_TRAIL_HIGH) {
419         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
420         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
421         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
422         return {UtfLength::THREE, {ch0, ch1, ch2}};
423     }
424 
425     if (d0 == 0) {
426         if (modify) {
427             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
428             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
429         }
430         return {UtfLength::ONE, {0x00U}};
431     }
432     if (d0 <= UTF8_1B_MAX) {
433         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
434     }
435     if (d0 <= UTF8_2B_MAX) {
436         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
437         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & MASK_6BIT));
438         return {UtfLength::TWO, {ch0, ch1}};
439     }
440     if (d0 < DECODE_LEAD_LOW || d0 > DECODE_LEAD_HIGH) {
441         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
442         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & MASK_6BIT));
443         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & MASK_6BIT));
444         return {UtfLength::THREE, {ch0, ch1, ch2}};
445     }
446     if (d1 < DECODE_TRAIL_LOW || d1 > DECODE_TRAIL_HIGH) {
447         // Bad sequence
448         UNREACHABLE();
449     }
450 
451     uint32_t codePoint = CombineTwoU16(d0, d1);
452 
453     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
454     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & MASK_6BIT) | MASK1);
455     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & MASK_6BIT) | MASK1);
456     auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
457 
458     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
459 }
460 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)461 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
462 {
463     size_t res = 1;  // zero byte
464     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
465     // means that is a single code point, it needs to be represented by three UTF8 code.
466     if (length == 1 && utf16[0] >= DECODE_LEAD_LOW &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
467         utf16[0] <= DECODE_TRAIL_HIGH) {               // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
468         res += UtfLength::THREE;
469         return res;
470     }
471 
472     for (uint32_t i = 0; i < length; ++i) {
473         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
474             if (modify) {
475                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
476             } else {
477                 res += UtfLength::ONE;
478             }
479         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
480             res += 1;
481         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
482             res += UtfLength::TWO;
483             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
484         } else if (utf16[i] < DECODE_LEAD_LOW || utf16[i] > DECODE_LEAD_HIGH) {
485             res += UtfLength::THREE;
486         } else {
487             if (i < length - 1 &&
488                 utf16[i + 1] >= DECODE_TRAIL_LOW &&   // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
489                 utf16[i + 1] <= DECODE_TRAIL_HIGH) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
490                 res += UtfLength::FOUR;
491                 ++i;
492             } else {
493                 res += UtfLength::THREE;
494             }
495         }
496     }
497     return res;
498 }
499 
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)500 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
501 {
502     return Utf16ToUtf8Size(mutf16, length, true);
503 }
504 
505 // CC-OFFNXT(G.FUN.01) solid logic
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)506 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
507                                 size_t start, bool modify)
508 {
509     size_t utf8Pos = 0;
510     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
511         return 0;
512     }
513     size_t end = start + utf16Len;
514     for (size_t i = start; i < end; ++i) {
515         uint16_t next16Code = 0;
516         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
517         if ((i + 1) != end && IsAvailableNextUtf16Code(utf16In[i + 1])) {
518             next16Code = utf16In[i + 1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
519         }
520         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
521         Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
522         if (utf8Pos + ch.n > utf8Len) {
523             break;
524         }
525         for (size_t c = 0; c < ch.n; ++c) {
526             utf8Out[utf8Pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
527         }
528         if (ch.n == UtfLength::FOUR) {  // Two UTF-16 chars are used
529             ++i;
530         }
531     }
532     return utf8Pos;
533 }
534 
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)535 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
536 {
537     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
538     if ((d0 & MASK1) == 0) {
539         return {d0, 1};
540     }
541 
542     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
543     if ((d0 & MASK2) == 0) {
544         return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), UtfLength::TWO};
545     }
546 
547     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
548     if ((d0 & MASK3) == 0) {
549         return {((d0 & MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
550                 UtfLength::THREE};
551     }
552 
553     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
554     uint32_t codePoint = ((d0 & MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & MASK_6BIT) << UtfOffset::TWELVE) |
555                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
556 
557     uint32_t pair = 0;
558     if (combine) {
559         uint32_t lead = ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD);
560         uint32_t tail = ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
561         pair = U16_GET_SUPPLEMENTARY(lead, tail);  // NOLINT(hicpp-signed-bitwise)
562     } else {
563         pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) << PAIR_ELEMENT_WIDTH;
564         pair |= ((codePoint & MASK_10BIT) + U16_TAIL) & MASK_16BIT;
565     }
566 
567     return {pair, UtfLength::FOUR};
568 }
569 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)570 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
571 {
572     return MUtf8ToUtf16Size(utf8, utf8Len);
573 }
574 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)575 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
576                                 size_t start)
577 {
578     return ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
579 }
580 
IsUTF16SurrogatePair(const uint16_t lead)581 bool IsUTF16SurrogatePair(const uint16_t lead)
582 {
583     return lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH;
584 }
585 
586 /**
587  * The table below is to translate integer numbers from [0..99] range to pairs of corresponding utf16 codes.
588  * The pairs are packed into utf::BidigitsCode type.
589  *
590  * Example: 0  -> 0x00300030 ("00")
591  *          1  -> 0x00310030 ("01")
592  *          ...
593  *          99 -> 0x00390039 ("99")
594  */
595 using BidigitsCode = uint32_t;
596 static constexpr size_t BIDIGITS_CODE_TAB_SIZE = 100U;
597 
598 static constexpr std::array<BidigitsCode, BIDIGITS_CODE_TAB_SIZE> BIDIGITS_CODE_TAB = {
599     0x00300030, 0x00310030, 0x00320030, 0x00330030, 0x00340030, 0x00350030, 0x00360030, 0x00370030, 0x00380030,
600     0x00390030, 0x00300031, 0x00310031, 0x00320031, 0x00330031, 0x00340031, 0x00350031, 0x00360031, 0x00370031,
601     0x00380031, 0x00390031, 0x00300032, 0x00310032, 0x00320032, 0x00330032, 0x00340032, 0x00350032, 0x00360032,
602     0x00370032, 0x00380032, 0x00390032, 0x00300033, 0x00310033, 0x00320033, 0x00330033, 0x00340033, 0x00350033,
603     0x00360033, 0x00370033, 0x00380033, 0x00390033, 0x00300034, 0x00310034, 0x00320034, 0x00330034, 0x00340034,
604     0x00350034, 0x00360034, 0x00370034, 0x00380034, 0x00390034, 0x00300035, 0x00310035, 0x00320035, 0x00330035,
605     0x00340035, 0x00350035, 0x00360035, 0x00370035, 0x00380035, 0x00390035, 0x00300036, 0x00310036, 0x00320036,
606     0x00330036, 0x00340036, 0x00350036, 0x00360036, 0x00370036, 0x00380036, 0x00390036, 0x00300037, 0x00310037,
607     0x00320037, 0x00330037, 0x00340037, 0x00350037, 0x00360037, 0x00370037, 0x00380037, 0x00390037, 0x00300038,
608     0x00310038, 0x00320038, 0x00330038, 0x00340038, 0x00350038, 0x00360038, 0x00370038, 0x00380038, 0x00390038,
609     0x00300039, 0x00310039, 0x00320039, 0x00330039, 0x00340039, 0x00350039, 0x00360039, 0x00370039, 0x00380039,
610     0x00390039};
611 
UInt64ToUtf16Array(uint64_t v,uint16_t * outUtf16Buf,uint32_t nDigits,bool negative)612 void UInt64ToUtf16Array(uint64_t v, uint16_t *outUtf16Buf, uint32_t nDigits, bool negative)
613 {
614     ASSERT(outUtf16Buf != nullptr && nDigits != 0);
615 
616     constexpr uint64_t POW10_1 = 10U;
617     constexpr uint64_t POW10_2 = 100U;
618 
619     Span<uint16_t> outSpan(outUtf16Buf, nDigits);
620     // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
621     auto *out = reinterpret_cast<uint32_t *>(outUtf16Buf + nDigits);
622     int i = 0;
623     while (v >= POW10_2) {
624         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
625         out[--i] = BIDIGITS_CODE_TAB[v % POW10_2];
626         v /= POW10_2;
627     }
628     if (v >= POW10_1) {
629         // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
630         out[--i] = BIDIGITS_CODE_TAB[v];
631     } else {
632         outSpan[negative ? 1U : 0] = v + '0';
633     }
634     if (negative) {
635         outSpan[0] = '-';
636     }
637 }
638 
639 static constexpr uint16_t C_SPACE = 0x0020;
640 static constexpr uint16_t C_0009 = 0x0009;
641 static constexpr uint16_t C_000D = 0x000D;
642 static constexpr uint16_t C_000E = 0x000E;
643 static constexpr uint16_t C_00A0 = 0x00A0;
644 static constexpr uint16_t C_1680 = 0x1680;
645 static constexpr uint16_t C_2000 = 0x2000;
646 static constexpr uint16_t C_200A = 0x200A;
647 static constexpr uint16_t C_2028 = 0x2028;
648 static constexpr uint16_t C_2029 = 0x2029;
649 static constexpr uint16_t C_202F = 0x202F;
650 static constexpr uint16_t C_205F = 0x205F;
651 static constexpr uint16_t C_3000 = 0x3000;
652 static constexpr uint16_t C_FEFF = 0xFEFF;
653 
IsWhiteSpaceChar(uint16_t c)654 bool IsWhiteSpaceChar(uint16_t c)
655 {
656     if (c == C_SPACE) {
657         return true;
658     }
659     // [0x000E, 0x009F] -- common non-whitespace characters
660     if (C_000E <= c && c < C_00A0) {
661         return false;
662     }
663     // 0x0009 -- horizontal tab
664     if (c < C_0009) {
665         return false;
666     }
667     // 0x000A -- line feed or new line
668     // 0x000B -- vertical tab
669     // 0x000C -- formfeed
670     // 0x000D -- carriage return
671     if (c <= C_000D) {
672         return true;
673     }
674     // 0x00A0 -- no-break space
675     if (c == C_00A0) {
676         return true;
677     }
678     // 0x1680 -- Ogham space mark
679     if (c == C_1680) {
680         return true;
681     }
682     // 0x2000 -- en quad
683     if (c < C_2000) {
684         return false;
685     }
686     // 0x2001 -- em quad
687     // 0x2002 -- en space
688     // 0x2003 -- em space
689     // 0x2004 -- three-per-em space
690     // 0x2005 -- four-per-em space
691     // 0x2006 -- six-per-em space
692     // 0x2007 -- figure space
693     // 0x2008 -- punctuation space
694     // 0x2009 -- thin space
695     // 0x200A -- hair space
696     if (c <= C_200A) {
697         return true;
698     }
699     // 0x2028 -- line separator
700     if (c == C_2028) {
701         return true;
702     }
703     // 0x2029 -- paragraph separator
704     if (c == C_2029) {
705         return true;
706     }
707     // 0x202F -- narrow no-break space
708     if (c == C_202F) {
709         return true;
710     }
711     // 0x205F -- medium mathematical space
712     if (c == C_205F) {
713         return true;
714     }
715     // 0xFEFF -- byte order mark
716     if (c == C_FEFF) {
717         return true;
718     }
719     // 0x3000 -- ideographic space
720     if (c == C_3000) {
721         return true;
722     }
723     return false;
724 }
725 
726 }  // namespace ark::utf
727