• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /**
2   * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3   * Licensed under the Apache License, Version 2.0 (the "License");
4   * you may not use this file except in compliance with the License.
5   * You may obtain a copy of the License at
6   *
7   * http://www.apache.org/licenses/LICENSE-2.0
8   *
9   * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  
16  #include "utf.h"
17  
18  #include <cstddef>
19  #include <cstring>
20  
21  #include <limits>
22  #include <tuple>
23  #include <utility>
24  
25  namespace panda::utf {
26  
27  constexpr size_t MAX_U16 = 0xffff;
28  constexpr size_t CONST_2 = 2;
29  constexpr size_t CONST_3 = 3;
30  constexpr size_t CONST_4 = 4;
31  constexpr size_t CONST_6 = 6;
32  constexpr size_t CONST_12 = 12;
33  
34  struct MUtf8Char {
35      size_t n;
36      std::array<uint8_t, CONST_4> ch;
37  };
38  
39  /*
40   * MUtf-8
41   *
42   * U+0000 => C0 80
43   *
44   * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
45   *    code point   code point   code point
46   * 1  7            U+0000       U+007F      0xxxxxxx
47   * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
48   * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
49   * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
50   * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
51   */
52  
53  /*
54   * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
55   * In case of invalid sequence return first byte of it.
56   */
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t max_bytes)57  std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes)
58  {
59      // TODO(d.kovalneko): make the function safe
60      Span<const uint8_t> sp(data, max_bytes);
61      uint8_t d0 = sp[0];
62      if ((d0 & MASK1) == 0) {
63          return {d0, 1};
64      }
65  
66      if (max_bytes < CONST_2) {
67          return {d0, 1};
68      }
69      uint8_t d1 = sp[1];
70      if ((d0 & MASK2) == 0) {
71          return {((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2};
72      }
73  
74      if (max_bytes < CONST_3) {
75          return {d0, 1};
76      }
77      uint8_t d2 = sp[CONST_2];
78      if ((d0 & MASK3) == 0) {
79          return {((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
80                  CONST_3};
81      }
82  
83      if (max_bytes < CONST_4) {
84          return {d0, 1};
85      }
86      uint8_t d3 = sp[CONST_3];
87      uint32_t code_point = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
88                            ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
89  
90      uint32_t pair = 0;
91      pair |= ((code_point >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
92      pair <<= PAIR_ELEMENT_WIDTH;
93      pair |= (code_point & MASK_10BIT) + U16_TAIL;
94  
95      return {pair, CONST_4};
96  }
97  
CombineTwoU16(uint16_t d0,uint16_t d1)98  static constexpr uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
99  {
100      uint32_t codePoint = d0 - HI_SURROGATE_MIN;
101      codePoint <<= (PAIR_ELEMENT_WIDTH - DATA_WIDTH);
102      codePoint |= d1 - LO_SURROGATE_MIN;
103      codePoint += LO_SUPPLEMENTS_MIN;
104      return codePoint;
105  }
106  
ConvertUtf16ToMUtf8(uint16_t d0,uint16_t d1)107  constexpr MUtf8Char ConvertUtf16ToMUtf8(uint16_t d0, uint16_t d1)
108  {
109      // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
110      // means that is a single code point, it needs to be represented by three MUTF8 code.
111      if (d1 == 0 && d0 >= HI_SURROGATE_MIN && d0 <= LO_SURROGATE_MAX) {
112          auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
113          auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
114          auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
115          return {CONST_3, {ch0, ch1, ch2}};
116      }
117  
118      if (d0 == 0) {
119          return {CONST_2, {MUTF8_2B_FIRST, MUTF8_2B_SECOND}};
120      }
121      if (d0 <= MUTF8_1B_MAX) {
122          return {1, {static_cast<uint8_t>(d0)}};
123      }
124      if (d0 <= MUTF8_2B_MAX) {
125          auto ch0 = static_cast<uint8_t>(MUTF8_2B_FIRST | static_cast<uint8_t>(d0 >> CONST_6));
126          auto ch1 = static_cast<uint8_t>(MUTF8_2B_SECOND | (d0 & MASK_6BIT));
127          return {CONST_2, {ch0, ch1}};
128      }
129      if (d0 < HI_SURROGATE_MIN || d0 > HI_SURROGATE_MAX) {
130          auto ch0 = static_cast<uint8_t>(MUTF8_3B_FIRST | static_cast<uint8_t>(d0 >> CONST_12));
131          auto ch1 = static_cast<uint8_t>(MUTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> CONST_6) & MASK_6BIT));
132          auto ch2 = static_cast<uint8_t>(MUTF8_3B_THIRD | (d0 & MASK_6BIT));
133          return {CONST_3, {ch0, ch1, ch2}};
134      }
135  
136      uint32_t codePoint = CombineTwoU16(d0, d1);
137  
138      auto ch0 = static_cast<uint8_t>((codePoint >> (DATA_WIDTH * CONST_3)) | MUTF8_4B_FIRST);
139      auto ch1 = static_cast<uint8_t>(((codePoint >> (DATA_WIDTH * CONST_2)) & MASK_6BIT) | MASK1);
140      auto ch2 = static_cast<uint8_t>(((codePoint >> DATA_WIDTH) & MASK_6BIT) | MASK1);
141      auto ch3 = static_cast<uint8_t>((codePoint & MASK_6BIT) | MASK1);
142  
143      return {CONST_4, {ch0, ch1, ch2, ch3}};
144  }
145  
IsMUtf8OnlySingleBytes(const uint8_t * mutf8_in)146  bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in)
147  {
148      while (*mutf8_in != '\0') {    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
149          if (*mutf8_in >= MASK1) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
150              return false;
151          }
152          mutf8_in += 1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
153      }
154      return true;
155  }
156  
ConvertRegionUtf16ToMUtf8(const uint16_t * utf16_in,uint8_t * mutf8_out,size_t utf16_len,size_t mutf8_len,size_t start)157  size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
158                                   size_t start)
159  {
160      size_t mutf8_pos = 0;
161      if (utf16_in == nullptr || mutf8_out == nullptr || mutf8_len == 0) {
162          return 0;
163      }
164      size_t end = start + utf16_len;
165      for (size_t i = start; i < end; ++i) {
166          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
167          uint16_t next16Code = (i + 1) != end && IsAvailableNextUtf16Code(utf16_in[i + 1]) ? utf16_in[i + 1] : 0;
168          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
169          MUtf8Char ch = ConvertUtf16ToMUtf8(utf16_in[i], next16Code);
170          if (mutf8_pos + ch.n > mutf8_len) {
171              break;
172          }
173          for (size_t c = 0; c < ch.n; ++c) {
174              mutf8_out[mutf8_pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
175          }
176          if (ch.n == CONST_4) {  // Two UTF-16 chars are used
177              ++i;
178          }
179      }
180      return mutf8_pos;
181  }
182  
ConvertMUtf8ToUtf16(const uint8_t * mutf8_in,size_t mutf8_len,uint16_t * utf16_out)183  void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out)
184  {
185      size_t in_pos = 0;
186      while (in_pos < mutf8_len) {
187          auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
188          auto [p_hi, p_lo] = SplitUtf16Pair(pair);
189  
190          if (p_hi != 0) {
191              *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
192          }
193          *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
194  
195          mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196          in_pos += nbytes;
197      }
198  }
199  
ConvertRegionMUtf8ToUtf16(const uint8_t * mutf8_in,uint16_t * utf16_out,size_t mutf8_len,size_t utf16_len,size_t start)200  size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
201                                   size_t start)
202  {
203      size_t in_pos = 0;
204      size_t out_pos = 0;
205      while (in_pos < mutf8_len) {
206          auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8_in, mutf8_len - in_pos);
207          auto [p_hi, p_lo] = SplitUtf16Pair(pair);
208  
209          mutf8_in += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
210          in_pos += nbytes;
211          if (start > 0) {
212              start -= nbytes;
213              continue;
214          }
215  
216          if (p_hi != 0) {
217              if (out_pos++ >= utf16_len - 1) {  // check for place for two uint16
218                  --out_pos;
219                  break;
220              }
221              *utf16_out++ = p_hi;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
222          }
223          if (out_pos++ >= utf16_len) {
224              --out_pos;
225              break;
226          }
227          *utf16_out++ = p_lo;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
228      }
229      return out_pos;
230  }
231  
CompareMUtf8ToMUtf8(const uint8_t * mutf8_1,const uint8_t * mutf8_2)232  int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
233  {
234      uint32_t c1;
235      uint32_t c2;
236      uint32_t n1;
237      uint32_t n2;
238  
239      do {
240          c1 = *mutf8_1;
241          c2 = *mutf8_2;
242  
243          if (c1 == 0 && c2 == 0) {
244              return 0;
245          }
246  
247          if (c1 == 0 && c2 != 0) {
248              return -1;
249          }
250  
251          if (c1 != 0 && c2 == 0) {
252              return 1;
253          }
254  
255          std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(mutf8_1);
256          std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(mutf8_2);
257  
258          mutf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
259          mutf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
260      } while (c1 == c2);
261  
262      auto [c1p1, c1p2] = SplitUtf16Pair(c1);
263      auto [c2p1, c2p2] = SplitUtf16Pair(c2);
264  
265      auto result = static_cast<int>(c1p1 - c2p1);
266      if (result != 0) {
267          return result;
268      }
269  
270      return c1p2 - c2p2;
271  }
272  
273  // compare plain utf8, which allows 0 inside a string
CompareUtf8ToUtf8(const uint8_t * utf8_1,size_t utf8_1_length,const uint8_t * utf8_2,size_t utf8_2_length)274  int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length)
275  {
276      uint32_t c1;
277      uint32_t c2;
278      uint32_t n1;
279      uint32_t n2;
280  
281      uint32_t utf8_1_index = 0;
282      uint32_t utf8_2_index = 0;
283  
284      do {
285          if (utf8_1_index == utf8_1_length && utf8_2_index == utf8_2_length) {
286              return 0;
287          }
288  
289          if (utf8_1_index == utf8_1_length && utf8_2_index < utf8_2_length) {
290              return -1;
291          }
292  
293          if (utf8_1_index < utf8_1_length && utf8_2_index == utf8_2_length) {
294              return 1;
295          }
296  
297          c1 = *utf8_1;
298          c2 = *utf8_2;
299  
300          std::tie(c1, n1) = ConvertMUtf8ToUtf16Pair(utf8_1);
301          std::tie(c2, n2) = ConvertMUtf8ToUtf16Pair(utf8_2);
302  
303          utf8_1 += n1;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
304          utf8_2 += n2;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
305          utf8_1_index += n1;
306          utf8_2_index += n2;
307      } while (c1 == c2);
308  
309      auto [c1p1, c1p2] = SplitUtf16Pair(c1);
310      auto [c2p1, c2p2] = SplitUtf16Pair(c2);
311  
312      auto result = static_cast<int>(c1p1 - c2p1);
313      if (result != 0) {
314          return result;
315      }
316  
317      return c1p2 - c2p2;
318  }
319  
Mutf8Size(const uint8_t * mutf8)320  size_t Mutf8Size(const uint8_t *mutf8)
321  {
322      return strlen(Mutf8AsCString(mutf8));
323  }
324  
MUtf8ToUtf16Size(const uint8_t * mutf8)325  size_t MUtf8ToUtf16Size(const uint8_t *mutf8)
326  {
327      // TODO(d.kovalenko): make it faster
328      size_t res = 0;
329      while (*mutf8 != '\0') {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
330          auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8);
331          res += pair > MAX_U16 ? CONST_2 : 1;
332          mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
333      }
334      return res;
335  }
336  
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8_len)337  size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len)
338  {
339      size_t pos = 0;
340      size_t res = 0;
341      while (pos != mutf8_len) {
342          auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8_len - pos);
343          if (nbytes == 0) {
344              nbytes = 1;
345          }
346          res += pair > MAX_U16 ? CONST_2 : 1;
347          mutf8 += nbytes;  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
348          pos += nbytes;
349      }
350      return res;
351  }
352  
Utf16ToMUtf8Size(const uint16_t * mutf16,uint32_t length)353  size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length)
354  {
355      size_t res = 1;  // zero byte
356      // when utf16 data length is only 1 and code in 0xd800-0xdfff,
357      // means that is a single code point, it needs to be represented by three MUTF8 code.
358      if (length == 1 && mutf16[0] >= HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
359          mutf16[0] <= LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
360          res += CONST_3;
361          return res;
362      }
363  
364      for (uint32_t i = 0; i < length; ++i) {
365          // NOLINTNEXTLINE(bugprone-branch-clone)
366          if (mutf16[i] == 0) {                    // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
367              res += CONST_2;                      // special case for U+0000 => C0 80
368          } else if (mutf16[i] <= MUTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
369              res += 1;
370          } else if (mutf16[i] <= MUTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
371              res += CONST_2;
372              // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
373          } else if (mutf16[i] < HI_SURROGATE_MIN || mutf16[i] > HI_SURROGATE_MAX) {
374              res += CONST_3;
375          } else {
376              res += CONST_4;
377              ++i;
378          }
379      }
380      return res;
381  }
382  
IsEqual(Span<const uint8_t> utf8_1,Span<const uint8_t> utf8_2)383  bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2)
384  {
385      if (utf8_1.size() != utf8_2.size()) {
386          return false;
387      }
388  
389      return memcmp(utf8_1.data(), utf8_2.data(), utf8_1.size()) == 0;
390  }
391  
IsEqual(const uint8_t * mutf8_1,const uint8_t * mutf8_2)392  bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2)
393  {
394      return strcmp(Mutf8AsCString(mutf8_1), Mutf8AsCString(mutf8_2)) == 0;
395  }
396  
IsValidModifiedUTF8(const uint8_t * elems)397  bool IsValidModifiedUTF8(const uint8_t *elems)
398  {
399      ASSERT(elems);
400  
401      while (*elems != '\0') {
402          // NOLINTNEXTLINE(hicpp-signed-bitwise, readability-magic-numbers)
403          switch (*elems & 0xf0) {
404              case 0x00:
405              case 0x10:  // NOLINT(readability-magic-numbers)
406              case 0x20:  // NOLINT(readability-magic-numbers)
407              case 0x30:  // NOLINT(readability-magic-numbers)
408              case 0x40:  // NOLINT(readability-magic-numbers)
409              case 0x50:  // NOLINT(readability-magic-numbers)
410              case 0x60:  // NOLINT(readability-magic-numbers)
411              case 0x70:  // NOLINT(readability-magic-numbers)
412                  // pattern 0xxx
413                  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
414                  ++elems;
415                  break;
416              case 0x80:  // NOLINT(readability-magic-numbers)
417              case 0x90:  // NOLINT(readability-magic-numbers)
418              case 0xa0:  // NOLINT(readability-magic-numbers)
419              case 0xb0:  // NOLINT(readability-magic-numbers)
420                  // pattern 10xx is illegal start
421                  return false;
422  
423              case 0xf0:  // NOLINT(readability-magic-numbers)
424                  // pattern 1111 0xxx starts four byte section
425                  if ((*elems & 0x08) == 0) {  // NOLINT(hicpp-signed-bitwise)
426                      // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
427                      ++elems;
428                      if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
429                          return false;
430                      }
431                  } else {
432                      return false;
433                  }
434                  // no need break
435                  [[fallthrough]];
436  
437              case 0xe0:  // NOLINT(readability-magic-numbers)
438                  // pattern 1110
439                  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
440                  ++elems;
441                  if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
442                      return false;
443                  }
444                  // no need break
445                  [[fallthrough]];
446  
447              case 0xc0:  // NOLINT(readability-magic-numbers)
448              case 0xd0:  // NOLINT(readability-magic-numbers)
449                  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
450                  ++elems;
451                  if ((*elems & 0xc0) != 0x80) {  // NOLINT(hicpp-signed-bitwise, readability-magic-numbers)
452                      return false;
453                  }
454                  // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
455                  ++elems;
456                  break;
457              default:
458                  break;
459          }
460      }
461      return true;
462  }
463  
464  }  // namespace panda::utf
465