• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1  /*
2   * Copyright (c) 2021 Huawei Device Co., Ltd.
3   * Licensed under the Apache License, Version 2.0 (the "License");
4   * you may not use this file except in compliance with the License.
5   * You may obtain a copy of the License at
6   *
7   *     http://www.apache.org/licenses/LICENSE-2.0
8   *
9   * Unless required by applicable law or agreed to in writing, software
10   * distributed under the License is distributed on an "AS IS" BASIS,
11   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12   * See the License for the specific language governing permissions and
13   * limitations under the License.
14   */
15  
16  #include "ecmascript/base/utf_helper.h"
17  
18  // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
19  static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
20  // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21  #define U16_GET_SUPPLEMENTARY(lead, trail) \
22      ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
23  
24  namespace panda::ecmascript::base::utf_helper {
UTF16Decode(uint16_t lead,uint16_t trail)25  uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
26  {
27      ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
28             (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
29      uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
30      return cp;
31  }
32  
IsValidUTF8(const std::vector<uint8_t> & data)33  bool IsValidUTF8(const std::vector<uint8_t> &data)
34  {
35      uint32_t length = data.size();
36      switch (length) {
37          case UtfLength::ONE:
38              if (data.at(0) >= BIT_MASK_1) {
39                  return false;
40              }
41              break;
42          case UtfLength::TWO:
43              if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
44                  return false;
45              }
46              break;
47          case UtfLength::THREE:
48              if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
49                  return false;
50              }
51              break;
52          case UtfLength::FOUR:
53              if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
54                  return false;
55              }
56              break;
57          default:
58              UNREACHABLE();
59              break;
60      }
61  
62      for (uint32_t i = 1; i < length; i++) {
63          if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
64              return false;
65          }
66      }
67      return true;
68  }
69  
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)70  Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
71  {
72      // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
73      // means that is a single code point, it needs to be represented by three UTF8 code.
74      if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
75          auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
76          auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
77          auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
78          return {UtfLength::THREE, {ch0, ch1, ch2}};
79      }
80  
81      if (d0 == 0) {
82          if (modify) {
83              // special case for \u0000 ==> C080 - 1100'0000 1000'0000
84              return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
85          }
86          // For print string, just skip '\u0000'
87          return {0, {0x00U}};
88      }
89      if (d0 <= UTF8_1B_MAX) {
90          return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
91      }
92      if (d0 <= UTF8_2B_MAX) {
93          auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
94          auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
95          return {UtfLength::TWO, {ch0, ch1}};
96      }
97      if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
98          auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
99          auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
100          auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
101          return {UtfLength::THREE, {ch0, ch1, ch2}};
102      }
103      if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
104          // Bad sequence
105          UNREACHABLE();
106      }
107  
108      uint32_t codePoint = CombineTwoU16(d0, d1);
109  
110      auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
111      auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
112      auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
113      auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
114      return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
115  }
116  
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)117  size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
118  {
119      size_t res = 1;  // zero byte
120      // when utf16 data length is only 1 and code in 0xd800-0xdfff,
121      // means that is a single code point, it needs to be represented by three UTF8 code.
122      if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
123          utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
124          res += UtfLength::THREE;
125          return res;
126      }
127  
128      for (uint32_t i = 0; i < length; ++i) {
129          if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
130              if (modify) {
131                  res += UtfLength::TWO;  // special case for U+0000 => C0 80
132              }
133          } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
134              res += 1;
135          } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
136              res += UtfLength::TWO;
137              // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
138          } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
139              res += UtfLength::THREE;
140          } else {
141              if (i < length - 1 &&
142                  utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
143                  utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
144                  res += UtfLength::FOUR;
145                  ++i;
146              } else {
147                  res += UtfLength::THREE;
148              }
149          }
150      }
151      return res;
152  }
153  
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)154  size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
155                                  size_t start, bool modify)
156  {
157      size_t utf8Pos = 0;
158      if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
159          return 0;
160      }
161      size_t end = start + utf16Len;
162      for (size_t i = start; i < end; ++i) {
163          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
164          uint16_t next16Code = 0;
165          if ((i + 1) != end && utf::IsAvailableNextUtf16Code(utf16In[i + 1])) {
166              next16Code = utf16In[i + 1];
167          }
168          // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
169          Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify);
170          if (utf8Pos + ch.n > utf8Len) {
171              break;
172          }
173          for (size_t c = 0; c < ch.n; ++c) {
174              utf8Out[utf8Pos++] = ch.ch[c];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
175          }
176          if (ch.n == UtfLength::FOUR) {  // Two UTF-16 chars are used
177              ++i;
178          }
179      }
180      return utf8Pos;
181  }
182  
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)183  std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
184  {
185      uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
186      if ((d0 & utf::MASK1) == 0) {
187          return {d0, 1};
188      }
189  
190      uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191      if ((d0 & utf::MASK2) == 0) {
192          return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
193      }
194  
195      uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
196      if ((d0 & utf::MASK3) == 0) {
197          return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
198                      (d2 & utf::MASK_6BIT),
199                  UtfLength::THREE};
200      }
201  
202      uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
203      uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
204                           ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
205  
206      uint32_t pair = 0;
207      if (combine) {
208          uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
209          uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
210          pair = U16_GET_SUPPLEMENTARY(lead, tail);  // NOLINTNEXTLINE(hicpp-signed-bitwise)
211      } else {
212          pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
213          pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
214      }
215  
216      return {pair, UtfLength::FOUR};
217  }
218  
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)219  size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
220  {
221      return utf::MUtf8ToUtf16Size(utf8, utf8Len);
222  }
223  
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)224  size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
225                                  size_t start)
226  {
227      return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
228  }
229  }  // namespace panda::ecmascript::base::utf_helper
230