• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "ecmascript/base/utf_helper.h"
17 
18 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
19 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000;
20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage)
21 #define U16_GET_SUPPLEMENTARY(lead, trail) \
22     ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET)
23 
24 namespace panda::ecmascript::base::utf_helper {
UTF16Decode(uint16_t lead,uint16_t trail)25 uint32_t UTF16Decode(uint16_t lead, uint16_t trail)
26 {
27     ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) &&
28            (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH));
29     uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
30     return cp;
31 }
32 
IsUTF16HighSurrogate(uint16_t ch)33 bool IsUTF16HighSurrogate(uint16_t ch)
34 {
35     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
36 }
37 
IsUTF16LowSurrogate(uint16_t ch)38 bool IsUTF16LowSurrogate(uint16_t ch)
39 {
40     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
41 }
42 
43 // Methods for decode utf16 to unicode
DecodeUTF16(uint16_t const * utf16,size_t len,size_t * index)44 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index)
45 {
46     uint16_t high = utf16[*index];
47     if ((high & SURROGATE_MASK) != DECODE_LEAD_LOW || !IsUTF16HighSurrogate(high) || *index == len - 1) {
48         return high;
49     }
50     uint16_t low = utf16[*index + 1];
51     if (!IsUTF16LowSurrogate(low)) {
52         return high;
53     }
54     (*index)++;
55     return ((high - DECODE_LEAD_LOW) << UTF16_OFFSET) + (low - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
56 }
57 
UTF8Length(uint32_t codepoint)58 inline size_t UTF8Length(uint32_t codepoint)
59 {
60     if (codepoint <= UTF8_1B_MAX) {
61         return UtfLength::ONE;
62     }
63     if (codepoint <= UTF8_2B_MAX) {
64         return UtfLength::TWO;
65     }
66     if (codepoint <= UTF8_3B_MAX) {
67         return UtfLength::THREE;
68     }
69     return UtfLength::FOUR;
70 }
71 
72 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codepoint,uint8_t * utf8,size_t len,size_t index)73 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t len, size_t index)
74 {
75     size_t size = UTF8Length(codepoint);
76     if (index + size > len) {
77         return 0;
78     }
79     for (size_t j = size - 1; j > 0; j--) {
80         uint8_t cont = ((codepoint | byteMark) & byteMask);
81         utf8[index + j] = cont;
82         codepoint >>= UTF8_OFFSET;
83     }
84     utf8[index] = codepoint | firstByteMark[size];
85     return size;
86 }
87 
IsValidUTF8(const std::vector<uint8_t> & data)88 bool IsValidUTF8(const std::vector<uint8_t> &data)
89 {
90     uint32_t length = data.size();
91     switch (length) {
92         case UtfLength::ONE:
93             if (data.at(0) >= BIT_MASK_1) {
94                 return false;
95             }
96             break;
97         case UtfLength::TWO:
98             if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) {
99                 return false;
100             }
101             break;
102         case UtfLength::THREE:
103             if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) {
104                 return false;
105             }
106             break;
107         case UtfLength::FOUR:
108             if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) {
109                 return false;
110             }
111             break;
112         default:
113             UNREACHABLE();
114             break;
115     }
116 
117     for (uint32_t i = 1; i < length; i++) {
118         if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) {
119             return false;
120         }
121     }
122     return true;
123 }
124 
ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)125 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify)
126 {
127     // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0,
128     // means that is a single code point, it needs to be represented by three UTF8 code.
129     if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) {
130         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
131         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
132         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
133         return {UtfLength::THREE, {ch0, ch1, ch2}};
134     }
135 
136     if (d0 == 0) {
137         if (modify) {
138             // special case for \u0000 ==> C080 - 1100'0000 1000'0000
139             return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}};
140         }
141         // For print string, just skip '\u0000'
142         return {0, {0x00U}};
143     }
144     if (d0 <= UTF8_1B_MAX) {
145         return {UtfLength::ONE, {static_cast<uint8_t>(d0)}};
146     }
147     if (d0 <= UTF8_2B_MAX) {
148         auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX));
149         auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT));
150         return {UtfLength::TWO, {ch0, ch1}};
151     }
152     if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) {
153         auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE));
154         auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT));
155         auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT));
156         return {UtfLength::THREE, {ch0, ch1, ch2}};
157     }
158     if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) {
159         // Bad sequence
160         UNREACHABLE();
161     }
162 
163     uint32_t codePoint = CombineTwoU16(d0, d1);
164 
165     auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST);
166     auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1);
167     auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1);
168     auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1);
169     return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}};
170 }
171 
Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)172 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify)
173 {
174     size_t res = 1;  // zero byte
175     // when utf16 data length is only 1 and code in 0xd800-0xdfff,
176     // means that is a single code point, it needs to be represented by three UTF8 code.
177     if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
178         utf16[0] <= utf::LO_SURROGATE_MAX) {                 // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
179         res += UtfLength::THREE;
180         return res;
181     }
182 
183     for (uint32_t i = 0; i < length; ++i) {
184         if (utf16[i] == 0) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
185             if (modify) {
186                 res += UtfLength::TWO;  // special case for U+0000 => C0 80
187             }
188         } else if (utf16[i] <= UTF8_1B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
189             res += 1;
190         } else if (utf16[i] <= UTF8_2B_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
191             res += UtfLength::TWO;
192             // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
193         } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) {
194             res += UtfLength::THREE;
195         } else {
196             if (i < length - 1 &&
197                 utf16[i + 1] >= utf::LO_SURROGATE_MIN &&  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
198                 utf16[i + 1] <= utf::LO_SURROGATE_MAX) {  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
199                 res += UtfLength::FOUR;
200                 ++i;
201             } else {
202                 res += UtfLength::THREE;
203             }
204         }
205     }
206     return res;
207 }
208 
ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)209 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
210                                 size_t start, bool modify)
211 {
212     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
213         return 0;
214     }
215     size_t utf8Pos = 0;
216     size_t end = start + utf16Len;
217     for (size_t i = start; i < end; ++i) {
218         uint32_t codepoint = DecodeUTF16(utf16In, end, &i);
219         if (codepoint == 0) {
220             if (modify) {
221                 // special case for \u0000 ==> C080 - 1100'0000 1000'0000
222                 utf8Out[utf8Pos++] = UTF8_2B_FIRST;
223                 utf8Out[utf8Pos++] = UTF8_2B_SECOND;
224             }
225             continue;
226         }
227         utf8Pos += EncodeUTF8(codepoint, utf8Out, utf8Len, utf8Pos);
228     }
229     return utf8Pos;
230 }
231 
ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)232 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine)
233 {
234     uint8_t d0 = data[0];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
235     if ((d0 & utf::MASK1) == 0) {
236         return {d0, 1};
237     }
238 
239     uint8_t d1 = data[1];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
240     if ((d0 & utf::MASK2) == 0) {
241         return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO};
242     }
243 
244     uint8_t d2 = data[UtfLength::TWO];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
245     if ((d0 & utf::MASK3) == 0) {
246         return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) |
247                     (d2 & utf::MASK_6BIT),
248                 UtfLength::THREE};
249     }
250 
251     uint8_t d3 = data[UtfLength::THREE];  // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
252     uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) |
253                          ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT);
254 
255     uint32_t pair = 0;
256     if (combine) {
257         uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD);
258         uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
259         pair = static_cast<uint32_t>(U16_GET_SUPPLEMENTARY(lead, tail));  // NOLINTNEXTLINE(hicpp-signed-bitwise)
260     } else {
261         pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH;
262         pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT;
263     }
264 
265     return {pair, UtfLength::FOUR};
266 }
267 
Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)268 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len)
269 {
270     return utf::MUtf8ToUtf16Size(utf8, utf8Len);
271 }
272 
ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)273 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len,
274                                 size_t start)
275 {
276     return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start);
277 }
278 }  // namespace panda::ecmascript::base::utf_helper
279