1 /* 2 * Copyright (c) 2021 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #include "ecmascript/base/utf_helper.h" 17 18 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 19 static constexpr int32_t U16_SURROGATE_OFFSET = (0xd800 << 10UL) + 0xdc00 - 0x10000; 20 // NOLINTNEXTLINE(cppcoreguidelines-macro-usage) 21 #define U16_GET_SUPPLEMENTARY(lead, trail) \ 22 ((static_cast<int32_t>(lead) << 10UL) + static_cast<int32_t>(trail) - U16_SURROGATE_OFFSET) 23 24 namespace panda::ecmascript::base::utf_helper { UTF16Decode(uint16_t lead,uint16_t trail)25 uint32_t UTF16Decode(uint16_t lead, uint16_t trail) 26 { 27 ASSERT((lead >= DECODE_LEAD_LOW && lead <= DECODE_LEAD_HIGH) && 28 (trail >= DECODE_TRAIL_LOW && trail <= DECODE_TRAIL_HIGH)); 29 uint32_t cp = (lead - DECODE_LEAD_LOW) * DECODE_FIRST_FACTOR + (trail - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR; 30 return cp; 31 } 32 IsValidUTF8(const std::vector<uint8_t> & data)33 bool IsValidUTF8(const std::vector<uint8_t> &data) 34 { 35 uint32_t length = data.size(); 36 switch (length) { 37 case UtfLength::ONE: 38 if (data.at(0) >= BIT_MASK_1) { 39 return false; 40 } 41 break; 42 case UtfLength::TWO: 43 if ((data.at(0) & BIT_MASK_3) != BIT_MASK_2) { 44 return false; 45 } 46 break; 47 case UtfLength::THREE: 48 if ((data.at(0) & BIT_MASK_4) != BIT_MASK_3) { 49 return false; 50 } 51 break; 52 case UtfLength::FOUR: 53 if ((data.at(0) & BIT_MASK_5) != BIT_MASK_4) { 54 return false; 55 } 56 break; 57 default: 58 UNREACHABLE(); 59 break; 60 } 61 62 for (uint32_t i = 1; i < length; i++) { 63 if ((data.at(i) & BIT_MASK_2) != BIT_MASK_1) { 64 return false; 65 } 66 } 67 return true; 68 } 69 ConvertUtf16ToUtf8(uint16_t d0,uint16_t d1,bool modify)70 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify) 71 { 72 // when first utf16 code is in 0xd800-0xdfff and second utf16 code is 0, 73 // means that is a single code point, it needs to be represented by three UTF8 code. 74 if (d1 == 0 && d0 >= utf::HI_SURROGATE_MIN && d0 <= utf::LO_SURROGATE_MAX) { 75 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 76 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 77 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 78 return {UtfLength::THREE, {ch0, ch1, ch2}}; 79 } 80 81 if (d0 == 0) { 82 if (modify) { 83 // special case for \u0000 ==> C080 - 1100'0000 1000'0000 84 return {UtfLength::TWO, {UTF8_2B_FIRST, UTF8_2B_SECOND}}; 85 } 86 // For print string, just skip '\u0000' 87 return {0, {0x00U}}; 88 } 89 if (d0 <= UTF8_1B_MAX) { 90 return {UtfLength::ONE, {static_cast<uint8_t>(d0)}}; 91 } 92 if (d0 <= UTF8_2B_MAX) { 93 auto ch0 = static_cast<uint8_t>(UTF8_2B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::SIX)); 94 auto ch1 = static_cast<uint8_t>(UTF8_2B_SECOND | (d0 & utf::MASK_6BIT)); 95 return {UtfLength::TWO, {ch0, ch1}}; 96 } 97 if (d0 < utf::HI_SURROGATE_MIN || d0 > utf::HI_SURROGATE_MAX) { 98 auto ch0 = static_cast<uint8_t>(UTF8_3B_FIRST | static_cast<uint8_t>(d0 >> UtfOffset::TWELVE)); 99 auto ch1 = static_cast<uint8_t>(UTF8_3B_SECOND | (static_cast<uint8_t>(d0 >> UtfOffset::SIX) & utf::MASK_6BIT)); 100 auto ch2 = static_cast<uint8_t>(UTF8_3B_THIRD | (d0 & utf::MASK_6BIT)); 101 return {UtfLength::THREE, {ch0, ch1, ch2}}; 102 } 103 if (d1 < utf::LO_SURROGATE_MIN || d1 > utf::LO_SURROGATE_MAX) { 104 // Bad sequence 105 UNREACHABLE(); 106 } 107 108 uint32_t codePoint = CombineTwoU16(d0, d1); 109 110 auto ch0 = static_cast<uint8_t>((codePoint >> UtfOffset::EIGHTEEN) | UTF8_4B_FIRST); 111 auto ch1 = static_cast<uint8_t>(((codePoint >> UtfOffset::TWELVE) & utf::MASK_6BIT) | utf::MASK1); 112 auto ch2 = static_cast<uint8_t>(((codePoint >> UtfOffset::SIX) & utf::MASK_6BIT) | utf::MASK1); 113 auto ch3 = static_cast<uint8_t>((codePoint & utf::MASK_6BIT) | utf::MASK1); 114 return {UtfLength::FOUR, {ch0, ch1, ch2, ch3}}; 115 } 116 Utf16ToUtf8Size(const uint16_t * utf16,uint32_t length,bool modify)117 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify) 118 { 119 size_t res = 1; // zero byte 120 // when utf16 data length is only 1 and code in 0xd800-0xdfff, 121 // means that is a single code point, it needs to be represented by three UTF8 code. 122 if (length == 1 && utf16[0] >= utf::HI_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 123 utf16[0] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 124 res += UtfLength::THREE; 125 return res; 126 } 127 128 for (uint32_t i = 0; i < length; ++i) { 129 if (utf16[i] == 0) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 130 if (modify) { 131 res += UtfLength::TWO; // special case for U+0000 => C0 80 132 } 133 } else if (utf16[i] <= UTF8_1B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 134 res += 1; 135 } else if (utf16[i] <= UTF8_2B_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 136 res += UtfLength::TWO; 137 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 138 } else if (utf16[i] < utf::HI_SURROGATE_MIN || utf16[i] > utf::HI_SURROGATE_MAX) { 139 res += UtfLength::THREE; 140 } else { 141 if (i < length - 1 && 142 utf16[i + 1] >= utf::LO_SURROGATE_MIN && // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 143 utf16[i + 1] <= utf::LO_SURROGATE_MAX) { // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 144 res += UtfLength::FOUR; 145 ++i; 146 } else { 147 res += UtfLength::THREE; 148 } 149 } 150 } 151 return res; 152 } 153 ConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start,bool modify)154 size_t ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len, 155 size_t start, bool modify) 156 { 157 size_t utf8Pos = 0; 158 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) { 159 return 0; 160 } 161 size_t end = start + utf16Len; 162 for (size_t i = start; i < end; ++i) { 163 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 164 uint16_t next16Code = 0; 165 if ((i + 1) != end && utf::IsAvailableNextUtf16Code(utf16In[i + 1])) { 166 next16Code = utf16In[i + 1]; 167 } 168 // NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic) 169 Utf8Char ch = ConvertUtf16ToUtf8(utf16In[i], next16Code, modify); 170 if (utf8Pos + ch.n > utf8Len) { 171 break; 172 } 173 for (size_t c = 0; c < ch.n; ++c) { 174 utf8Out[utf8Pos++] = ch.ch[c]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 175 } 176 if (ch.n == UtfLength::FOUR) { // Two UTF-16 chars are used 177 ++i; 178 } 179 } 180 return utf8Pos; 181 } 182 ConvertUtf8ToUtf16Pair(const uint8_t * data,bool combine)183 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine) 184 { 185 uint8_t d0 = data[0]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 186 if ((d0 & utf::MASK1) == 0) { 187 return {d0, 1}; 188 } 189 190 uint8_t d1 = data[1]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 191 if ((d0 & utf::MASK2) == 0) { 192 return {((d0 & utf::MASK_5BIT) << utf::DATA_WIDTH) | (d1 & utf::MASK_6BIT), UtfLength::TWO}; 193 } 194 195 uint8_t d2 = data[UtfLength::TWO]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 196 if ((d0 & utf::MASK3) == 0) { 197 return {((d0 & utf::MASK_4BIT) << UtfOffset::TWELVE) | ((d1 & utf::MASK_6BIT) << utf::DATA_WIDTH) | 198 (d2 & utf::MASK_6BIT), 199 UtfLength::THREE}; 200 } 201 202 uint8_t d3 = data[UtfLength::THREE]; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic) 203 uint32_t codePoint = ((d0 & utf::MASK_4BIT) << UtfOffset::EIGHTEEN) | ((d1 & utf::MASK_6BIT) << UtfOffset::TWELVE) | 204 ((d2 & utf::MASK_6BIT) << utf::DATA_WIDTH) | (d3 & utf::MASK_6BIT); 205 206 uint32_t pair = 0; 207 if (combine) { 208 uint32_t lead = ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD); 209 uint32_t tail = ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 210 pair = U16_GET_SUPPLEMENTARY(lead, tail); // NOLINTNEXTLINE(hicpp-signed-bitwise) 211 } else { 212 pair |= ((codePoint >> (utf::PAIR_ELEMENT_WIDTH - utf::DATA_WIDTH)) + utf::U16_LEAD) << utf::PAIR_ELEMENT_WIDTH; 213 pair |= ((codePoint & utf::MASK_10BIT) + utf::U16_TAIL) & utf::MASK_16BIT; 214 } 215 216 return {pair, UtfLength::FOUR}; 217 } 218 Utf8ToUtf16Size(const uint8_t * utf8,size_t utf8Len)219 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len) 220 { 221 return utf::MUtf8ToUtf16Size(utf8, utf8Len); 222 } 223 ConvertRegionUtf8ToUtf16(const uint8_t * utf8In,uint16_t * utf16Out,size_t utf8Len,size_t utf16Len,size_t start)224 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len, 225 size_t start) 226 { 227 return utf::ConvertRegionMUtf8ToUtf16(utf8In, utf16Out, utf8Len, utf16Len, start); 228 } 229 } // namespace panda::ecmascript::base::utf_helper 230