1 /**
2 * Copyright (c) 2024 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "utf.h"
17 #include "base/log/log_wrapper.h"
18 #include <memory>
19
20 namespace OHOS::Ace {
21
22 /*
23 * MUtf-8
24 *
25 * U+0000 => C0 80
26 *
27 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4 Byte 5 Byte 6
28 * code point code point code point
29 * 1 7 U+0000 U+007F 0xxxxxxx
30 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
31 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
32 * 6 21 U+10000 U+10FFFF 11101101 1010xxxx 10xxxxxx 11101101 1011xxxx 10xxxxxx
33 * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
34 */
35
36 /*
37 * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
38 * In case of invalid sequence return first byte of it.
39 */
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)40 size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
41 {
42 size_t pos = 0;
43 size_t res = 0;
44 while (pos != mutf8Len) {
45 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
46 if (nbytes == 0) {
47 nbytes = 1;
48 }
49 res += pair > MAX_U16 ? CONST_2 : 1;
50 mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
51 pos += nbytes;
52 }
53 return res;
54 }
55
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)56 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
57 {
58 uint8_t d0 = *data;
59 if ((d0 & MASK1) == 0) {
60 return { d0, 1 };
61 }
62
63 if (maxBytes < CONST_2) {
64 return { d0, 1 };
65 }
66 uint8_t d1 = *(data + 1);
67 if ((d0 & MASK2) == 0) {
68 return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
69 }
70
71 if (maxBytes < CONST_3) {
72 return { d0, 1 };
73 }
74 uint8_t d2 = *(data + CONST_2);
75 if ((d0 & MASK3) == 0) {
76 return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
77 CONST_3 };
78 }
79
80 if (maxBytes < CONST_4) {
81 return { d0, 1 };
82 }
83 uint8_t d3 = *(data + CONST_3);
84 uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
85 ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
86
87 uint32_t pair = 0;
88 pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
89 pair <<= PAIR_ELEMENT_WIDTH;
90 pair |= (codePoint & MASK_10BIT) + U16_TAIL;
91
92 return { pair, CONST_4 };
93 }
94
ConvertRegionUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)95 size_t ConvertRegionUtf8ToUtf16(
96 const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
97 {
98 size_t inPos = 0;
99 size_t outPos = 0;
100 while (inPos < mutf8Len) {
101 auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
102 auto [pHi, pLo] = SplitUtf16Pair(pair);
103
104 mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105 inPos += nbytes;
106 if (start > 0) {
107 start -= nbytes;
108 continue;
109 }
110
111 if (pHi != 0) {
112 if (outPos++ >= utf16Len - 1) { // check for place for two uint16
113 --outPos;
114 break;
115 }
116 *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
117 }
118 if (outPos++ >= utf16Len) {
119 --outPos;
120 break;
121 }
122 *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
123 }
124 return outPos;
125 }
126
IsUTF16HighSurrogate(uint16_t ch)127 bool IsUTF16HighSurrogate(uint16_t ch)
128 {
129 return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
130 }
131
IsUTF16LowSurrogate(uint16_t ch)132 bool IsUTF16LowSurrogate(uint16_t ch)
133 {
134 return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
135 }
136
UTF8Length(uint32_t codePoint)137 size_t UTF8Length(uint32_t codePoint)
138 {
139 if (codePoint <= UTF8_1B_MAX) {
140 return UtfLength::ONE;
141 }
142 if (codePoint <= UTF8_2B_MAX) {
143 return UtfLength::TWO;
144 }
145 if (codePoint <= UTF8_3B_MAX) {
146 return UtfLength::THREE;
147 }
148 return UtfLength::FOUR;
149 }
150
151 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codePoint,uint8_t * utf8,size_t len,size_t index)152 size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
153 {
154 size_t size = UTF8Length(codePoint);
155 if (index + size > len) {
156 return 0;
157 }
158 for (size_t j = size - 1; j > 0; j--) {
159 uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
160 utf8[index + j] = cont;
161 codePoint >>= UTF8_OFFSET;
162 }
163 utf8[index] = codePoint | FIRST_BYTE_MARK[size];
164 return size;
165 }
166
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)167 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
168 {
169 uint16_t first = utf16[*index];
170 // A valid surrogate pair should always start with a High Surrogate
171 if (IsUTF16LowSurrogate(first)) {
172 return UTF16_REPLACEMENT_CHARACTER;
173 }
174 if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
175 if (*index == len - 1) {
176 // A High surrogate not paired with another surrogate
177 return UTF16_REPLACEMENT_CHARACTER;
178 }
179 uint16_t second = utf16[*index + 1];
180 if (!IsUTF16LowSurrogate(second)) {
181 // A High surrogate not followed by a low surrogate
182 return UTF16_REPLACEMENT_CHARACTER;
183 }
184 // A valid surrogate pair, decode normally
185 (*index)++;
186 return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
187 }
188 // A unicode not fallen into the range of representing by surrogate pair, return as it is
189 return first;
190 }
191
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)192 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
193 size_t start)
194 {
195 if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
196 return 0;
197 }
198 size_t utf8Pos = 0;
199 size_t end = start + utf16Len;
200 for (size_t i = start; i < end; ++i) {
201 uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
202 if (codePoint == 0) {
203 continue;
204 }
205 utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
206 }
207 return utf8Pos;
208 }
209
IsContinuationByte(const std::string & input,size_t startIndex,uint8_t continueCount)210 bool IsContinuationByte(const std::string& input, size_t startIndex, uint8_t continueCount)
211 {
212 uint8_t i = 0;
213 while (i < continueCount) {
214 unsigned char utfByte = input[startIndex + i];
215 if ((utfByte & MUTF8_2B_FIRST) != MUTF8_2B_SECOND) {
216 return false;
217 }
218 i++;
219 }
220 return true;
221 }
222
IsUTF8(std::string & data)223 bool IsUTF8(std::string& data)
224 {
225 if (data.empty()) {
226 return false;
227 }
228
229 size_t i = 0;
230 while (i < data.size()) {
231 unsigned char byte = data[i];
232 if (byte <= MUTF8_1B_MAX) {
233 i++;
234 } else if ((byte & MUTF8_3B_FIRST) == MUTF8_2B_FIRST) {
235 if (i + INDEX_ONE >= data.size()) {
236 return false;
237 }
238 if (!IsContinuationByte(data, i + 1, 1)) {
239 return false;
240 }
241 i += CONST_2;
242 } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
243 if (i + INDEX_TWO >= data.size()) {
244 return false;
245 }
246 if (!IsContinuationByte(data, i + 1, CONST_2)) {
247 return false;
248 }
249 i += CONST_3;
250 } else if ((byte & MUTF8_4B_FIRST_MASK) == MUTF8_4B_FIRST) {
251 if (i + INDEX_THREE >= data.size()) {
252 return false;
253 }
254 if (!IsContinuationByte(data, i + 1, CONST_3)) {
255 return false;
256 }
257 i += CONST_4;
258 } else {
259 return false;
260 }
261 }
262 return true;
263 }
264
RemoveInvalidUft8Bytes(const std::string & input)265 std::string RemoveInvalidUft8Bytes(const std::string& input)
266 {
267 std::string result;
268 result.reserve(input.size());
269 size_t i = 0;
270
271 while (i < input.size()) {
272 unsigned char byte = input[i];
273 if (byte <= MUTF8_1B_MAX) {
274 result += byte;
275 ++i;
276 } else if ((byte & MUTF8_3B_FIRST) == MUTF8_2B_FIRST) {
277 if (i + 1 < input.size() && IsContinuationByte(input, i + 1, 1)) {
278 result += input.substr(i, CONST_2);
279 i += CONST_2;
280 } else {
281 ++i;
282 }
283 } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
284 if (i + CONST_2 < input.size() && IsContinuationByte(input, i + 1, CONST_2)) {
285 result += input.substr(i, CONST_3);
286 i += CONST_3;
287 } else {
288 ++i;
289 }
290 } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
291 if (i + CONST_3 < input.size() && IsContinuationByte(input, i + 1, CONST_3)) {
292 result += input.substr(i, CONST_4);
293 i += CONST_4;
294 } else {
295 ++i;
296 }
297 } else {
298 ++i;
299 }
300 }
301 return result;
302 }
303
ConvertIllegalStr(std::string & str)304 void ConvertIllegalStr(std::string& str)
305 {
306 bool isRemove = false;
307 if (!IsUTF8(str)) {
308 TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "the str is not valid utf-8 string");
309 str = RemoveInvalidUft8Bytes(str);
310 isRemove = true;
311 }
312 if (!isRemove || IsUTF8(str)) {
313 uint8_t* buf8 = reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
314 size_t utf8Len = str.size();
315 auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
316 std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
317 auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
318 if (resultLen == utf16Len) {
319 DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
320 } else {
321 TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "resultLen is %{public}d, utf16Len is %{public}d",
322 static_cast<uint16_t>(resultLen), static_cast<uint16_t>(utf16Len));
323 }
324 } else {
325 TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "the str is still not valid utf-8 string");
326 }
327 }
328
329 } // namespace OHOS::Ace
330