• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "utf.h"
17 #include "base/log/log_wrapper.h"
18 #include <memory>
19 
20 namespace OHOS::Ace {
21 
22 /*
23  * MUtf-8
24  *
25  * U+0000 => C0 80
26  *
27  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4      Byte 5      Byte 6
28  *    code point   code point   code point
29  * 1  7            U+0000       U+007F      0xxxxxxx
30  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
31  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
32  * 6  21           U+10000      U+10FFFF    11101101    1010xxxx    10xxxxxx    11101101    1011xxxx    10xxxxxx
33  * for U+10000 -- U+10FFFF encodes the following (value - 0x10000)
34  */
35 
36 /*
37  * Convert mutf8 sequence to utf16 pair and return pair: [utf16 code point, mutf8 size].
38  * In case of invalid sequence return first byte of it.
39  */
MUtf8ToUtf16Size(const uint8_t * mutf8,size_t mutf8Len)40 size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len)
41 {
42     size_t pos = 0;
43     size_t res = 0;
44     while (pos != mutf8Len) {
45         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8, mutf8Len - pos);
46         if (nbytes == 0) {
47             nbytes = 1;
48         }
49         res += pair > MAX_U16 ? CONST_2 : 1;
50         mutf8 += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
51         pos += nbytes;
52     }
53     return res;
54 }
55 
ConvertMUtf8ToUtf16Pair(const uint8_t * data,size_t maxBytes)56 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes)
57 {
58     uint8_t d0 = *data;
59     if ((d0 & MASK1) == 0) {
60         return { d0, 1 };
61     }
62 
63     if (maxBytes < CONST_2) {
64         return { d0, 1 };
65     }
66     uint8_t d1 = *(data + 1);
67     if ((d0 & MASK2) == 0) {
68         return { ((d0 & MASK_5BIT) << DATA_WIDTH) | (d1 & MASK_6BIT), 2 };
69     }
70 
71     if (maxBytes < CONST_3) {
72         return { d0, 1 };
73     }
74     uint8_t d2 = *(data + CONST_2);
75     if ((d0 & MASK3) == 0) {
76         return { ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_2)) | ((d1 & MASK_6BIT) << DATA_WIDTH) | (d2 & MASK_6BIT),
77             CONST_3 };
78     }
79 
80     if (maxBytes < CONST_4) {
81         return { d0, 1 };
82     }
83     uint8_t d3 = *(data + CONST_3);
84     uint32_t codePoint = ((d0 & MASK_4BIT) << (DATA_WIDTH * CONST_3)) | ((d1 & MASK_6BIT) << (DATA_WIDTH * CONST_2)) |
85                          ((d2 & MASK_6BIT) << DATA_WIDTH) | (d3 & MASK_6BIT);
86 
87     uint32_t pair = 0;
88     pair |= ((codePoint >> (PAIR_ELEMENT_WIDTH - DATA_WIDTH)) + U16_LEAD) & MASK_16BIT;
89     pair <<= PAIR_ELEMENT_WIDTH;
90     pair |= (codePoint & MASK_10BIT) + U16_TAIL;
91 
92     return { pair, CONST_4 };
93 }
94 
ConvertRegionUtf8ToUtf16(const uint8_t * mutf8In,uint16_t * utf16Out,size_t mutf8Len,size_t utf16Len,size_t start)95 size_t ConvertRegionUtf8ToUtf16(
96     const uint8_t* mutf8In, uint16_t* utf16Out, size_t mutf8Len, size_t utf16Len, size_t start)
97 {
98     size_t inPos = 0;
99     size_t outPos = 0;
100     while (inPos < mutf8Len) {
101         auto [pair, nbytes] = ConvertMUtf8ToUtf16Pair(mutf8In, mutf8Len - inPos);
102         auto [pHi, pLo] = SplitUtf16Pair(pair);
103 
104         mutf8In += nbytes; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
105         inPos += nbytes;
106         if (start > 0) {
107             start -= nbytes;
108             continue;
109         }
110 
111         if (pHi != 0) {
112             if (outPos++ >= utf16Len - 1) { // check for place for two uint16
113                 --outPos;
114                 break;
115             }
116             *utf16Out++ = pHi; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
117         }
118         if (outPos++ >= utf16Len) {
119             --outPos;
120             break;
121         }
122         *utf16Out++ = pLo; // NOLINT(cppcoreguidelines-pro-bounds-pointer-arithmetic)
123     }
124     return outPos;
125 }
126 
IsUTF16HighSurrogate(uint16_t ch)127 bool IsUTF16HighSurrogate(uint16_t ch)
128 {
129     return DECODE_LEAD_LOW <= ch && ch <= DECODE_LEAD_HIGH;
130 }
131 
IsUTF16LowSurrogate(uint16_t ch)132 bool IsUTF16LowSurrogate(uint16_t ch)
133 {
134     return DECODE_TRAIL_LOW <= ch && ch <= DECODE_TRAIL_HIGH;
135 }
136 
UTF8Length(uint32_t codePoint)137 size_t UTF8Length(uint32_t codePoint)
138 {
139     if (codePoint <= UTF8_1B_MAX) {
140         return UtfLength::ONE;
141     }
142     if (codePoint <= UTF8_2B_MAX) {
143         return UtfLength::TWO;
144     }
145     if (codePoint <= UTF8_3B_MAX) {
146         return UtfLength::THREE;
147     }
148     return UtfLength::FOUR;
149 }
150 
151 // Methods for encode unicode to unicode
EncodeUTF8(uint32_t codePoint,uint8_t * utf8,size_t len,size_t index)152 size_t EncodeUTF8(uint32_t codePoint, uint8_t* utf8, size_t len, size_t index)
153 {
154     size_t size = UTF8Length(codePoint);
155     if (index + size > len) {
156         return 0;
157     }
158     for (size_t j = size - 1; j > 0; j--) {
159         uint8_t cont = ((codePoint | BYTE_MARK) & BYTE_MASK);
160         utf8[index + j] = cont;
161         codePoint >>= UTF8_OFFSET;
162     }
163     utf8[index] = codePoint | FIRST_BYTE_MARK[size];
164     return size;
165 }
166 
HandleAndDecodeInvalidUTF16(uint16_t const * utf16,size_t len,size_t * index)167 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const* utf16, size_t len, size_t* index)
168 {
169     uint16_t first = utf16[*index];
170     // A valid surrogate pair should always start with a High Surrogate
171     if (IsUTF16LowSurrogate(first)) {
172         return UTF16_REPLACEMENT_CHARACTER;
173     }
174     if (IsUTF16HighSurrogate(first) || (first & SURROGATE_MASK) == DECODE_LEAD_LOW) {
175         if (*index == len - 1) {
176             // A High surrogate not paired with another surrogate
177             return UTF16_REPLACEMENT_CHARACTER;
178         }
179         uint16_t second = utf16[*index + 1];
180         if (!IsUTF16LowSurrogate(second)) {
181             // A High surrogate not followed by a low surrogate
182             return UTF16_REPLACEMENT_CHARACTER;
183         }
184         // A valid surrogate pair, decode normally
185         (*index)++;
186         return ((first - DECODE_LEAD_LOW) << UTF16_OFFSET) + (second - DECODE_TRAIL_LOW) + DECODE_SECOND_FACTOR;
187     }
188     // A unicode not fallen into the range of representing by surrogate pair, return as it is
189     return first;
190 }
191 
DebuggerConvertRegionUtf16ToUtf8(const uint16_t * utf16In,uint8_t * utf8Out,size_t utf16Len,size_t utf8Len,size_t start)192 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
193     size_t start)
194 {
195     if (utf16In == nullptr || utf8Out == nullptr || utf8Len == 0) {
196         return 0;
197     }
198     size_t utf8Pos = 0;
199     size_t end = start + utf16Len;
200     for (size_t i = start; i < end; ++i) {
201         uint32_t codePoint = HandleAndDecodeInvalidUTF16(utf16In, end, &i);
202         if (codePoint == 0) {
203             continue;
204         }
205         utf8Pos += EncodeUTF8(codePoint, utf8Out, utf8Len, utf8Pos);
206     }
207     return utf8Pos;
208 }
209 
IsContinuationByte(const std::string & input,size_t startIndex,uint8_t continueCount)210 bool IsContinuationByte(const std::string& input, size_t startIndex, uint8_t continueCount)
211 {
212     uint8_t i = 0;
213     while (i < continueCount) {
214         unsigned char utfByte = input[startIndex + i];
215         if ((utfByte & MUTF8_2B_FIRST) != MUTF8_2B_SECOND) {
216             return false;
217         }
218         i++;
219     }
220     return true;
221 }
222 
IsUTF8(std::string & data)223 bool IsUTF8(std::string& data)
224 {
225     if (data.empty()) {
226         return false;
227     }
228 
229     size_t i = 0;
230     while (i < data.size()) {
231         unsigned char byte = data[i];
232         if (byte <= MUTF8_1B_MAX) {
233             i++;
234         } else if ((byte & MUTF8_3B_FIRST) == MUTF8_2B_FIRST) {
235             if (i + INDEX_ONE >= data.size()) {
236                 return false;
237             }
238             if (!IsContinuationByte(data, i + 1, 1)) {
239                 return false;
240             }
241             i += CONST_2;
242         } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
243             if (i + INDEX_TWO >= data.size()) {
244                 return false;
245             }
246             if (!IsContinuationByte(data, i + 1, CONST_2)) {
247                 return false;
248             }
249             i += CONST_3;
250         } else if ((byte & MUTF8_4B_FIRST_MASK) == MUTF8_4B_FIRST) {
251             if (i + INDEX_THREE >= data.size()) {
252                 return false;
253             }
254             if (!IsContinuationByte(data, i + 1, CONST_3)) {
255                 return false;
256             }
257             i += CONST_4;
258         } else {
259             return false;
260         }
261     }
262     return true;
263 }
264 
RemoveInvalidUft8Bytes(const std::string & input)265 std::string RemoveInvalidUft8Bytes(const std::string& input)
266 {
267     std::string result;
268     result.reserve(input.size());
269     size_t i = 0;
270 
271     while (i < input.size()) {
272         unsigned char byte = input[i];
273         if (byte <= MUTF8_1B_MAX) {
274             result += byte;
275             ++i;
276         } else if ((byte & MUTF8_3B_FIRST) == MUTF8_2B_FIRST) {
277             if (i + 1 < input.size() && IsContinuationByte(input, i + 1, 1)) {
278                 result += input.substr(i, CONST_2);
279                 i += CONST_2;
280             } else {
281                 ++i;
282             }
283         } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
284             if (i + CONST_2 < input.size() && IsContinuationByte(input, i + 1, CONST_2)) {
285                 result += input.substr(i, CONST_3);
286                 i += CONST_3;
287             } else {
288                 ++i;
289             }
290         } else if ((byte & MUTF8_4B_FIRST) == MUTF8_3B_FIRST) {
291             if (i + CONST_3 < input.size() && IsContinuationByte(input, i + 1, CONST_3)) {
292                 result += input.substr(i, CONST_4);
293                 i += CONST_4;
294             } else {
295                 ++i;
296             }
297         } else {
298             ++i;
299         }
300     }
301     return result;
302 }
303 
ConvertIllegalStr(std::string & str)304 void ConvertIllegalStr(std::string& str)
305 {
306     bool isRemove = false;
307     if (!IsUTF8(str)) {
308         TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "the str is not valid utf-8 string");
309         str = RemoveInvalidUft8Bytes(str);
310         isRemove = true;
311     }
312     if (!isRemove || IsUTF8(str)) {
313         uint8_t* buf8 =  reinterpret_cast<uint8_t*>(const_cast<char*>(str.c_str()));
314         size_t utf8Len = str.size();
315         auto utf16Len = MUtf8ToUtf16Size(buf8, utf8Len);
316         std::unique_ptr<uint16_t[]> buf16 = std::make_unique<uint16_t[]>(utf16Len);
317         auto resultLen = ConvertRegionUtf8ToUtf16(buf8, buf16.get(), utf8Len, utf16Len, 0);
318         if (resultLen == utf16Len) {
319             DebuggerConvertRegionUtf16ToUtf8(buf16.get(), buf8, utf16Len, utf8Len, 0);
320         } else {
321             TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "resultLen is %{public}d, utf16Len is %{public}d",
322                 static_cast<uint16_t>(resultLen), static_cast<uint16_t>(utf16Len));
323         }
324     } else {
325         TAG_LOGW(AceLogTag::ACE_LAYOUT_INSPECTOR, "the str is still not valid utf-8 string");
326     }
327 }
328 
329 } // namespace OHOS::Ace
330