1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "unicode_ex.h"
17
18 #include "utils_log.h"
19 using namespace std;
20 /***************************************UTF8 and UTF16 unicode**********************************************
21 UTF8
22 Unicode utf8
23 U + 0000~U + 007F 0???????
24 U + 0080~U + 07FF 110????? 10??????
25 U + 0800~U + FFFF 1110???? 10?????? 10??????
26 U + 10000~U + 10FFFF 11110??? 10?????? 10?????? 10??????
27
28 UTF16
29 Unicode utf16 code
30
31 U + 000~U + FFFF 2 Byte save, same with Unicode
32 U + 10000~U + 10FFFF 4 Byte save Unicode 0x10000
33 **************************************UTF8 and UTF16 unicode**********************************************/
34 namespace OHOS {
35 namespace {
36 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
37 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
38 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
39
40
41 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
42 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
43 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
44 constexpr unsigned int UTF8_OFFSET = 6;
45
46 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
47 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
48 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
49 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
50 };
51 }
52
53 #define UTF8_LENGTH_INVALID 0
54 #define UTF8_LENGTH_1 1
55 #define UTF8_LENGTH_2 2
56 #define UTF8_LENGTH_3 3
57 #define UTF8_LENGTH_4 4
58 #define UTF8_LEN_MASK 3
59 #define UTF8_FIRST_BYTE_INDEX 0
60 #define UTF8_SECOND_BYTE_INDEX 1
61 #define UTF8_THIRD_BYTE_INDEX 2
62 #define UTF8_FORTH_BYTE_INDEX 3
63 #define UTF8_SHIFT_WIDTH 6
64 #define STR16_TO_STR8_SHIFT_WIDTH 10
65 #define UTF16_SHIFT_WIDTH 10
66 #define UTF32_BYTE_SIZE_1 1
67 #define UTF32_BYTE_SIZE_2 2
68 #define UTF32_BYTE_SIZE_3 3
69 #define UTF32_BYTE_SIZE_4 4
70
71 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)72 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
73 {
74 dstP += bytes;
75 if (bytes >= UTF32_BYTE_SIZE_4) {
76 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
77 srcChar >>= UTF8_OFFSET;
78 }
79
80 if (bytes >= UTF32_BYTE_SIZE_3) {
81 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
82 srcChar >>= UTF8_OFFSET;
83 }
84
85 if (bytes >= UTF32_BYTE_SIZE_2) {
86 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
87 srcChar >>= UTF8_OFFSET;
88 }
89
90 if (bytes >= UTF32_BYTE_SIZE_1) {
91 *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
92 }
93 }
94
Utf32CodePointUtf8Length(char32_t srcChar)95 size_t Utf32CodePointUtf8Length(char32_t srcChar)
96 {
97 if (srcChar < ONE_BYTE_UTF8) {
98 return UTF8_LENGTH_1;
99 } else if (srcChar < TWO_BYTES_UTF8) {
100 return UTF8_LENGTH_2;
101 } else if (srcChar < THREE_BYTES_UTF8) {
102 if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
103 return UTF8_LENGTH_3;
104 } else {
105 // Surrogates are invalid UTF-32 characters.
106 return UTF8_LENGTH_INVALID;
107 }
108 } else if (srcChar <= UNICODE_MAX_NUM) {
109 // Max code point for Unicode is 0x0010FFFF.
110 return UTF8_LENGTH_4;
111 } else {
112 // Invalid UTF-32 character.
113 return UTF8_LENGTH_INVALID;
114 }
115 }
116
117 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)118 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
119 {
120 if (str16 == nullptr || str16Len == 0) {
121 return -1;
122 }
123
124 const char16_t* const str16End = str16 + str16Len;
125 int utf8Len = 0;
126 while (str16 < str16End) {
127 int charLen = 0;
128 if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
129 && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
130 // surrogate pairs are always 4 bytes.
131 charLen = 4;
132 // str16 advance 2 bytes
133 str16 += 2;
134 } else {
135 charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
136 }
137
138 if (utf8Len > (INT_MAX - charLen)) {
139 return -1;
140 }
141 utf8Len += charLen;
142 }
143 return utf8Len;
144 }
145
146 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)147 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
148 {
149 const char16_t* curUtf16 = utf16Str;
150 const char16_t* const endUtf16 = utf16Str + str16Len;
151 char* cur = utf8Str;
152 while (curUtf16 < endUtf16) {
153 char32_t utf32;
154 // surrogate pairs
155 if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
156 && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
157 utf32 = (*curUtf16++ - 0xD800) << STR16_TO_STR8_SHIFT_WIDTH;
158 utf32 |= *curUtf16++ - 0xDC00;
159 utf32 += 0x10000;
160 } else {
161 utf32 = static_cast<char32_t>(*curUtf16++);
162 }
163 const size_t len = Utf32CodePointUtf8Length(utf32);
164 if (str8Len <= len) {
165 break;
166 }
167
168 Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
169 cur += len;
170 str8Len -= len;
171 }
172 *cur = '\0';
173 }
174
175 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)176 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
177 {
178 char* str8 = nullptr;
179 int utf8Len = Utf16ToUtf8Length(str16, str16Len);
180 if (utf8Len < 0 || utf8Len >= INT_MAX) {
181 return nullptr;
182 }
183
184 // Allow for closing '\0'
185 utf8Len += 1;
186 str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
187 if (str8 == nullptr) {
188 return nullptr;
189 }
190
191 StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
192 return str8;
193 }
194
String16ToString8(const u16string & str16,string & str8)195 bool String16ToString8(const u16string& str16, string& str8)
196 {
197 size_t str16Len = str16.length();
198 if (str16Len < 1) {
199 return false;
200 }
201
202 char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
203 if (str8Temp == nullptr) {
204 UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
205 return false;
206 }
207
208 str8 = str8Temp;
209 free(str8Temp);
210 str8Temp = nullptr;
211 return true;
212 }
213
214 /**
215 * return 1-4 by first byte
216 * 1111xxxx : 4
217 * 1110xxxx : 3
218 * 110xxxxx : 2
219 * 10xxxxxx : 1
220 * 0xxxxxxx : 1
221 */
Utf8CodePointLen(uint8_t ch)222 static inline size_t Utf8CodePointLen(uint8_t ch)
223 {
224 return ((0xe5000000 >> ((ch >> UTF8_LEN_MASK) & 0x1e)) & UTF8_LEN_MASK) + 1;
225 }
226
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)227 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
228 {
229 *codePoint <<= UTF8_SHIFT_WIDTH;
230 *codePoint |= 0x3F & byte;
231 }
232
Utf8ToUtf32CodePoint(const char * src,size_t length)233 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
234 {
235 uint32_t unicode = 0;
236
237 switch (length) {
238 case UTF8_LENGTH_1:
239 return src[UTF8_FIRST_BYTE_INDEX];
240 case UTF8_LENGTH_2:
241 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x1f;
242 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
243 return unicode;
244 case UTF8_LENGTH_3:
245 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x0f;
246 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
247 Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
248 return unicode;
249 case UTF8_LENGTH_4:
250 unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x07;
251 Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
252 Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
253 Utf8ShiftAndMask(&unicode, src[UTF8_FORTH_BYTE_INDEX]);
254 return unicode;
255 default:
256 return 0xffff;
257 }
258 }
259
Utf8ToUtf16Length(const char * str8,size_t str8Len)260 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
261 {
262 const char* const str8end = str8 + str8Len;
263 int utf16len = 0;
264 while (str8 < str8end) {
265 utf16len++;
266 size_t u8charlen = Utf8CodePointLen(*str8);
267 if (str8 + u8charlen - 1 >= str8end) {
268 UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
269 return -1;
270 }
271 uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
272 if (codepoint > 0xFFFF) {
273 utf16len++; // this will be a surrogate pair in utf16
274 }
275 str8 += u8charlen;
276 }
277 if (str8 != str8end) {
278 UTILS_LOGE("Get str16 length failed because str8length is illegal!");
279 return -1;
280 }
281 return utf16len;
282 }
283
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)284 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
285 {
286 if (u16len == 0) {
287 return u16str;
288 }
289 const char* const u8end = utf8Str + u8len;
290 const char* u8cur = utf8Str;
291 const char16_t* const u16end = u16str + u16len;
292 char16_t* u16cur = u16str;
293
294 while ((u8cur < u8end) && (u16cur < u16end)) {
295 size_t len = Utf8CodePointLen(*u8cur);
296 uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
297 // Convert the UTF32 codepoint to one or more UTF16 codepoints
298 if (codepoint <= 0xFFFF) {
299 // Single UTF16 character
300 *u16cur++ = static_cast<char16_t>(codepoint);
301 } else {
302 // Multiple UTF16 characters with surrogates
303 codepoint = codepoint - 0x10000;
304 *u16cur++ = static_cast<char16_t>((codepoint >> UTF16_SHIFT_WIDTH) + 0xD800);
305 if (u16cur >= u16end) {
306 // Ooops... not enough room for this surrogate pair.
307 return u16cur - 1;
308 }
309 *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
310 }
311
312 u8cur += len;
313 }
314 return u16cur;
315 }
316
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)317 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
318 {
319 char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
320 *result = 0;
321 return;
322 }
323
324 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)325 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
326 {
327 char16_t* str16 = nullptr;
328 int utf16Len = Utf8ToUtf16Length(str8, str8Len);
329 if (utf16Len < 0 || utf16Len >= INT_MAX) {
330 UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
331 return nullptr;
332 }
333
334 // Allow for closing 0
335 utf16Len = utf16Len + 1;
336 str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
337 if (str16 == nullptr) {
338 UTILS_LOGE("Str16 malloc memory failed!");
339 return nullptr;
340 }
341
342 StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
343 return str16;
344 }
345
String8ToString16(const string & str8,u16string & str16)346 bool String8ToString16(const string& str8, u16string& str16)
347 {
348 size_t str8len = str8.length();
349 if (str8len < 1) {
350 return false;
351 }
352
353 char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
354 if (str16Temp == nullptr) {
355 UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
356 return false;
357 }
358
359 str16 = str16Temp;
360 free(str16Temp);
361 str16Temp = nullptr;
362 return true;
363 }
364 } // namespace OHOS
365