1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "unicode_ex.h"
17 #include "utils_log.h"
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 using namespace std;
22 /***************************************UTF8 and UTF16 unicode**********************************************
23 UTF8
24 Unicode utf8
25 U + 0000~U + 007F 0???????
26 U + 0080~U + 07FF 110????? 10??????
27 U + 0800~U + FFFF 1110???? 10?????? 10??????
28 U + 10000~U + 10FFFF 11110??? 10?????? 10?????? 10??????
29
30 UTF16
31 Unicode utf16 code
32
33 U + 000~U + FFFF 2 Byte save, same with Unicode
34 U + 10000~U + 10FFFF 4 Byte save Unicode 0x10000
35 **************************************UTF8 and UTF16 unicode**********************************************/
36 namespace OHOS {
37 namespace {
38 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
39 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
40 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
41
42
43 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
44 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
45 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
46 constexpr unsigned int UTF8_OFFSET = 6;
47
48 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
49 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
50 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
51 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
52 };
53 }
54
55 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)56 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
57 {
58 dstP += bytes;
59 if (bytes >= 4) {
60 *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
61 srcChar >>= UTF8_OFFSET;
62 }
63
64 if (bytes >= 3) {
65 *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
66 srcChar >>= UTF8_OFFSET;
67 }
68
69 if (bytes >= 2) {
70 *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
71 srcChar >>= UTF8_OFFSET;
72 }
73
74 if (bytes >= 1) {
75 *--dstP = (uint8_t)(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
76 }
77 }
78
Utf32CodePointUtf8Length(char32_t srcChar)79 size_t Utf32CodePointUtf8Length(char32_t srcChar)
80 {
81 if (srcChar < ONE_BYTE_UTF8) {
82 return 1;
83 } else if (srcChar < TWO_BYTES_UTF8) {
84 return 2;
85 } else if (srcChar < THREE_BYTES_UTF8) {
86 if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
87 return 3;
88 } else {
89 // Surrogates are invalid UTF-32 characters.
90 return 0;
91 }
92 } else if (srcChar <= UNICODE_MAX_NUM) {
93 // Max code point for Unicode is 0x0010FFFF.
94 return 4;
95 } else {
96 // Invalid UTF-32 character.
97 return 0;
98 }
99 }
100
101 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)102 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
103 {
104 if (str16 == nullptr || str16Len == 0) {
105 return -1;
106 }
107
108 const char16_t* const str16End = str16 + str16Len;
109 int utf8Len = 0;
110 while (str16 < str16End) {
111 int charLen = 0;
112 if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
113 && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
114 // surrogate pairs are always 4 bytes.
115 charLen = 4;
116 str16 += 2;
117 } else {
118 charLen = Utf32CodePointUtf8Length((char32_t)* str16++);
119 }
120
121 if (utf8Len > (INT_MAX - charLen)) {
122 return -1;
123 }
124 utf8Len += charLen;
125 }
126 return utf8Len;
127 }
128
129 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)130 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
131 {
132 const char16_t* curUtf16 = utf16Str;
133 const char16_t* const endUtf16 = utf16Str + str16Len;
134 char* cur = utf8Str;
135 while (curUtf16 < endUtf16) {
136 char32_t utf32;
137 // surrogate pairs
138 if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
139 && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
140 utf32 = (*curUtf16++ - 0xD800) << 10;
141 utf32 |= *curUtf16++ - 0xDC00;
142 utf32 += 0x10000;
143 } else {
144 utf32 = (char32_t)* curUtf16++;
145 }
146 const size_t len = Utf32CodePointUtf8Length(utf32);
147 if (str8Len < len) {
148 break;
149 }
150
151 Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
152 cur += len;
153 str8Len -= len;
154 }
155 *cur = '\0';
156 }
157
158 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)159 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
160 {
161 char* str8 = nullptr;
162 int utf8Len = Utf16ToUtf8Length(str16, str16Len);
163 if (utf8Len < 0) {
164 return nullptr;
165 }
166
167 // Allow for closing '\0'
168 utf8Len += 1;
169 str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
170 if (str8 == nullptr) {
171 return nullptr;
172 }
173
174 StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
175 return str8;
176 }
177
String16ToString8(const u16string & str16,string & str8)178 bool String16ToString8(const u16string& str16, string& str8)
179 {
180 size_t str16Len = str16.length();
181 if (str16Len < 1) {
182 return false;
183 }
184
185 char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
186 if (str8Temp == nullptr) {
187 UTILS_LOGE("Str16 to str8 failed, because str8Temp is nullptr!");
188 return false;
189 }
190
191 str8 = str8Temp;
192 free(str8Temp);
193 str8Temp = nullptr;
194 return true;
195 }
196
197 /**
198 * return 1-4 by first byte
199 * 1111xxxx : 4
200 * 1110xxxx : 3
201 * 110xxxxx : 2
202 * 10xxxxxx : 1
203 * 0xxxxxxx : 1
204 */
Utf8CodePointLen(uint8_t ch)205 static inline size_t Utf8CodePointLen(uint8_t ch)
206 {
207 return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
208 }
209
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)210 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
211 {
212 *codePoint <<= 6;
213 *codePoint |= 0x3F & byte;
214 }
215
Utf8ToUtf32CodePoint(const char * src,size_t length)216 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
217 {
218 uint32_t unicode = 0;
219
220 switch (length) {
221 case 1:
222 return src[0];
223 case 2:
224 unicode = src[0] & 0x1f;
225 Utf8ShiftAndMask(&unicode, src[1]);
226 return unicode;
227 case 3:
228 unicode = src[0] & 0x0f;
229 Utf8ShiftAndMask(&unicode, src[1]);
230 Utf8ShiftAndMask(&unicode, src[2]);
231 return unicode;
232 case 4:
233 unicode = src[0] & 0x07;
234 Utf8ShiftAndMask(&unicode, src[1]);
235 Utf8ShiftAndMask(&unicode, src[2]);
236 Utf8ShiftAndMask(&unicode, src[3]);
237 return unicode;
238 default:
239 return 0xffff;
240 }
241 }
242
Utf8ToUtf16Length(const char * str8,size_t str8Len)243 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
244 {
245 const char* const str8end = str8 + str8Len;
246 int utf16len = 0;
247 while (str8 < str8end) {
248 utf16len++;
249 int u8charlen = Utf8CodePointLen(*str8);
250 if (str8 + u8charlen - 1 >= str8end) {
251 UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
252 return -1;
253 }
254 uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
255 if (codepoint > 0xFFFF) {
256 utf16len++; // this will be a surrogate pair in utf16
257 }
258 str8 += u8charlen;
259 }
260 if (str8 != str8end) {
261 UTILS_LOGE("Get str16 length failed because str8length is illegal!");
262 return -1;
263 }
264 return utf16len;
265 }
266
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)267 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
268 {
269 if (u16len == 0) {
270 return u16str;
271 }
272 const char* const u8end = utf8Str + u8len;
273 const char* u8cur = utf8Str;
274 const char16_t* const u16end = u16str + u16len;
275 char16_t* u16cur = u16str;
276
277 while ((u8cur < u8end) && (u16cur < u16end)) {
278 size_t len = Utf8CodePointLen(*u8cur);
279 uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
280 // Convert the UTF32 codepoint to one or more UTF16 codepoints
281 if (codepoint <= 0xFFFF) {
282 // Single UTF16 character
283 *u16cur++ = (char16_t)codepoint;
284 } else {
285 // Multiple UTF16 characters with surrogates
286 codepoint = codepoint - 0x10000;
287 *u16cur++ = (char16_t)((codepoint >> 10) + 0xD800);
288 if (u16cur >= u16end) {
289 // Ooops... not enough room for this surrogate pair.
290 return u16cur - 1;
291 }
292 *u16cur++ = (char16_t)((codepoint & 0x3FF) + 0xDC00);
293 }
294
295 u8cur += len;
296 }
297 return u16cur;
298 }
299
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)300 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
301 {
302 char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
303 *result = 0;
304 return;
305 }
306
307 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)308 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
309 {
310 char16_t* str16 = nullptr;
311 int utf16Len = Utf8ToUtf16Length(str8, str8Len);
312 if (utf16Len < 0) {
313 UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
314 return nullptr;
315 }
316
317 // Allow for closing 0
318 utf16Len = utf16Len + 1;
319 str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
320 if (str16 == nullptr) {
321 UTILS_LOGE("Str16 malloc memory failed!");
322 return nullptr;
323 }
324
325 StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
326 return str16;
327 }
328
String8ToString16(const string & str8,u16string & str16)329 bool String8ToString16(const string& str8, u16string& str16)
330 {
331 size_t str8len = str8.length();
332 if (str8len < 1) {
333 UTILS_LOGE("str8 to str16 failed, str8 is: %{public}s, size is: %{public}zu",
334 str8.c_str(), str8len);
335 return false;
336 }
337
338 char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
339 if (str16Temp == nullptr) {
340 UTILS_LOGE("str8 to str16 failed, str16Temp is nullptr!");
341 return false;
342 }
343
344 str16 = str16Temp;
345 free(str16Temp);
346 str16Temp = nullptr;
347 return true;
348 }
349 } // namespace OHOS