1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "unicode_ex.h"
17
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21
22 #include "utils_log.h"
23 using namespace std;
24 /***************************************UTF8 and UTF16 unicode**********************************************
25 UTF8
26 Unicode utf8
27 U + 0000~U + 007F 0???????
28 U + 0080~U + 07FF 110????? 10??????
29 U + 0800~U + FFFF 1110???? 10?????? 10??????
30 U + 10000~U + 10FFFF 11110??? 10?????? 10?????? 10??????
31
32 UTF16
33 Unicode utf16 code
34
35 U + 000~U + FFFF 2 Byte save, same with Unicode
36 U + 10000~U + 10FFFF 4 Byte save Unicode 0x10000
37 **************************************UTF8 and UTF16 unicode**********************************************/
38 namespace OHOS {
39 namespace {
40 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
41 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
42 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
43
44
45 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
46 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
47 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
48 constexpr unsigned int UTF8_OFFSET = 6;
49
50 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
51 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
52 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
53 0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
54 };
55 }
56
57 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)58 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
59 {
60 dstP += bytes;
61 if (bytes >= 4) {
62 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
63 srcChar >>= UTF8_OFFSET;
64 }
65
66 if (bytes >= 3) {
67 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
68 srcChar >>= UTF8_OFFSET;
69 }
70
71 if (bytes >= 2) {
72 *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
73 srcChar >>= UTF8_OFFSET;
74 }
75
76 if (bytes >= 1) {
77 *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
78 }
79 }
80
Utf32CodePointUtf8Length(char32_t srcChar)81 size_t Utf32CodePointUtf8Length(char32_t srcChar)
82 {
83 if (srcChar < ONE_BYTE_UTF8) {
84 return 1;
85 } else if (srcChar < TWO_BYTES_UTF8) {
86 return 2;
87 } else if (srcChar < THREE_BYTES_UTF8) {
88 if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
89 return 3;
90 } else {
91 // Surrogates are invalid UTF-32 characters.
92 return 0;
93 }
94 } else if (srcChar <= UNICODE_MAX_NUM) {
95 // Max code point for Unicode is 0x0010FFFF.
96 return 4;
97 } else {
98 // Invalid UTF-32 character.
99 return 0;
100 }
101 }
102
103 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)104 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
105 {
106 if (str16 == nullptr || str16Len == 0) {
107 return -1;
108 }
109
110 const char16_t* const str16End = str16 + str16Len;
111 int utf8Len = 0;
112 while (str16 < str16End) {
113 int charLen = 0;
114 if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
115 && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
116 // surrogate pairs are always 4 bytes.
117 charLen = 4;
118 str16 += 2;
119 } else {
120 charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
121 }
122
123 if (utf8Len > (INT_MAX - charLen)) {
124 return -1;
125 }
126 utf8Len += charLen;
127 }
128 return utf8Len;
129 }
130
131 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)132 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
133 {
134 const char16_t* curUtf16 = utf16Str;
135 const char16_t* const endUtf16 = utf16Str + str16Len;
136 char* cur = utf8Str;
137 while (curUtf16 < endUtf16) {
138 char32_t utf32;
139 // surrogate pairs
140 if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
141 && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
142 utf32 = (*curUtf16++ - 0xD800) << 10;
143 utf32 |= *curUtf16++ - 0xDC00;
144 utf32 += 0x10000;
145 } else {
146 utf32 = static_cast<char32_t>(*curUtf16++);
147 }
148 const size_t len = Utf32CodePointUtf8Length(utf32);
149 if (str8Len < len) {
150 break;
151 }
152
153 Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
154 cur += len;
155 str8Len -= len;
156 }
157 *cur = '\0';
158 }
159
160 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)161 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
162 {
163 char* str8 = nullptr;
164 int utf8Len = Utf16ToUtf8Length(str16, str16Len);
165 if (utf8Len < 0) {
166 return nullptr;
167 }
168
169 // Allow for closing '\0'
170 utf8Len += 1;
171 str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
172 if (str8 == nullptr) {
173 return nullptr;
174 }
175
176 StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
177 return str8;
178 }
179
String16ToString8(const u16string & str16,string & str8)180 bool String16ToString8(const u16string& str16, string& str8)
181 {
182 size_t str16Len = str16.length();
183 if (str16Len < 1) {
184 return false;
185 }
186
187 char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
188 if (str8Temp == nullptr) {
189 UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
190 return false;
191 }
192
193 str8 = str8Temp;
194 free(str8Temp);
195 str8Temp = nullptr;
196 return true;
197 }
198
199 /**
200 * return 1-4 by first byte
201 * 1111xxxx : 4
202 * 1110xxxx : 3
203 * 110xxxxx : 2
204 * 10xxxxxx : 1
205 * 0xxxxxxx : 1
206 */
Utf8CodePointLen(uint8_t ch)207 static inline size_t Utf8CodePointLen(uint8_t ch)
208 {
209 return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
210 }
211
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)212 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
213 {
214 *codePoint <<= 6;
215 *codePoint |= 0x3F & byte;
216 }
217
Utf8ToUtf32CodePoint(const char * src,size_t length)218 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
219 {
220 uint32_t unicode = 0;
221
222 switch (length) {
223 case 1:
224 return src[0];
225 case 2:
226 unicode = src[0] & 0x1f;
227 Utf8ShiftAndMask(&unicode, src[1]);
228 return unicode;
229 case 3:
230 unicode = src[0] & 0x0f;
231 Utf8ShiftAndMask(&unicode, src[1]);
232 Utf8ShiftAndMask(&unicode, src[2]);
233 return unicode;
234 case 4:
235 unicode = src[0] & 0x07;
236 Utf8ShiftAndMask(&unicode, src[1]);
237 Utf8ShiftAndMask(&unicode, src[2]);
238 Utf8ShiftAndMask(&unicode, src[3]);
239 return unicode;
240 default:
241 return 0xffff;
242 }
243 }
244
Utf8ToUtf16Length(const char * str8,size_t str8Len)245 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
246 {
247 const char* const str8end = str8 + str8Len;
248 int utf16len = 0;
249 while (str8 < str8end) {
250 utf16len++;
251 size_t u8charlen = Utf8CodePointLen(*str8);
252 if (str8 + u8charlen - 1 >= str8end) {
253 UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
254 return -1;
255 }
256 uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
257 if (codepoint > 0xFFFF) {
258 utf16len++; // this will be a surrogate pair in utf16
259 }
260 str8 += u8charlen;
261 }
262 if (str8 != str8end) {
263 UTILS_LOGE("Get str16 length failed because str8length is illegal!");
264 return -1;
265 }
266 return utf16len;
267 }
268
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)269 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
270 {
271 if (u16len == 0) {
272 return u16str;
273 }
274 const char* const u8end = utf8Str + u8len;
275 const char* u8cur = utf8Str;
276 const char16_t* const u16end = u16str + u16len;
277 char16_t* u16cur = u16str;
278
279 while ((u8cur < u8end) && (u16cur < u16end)) {
280 size_t len = Utf8CodePointLen(*u8cur);
281 uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
282 // Convert the UTF32 codepoint to one or more UTF16 codepoints
283 if (codepoint <= 0xFFFF) {
284 // Single UTF16 character
285 *u16cur++ = static_cast<char16_t>(codepoint);
286 } else {
287 // Multiple UTF16 characters with surrogates
288 codepoint = codepoint - 0x10000;
289 *u16cur++ = static_cast<char16_t>((codepoint >> 10) + 0xD800);
290 if (u16cur >= u16end) {
291 // Ooops... not enough room for this surrogate pair.
292 return u16cur - 1;
293 }
294 *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
295 }
296
297 u8cur += len;
298 }
299 return u16cur;
300 }
301
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)302 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
303 {
304 char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
305 *result = 0;
306 return;
307 }
308
309 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)310 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
311 {
312 char16_t* str16 = nullptr;
313 int utf16Len = Utf8ToUtf16Length(str8, str8Len);
314 if (utf16Len < 0) {
315 UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
316 return nullptr;
317 }
318
319 // Allow for closing 0
320 utf16Len = utf16Len + 1;
321 str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
322 if (str16 == nullptr) {
323 UTILS_LOGE("Str16 malloc memory failed!");
324 return nullptr;
325 }
326
327 StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
328 return str16;
329 }
330
String8ToString16(const string & str8,u16string & str16)331 bool String8ToString16(const string& str8, u16string& str16)
332 {
333 size_t str8len = str8.length();
334 if (str8len < 1) {
335 return false;
336 }
337
338 char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
339 if (str16Temp == nullptr) {
340 UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
341 return false;
342 }
343
344 str16 = str16Temp;
345 free(str16Temp);
346 str16Temp = nullptr;
347 return true;
348 }
349 } // namespace OHOS
350