• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "unicode_ex.h"
17 #include "utils_log.h"
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 using namespace std;
22 /***************************************UTF8 and UTF16 unicode**********************************************
23 UTF8
24 Unicode                                 utf8
25 U + 0000~U + 007F                    0???????
26 U + 0080~U + 07FF                    110????? 10??????
27 U + 0800~U + FFFF                    1110???? 10?????? 10??????
28 U + 10000~U + 10FFFF                 11110??? 10?????? 10?????? 10??????
29 
30 UTF16
31 Unicode                           utf16 code
32 
33 U + 000~U + FFFF                    2 Byte save, same with Unicode
34 U + 10000~U + 10FFFF                4 Byte save Unicode 0x10000
35 **************************************UTF8 and UTF16 unicode**********************************************/
36 namespace OHOS {
37 namespace {
38 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
39 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
40 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
41 
42 
43 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
44 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
45 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
46 constexpr unsigned int UTF8_OFFSET = 6;
47 
48 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
49 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
50 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
51     0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
52 };
53 }
54 
55 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)56 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
57 {
58     dstP += bytes;
59     if (bytes >= 4) {
60         *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
61         srcChar >>= UTF8_OFFSET;
62     }
63 
64     if (bytes >= 3) {
65         *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
66         srcChar >>= UTF8_OFFSET;
67     }
68 
69     if (bytes >= 2) {
70         *--dstP = (uint8_t)((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
71         srcChar >>= UTF8_OFFSET;
72     }
73 
74     if (bytes >= 1) {
75         *--dstP = (uint8_t)(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
76     }
77 }
78 
Utf32CodePointUtf8Length(char32_t srcChar)79 size_t Utf32CodePointUtf8Length(char32_t srcChar)
80 {
81     if (srcChar < ONE_BYTE_UTF8) {
82         return 1;
83     } else if (srcChar < TWO_BYTES_UTF8) {
84         return 2;
85     } else if (srcChar < THREE_BYTES_UTF8) {
86         if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
87             return 3;
88         } else {
89             // Surrogates are invalid UTF-32 characters.
90             return 0;
91         }
92     } else if (srcChar <= UNICODE_MAX_NUM) {
93         // Max code point for Unicode is 0x0010FFFF.
94         return 4;
95     } else {
96         // Invalid UTF-32 character.
97         return 0;
98     }
99 }
100 
101 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)102 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
103 {
104     if (str16 == nullptr || str16Len == 0) {
105         return -1;
106     }
107 
108     const char16_t* const str16End = str16 + str16Len;
109     int utf8Len = 0;
110     while (str16 < str16End) {
111         int charLen = 0;
112         if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
113             && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
114             // surrogate pairs are always 4 bytes.
115             charLen = 4;
116             str16 += 2;
117         } else {
118             charLen = Utf32CodePointUtf8Length((char32_t)* str16++);
119         }
120 
121         if (utf8Len > (INT_MAX - charLen)) {
122             return -1;
123         }
124         utf8Len += charLen;
125     }
126     return utf8Len;
127 }
128 
129 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)130 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
131 {
132     const char16_t* curUtf16 = utf16Str;
133     const char16_t* const endUtf16 = utf16Str + str16Len;
134     char* cur = utf8Str;
135     while (curUtf16 < endUtf16) {
136         char32_t utf32;
137         // surrogate pairs
138         if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
139             && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
140             utf32 = (*curUtf16++ - 0xD800) << 10;
141             utf32 |= *curUtf16++ - 0xDC00;
142             utf32 += 0x10000;
143         } else {
144             utf32 = (char32_t)* curUtf16++;
145         }
146         const size_t len = Utf32CodePointUtf8Length(utf32);
147         if (str8Len < len) {
148             break;
149         }
150 
151         Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
152         cur += len;
153         str8Len -= len;
154     }
155     *cur = '\0';
156 }
157 
158 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)159 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
160 {
161     char* str8 = nullptr;
162     int utf8Len = Utf16ToUtf8Length(str16, str16Len);
163     if (utf8Len < 0) {
164         return nullptr;
165     }
166 
167     // Allow for closing '\0'
168     utf8Len += 1;
169     str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
170     if (str8 == nullptr) {
171         return nullptr;
172     }
173 
174     StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
175     return str8;
176 }
177 
String16ToString8(const u16string & str16,string & str8)178 bool String16ToString8(const u16string& str16, string& str8)
179 {
180     size_t str16Len = str16.length();
181     if (str16Len < 1) {
182         return false;
183     }
184 
185     char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
186     if (str8Temp == nullptr) {
187         UTILS_LOGE("Str16 to str8 failed, because str8Temp is nullptr!");
188         return false;
189     }
190 
191     str8 = str8Temp;
192     free(str8Temp);
193     str8Temp = nullptr;
194     return true;
195 }
196 
197 /**
198 * return 1-4 by first byte
199 * 1111xxxx : 4
200 * 1110xxxx : 3
201 * 110xxxxx : 2
202 * 10xxxxxx : 1
203 * 0xxxxxxx : 1
204 */
Utf8CodePointLen(uint8_t ch)205 static inline size_t Utf8CodePointLen(uint8_t ch)
206 {
207     return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
208 }
209 
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)210 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
211 {
212     *codePoint <<= 6;
213     *codePoint |= 0x3F & byte;
214 }
215 
Utf8ToUtf32CodePoint(const char * src,size_t length)216 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
217 {
218     uint32_t unicode = 0;
219 
220     switch (length) {
221         case 1:
222             return src[0];
223         case 2:
224             unicode = src[0] & 0x1f;
225             Utf8ShiftAndMask(&unicode, src[1]);
226             return unicode;
227         case 3:
228             unicode = src[0] & 0x0f;
229             Utf8ShiftAndMask(&unicode, src[1]);
230             Utf8ShiftAndMask(&unicode, src[2]);
231             return unicode;
232         case 4:
233             unicode = src[0] & 0x07;
234             Utf8ShiftAndMask(&unicode, src[1]);
235             Utf8ShiftAndMask(&unicode, src[2]);
236             Utf8ShiftAndMask(&unicode, src[3]);
237             return unicode;
238         default:
239             return 0xffff;
240     }
241 }
242 
Utf8ToUtf16Length(const char * str8,size_t str8Len)243 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
244 {
245     const char* const str8end = str8 + str8Len;
246     int utf16len = 0;
247     while (str8 < str8end) {
248         utf16len++;
249         int u8charlen = Utf8CodePointLen(*str8);
250         if (str8 + u8charlen - 1 >= str8end) {
251             UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
252             return -1;
253         }
254         uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
255         if (codepoint > 0xFFFF) {
256             utf16len++; // this will be a surrogate pair in utf16
257         }
258         str8 += u8charlen;
259     }
260     if (str8 != str8end) {
261         UTILS_LOGE("Get str16 length failed because str8length is illegal!");
262         return -1;
263     }
264     return utf16len;
265 }
266 
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)267 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
268 {
269     if (u16len == 0) {
270         return u16str;
271     }
272     const char* const u8end = utf8Str + u8len;
273     const char* u8cur = utf8Str;
274     const char16_t* const u16end = u16str + u16len;
275     char16_t* u16cur = u16str;
276 
277     while ((u8cur < u8end) && (u16cur < u16end)) {
278         size_t len = Utf8CodePointLen(*u8cur);
279         uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
280         // Convert the UTF32 codepoint to one or more UTF16 codepoints
281         if (codepoint <= 0xFFFF) {
282             // Single UTF16 character
283             *u16cur++ = (char16_t)codepoint;
284         } else {
285             // Multiple UTF16 characters with surrogates
286             codepoint = codepoint - 0x10000;
287             *u16cur++ = (char16_t)((codepoint >> 10) + 0xD800);
288             if (u16cur >= u16end) {
289                 // Ooops...  not enough room for this surrogate pair.
290                 return u16cur - 1;
291             }
292             *u16cur++ = (char16_t)((codepoint & 0x3FF) + 0xDC00);
293         }
294 
295         u8cur += len;
296     }
297     return u16cur;
298 }
299 
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)300 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
301 {
302     char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
303     *result = 0;
304     return;
305 }
306 
307 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)308 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
309 {
310     char16_t* str16 = nullptr;
311     int utf16Len = Utf8ToUtf16Length(str8, str8Len);
312     if (utf16Len < 0) {
313         UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
314         return nullptr;
315     }
316 
317     // Allow for closing 0
318     utf16Len = utf16Len + 1;
319     str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
320     if (str16 == nullptr) {
321         UTILS_LOGE("Str16 malloc memory failed!");
322         return nullptr;
323     }
324 
325     StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
326     return str16;
327 }
328 
String8ToString16(const string & str8,u16string & str16)329 bool String8ToString16(const string& str8, u16string& str16)
330 {
331     size_t str8len = str8.length();
332     if (str8len < 1) {
333         UTILS_LOGE("str8 to str16 failed, str8 is: %{public}s, size is: %{public}zu",
334             str8.c_str(), str8len);
335         return false;
336     }
337 
338     char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
339     if (str16Temp == nullptr) {
340         UTILS_LOGE("str8 to str16 failed, str16Temp is nullptr!");
341         return false;
342     }
343 
344     str16 = str16Temp;
345     free(str16Temp);
346     str16Temp = nullptr;
347     return true;
348 }
349 } // namespace OHOS