• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "unicode_ex.h"
17 
18 #include "utils_log.h"
19 using namespace std;
20 /***************************************UTF8 and UTF16 unicode**********************************************
21 UTF8
22 Unicode                                 utf8
23 U + 0000~U + 007F                    0???????
24 U + 0080~U + 07FF                    110????? 10??????
25 U + 0800~U + FFFF                    1110???? 10?????? 10??????
26 U + 10000~U + 10FFFF                 11110??? 10?????? 10?????? 10??????
27 
28 UTF16
29 Unicode                           utf16 code
30 
31 U + 000~U + FFFF                    2 Byte save, same with Unicode
32 U + 10000~U + 10FFFF                4 Byte save Unicode 0x10000
33 **************************************UTF8 and UTF16 unicode**********************************************/
34 namespace OHOS {
35 namespace {
36 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
37 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
38 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
39 
40 
41 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
42 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
43 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
44 constexpr unsigned int UTF8_OFFSET = 6;
45 
46 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
47 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
48 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
49     0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
50 };
51 }
52 
53 #define UTF8_LENGTH_INVALID 0
54 #define UTF8_LENGTH_1 1
55 #define UTF8_LENGTH_2 2
56 #define UTF8_LENGTH_3 3
57 #define UTF8_LENGTH_4 4
58 #define UTF8_LEN_MASK 3
59 #define UTF8_FIRST_BYTE_INDEX 0
60 #define UTF8_SECOND_BYTE_INDEX 1
61 #define UTF8_THIRD_BYTE_INDEX 2
62 #define UTF8_FORTH_BYTE_INDEX 3
63 #define UTF8_SHIFT_WIDTH 6
64 #define STR16_TO_STR8_SHIFT_WIDTH 10
65 #define UTF16_SHIFT_WIDTH 10
66 #define UTF32_BYTE_SIZE_1 1
67 #define UTF32_BYTE_SIZE_2 2
68 #define UTF32_BYTE_SIZE_3 3
69 #define UTF32_BYTE_SIZE_4 4
70 
71 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)72 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
73 {
74     dstP += bytes;
75     if (bytes >= UTF32_BYTE_SIZE_4) {
76         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
77         srcChar >>= UTF8_OFFSET;
78     }
79 
80     if (bytes >= UTF32_BYTE_SIZE_3) {
81         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
82         srcChar >>= UTF8_OFFSET;
83     }
84 
85     if (bytes >= UTF32_BYTE_SIZE_2) {
86         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
87         srcChar >>= UTF8_OFFSET;
88     }
89 
90     if (bytes >= UTF32_BYTE_SIZE_1) {
91         *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
92     }
93 }
94 
Utf32CodePointUtf8Length(char32_t srcChar)95 size_t Utf32CodePointUtf8Length(char32_t srcChar)
96 {
97     if (srcChar < ONE_BYTE_UTF8) {
98         return UTF8_LENGTH_1;
99     } else if (srcChar < TWO_BYTES_UTF8) {
100         return UTF8_LENGTH_2;
101     } else if (srcChar < THREE_BYTES_UTF8) {
102         if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
103             return UTF8_LENGTH_3;
104         } else {
105             // Surrogates are invalid UTF-32 characters.
106             return UTF8_LENGTH_INVALID;
107         }
108     } else if (srcChar <= UNICODE_MAX_NUM) {
109         // Max code point for Unicode is 0x0010FFFF.
110         return UTF8_LENGTH_4;
111     } else {
112         // Invalid UTF-32 character.
113         return UTF8_LENGTH_INVALID;
114     }
115 }
116 
117 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)118 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
119 {
120     if (str16 == nullptr || str16Len == 0) {
121         return -1;
122     }
123 
124     const char16_t* const str16End = str16 + str16Len;
125     int utf8Len = 0;
126     while (str16 < str16End) {
127         int charLen = 0;
128         if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
129             && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
130             // surrogate pairs are always 4 bytes.
131             charLen = 4;
132             // str16 advance 2 bytes
133             str16 += 2;
134         } else {
135             charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
136         }
137 
138         if (utf8Len > (INT_MAX - charLen)) {
139             return -1;
140         }
141         utf8Len += charLen;
142     }
143     return utf8Len;
144 }
145 
146 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)147 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
148 {
149     const char16_t* curUtf16 = utf16Str;
150     const char16_t* const endUtf16 = utf16Str + str16Len;
151     char* cur = utf8Str;
152     while (curUtf16 < endUtf16) {
153         char32_t utf32;
154         // surrogate pairs
155         if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
156             && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
157             utf32 = (*curUtf16++ - 0xD800) << STR16_TO_STR8_SHIFT_WIDTH;
158             utf32 |= *curUtf16++ - 0xDC00;
159             utf32 += 0x10000;
160         } else {
161             utf32 = static_cast<char32_t>(*curUtf16++);
162         }
163         const size_t len = Utf32CodePointUtf8Length(utf32);
164         if (str8Len <= len) {
165             break;
166         }
167 
168         Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
169         cur += len;
170         str8Len -= len;
171     }
172     *cur = '\0';
173 }
174 
175 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)176 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
177 {
178     char* str8 = nullptr;
179     int utf8Len = Utf16ToUtf8Length(str16, str16Len);
180     if (utf8Len < 0 || utf8Len >= INT_MAX) {
181         return nullptr;
182     }
183 
184     // Allow for closing '\0'
185     utf8Len += 1;
186     str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
187     if (str8 == nullptr) {
188         return nullptr;
189     }
190 
191     StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
192     return str8;
193 }
194 
String16ToString8(const u16string & str16,string & str8)195 bool String16ToString8(const u16string& str16, string& str8)
196 {
197     size_t str16Len = str16.length();
198     if (str16Len < 1) {
199         return false;
200     }
201 
202     char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
203     if (str8Temp == nullptr) {
204         UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
205         return false;
206     }
207 
208     str8 = str8Temp;
209     free(str8Temp);
210     str8Temp = nullptr;
211     return true;
212 }
213 
214 /**
215 * return 1-4 by first byte
216 * 1111xxxx : 4
217 * 1110xxxx : 3
218 * 110xxxxx : 2
219 * 10xxxxxx : 1
220 * 0xxxxxxx : 1
221 */
Utf8CodePointLen(uint8_t ch)222 static inline size_t Utf8CodePointLen(uint8_t ch)
223 {
224     return ((0xe5000000 >> ((ch >> UTF8_LEN_MASK) & 0x1e)) & UTF8_LEN_MASK) + 1;
225 }
226 
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)227 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
228 {
229     *codePoint <<= UTF8_SHIFT_WIDTH;
230     *codePoint |= 0x3F & byte;
231 }
232 
Utf8ToUtf32CodePoint(const char * src,size_t length)233 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
234 {
235     uint32_t unicode = 0;
236 
237     switch (length) {
238         case UTF8_LENGTH_1:
239             return src[UTF8_FIRST_BYTE_INDEX];
240         case UTF8_LENGTH_2:
241             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x1f;
242             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
243             return unicode;
244         case UTF8_LENGTH_3:
245             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x0f;
246             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
247             Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
248             return unicode;
249         case UTF8_LENGTH_4:
250             unicode = src[UTF8_FIRST_BYTE_INDEX] & 0x07;
251             Utf8ShiftAndMask(&unicode, src[UTF8_SECOND_BYTE_INDEX]);
252             Utf8ShiftAndMask(&unicode, src[UTF8_THIRD_BYTE_INDEX]);
253             Utf8ShiftAndMask(&unicode, src[UTF8_FORTH_BYTE_INDEX]);
254             return unicode;
255         default:
256             return 0xffff;
257     }
258 }
259 
Utf8ToUtf16Length(const char * str8,size_t str8Len)260 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
261 {
262     const char* const str8end = str8 + str8Len;
263     int utf16len = 0;
264     while (str8 < str8end) {
265         utf16len++;
266         size_t u8charlen = Utf8CodePointLen(*str8);
267         if (str8 + u8charlen - 1 >= str8end) {
268             UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
269             return -1;
270         }
271         uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
272         if (codepoint > 0xFFFF) {
273             utf16len++; // this will be a surrogate pair in utf16
274         }
275         str8 += u8charlen;
276     }
277     if (str8 != str8end) {
278         UTILS_LOGE("Get str16 length failed because str8length is illegal!");
279         return -1;
280     }
281     return utf16len;
282 }
283 
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)284 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
285 {
286     if (u16len == 0) {
287         return u16str;
288     }
289     const char* const u8end = utf8Str + u8len;
290     const char* u8cur = utf8Str;
291     const char16_t* const u16end = u16str + u16len;
292     char16_t* u16cur = u16str;
293 
294     while ((u8cur < u8end) && (u16cur < u16end)) {
295         size_t len = Utf8CodePointLen(*u8cur);
296         uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
297         // Convert the UTF32 codepoint to one or more UTF16 codepoints
298         if (codepoint <= 0xFFFF) {
299             // Single UTF16 character
300             *u16cur++ = static_cast<char16_t>(codepoint);
301         } else {
302             // Multiple UTF16 characters with surrogates
303             codepoint = codepoint - 0x10000;
304             *u16cur++ = static_cast<char16_t>((codepoint >> UTF16_SHIFT_WIDTH) + 0xD800);
305             if (u16cur >= u16end) {
306                 // Ooops...  not enough room for this surrogate pair.
307                 return u16cur - 1;
308             }
309             *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
310         }
311 
312         u8cur += len;
313     }
314     return u16cur;
315 }
316 
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)317 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
318 {
319     char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
320     *result = 0;
321     return;
322 }
323 
324 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)325 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
326 {
327     char16_t* str16 = nullptr;
328     int utf16Len = Utf8ToUtf16Length(str8, str8Len);
329     if (utf16Len < 0 || utf16Len >= INT_MAX) {
330         UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
331         return nullptr;
332     }
333 
334     // Allow for closing 0
335     utf16Len = utf16Len + 1;
336     str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
337     if (str16 == nullptr) {
338         UTILS_LOGE("Str16 malloc memory failed!");
339         return nullptr;
340     }
341 
342     StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
343     return str16;
344 }
345 
String8ToString16(const string & str8,u16string & str16)346 bool String8ToString16(const string& str8, u16string& str16)
347 {
348     size_t str8len = str8.length();
349     if (str8len < 1) {
350         return false;
351     }
352 
353     char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
354     if (str16Temp == nullptr) {
355         UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
356         return false;
357     }
358 
359     str16 = str16Temp;
360     free(str16Temp);
361     str16Temp = nullptr;
362     return true;
363 }
364 } // namespace OHOS
365