• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "unicode_ex.h"
17 
18 #include <climits>
19 #include <cstdio>
20 #include <cstdlib>
21 
22 #include "utils_log.h"
23 using namespace std;
24 /***************************************UTF8 and UTF16 unicode**********************************************
25 UTF8
26 Unicode                                 utf8
27 U + 0000~U + 007F                    0???????
28 U + 0080~U + 07FF                    110????? 10??????
29 U + 0800~U + FFFF                    1110???? 10?????? 10??????
30 U + 10000~U + 10FFFF                 11110??? 10?????? 10?????? 10??????
31 
32 UTF16
33 Unicode                           utf16 code
34 
35 U + 000~U + FFFF                    2 Byte save, same with Unicode
36 U + 10000~U + 10FFFF                4 Byte save Unicode 0x10000
37 **************************************UTF8 and UTF16 unicode**********************************************/
38 namespace OHOS {
39 namespace {
40 constexpr char32_t ONE_BYTE_UTF8 = 0x00000080;
41 constexpr char32_t TWO_BYTES_UTF8 = 0x00000800;
42 constexpr char32_t THREE_BYTES_UTF8 = 0x00010000;
43 
44 
45 constexpr char32_t UNICODE_RESERVED_START = 0x0000D800;
46 constexpr char32_t UNICODE_RESERVED_END = 0x0000DFFF;
47 constexpr char32_t UNICODE_MAX_NUM = 0x0010FFFF;
48 constexpr unsigned int UTF8_OFFSET = 6;
49 
50 constexpr char32_t UTF8_BYTE_MASK = 0x000000BF;
51 constexpr char32_t UTF8_BYTE_MARK = 0x00000080;
52 constexpr char32_t UTF8_FIRST_BYTE_MARK[] = {
53     0x00000000, 0x00000000, 0x000000C0, 0x000000E0, 0x000000F0
54 };
55 }
56 
57 // inner func and dstP is not nullptr
Utf32CodePointToUtf8(uint8_t * dstP,char32_t srcChar,size_t bytes)58 void Utf32CodePointToUtf8(uint8_t* dstP, char32_t srcChar, size_t bytes)
59 {
60     dstP += bytes;
61     if (bytes >= 4) {
62         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
63         srcChar >>= UTF8_OFFSET;
64     }
65 
66     if (bytes >= 3) {
67         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
68         srcChar >>= UTF8_OFFSET;
69     }
70 
71     if (bytes >= 2) {
72         *--dstP = static_cast<uint8_t>((srcChar | UTF8_BYTE_MARK) & UTF8_BYTE_MASK);
73         srcChar >>= UTF8_OFFSET;
74     }
75 
76     if (bytes >= 1) {
77         *--dstP = static_cast<uint8_t>(srcChar | UTF8_FIRST_BYTE_MARK[bytes]);
78     }
79 }
80 
Utf32CodePointUtf8Length(char32_t srcChar)81 size_t Utf32CodePointUtf8Length(char32_t srcChar)
82 {
83     if (srcChar < ONE_BYTE_UTF8) {
84         return 1;
85     } else if (srcChar < TWO_BYTES_UTF8) {
86         return 2;
87     } else if (srcChar < THREE_BYTES_UTF8) {
88         if ((srcChar < UNICODE_RESERVED_START) || (srcChar > UNICODE_RESERVED_END)) {
89             return 3;
90         } else {
91             // Surrogates are invalid UTF-32 characters.
92             return 0;
93         }
94     } else if (srcChar <= UNICODE_MAX_NUM) {
95         // Max code point for Unicode is 0x0010FFFF.
96         return 4;
97     } else {
98         // Invalid UTF-32 character.
99         return 0;
100     }
101 }
102 
103 // get the length of utf8 from utf16
Utf16ToUtf8Length(const char16_t * str16,size_t str16Len)104 int Utf16ToUtf8Length(const char16_t* str16, size_t str16Len)
105 {
106     if (str16 == nullptr || str16Len == 0) {
107         return -1;
108     }
109 
110     const char16_t* const str16End = str16 + str16Len;
111     int utf8Len = 0;
112     while (str16 < str16End) {
113         int charLen = 0;
114         if (((*str16 & 0xFC00) == 0xD800) && ((str16 + 1) < str16End)
115             && ((*(str16 + 1) & 0xFC00) == 0xDC00)) {
116             // surrogate pairs are always 4 bytes.
117             charLen = 4;
118             str16 += 2;
119         } else {
120             charLen = Utf32CodePointUtf8Length(static_cast<char32_t>(*str16++));
121         }
122 
123         if (utf8Len > (INT_MAX - charLen)) {
124             return -1;
125         }
126         utf8Len += charLen;
127     }
128     return utf8Len;
129 }
130 
131 // inner function, utf8Str and utf16Str is not nullptr
StrncpyStr16ToStr8(const char16_t * utf16Str,size_t str16Len,char * utf8Str,size_t str8Len)132 void StrncpyStr16ToStr8(const char16_t* utf16Str, size_t str16Len, char* utf8Str, size_t str8Len)
133 {
134     const char16_t* curUtf16 = utf16Str;
135     const char16_t* const endUtf16 = utf16Str + str16Len;
136     char* cur = utf8Str;
137     while (curUtf16 < endUtf16) {
138         char32_t utf32;
139         // surrogate pairs
140         if (((*curUtf16 & 0xFC00) == 0xD800) && ((curUtf16 + 1) < endUtf16)
141             && (((*(curUtf16 + 1) & 0xFC00)) == 0xDC00)) {
142             utf32 = (*curUtf16++ - 0xD800) << 10;
143             utf32 |= *curUtf16++ - 0xDC00;
144             utf32 += 0x10000;
145         } else {
146             utf32 = static_cast<char32_t>(*curUtf16++);
147         }
148         const size_t len = Utf32CodePointUtf8Length(utf32);
149         if (str8Len < len) {
150             break;
151         }
152 
153         Utf32CodePointToUtf8(reinterpret_cast<uint8_t*>(cur), utf32, len);
154         cur += len;
155         str8Len -= len;
156     }
157     *cur = '\0';
158 }
159 
160 // inner function and str16 is not null
Char16ToChar8(const char16_t * str16,size_t str16Len)161 char* Char16ToChar8(const char16_t* str16, size_t str16Len)
162 {
163     char* str8 = nullptr;
164     int utf8Len = Utf16ToUtf8Length(str16, str16Len);
165     if (utf8Len < 0) {
166         return nullptr;
167     }
168 
169     // Allow for closing '\0'
170     utf8Len += 1;
171     str8 = reinterpret_cast<char*>(calloc(utf8Len, sizeof(char)));
172     if (str8 == nullptr) {
173         return nullptr;
174     }
175 
176     StrncpyStr16ToStr8(str16, str16Len, str8, utf8Len);
177     return str8;
178 }
179 
String16ToString8(const u16string & str16,string & str8)180 bool String16ToString8(const u16string& str16, string& str8)
181 {
182     size_t str16Len = str16.length();
183     if (str16Len < 1) {
184         return false;
185     }
186 
187     char* str8Temp = Char16ToChar8(str16.c_str(), str16Len);
188     if (str8Temp == nullptr) {
189         UTILS_LOGD("Str16 to str8 failed, because str8Temp is nullptr!");
190         return false;
191     }
192 
193     str8 = str8Temp;
194     free(str8Temp);
195     str8Temp = nullptr;
196     return true;
197 }
198 
199 /**
200 * return 1-4 by first byte
201 * 1111xxxx : 4
202 * 1110xxxx : 3
203 * 110xxxxx : 2
204 * 10xxxxxx : 1
205 * 0xxxxxxx : 1
206 */
Utf8CodePointLen(uint8_t ch)207 static inline size_t Utf8CodePointLen(uint8_t ch)
208 {
209     return ((0xe5000000 >> ((ch >> 3) & 0x1e)) & 3) + 1;
210 }
211 
Utf8ShiftAndMask(uint32_t * codePoint,const uint8_t byte)212 static inline void Utf8ShiftAndMask(uint32_t* codePoint, const uint8_t byte)
213 {
214     *codePoint <<= 6;
215     *codePoint |= 0x3F & byte;
216 }
217 
Utf8ToUtf32CodePoint(const char * src,size_t length)218 uint32_t Utf8ToUtf32CodePoint(const char* src, size_t length)
219 {
220     uint32_t unicode = 0;
221 
222     switch (length) {
223         case 1:
224             return src[0];
225         case 2:
226             unicode = src[0] & 0x1f;
227             Utf8ShiftAndMask(&unicode, src[1]);
228             return unicode;
229         case 3:
230             unicode = src[0] & 0x0f;
231             Utf8ShiftAndMask(&unicode, src[1]);
232             Utf8ShiftAndMask(&unicode, src[2]);
233             return unicode;
234         case 4:
235             unicode = src[0] & 0x07;
236             Utf8ShiftAndMask(&unicode, src[1]);
237             Utf8ShiftAndMask(&unicode, src[2]);
238             Utf8ShiftAndMask(&unicode, src[3]);
239             return unicode;
240         default:
241             return 0xffff;
242     }
243 }
244 
Utf8ToUtf16Length(const char * str8,size_t str8Len)245 int Utf8ToUtf16Length(const char* str8, size_t str8Len)
246 {
247     const char* const str8end = str8 + str8Len;
248     int utf16len = 0;
249     while (str8 < str8end) {
250         utf16len++;
251         size_t u8charlen = Utf8CodePointLen(*str8);
252         if (str8 + u8charlen - 1 >= str8end) {
253             UTILS_LOGE("Get str16 length failed because str8 unicode is illegal!");
254             return -1;
255         }
256         uint32_t codepoint = Utf8ToUtf32CodePoint(str8, u8charlen);
257         if (codepoint > 0xFFFF) {
258             utf16len++; // this will be a surrogate pair in utf16
259         }
260         str8 += u8charlen;
261     }
262     if (str8 != str8end) {
263         UTILS_LOGE("Get str16 length failed because str8length is illegal!");
264         return -1;
265     }
266     return utf16len;
267 }
268 
Utf8ToUtf16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)269 char16_t* Utf8ToUtf16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
270 {
271     if (u16len == 0) {
272         return u16str;
273     }
274     const char* const u8end = utf8Str + u8len;
275     const char* u8cur = utf8Str;
276     const char16_t* const u16end = u16str + u16len;
277     char16_t* u16cur = u16str;
278 
279     while ((u8cur < u8end) && (u16cur < u16end)) {
280         size_t len = Utf8CodePointLen(*u8cur);
281         uint32_t codepoint = Utf8ToUtf32CodePoint(u8cur, len);
282         // Convert the UTF32 codepoint to one or more UTF16 codepoints
283         if (codepoint <= 0xFFFF) {
284             // Single UTF16 character
285             *u16cur++ = static_cast<char16_t>(codepoint);
286         } else {
287             // Multiple UTF16 characters with surrogates
288             codepoint = codepoint - 0x10000;
289             *u16cur++ = static_cast<char16_t>((codepoint >> 10) + 0xD800);
290             if (u16cur >= u16end) {
291                 // Ooops...  not enough room for this surrogate pair.
292                 return u16cur - 1;
293             }
294             *u16cur++ = static_cast<char16_t>((codepoint & 0x3FF) + 0xDC00);
295         }
296 
297         u8cur += len;
298     }
299     return u16cur;
300 }
301 
StrncpyStr8ToStr16(const char * utf8Str,size_t u8len,char16_t * u16str,size_t u16len)302 void StrncpyStr8ToStr16(const char* utf8Str, size_t u8len, char16_t* u16str, size_t u16len)
303 {
304     char16_t* result = Utf8ToUtf16(utf8Str, u8len, u16str, u16len - 1);
305     *result = 0;
306     return;
307 }
308 
309 // inner function and str8 is not null
Char8ToChar16(const char * str8,size_t str8Len)310 char16_t* Char8ToChar16(const char* str8, size_t str8Len)
311 {
312     char16_t* str16 = nullptr;
313     int utf16Len = Utf8ToUtf16Length(str8, str8Len);
314     if (utf16Len < 0) {
315         UTILS_LOGE("Get str16 length failed,length is: %{public}d", utf16Len);
316         return nullptr;
317     }
318 
319     // Allow for closing 0
320     utf16Len = utf16Len + 1;
321     str16 = reinterpret_cast<char16_t*>(calloc(utf16Len, sizeof(char16_t)));
322     if (str16 == nullptr) {
323         UTILS_LOGE("Str16 malloc memory failed!");
324         return nullptr;
325     }
326 
327     StrncpyStr8ToStr16(str8, str8Len, str16, utf16Len);
328     return str16;
329 }
330 
String8ToString16(const string & str8,u16string & str16)331 bool String8ToString16(const string& str8, u16string& str16)
332 {
333     size_t str8len = str8.length();
334     if (str8len < 1) {
335         return false;
336     }
337 
338     char16_t* str16Temp = Char8ToChar16(str8.c_str(), str8len);
339     if (str16Temp == nullptr) {
340         UTILS_LOGD("str8 to str16 failed, str16Temp is nullptr!");
341         return false;
342     }
343 
344     str16 = str16Temp;
345     free(str16Temp);
346     str16Temp = nullptr;
347     return true;
348 }
349 } // namespace OHOS
350