• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "converter.h"
17 
18 #include <codecvt>
19 #include <locale>
20 
21 using namespace std;
22 
23 namespace OHOS::buffer {
24 
IsOneByte(uint8_t u8Char)25 bool IsOneByte(uint8_t u8Char)
26 {
27     return (u8Char & 0x80) == 0;
28 }
29 
Utf8ToUtf16BE(const string & u8Str,bool * ok)30 u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
31 {
32     u16string u16Str = u"";
33     u16Str.reserve(u8Str.size());
34     string::size_type len = u8Str.length();
35 
36     const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
37 
38     bool isOk = true;
39     for (string::size_type i = 0; i < len; ++i) {
40         uint8_t c1 = data[i]; // The first byte
41         if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
42             u16Str.push_back(static_cast<char16_t>(c1));
43             continue;
44         }
45         switch (c1 & HIGER_4_BITS_MASK) {
46             case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
47                 uint8_t c2 = data[++i]; // The second byte
48                 uint8_t c3 = data[++i]; // The third byte
49                 uint8_t c4 = data[++i]; // The forth byte
50                 // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
51                 // 3 : shift left 3 times of UTF8_VALID_BITS
52                 uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
53                     // 2 : shift left 2 times of UTF8_VALID_BITS
54                     ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
55                     ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
56                     (c4 & LOWER_6_BITS_MASK);
57 
58                 // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
59                 if (codePoint >= UTF16_SPECIAL_VALUE) {
60                     codePoint -= UTF16_SPECIAL_VALUE;
61                     // 10 : a half of 20 , shift right 10 bits
62                     u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
63                     u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
64                 } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
65                     // U+D800 to U+DFFF are invalid characters, for simplicity,
66                     // assume it does not exist (if any, not encoded)
67                     u16Str.push_back(static_cast<char16_t>(codePoint));
68                 }
69                 break;
70             }
71             case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
72                 uint8_t c2 = data[++i]; // The second byte
73                 uint8_t c3 = data[++i]; // The third byte
74                 // Calculates the UNICODE code point value
75                 // (4 bits lower for the first byte, 6 bits lower for the other)
76                 // 2 : shift left 2 times of UTF8_VALID_BITS
77                 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
78                     ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
79                     (c3 & LOWER_6_BITS_MASK);
80                 u16Str.push_back(static_cast<char16_t>(codePoint));
81                 break;
82             }
83             case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
84             case TWO_BYTES_STYLE2: {
85                 uint8_t c2 = data[++i]; // The second byte
86                 // Calculates the UNICODE code point value
87                 // (5 bits lower for the first byte, 6 bits lower for the other)
88                 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
89                     (c2 & LOWER_6_BITS_MASK);
90                 u16Str.push_back(static_cast<char16_t>(codePoint));
91                 break;
92             }
93             default: {
94                 isOk = false;
95                 break;
96             }
97         }
98     }
99     if (ok != nullptr) {
100         *ok = isOk;
101     }
102 
103     return u16Str;
104 }
105 
Utf16BEToLE(const u16string & wstr)106 u16string Utf16BEToLE(const u16string &wstr)
107 {
108     u16string str16 = u"";
109     const char16_t *data = wstr.data();
110     for (unsigned int i = 0; i < wstr.length(); i++) {
111         char16_t wc = data[i];
112         char16_t high = (wc >> 8) & 0x00FF;
113         char16_t low = wc & 0x00FF;
114         char16_t c16 = (low << 8) | high;
115         str16.push_back(c16);
116     }
117     return str16;
118 }
119 
Utf16BEToANSI(const u16string & wstr)120 string Utf16BEToANSI(const u16string &wstr)
121 {
122     string ret = "";
123     for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
124         char16_t wc = (*it);
125         // get the lower bit from the UNICODE code point
126         char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
127         ret.push_back(c);
128     }
129     return ret;
130 }
131 
Utf8ToUtf16BEToANSI(const string & str)132 string Utf8ToUtf16BEToANSI(const string &str)
133 {
134     u16string u16Str = Utf8ToUtf16BE(str);
135     string ret = Utf16BEToANSI(u16Str);
136     return ret;
137 }
138 
IsBase64Char(unsigned char c)139 bool IsBase64Char(unsigned char c)
140 {
141     return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
142 }
143 
144 /**
145 * Base64Encode - Base64 encode
146 * @src: Data to be encoded
147 * @len: Length of the data to be encoded
148 * Returns: Allocated buffer of outLen bytes of encoded data,
149 * or empty string on failure
150 */
Base64Encode(const unsigned char * src,size_t len,EncodingType type)151 string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
152 {
153     if (src == nullptr) {
154         return string();
155     }
156     unsigned char *out = nullptr;
157     unsigned char *pos = nullptr;
158     const unsigned char *pEnd = nullptr;
159     const unsigned char *pStart = nullptr;
160     size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
161 
162     if (outLen < len) {
163         return string(); // integer overflow
164     }
165 
166     string outStr = "";
167     outStr.resize(outLen);
168     out = reinterpret_cast<unsigned char *>(&outStr[0]);
169 
170     pEnd = src + len;
171     pStart = src;
172     pos = out;
173 
174     string table = BASE64_TABLE;
175     if (type == BASE64URL) {
176         table = BASE64URL_TABLE;
177     }
178     // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
179     while (pEnd - pStart >= 3) {
180         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
181         *pos = table[pStart[0] >> 2];
182         // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
183         *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
184         // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
185         *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
186         // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
187         *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
188         // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
189         pos += 4;
190         // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
191         pStart += 3;
192     }
193 
194     // process the last set of less than 3 bytes of data
195     if (pEnd - pStart > 0) {
196         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
197         *pos = table[pStart[0] >> 2];
198         if (pEnd - pStart == 1) { // one byte remaining
199             // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
200             *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
201             // 2 : fill in the missing bytes with '='
202             *(pos + 2) = '=';
203         } else { // two bytes remaining
204             // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
205             *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
206             // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
207             *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
208         }
209         // 3 : fill in the missing bytes with '='
210         *(pos + 3) = '=';
211     }
212 
213     if (type == BASE64URL) {
214         size_t poss = outStr.find_last_not_of('=');
215         if (poss != std::string::npos) {
216             outStr.erase(poss + 1);
217         }
218     }
219     return outStr;
220 }
221 
Base64Decode(string const & encodedStr,EncodingType type)222 string Base64Decode(string const& encodedStr, EncodingType type)
223 {
224     size_t len = encodedStr.size();
225     unsigned int index = 0;
226     unsigned int cursor = 0;
227     unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
228     unsigned char charArray3[3] = {0}; // an array to stage a set of original string
229     string ret = "";
230     string table = BASE64_TABLE;
231 
232     if (type == BASE64URL) {
233         table = BASE64URL_TABLE;
234     }
235     while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
236         // stage a 4-byte string to charArray4
237         charArray4[index] = encodedStr[cursor];
238         index++;
239         cursor++;
240         if (index == 4) { // 4 : after 4 chars is assigned to charArray4
241             // 4 : fill data into charArray4
242             for (index = 0; index < 4; index++) {
243                 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
244             }
245             // get the last six bits of the first byte of charArray4 and the first valid
246             // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
247             charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
248             // get the last four bits of the second byte of charArray4 and the first valid
249             // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
250             charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
251             // get the last two bits of the third byte of charArray4 and the forth byte,
252             // 2 : 3 : 6 : combine them to a new byte
253             charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
254             // 3 : assigns the decoded string to the return value
255             for (index = 0; index < 3; index++) {
256                 ret += charArray3[index];
257             }
258             index = 0;
259         }
260         if (cursor > len - 1) {
261             break;
262         }
263     }
264 
265     if (index != 0) {
266         // fill data into charArray4
267         for (unsigned int i = 0; i < index; i++) {
268             charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
269         }
270         // get the last six bits of the first byte of charArray4 and the first valid
271         // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
272         charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
273         // get the last four bits of the second byte of charArray4 and the first valid
274         // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
275         charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
276         // assigns the decoded string to the return value
277         for (unsigned int i = 0; i < index - 1; i++) {
278             ret += charArray3[i];
279         }
280     }
281 
282     return ret;
283 }
284 
IsValidHex(const string & hex)285 bool IsValidHex(const string &hex)
286 {
287     bool isValid = false;
288     for (unsigned int i = 0; i < hex.size(); i++) {
289         char c = hex.at(i);
290         // 0 ~ 9, A ~ F, a ~ f
291         if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
292             isValid = true;
293         } else {
294             isValid = false;
295             break;
296         }
297     }
298     return isValid;
299 }
300 
HexDecode(const string & hexStr)301 string HexDecode(const string &hexStr)
302 {
303     string nums = "";
304     unsigned int arrSize = hexStr.size();
305 
306     // 2 : means a half length of hex str's size
307     for (unsigned int i = 0; i < arrSize / 2; i++) {
308         string hexStrTmp = "";
309         int num = 0;
310         // 2 : offset is i * 2
311         hexStrTmp.push_back(hexStr[i * 2]);
312         // 2 : offset is i * 2 + 1
313         hexStrTmp.push_back(hexStr[i * 2 + 1]);
314         if (!IsValidHex(hexStrTmp)) {
315             break;
316         }
317         // 16 : the base is 16
318         num = stoi(hexStrTmp, nullptr, 16);
319         nums.push_back(static_cast<char>(num));
320     }
321 
322     return nums;
323 }
324 
325 // Find the position of the last character in pat from patIndex
GetGoodSuffixLengthByLastChar(uint8_t * pat,int patIndex,int patLen)326 int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
327 {
328     int lastIndex = patLen - 1;
329     int index = -1;
330     while (patIndex >= 0) {
331         if (pat[patIndex] == pat[lastIndex]) {
332             index = patIndex;
333             break;
334         } else {
335             patIndex--;
336         }
337     }
338     return lastIndex - index;
339 }
340 // Find the position of the first character in pat from patIndex
GetGoodSuffixLengthByFirstChar(uint8_t * pat,int patIndex,int tarlen)341 int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
342 {
343     int indexOfNextFirstChar = tarlen;
344     for (int i = patIndex; i < tarlen; i++) {
345         if (pat[0] == pat[i]) {
346             indexOfNextFirstChar = i;
347             break;
348         }
349     }
350     return indexOfNextFirstChar;
351 }
352 
353 // Match forward from patIndex to get the position of the singleChar in the pat
354 // and the length of the bad character
GetBadCharLengthInReverseOrder(uint8_t * pat,char singleChar,int patIndex)355 int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
356 {
357     int index = -1;
358     for (int i = patIndex - 1; i >= 0; --i) {
359         if (pat[i] == singleChar) {
360             index = i;
361             break;
362         }
363     }
364     return patIndex - index;
365 }
366 
367 // Get the position of character c in pat
GetBadCharLengthInSequence(uint8_t * pat,char singleChar,int patIndex,int tarlen)368 int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
369 {
370     int resIndex = tarlen;
371     for (int i = patIndex; i < tarlen; i++) {
372         if (singleChar == pat[i]) {
373             resIndex = i;
374             break;
375         }
376     }
377     return resIndex;
378 }
379 
FindLastIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)380 int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
381 {
382     if (source == nullptr || target == nullptr) {
383         return -1;
384     }
385     if (soulen < tarlen || tarlen == 0) {
386         return -1;
387     }
388     int i = soulen - tarlen;
389     int j = 0;
390 
391     while (i >= 0) {
392         if (source[i] == target[j]) {
393             if (j == tarlen - 1) {
394                 return i - (tarlen - 1);
395             }
396             i++;
397             j++;
398         } else {
399             if (j == 0) {
400                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
401                 i = i - badValue;
402                 j = 0;
403             } else {
404                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
405                 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
406                 int distance = badValue > goodSuffix ? badValue : goodSuffix;
407                 i = i - distance;
408                 j = 0;
409             }
410         }
411     }
412     return -1;
413 }
414 
FindIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)415 int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
416 {
417     if (source == nullptr || target == nullptr) {
418         return -1;
419     }
420     if (soulen < tarlen || tarlen == 0) {
421         return -1;
422     }
423     int i = tarlen - 1;
424     int j = tarlen - 1;
425     while (i < soulen) {
426         if (source[i] == target[j]) {
427             if (j == 0) {
428                 return i;
429             }
430             i--;
431             j--;
432         } else {
433             if (j == tarlen - 1) {
434                 int badValue = GetBadCharLengthInReverseOrder(target, source[i], j);
435                 i = i + badValue;
436                 j = tarlen - 1;
437             } else {
438                 int badValue = GetBadCharLengthInReverseOrder(target, source[i], j);
439                 int goodSuffix = GetGoodSuffixLengthByLastChar(target, j, tarlen);
440                 int distance = badValue > goodSuffix ? badValue : goodSuffix;
441                 i = i + tarlen - 1 - j + distance;
442                 j = tarlen - 1;
443             }
444         }
445     }
446     return -1;
447 }
448 }
449