• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #include "converter.h"
17 #include <climits>
18 using namespace std;
19 
20 namespace OHOS::buffer {
21 
IsOneByte(uint8_t u8Char)22 bool IsOneByte(uint8_t u8Char)
23 {
24     return (u8Char & 0x80) == 0;
25 }
26 
Utf8ToUtf16BEToData(const unsigned char * data,u16string & u16Str,string::size_type & index,uint8_t & c1)27 void Utf8ToUtf16BEToData(const unsigned char *data, u16string &u16Str, string::size_type &index, uint8_t &c1)
28 {
29     uint8_t c2 = data[++index]; // The second byte
30     uint8_t c3 = data[++index]; // The third byte
31     uint8_t c4 = data[++index]; // The forth byte
32     // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
33     // 3 : shift left 3 times of UTF8_VALID_BITS
34     uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
35         // 2 : shift left 2 times of UTF8_VALID_BITS
36         ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
37         ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
38         (c4 & LOWER_6_BITS_MASK);
39     // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
40     if (codePoint >= UTF16_SPECIAL_VALUE) {
41         codePoint -= UTF16_SPECIAL_VALUE;
42         // 10 : a half of 20 , shift right 10 bits
43         u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
44         u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
45     } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
46         // U+D800 to U+DFFF are invalid characters, for simplicity,
47         // assume it does not exist (if any, not encoded)
48         u16Str.push_back(static_cast<char16_t>(codePoint));
49     }
50 }
51 
Utf8ToUtf16BE(const string & u8Str,bool * ok)52 u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
53 {
54     u16string u16Str = u"";
55     u16Str.reserve(u8Str.size());
56     string::size_type len = u8Str.length();
57     const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
58     bool isOk = true;
59     for (string::size_type i = 0; i < len; ++i) {
60         uint8_t c1 = data[i]; // The first byte
61         if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
62             u16Str.push_back(static_cast<char16_t>(c1));
63             continue;
64         }
65         switch (c1 & HIGER_4_BITS_MASK) {
66             case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
67                 Utf8ToUtf16BEToData(data, u16Str, i, c1);
68                 break;
69             }
70             case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
71                 uint8_t c2 = data[++i]; // The second byte
72                 uint8_t c3 = data[++i]; // The third byte
73                 // Calculates the UNICODE code point value
74                 // (4 bits lower for the first byte, 6 bits lower for the other)
75                 // 2 : shift left 2 times of UTF8_VALID_BITS
76                 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
77                     ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
78                     (c3 & LOWER_6_BITS_MASK);
79                 u16Str.push_back(static_cast<char16_t>(codePoint));
80                 break;
81             }
82             case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
83             case TWO_BYTES_STYLE2: {
84                 uint8_t c2 = data[++i]; // The second byte
85                 // Calculates the UNICODE code point value
86                 // (5 bits lower for the first byte, 6 bits lower for the other)
87                 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
88                     (c2 & LOWER_6_BITS_MASK);
89                 u16Str.push_back(static_cast<char16_t>(codePoint));
90                 break;
91             }
92             default: {
93                 isOk = false;
94                 break;
95             }
96         }
97     }
98     if (ok != nullptr) {
99         *ok = isOk;
100     }
101     return u16Str;
102 }
103 
Utf16BEToLE(const u16string & wstr)104 u16string Utf16BEToLE(const u16string &wstr)
105 {
106     u16string str16 = u"";
107     const char16_t *data = wstr.data();
108     for (unsigned int i = 0; i < wstr.length(); i++) {
109         char16_t wc = data[i];
110         char16_t high = (wc >> 8) & 0x00FF;
111         char16_t low = wc & 0x00FF;
112         char16_t c16 = (low << 8) | high;
113         str16.push_back(c16);
114     }
115     return str16;
116 }
117 
Utf16BEToANSI(const u16string & wstr)118 string Utf16BEToANSI(const u16string &wstr)
119 {
120     string ret = "";
121     for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
122         char16_t wc = (*it);
123         // get the lower bit from the UNICODE code point
124         char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
125         ret.push_back(c);
126     }
127     return ret;
128 }
129 
Utf8ToUtf16BEToANSI(const string & str)130 string Utf8ToUtf16BEToANSI(const string &str)
131 {
132     u16string u16Str = Utf8ToUtf16BE(str);
133     string ret = Utf16BEToANSI(u16Str);
134     return ret;
135 }
136 
IsBase64Char(unsigned char c)137 bool IsBase64Char(unsigned char c)
138 {
139     return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
140 }
141 
142 /**
143 * Base64Encode - Base64 encode
144 * @src: Data to be encoded
145 * @len: Length of the data to be encoded
146 * Returns: Allocated buffer of outLen bytes of encoded data,
147 * or empty string on failure
148 */
Base64Encode(const unsigned char * src,size_t len,EncodingType type)149 string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
150 {
151     if (src == nullptr) {
152         return string();
153     }
154     unsigned char *out = nullptr;
155     unsigned char *pos = nullptr;
156     const unsigned char *pEnd = nullptr;
157     const unsigned char *pStart = nullptr;
158     size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
159 
160     if (outLen < len) {
161         return string(); // integer overflow
162     }
163 
164     string outStr = "";
165     outStr.resize(outLen);
166     out = reinterpret_cast<unsigned char *>(&outStr[0]);
167 
168     pEnd = src + len;
169     pStart = src;
170     pos = out;
171 
172     string table = BASE64_TABLE;
173     if (type == BASE64URL) {
174         table = BASE64URL_TABLE;
175     }
176     // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
177     while (pEnd - pStart >= 3) {
178         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
179         *pos = table[pStart[0] >> 2];
180         // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
181         *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
182         // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
183         *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
184         // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
185         *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
186         // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
187         pos += 4;
188         // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
189         pStart += 3;
190     }
191 
192     // process the last set of less than 3 bytes of data
193     if (pEnd - pStart > 0) {
194         // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
195         *pos = table[pStart[0] >> 2];
196         if (pEnd - pStart == 1) { // one byte remaining
197             // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
198             *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
199             // 2 : fill in the missing bytes with '='
200             *(pos + 2) = '=';
201         } else { // two bytes remaining
202             // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
203             *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
204             // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
205             *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
206         }
207         // 3 : fill in the missing bytes with '='
208         *(pos + 3) = '=';
209     }
210 
211     if (type == BASE64URL) {
212         size_t poss = outStr.find_last_not_of('=');
213         if (poss != std::string::npos) {
214             outStr.erase(poss + 1);
215         }
216     }
217     return outStr;
218 }
219 
Base64Decode(string const & encodedStr,EncodingType type)220 string Base64Decode(string const& encodedStr, EncodingType type)
221 {
222     size_t len = encodedStr.size();
223     unsigned int index = 0;
224     unsigned int cursor = 0;
225     unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
226     unsigned char charArray3[3] = {0}; // an array to stage a set of original string
227     string ret = "";
228     string table = BASE64_TABLE;
229 
230     if (type == BASE64URL) {
231         table = BASE64URL_TABLE;
232     }
233     while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
234         // stage a 4-byte string to charArray4
235         charArray4[index] = encodedStr[cursor];
236         index++;
237         cursor++;
238         if (index == 4) { // 4 : after 4 chars is assigned to charArray4
239             // 4 : fill data into charArray4
240             for (index = 0; index < 4; index++) {
241                 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
242             }
243             // get the last six bits of the first byte of charArray4 and the first valid
244             // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
245             charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
246             // get the last four bits of the second byte of charArray4 and the first valid
247             // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
248             charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
249             // get the last two bits of the third byte of charArray4 and the forth byte,
250             // 2 : 3 : 6 : combine them to a new byte
251             charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
252             // 3 : assigns the decoded string to the return value
253             for (index = 0; index < 3; index++) {
254                 ret += charArray3[index];
255             }
256             index = 0;
257         }
258         if (cursor > len - 1) {
259             break;
260         }
261     }
262 
263     if (index != 0) {
264         // fill data into charArray4
265         for (unsigned int i = 0; i < index; i++) {
266             charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
267         }
268         // get the last six bits of the first byte of charArray4 and the first valid
269         // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
270         charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
271         // get the last four bits of the second byte of charArray4 and the first valid
272         // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
273         charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
274         // assigns the decoded string to the return value
275         for (unsigned int i = 0; i < index - 1; i++) {
276             ret += charArray3[i];
277         }
278     }
279 
280     return ret;
281 }
282 
IsValidHex(const string & hex)283 bool IsValidHex(const string &hex)
284 {
285     bool isValid = false;
286     for (unsigned int i = 0; i < hex.size(); i++) {
287         char c = hex.at(i);
288         // 0 ~ 9, A ~ F, a ~ f
289         if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
290             isValid = true;
291         } else {
292             isValid = false;
293             break;
294         }
295     }
296     return isValid;
297 }
298 
HexDecode(const string & hexStr)299 string HexDecode(const string &hexStr)
300 {
301     string nums = "";
302     unsigned int arrSize = hexStr.size();
303 
304     // 2 : means a half length of hex str's size
305     for (unsigned int i = 0; i < arrSize / 2; i++) {
306         string hexStrTmp = "";
307         int num = 0;
308         // 2 : offset is i * 2
309         hexStrTmp.push_back(hexStr[i * 2]);
310         // 2 : offset is i * 2 + 1
311         hexStrTmp.push_back(hexStr[i * 2 + 1]);
312         if (!IsValidHex(hexStrTmp)) {
313             break;
314         }
315         // 16 : the base is 16
316         num = stoi(hexStrTmp, nullptr, 16);
317         nums.push_back(static_cast<char>(num));
318     }
319 
320     return nums;
321 }
322 
323 // Find the position of the last character in pat from patIndex
GetGoodSuffixLengthByLastChar(uint8_t * pat,int patIndex,int patLen)324 int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
325 {
326     int lastIndex = patLen - 1;
327     int index = -1;
328     while (patIndex >= 0) {
329         if (pat[patIndex] == pat[lastIndex]) {
330             index = patIndex;
331             break;
332         } else {
333             patIndex--;
334         }
335     }
336     return lastIndex - index;
337 }
338 // Find the position of the first character in pat from patIndex
GetGoodSuffixLengthByFirstChar(uint8_t * pat,int patIndex,int tarlen)339 int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
340 {
341     int indexOfNextFirstChar = tarlen;
342     for (int i = patIndex; i < tarlen; i++) {
343         if (pat[0] == pat[i]) {
344             indexOfNextFirstChar = i;
345             break;
346         }
347     }
348     return indexOfNextFirstChar;
349 }
350 
351 // Match forward from patIndex to get the position of the singleChar in the pat
352 // and the length of the bad character
GetBadCharLengthInReverseOrder(uint8_t * pat,char singleChar,int patIndex)353 int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
354 {
355     int index = -1;
356     for (int i = patIndex - 1; i >= 0; --i) {
357         if (pat[i] == singleChar) {
358             index = i;
359             break;
360         }
361     }
362     return patIndex - index;
363 }
364 
365 // Get the position of character c in pat
GetBadCharLengthInSequence(uint8_t * pat,char singleChar,int patIndex,int tarlen)366 int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
367 {
368     int resIndex = tarlen;
369     for (int i = patIndex; i < tarlen; i++) {
370         if (singleChar == pat[i]) {
371             resIndex = i;
372             break;
373         }
374     }
375     return resIndex;
376 }
377 
FindLastIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)378 int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
379 {
380     if (source == nullptr || target == nullptr) {
381         return -1;
382     }
383     if (soulen < tarlen || tarlen == 0) {
384         return -1;
385     }
386     int i = soulen - tarlen;
387     int j = 0;
388 
389     while (i >= 0) {
390         if (source[i] == target[j]) {
391             if (j == tarlen - 1) {
392                 return i - (tarlen - 1);
393             }
394             i++;
395             j++;
396         } else {
397             if (j == 0) {
398                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
399                 i = i - badValue;
400                 j = 0;
401             } else {
402                 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
403                 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
404                 int distance = badValue > goodSuffix ? badValue : goodSuffix;
405                 i = i - distance;
406                 j = 0;
407             }
408         }
409     }
410     return -1;
411 }
412 
FindIndexInner(uint8_t * target,uint8_t * source,int tarlen,int & indexI,int & indexJ)413 bool FindIndexInner(uint8_t* target, uint8_t* source, int tarlen, int &indexI, int &indexJ)
414 {
415     if (indexJ == tarlen - 1) {
416         int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
417         indexI = indexI + badValue;
418     } else {
419         int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
420         int goodSuffix = GetGoodSuffixLengthByLastChar(target, indexJ, tarlen);
421         int distance = badValue > goodSuffix ? badValue : goodSuffix;
422         long addVal = static_cast<long>(indexI) + tarlen;
423         long addRst = addVal + distance;
424         if (abs(addVal) > INT_MAX || abs(addRst) > INT_MAX) {
425             return false;
426         }
427         indexI = indexI + tarlen - 1 - indexJ + distance;
428         indexJ = tarlen - 1;
429     }
430     return true;
431 }
432 
FindIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)433 int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
434 {
435     if (source == nullptr || target == nullptr) {
436         return -1;
437     }
438     if (soulen < tarlen || tarlen == 0) {
439         return -1;
440     }
441     int i = tarlen - 1;
442     int j = tarlen - 1;
443     while (i < soulen) {
444         if (source[i] == target[j]) {
445             if (j == 0) {
446                 return i;
447             }
448             i--;
449             j--;
450         } else {
451             bool flag = FindIndexInner(target, source, tarlen, i, j);
452             if (!flag) {
453                 return -1;
454             }
455         }
456     }
457     return -1;
458 }
459 }
460