1 /*
2 * Copyright (c) 2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "converter.h"
17 #include <climits>
18 using namespace std;
19
20 namespace OHOS::buffer {
21
IsOneByte(uint8_t u8Char)22 bool IsOneByte(uint8_t u8Char)
23 {
24 return (u8Char & 0x80) == 0;
25 }
26
Utf8ToUtf16BEToData(const unsigned char * data,u16string & u16Str,string::size_type & index,uint8_t & c1)27 void Utf8ToUtf16BEToData(const unsigned char *data, u16string &u16Str, string::size_type &index, uint8_t &c1)
28 {
29 uint8_t c2 = data[++index]; // The second byte
30 uint8_t c3 = data[++index]; // The third byte
31 uint8_t c4 = data[++index]; // The forth byte
32 // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
33 // 3 : shift left 3 times of UTF8_VALID_BITS
34 uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
35 // 2 : shift left 2 times of UTF8_VALID_BITS
36 ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
37 ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
38 (c4 & LOWER_6_BITS_MASK);
39 // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
40 if (codePoint >= UTF16_SPECIAL_VALUE) {
41 codePoint -= UTF16_SPECIAL_VALUE;
42 // 10 : a half of 20 , shift right 10 bits
43 u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
44 u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
45 } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
46 // U+D800 to U+DFFF are invalid characters, for simplicity,
47 // assume it does not exist (if any, not encoded)
48 u16Str.push_back(static_cast<char16_t>(codePoint));
49 }
50 }
51
Utf8ToUtf16BE(const string & u8Str,bool * ok)52 u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
53 {
54 u16string u16Str = u"";
55 u16Str.reserve(u8Str.size());
56 string::size_type len = u8Str.length();
57 const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
58 bool isOk = true;
59 for (string::size_type i = 0; i < len; ++i) {
60 uint8_t c1 = data[i]; // The first byte
61 if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
62 u16Str.push_back(static_cast<char16_t>(c1));
63 continue;
64 }
65 switch (c1 & HIGER_4_BITS_MASK) {
66 case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
67 Utf8ToUtf16BEToData(data, u16Str, i, c1);
68 break;
69 }
70 case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
71 uint8_t c2 = data[++i]; // The second byte
72 uint8_t c3 = data[++i]; // The third byte
73 // Calculates the UNICODE code point value
74 // (4 bits lower for the first byte, 6 bits lower for the other)
75 // 2 : shift left 2 times of UTF8_VALID_BITS
76 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
77 ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
78 (c3 & LOWER_6_BITS_MASK);
79 u16Str.push_back(static_cast<char16_t>(codePoint));
80 break;
81 }
82 case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
83 case TWO_BYTES_STYLE2: {
84 uint8_t c2 = data[++i]; // The second byte
85 // Calculates the UNICODE code point value
86 // (5 bits lower for the first byte, 6 bits lower for the other)
87 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
88 (c2 & LOWER_6_BITS_MASK);
89 u16Str.push_back(static_cast<char16_t>(codePoint));
90 break;
91 }
92 default: {
93 isOk = false;
94 break;
95 }
96 }
97 }
98 if (ok != nullptr) {
99 *ok = isOk;
100 }
101 return u16Str;
102 }
103
Utf16BEToLE(const u16string & wstr)104 u16string Utf16BEToLE(const u16string &wstr)
105 {
106 u16string str16 = u"";
107 const char16_t *data = wstr.data();
108 for (unsigned int i = 0; i < wstr.length(); i++) {
109 char16_t wc = data[i];
110 char16_t high = (wc >> 8) & 0x00FF;
111 char16_t low = wc & 0x00FF;
112 char16_t c16 = (low << 8) | high;
113 str16.push_back(c16);
114 }
115 return str16;
116 }
117
Utf16BEToANSI(const u16string & wstr)118 string Utf16BEToANSI(const u16string &wstr)
119 {
120 string ret = "";
121 for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
122 char16_t wc = (*it);
123 // get the lower bit from the UNICODE code point
124 char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
125 ret.push_back(c);
126 }
127 return ret;
128 }
129
Utf8ToUtf16BEToANSI(const string & str)130 string Utf8ToUtf16BEToANSI(const string &str)
131 {
132 u16string u16Str = Utf8ToUtf16BE(str);
133 string ret = Utf16BEToANSI(u16Str);
134 return ret;
135 }
136
IsBase64Char(unsigned char c)137 bool IsBase64Char(unsigned char c)
138 {
139 return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
140 }
141
142 /**
143 * Base64Encode - Base64 encode
144 * @src: Data to be encoded
145 * @len: Length of the data to be encoded
146 * Returns: Allocated buffer of outLen bytes of encoded data,
147 * or empty string on failure
148 */
Base64Encode(const unsigned char * src,size_t len,EncodingType type)149 string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
150 {
151 if (src == nullptr) {
152 return string();
153 }
154 unsigned char *out = nullptr;
155 unsigned char *pos = nullptr;
156 const unsigned char *pEnd = nullptr;
157 const unsigned char *pStart = nullptr;
158 size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
159
160 if (outLen < len) {
161 return string(); // integer overflow
162 }
163
164 string outStr = "";
165 outStr.resize(outLen);
166 out = reinterpret_cast<unsigned char *>(&outStr[0]);
167
168 pEnd = src + len;
169 pStart = src;
170 pos = out;
171
172 string table = BASE64_TABLE;
173 if (type == BASE64URL) {
174 table = BASE64URL_TABLE;
175 }
176 // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
177 while (pEnd - pStart >= 3) {
178 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
179 *pos = table[pStart[0] >> 2];
180 // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
181 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
182 // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
183 *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
184 // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
185 *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
186 // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
187 pos += 4;
188 // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
189 pStart += 3;
190 }
191
192 // process the last set of less than 3 bytes of data
193 if (pEnd - pStart > 0) {
194 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
195 *pos = table[pStart[0] >> 2];
196 if (pEnd - pStart == 1) { // one byte remaining
197 // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
198 *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
199 // 2 : fill in the missing bytes with '='
200 *(pos + 2) = '=';
201 } else { // two bytes remaining
202 // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
203 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
204 // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
205 *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
206 }
207 // 3 : fill in the missing bytes with '='
208 *(pos + 3) = '=';
209 }
210
211 if (type == BASE64URL) {
212 size_t poss = outStr.find_last_not_of('=');
213 if (poss != std::string::npos) {
214 outStr.erase(poss + 1);
215 }
216 }
217 return outStr;
218 }
219
Base64Decode(string const & encodedStr,EncodingType type)220 string Base64Decode(string const& encodedStr, EncodingType type)
221 {
222 size_t len = encodedStr.size();
223 unsigned int index = 0;
224 unsigned int cursor = 0;
225 unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
226 unsigned char charArray3[3] = {0}; // an array to stage a set of original string
227 string ret = "";
228 string table = BASE64_TABLE;
229
230 if (type == BASE64URL) {
231 table = BASE64URL_TABLE;
232 }
233 while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
234 // stage a 4-byte string to charArray4
235 charArray4[index] = encodedStr[cursor];
236 index++;
237 cursor++;
238 if (index == 4) { // 4 : after 4 chars is assigned to charArray4
239 // 4 : fill data into charArray4
240 for (index = 0; index < 4; index++) {
241 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
242 }
243 // get the last six bits of the first byte of charArray4 and the first valid
244 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
245 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
246 // get the last four bits of the second byte of charArray4 and the first valid
247 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
248 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
249 // get the last two bits of the third byte of charArray4 and the forth byte,
250 // 2 : 3 : 6 : combine them to a new byte
251 charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
252 // 3 : assigns the decoded string to the return value
253 for (index = 0; index < 3; index++) {
254 ret += charArray3[index];
255 }
256 index = 0;
257 }
258 if (cursor > len - 1) {
259 break;
260 }
261 }
262
263 if (index != 0) {
264 // fill data into charArray4
265 for (unsigned int i = 0; i < index; i++) {
266 charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
267 }
268 // get the last six bits of the first byte of charArray4 and the first valid
269 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
270 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
271 // get the last four bits of the second byte of charArray4 and the first valid
272 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
273 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
274 // assigns the decoded string to the return value
275 for (unsigned int i = 0; i < index - 1; i++) {
276 ret += charArray3[i];
277 }
278 }
279
280 return ret;
281 }
282
IsValidHex(const string & hex)283 bool IsValidHex(const string &hex)
284 {
285 bool isValid = false;
286 for (unsigned int i = 0; i < hex.size(); i++) {
287 char c = hex.at(i);
288 // 0 ~ 9, A ~ F, a ~ f
289 if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
290 isValid = true;
291 } else {
292 isValid = false;
293 break;
294 }
295 }
296 return isValid;
297 }
298
HexDecode(const string & hexStr)299 string HexDecode(const string &hexStr)
300 {
301 string nums = "";
302 unsigned int arrSize = hexStr.size();
303
304 // 2 : means a half length of hex str's size
305 for (unsigned int i = 0; i < arrSize / 2; i++) {
306 string hexStrTmp = "";
307 int num = 0;
308 // 2 : offset is i * 2
309 hexStrTmp.push_back(hexStr[i * 2]);
310 // 2 : offset is i * 2 + 1
311 hexStrTmp.push_back(hexStr[i * 2 + 1]);
312 if (!IsValidHex(hexStrTmp)) {
313 break;
314 }
315 // 16 : the base is 16
316 num = stoi(hexStrTmp, nullptr, 16);
317 nums.push_back(static_cast<char>(num));
318 }
319
320 return nums;
321 }
322
323 // Find the position of the last character in pat from patIndex
GetGoodSuffixLengthByLastChar(uint8_t * pat,int patIndex,int patLen)324 int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
325 {
326 int lastIndex = patLen - 1;
327 int index = -1;
328 while (patIndex >= 0) {
329 if (pat[patIndex] == pat[lastIndex]) {
330 index = patIndex;
331 break;
332 } else {
333 patIndex--;
334 }
335 }
336 return lastIndex - index;
337 }
338 // Find the position of the first character in pat from patIndex
GetGoodSuffixLengthByFirstChar(uint8_t * pat,int patIndex,int tarlen)339 int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
340 {
341 int indexOfNextFirstChar = tarlen;
342 for (int i = patIndex; i < tarlen; i++) {
343 if (pat[0] == pat[i]) {
344 indexOfNextFirstChar = i;
345 break;
346 }
347 }
348 return indexOfNextFirstChar;
349 }
350
351 // Match forward from patIndex to get the position of the singleChar in the pat
352 // and the length of the bad character
GetBadCharLengthInReverseOrder(uint8_t * pat,char singleChar,int patIndex)353 int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
354 {
355 int index = -1;
356 for (int i = patIndex - 1; i >= 0; --i) {
357 if (pat[i] == singleChar) {
358 index = i;
359 break;
360 }
361 }
362 return patIndex - index;
363 }
364
365 // Get the position of character c in pat
GetBadCharLengthInSequence(uint8_t * pat,char singleChar,int patIndex,int tarlen)366 int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
367 {
368 int resIndex = tarlen;
369 for (int i = patIndex; i < tarlen; i++) {
370 if (singleChar == pat[i]) {
371 resIndex = i;
372 break;
373 }
374 }
375 return resIndex;
376 }
377
FindLastIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)378 int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
379 {
380 if (source == nullptr || target == nullptr) {
381 return -1;
382 }
383 if (soulen < tarlen || tarlen == 0) {
384 return -1;
385 }
386 int i = soulen - tarlen;
387 int j = 0;
388
389 while (i >= 0) {
390 if (source[i] == target[j]) {
391 if (j == tarlen - 1) {
392 return i - (tarlen - 1);
393 }
394 i++;
395 j++;
396 } else {
397 if (j == 0) {
398 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
399 i = i - badValue;
400 j = 0;
401 } else {
402 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
403 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
404 int distance = badValue > goodSuffix ? badValue : goodSuffix;
405 i = i - distance;
406 j = 0;
407 }
408 }
409 }
410 return -1;
411 }
412
FindIndexInner(uint8_t * target,uint8_t * source,int tarlen,int & indexI,int & indexJ)413 bool FindIndexInner(uint8_t* target, uint8_t* source, int tarlen, int &indexI, int &indexJ)
414 {
415 if (indexJ == tarlen - 1) {
416 int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
417 indexI = indexI + badValue;
418 } else {
419 int badValue = GetBadCharLengthInReverseOrder(target, source[indexI], indexJ);
420 int goodSuffix = GetGoodSuffixLengthByLastChar(target, indexJ, tarlen);
421 int distance = badValue > goodSuffix ? badValue : goodSuffix;
422 long addVal = static_cast<long>(indexI) + tarlen;
423 long addRst = addVal + distance;
424 if (abs(addVal) > INT_MAX || abs(addRst) > INT_MAX) {
425 return false;
426 }
427 indexI = indexI + tarlen - 1 - indexJ + distance;
428 indexJ = tarlen - 1;
429 }
430 return true;
431 }
432
FindIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)433 int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
434 {
435 if (source == nullptr || target == nullptr) {
436 return -1;
437 }
438 if (soulen < tarlen || tarlen == 0) {
439 return -1;
440 }
441 int i = tarlen - 1;
442 int j = tarlen - 1;
443 while (i < soulen) {
444 if (source[i] == target[j]) {
445 if (j == 0) {
446 return i;
447 }
448 i--;
449 j--;
450 } else {
451 bool flag = FindIndexInner(target, source, tarlen, i, j);
452 if (!flag) {
453 return -1;
454 }
455 }
456 }
457 return -1;
458 }
459 }
460