1 /*
2 * Copyright (c) 2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #include "converter.h"
17
18 #include <codecvt>
19 #include <locale>
20
21 using namespace std;
22
23 namespace OHOS::buffer {
24
IsOneByte(uint8_t u8Char)25 bool IsOneByte(uint8_t u8Char)
26 {
27 return (u8Char & 0x80) == 0;
28 }
29
Utf8ToUtf16BE(const string & u8Str,bool * ok)30 u16string Utf8ToUtf16BE(const string &u8Str, bool *ok)
31 {
32 u16string u16Str = u"";
33 u16Str.reserve(u8Str.size());
34 string::size_type len = u8Str.length();
35
36 const unsigned char *data = reinterpret_cast<const unsigned char *>(u8Str.data());
37
38 bool isOk = true;
39 for (string::size_type i = 0; i < len; ++i) {
40 uint8_t c1 = data[i]; // The first byte
41 if (IsOneByte(c1)) { // only 1 byte represents the UNICODE code point
42 u16Str.push_back(static_cast<char16_t>(c1));
43 continue;
44 }
45 switch (c1 & HIGER_4_BITS_MASK) {
46 case FOUR_BYTES_STYLE: { // 4 byte characters, from 0x10000 to 0x10FFFF
47 uint8_t c2 = data[++i]; // The second byte
48 uint8_t c3 = data[++i]; // The third byte
49 uint8_t c4 = data[++i]; // The forth byte
50 // Calculate the UNICODE code point value (3 bits lower for the first byte, 6 bits for the other)
51 // 3 : shift left 3 times of UTF8_VALID_BITS
52 uint32_t codePoint = ((c1 & LOWER_3_BITS_MASK) << (3 * UTF8_VALID_BITS)) |
53 // 2 : shift left 2 times of UTF8_VALID_BITS
54 ((c2 & LOWER_6_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
55 ((c3 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
56 (c4 & LOWER_6_BITS_MASK);
57
58 // In UTF-16, U+10000 to U+10FFFF represent surrogate pairs with two 16-bit units
59 if (codePoint >= UTF16_SPECIAL_VALUE) {
60 codePoint -= UTF16_SPECIAL_VALUE;
61 // 10 : a half of 20 , shift right 10 bits
62 u16Str.push_back(static_cast<char16_t>((codePoint >> 10) | HIGH_AGENT_MASK));
63 u16Str.push_back(static_cast<char16_t>((codePoint & LOWER_10_BITS_MASK) | LOW_AGENT_MASK));
64 } else { // In UTF-16, U+0000 to U+D7FF and U+E000 to U+FFFF are Unicode code point values
65 // U+D800 to U+DFFF are invalid characters, for simplicity,
66 // assume it does not exist (if any, not encoded)
67 u16Str.push_back(static_cast<char16_t>(codePoint));
68 }
69 break;
70 }
71 case THREE_BYTES_STYLE: { // 3 byte characters, from 0x800 to 0xFFFF
72 uint8_t c2 = data[++i]; // The second byte
73 uint8_t c3 = data[++i]; // The third byte
74 // Calculates the UNICODE code point value
75 // (4 bits lower for the first byte, 6 bits lower for the other)
76 // 2 : shift left 2 times of UTF8_VALID_BITS
77 uint32_t codePoint = ((c1 & LOWER_4_BITS_MASK) << (2 * UTF8_VALID_BITS)) |
78 ((c2 & LOWER_6_BITS_MASK) << UTF8_VALID_BITS) |
79 (c3 & LOWER_6_BITS_MASK);
80 u16Str.push_back(static_cast<char16_t>(codePoint));
81 break;
82 }
83 case TWO_BYTES_STYLE1: // 2 byte characters, from 0x80 to 0x7FF
84 case TWO_BYTES_STYLE2: {
85 uint8_t c2 = data[++i]; // The second byte
86 // Calculates the UNICODE code point value
87 // (5 bits lower for the first byte, 6 bits lower for the other)
88 uint32_t codePoint = ((c1 & LOWER_5_BITS_MASK) << UTF8_VALID_BITS) |
89 (c2 & LOWER_6_BITS_MASK);
90 u16Str.push_back(static_cast<char16_t>(codePoint));
91 break;
92 }
93 default: {
94 isOk = false;
95 break;
96 }
97 }
98 }
99 if (ok != nullptr) {
100 *ok = isOk;
101 }
102
103 return u16Str;
104 }
105
Utf16BEToLE(const u16string & wstr)106 u16string Utf16BEToLE(const u16string &wstr)
107 {
108 u16string str16 = u"";
109 const char16_t *data = wstr.data();
110 for (unsigned int i = 0; i < wstr.length(); i++) {
111 char16_t wc = data[i];
112 char16_t high = (wc >> 8) & 0x00FF;
113 char16_t low = wc & 0x00FF;
114 char16_t c16 = (low << 8) | high;
115 str16.push_back(c16);
116 }
117 return str16;
118 }
119
Utf16BEToANSI(const u16string & wstr)120 string Utf16BEToANSI(const u16string &wstr)
121 {
122 string ret = "";
123 for (u16string::const_iterator it = wstr.begin(); it != wstr.end(); ++it) {
124 char16_t wc = (*it);
125 // get the lower bit from the UNICODE code point
126 char c = static_cast<char>(wc & LOWER_8_BITS_MASK);
127 ret.push_back(c);
128 }
129 return ret;
130 }
131
Utf8ToUtf16BEToANSI(const string & str)132 string Utf8ToUtf16BEToANSI(const string &str)
133 {
134 u16string u16Str = Utf8ToUtf16BE(str);
135 string ret = Utf16BEToANSI(u16Str);
136 return ret;
137 }
138
IsBase64Char(unsigned char c)139 bool IsBase64Char(unsigned char c)
140 {
141 return (isalnum(c) || (c == '+') || (c == '/') || (c == '-') || (c == '_'));
142 }
143
144 /**
145 * Base64Encode - Base64 encode
146 * @src: Data to be encoded
147 * @len: Length of the data to be encoded
148 * Returns: Allocated buffer of outLen bytes of encoded data,
149 * or empty string on failure
150 */
Base64Encode(const unsigned char * src,size_t len,EncodingType type)151 string Base64Encode(const unsigned char *src, size_t len, EncodingType type)
152 {
153 if (src == nullptr) {
154 return string();
155 }
156 unsigned char *out = nullptr;
157 unsigned char *pos = nullptr;
158 const unsigned char *pEnd = nullptr;
159 const unsigned char *pStart = nullptr;
160 size_t outLen = 4 * ((len + 2) / 3); // 3-byte blocks to 4-byte
161
162 if (outLen < len) {
163 return string(); // integer overflow
164 }
165
166 string outStr = "";
167 outStr.resize(outLen);
168 out = reinterpret_cast<unsigned char *>(&outStr[0]);
169
170 pEnd = src + len;
171 pStart = src;
172 pos = out;
173
174 string table = BASE64_TABLE;
175 if (type == BASE64URL) {
176 table = BASE64URL_TABLE;
177 }
178 // 3 : 3 bytes is just 24 bits which is 4 times of 6 bits
179 while (pEnd - pStart >= 3) {
180 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
181 *pos = table[pStart[0] >> 2];
182 // 4 : add two zeros in front of the following second set of 6 bits to become the new 8 binary bits
183 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
184 // 2 : 4 : 6 : add two zeros in front of the following third set of 6 bits to become the new 8 binary bits
185 *(pos + 2) = table[((pStart[1] & LOWER_4_BITS_MASK) << 2) | (pStart[2] >> 6)];
186 // 2 : 3 : add two zeros in front of the following forth set of 6 bits to become the new 8 binary bits
187 *(pos + 3) = table[pStart[2] & LOWER_6_BITS_MASK];
188 // 4 : the pointer of pos scrolls off 4 bytes to point the next 4 bytes of encoded chars
189 pos += 4;
190 // 3 : the pointer of pStart scrolls off 3 bytes to point the next 3 bytes of which will be encoded chars
191 pStart += 3;
192 }
193
194 // process the last set of less than 3 bytes of data
195 if (pEnd - pStart > 0) {
196 // 2 : add two zeros in front of the first set of 6 bits to become a new 8 binary bits
197 *pos = table[pStart[0] >> 2];
198 if (pEnd - pStart == 1) { // one byte remaining
199 // 4 : paddle the last two bits of the last byte with two zeros in front of it and four zeros after it
200 *(pos + 1) = table[(pStart[0] & LOWER_2_BITS_MASK) << 4];
201 // 2 : fill in the missing bytes with '='
202 *(pos + 2) = '=';
203 } else { // two bytes remaining
204 // 4 : add two zeros in front of the second set of 6 bits to become the new 8 binary bits
205 *(pos + 1) = table[((pStart[0] & LOWER_2_BITS_MASK) << 4) | (pStart[1] >> 4)];
206 // 2 : paddle the last four bits of the last byte with two zeros in front of it and two zeros after it
207 *(pos + 2) = table[(pStart[1] & LOWER_4_BITS_MASK) << 2];
208 }
209 // 3 : fill in the missing bytes with '='
210 *(pos + 3) = '=';
211 }
212
213 if (type == BASE64URL) {
214 size_t poss = outStr.find_last_not_of('=');
215 if (poss != std::string::npos) {
216 outStr.erase(poss + 1);
217 }
218 }
219 return outStr;
220 }
221
Base64Decode(string const & encodedStr,EncodingType type)222 string Base64Decode(string const& encodedStr, EncodingType type)
223 {
224 size_t len = encodedStr.size();
225 unsigned int index = 0;
226 unsigned int cursor = 0;
227 unsigned char charArray4[4] = {0}; // an array to stage a group of indexes for encoded string
228 unsigned char charArray3[3] = {0}; // an array to stage a set of original string
229 string ret = "";
230 string table = BASE64_TABLE;
231
232 if (type == BASE64URL) {
233 table = BASE64URL_TABLE;
234 }
235 while ((encodedStr[cursor] != '=') && IsBase64Char(encodedStr[cursor])) {
236 // stage a 4-byte string to charArray4
237 charArray4[index] = encodedStr[cursor];
238 index++;
239 cursor++;
240 if (index == 4) { // 4 : after 4 chars is assigned to charArray4
241 // 4 : fill data into charArray4
242 for (index = 0; index < 4; index++) {
243 charArray4[index] = table.find(charArray4[index]) & LOWER_8_BITS_MASK;
244 }
245 // get the last six bits of the first byte of charArray4 and the first valid
246 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
247 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
248 // get the last four bits of the second byte of charArray4 and the first valid
249 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
250 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & MIDDLE_4_BITS_MASK) >> 2);
251 // get the last two bits of the third byte of charArray4 and the forth byte,
252 // 2 : 3 : 6 : combine them to a new byte
253 charArray3[2] = ((charArray4[2] & LOWER_2_BITS_MASK) << 6) + charArray4[3];
254 // 3 : assigns the decoded string to the return value
255 for (index = 0; index < 3; index++) {
256 ret += charArray3[index];
257 }
258 index = 0;
259 }
260 if (cursor > len - 1) {
261 break;
262 }
263 }
264
265 if (index != 0) {
266 // fill data into charArray4
267 for (unsigned int i = 0; i < index; i++) {
268 charArray4[i] = table.find(charArray4[i]) & LOWER_8_BITS_MASK;
269 }
270 // get the last six bits of the first byte of charArray4 and the first valid
271 // 2 : 4 : two bits(except two higer bits) of the second byte, combine them to a new byte
272 charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4);
273 // get the last four bits of the second byte of charArray4 and the first valid
274 // 4 : 2 : four bits(except two higer bits) of the third byte, combine them to a new byte
275 charArray3[1] = ((charArray4[1] & LOWER_4_BITS_MASK) << 4) + ((charArray4[2] & LOWER_6_BITS_MASK) >> 2);
276 // assigns the decoded string to the return value
277 for (unsigned int i = 0; i < index - 1; i++) {
278 ret += charArray3[i];
279 }
280 }
281
282 return ret;
283 }
284
IsValidHex(const string & hex)285 bool IsValidHex(const string &hex)
286 {
287 bool isValid = false;
288 for (unsigned int i = 0; i < hex.size(); i++) {
289 char c = hex.at(i);
290 // 0 ~ 9, A ~ F, a ~ f
291 if ((c <= '9' && c >= '0') || (c <= 'F' && c >= 'A') || (c <= 'f' && c >= 'a')) {
292 isValid = true;
293 } else {
294 isValid = false;
295 break;
296 }
297 }
298 return isValid;
299 }
300
HexDecode(const string & hexStr)301 string HexDecode(const string &hexStr)
302 {
303 string nums = "";
304 unsigned int arrSize = hexStr.size();
305
306 // 2 : means a half length of hex str's size
307 for (unsigned int i = 0; i < arrSize / 2; i++) {
308 string hexStrTmp = "";
309 int num = 0;
310 // 2 : offset is i * 2
311 hexStrTmp.push_back(hexStr[i * 2]);
312 // 2 : offset is i * 2 + 1
313 hexStrTmp.push_back(hexStr[i * 2 + 1]);
314 if (!IsValidHex(hexStrTmp)) {
315 break;
316 }
317 // 16 : the base is 16
318 num = stoi(hexStrTmp, nullptr, 16);
319 nums.push_back(static_cast<char>(num));
320 }
321
322 return nums;
323 }
324
325 // Find the position of the last character in pat from patIndex
GetGoodSuffixLengthByLastChar(uint8_t * pat,int patIndex,int patLen)326 int GetGoodSuffixLengthByLastChar(uint8_t *pat, int patIndex, int patLen)
327 {
328 int lastIndex = patLen - 1;
329 int index = -1;
330 while (patIndex >= 0) {
331 if (pat[patIndex] == pat[lastIndex]) {
332 index = patIndex;
333 break;
334 } else {
335 patIndex--;
336 }
337 }
338 return lastIndex - index;
339 }
340 // Find the position of the first character in pat from patIndex
GetGoodSuffixLengthByFirstChar(uint8_t * pat,int patIndex,int tarlen)341 int GetGoodSuffixLengthByFirstChar(uint8_t *pat, int patIndex, int tarlen)
342 {
343 int indexOfNextFirstChar = tarlen;
344 for (int i = patIndex; i < tarlen; i++) {
345 if (pat[0] == pat[i]) {
346 indexOfNextFirstChar = i;
347 break;
348 }
349 }
350 return indexOfNextFirstChar;
351 }
352
353 // Match forward from patIndex to get the position of the singleChar in the pat
354 // and the length of the bad character
GetBadCharLengthInReverseOrder(uint8_t * pat,char singleChar,int patIndex)355 int GetBadCharLengthInReverseOrder(uint8_t *pat, char singleChar, int patIndex)
356 {
357 int index = -1;
358 for (int i = patIndex - 1; i >= 0; --i) {
359 if (pat[i] == singleChar) {
360 index = i;
361 break;
362 }
363 }
364 return patIndex - index;
365 }
366
367 // Get the position of character c in pat
GetBadCharLengthInSequence(uint8_t * pat,char singleChar,int patIndex,int tarlen)368 int GetBadCharLengthInSequence(uint8_t *pat, char singleChar, int patIndex, int tarlen)
369 {
370 int resIndex = tarlen;
371 for (int i = patIndex; i < tarlen; i++) {
372 if (singleChar == pat[i]) {
373 resIndex = i;
374 break;
375 }
376 }
377 return resIndex;
378 }
379
FindLastIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)380 int FindLastIndex(uint8_t *source, uint8_t *target, int soulen, int tarlen)
381 {
382 if (source == nullptr || target == nullptr) {
383 return -1;
384 }
385 if (soulen < tarlen || tarlen == 0) {
386 return -1;
387 }
388 int i = soulen - tarlen;
389 int j = 0;
390
391 while (i >= 0) {
392 if (source[i] == target[j]) {
393 if (j == tarlen - 1) {
394 return i - (tarlen - 1);
395 }
396 i++;
397 j++;
398 } else {
399 if (j == 0) {
400 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
401 i = i - badValue;
402 j = 0;
403 } else {
404 int badValue = GetBadCharLengthInSequence(target, source[i], j, tarlen);
405 int goodSuffix = GetGoodSuffixLengthByFirstChar(target, j, tarlen);
406 int distance = badValue > goodSuffix ? badValue : goodSuffix;
407 i = i - distance;
408 j = 0;
409 }
410 }
411 }
412 return -1;
413 }
414
FindIndex(uint8_t * source,uint8_t * target,int soulen,int tarlen)415 int FindIndex(uint8_t* source, uint8_t* target, int soulen, int tarlen)
416 {
417 if (source == nullptr || target == nullptr) {
418 return -1;
419 }
420 if (soulen < tarlen || tarlen == 0) {
421 return -1;
422 }
423 int i = tarlen - 1;
424 int j = tarlen - 1;
425 while (i < soulen) {
426 if (source[i] == target[j]) {
427 if (j == 0) {
428 return i;
429 }
430 i--;
431 j--;
432 } else {
433 if (j == tarlen - 1) {
434 int badValue = GetBadCharLengthInReverseOrder(target, source[i], j);
435 i = i + badValue;
436 j = tarlen - 1;
437 } else {
438 int badValue = GetBadCharLengthInReverseOrder(target, source[i], j);
439 int goodSuffix = GetGoodSuffixLengthByLastChar(target, j, tarlen);
440 int distance = badValue > goodSuffix ? badValue : goodSuffix;
441 i = i + tarlen - 1 - j + distance;
442 j = tarlen - 1;
443 }
444 }
445 }
446 return -1;
447 }
448 }
449