• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2023 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef STRINGHELPER_H
17 #define STRINGHELPER_H
18 
19 #pragma once
20 #include <string>
21 
22 class StringHelper {
23 public:
24     enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 };
IsUtf8Data(const uint8_t * data,size_t size)25     static Encode IsUtf8Data(const uint8_t* data, size_t size)
26     {
27         bool bAnsi = true;
28         uint8_t ch = 0x00;
29         int32_t nBytes = 0;
30         int32_t byteNumber[] = {0, 1, 2, 3, 4, 5, 6};
31         for (auto i = 0; i < size; i++) {
32             ch = *(data + i);
33             if ((ch & 0x80) != 0x00) { // The first digit of the byte is 0:0XXX_XXXX
34                 bAnsi = false;
35             }
36             if (nBytes == byteNumber[0]) {
37                 if (ch < 0x80) {
38                     continue;
39                 }
40                 if (ch >= 0xFC && ch <= 0xFD) {
41                     // The char has 6 bytes:1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
42                     nBytes = byteNumber[6];
43                 } else if (ch >= 0xF8) {
44                     // The char has 5 bytes:1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
45                     nBytes = byteNumber[5];
46                 } else if (ch >= 0xF0) {
47                     // The char has 4 bytes:1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX
48                     nBytes = byteNumber[4];
49                 } else if (ch >= 0xE0) {
50                     nBytes = byteNumber[3]; // The char has 3 bytes:1110_XXXX 10XX_XXXX 10XX_XXXX
51                 } else if (ch >= 0xC0) {
52                     nBytes = byteNumber[2]; // The char has 2 bytes:110X_XXXX 10XX_XXXX
53                 } else {
54                     return Encode::ANSI;
55                 }
56                 nBytes--;
57             } else {
58                 if ((ch & 0xC0) != 0x80) {
59                     return Encode::ANSI;
60                 }
61                 nBytes--;
62             }
63         }
64         if (nBytes > byteNumber[0] || bAnsi) {
65             return Encode::ANSI;
66         }
67         return Encode::UTF8;
68     };
69 
DetectEncode(const uint8_t * data,size_t size)70     static Encode DetectEncode(const uint8_t* data, size_t size)
71     {
72         int utf16HeadLen = 2;
73         int utf8HeadLen = 3;
74         // Detect the size and the first and second bytes
75         if (size > utf16HeadLen && data[0] == 0xFF && data[1] == 0xFE) {
76             return Encode::UTF16_LE;
77         // Detect the size and the first and second bytes
78         } else if (size > utf16HeadLen && data[0] == 0xFE && data[1] == 0xFF) {
79             return Encode::UTF16_BE;
80         // Detect the size and the first, second, and third bytes
81         } else if (size > utf8HeadLen && data[0] == 0xEF && data[1] == 0xBB&& data[2] == 0xBF) { // NOLINT
82             return Encode::UTF8_BOM;
83         } else {
84             return IsUtf8Data(data, size);
85         }
86     };
87 
88     static std::string StringToUtf8(const std::string& str);
89     static std::string Utf8ToString(const std::string& str);
90 };
91 
92 #endif // STRINGHELPER_H