1 /* 2 * Copyright (c) 2023 Huawei Device Co., Ltd. 3 * Licensed under the Apache License, Version 2.0 (the "License"); 4 * you may not use this file except in compliance with the License. 5 * You may obtain a copy of the License at 6 * 7 * http://www.apache.org/licenses/LICENSE-2.0 8 * 9 * Unless required by applicable law or agreed to in writing, software 10 * distributed under the License is distributed on an "AS IS" BASIS, 11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 * See the License for the specific language governing permissions and 13 * limitations under the License. 14 */ 15 16 #ifndef STRINGHELPER_H 17 #define STRINGHELPER_H 18 19 #pragma once 20 #include <string> 21 22 class StringHelper { 23 public: 24 enum Encode { ANSI = 1, UTF16_LE, UTF16_BE, UTF8_BOM, UTF8 }; IsUtf8Data(const uint8_t * data,size_t size)25 static Encode IsUtf8Data(const uint8_t* data, size_t size) 26 { 27 bool bAnsi = true; 28 uint8_t ch = 0x00; 29 int32_t nBytes = 0; 30 int32_t byteNumber[] = {0, 1, 2, 3, 4, 5, 6}; 31 for (auto i = 0; i < size; i++) { 32 ch = *(data + i); 33 if ((ch & 0x80) != 0x00) { // The first digit of the byte is 0:0XXX_XXXX 34 bAnsi = false; 35 } 36 if (nBytes == byteNumber[0]) { 37 if (ch < 0x80) { 38 continue; 39 } 40 if (ch >= 0xFC && ch <= 0xFD) { 41 // The char has 6 bytes:1111_110X 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 42 nBytes = byteNumber[6]; 43 } else if (ch >= 0xF8) { 44 // The char has 5 bytes:1111_10XX 10XX_XXXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 45 nBytes = byteNumber[5]; 46 } else if (ch >= 0xF0) { 47 // The char has 4 bytes:1111_0XXX 10XX_XXXX 10XX_XXXX 10XX_XXXX 48 nBytes = byteNumber[4]; 49 } else if (ch >= 0xE0) { 50 nBytes = byteNumber[3]; // The char has 3 bytes:1110_XXXX 10XX_XXXX 10XX_XXXX 51 } else if (ch >= 0xC0) { 52 nBytes = byteNumber[2]; // The char has 2 bytes:110X_XXXX 10XX_XXXX 53 } else { 54 return Encode::ANSI; 55 } 56 nBytes--; 57 } else { 58 if ((ch & 0xC0) != 0x80) { 59 return Encode::ANSI; 60 } 61 nBytes--; 62 } 63 } 64 if (nBytes > byteNumber[0] || bAnsi) { 65 return Encode::ANSI; 66 } 67 return Encode::UTF8; 68 }; 69 DetectEncode(const uint8_t * data,size_t size)70 static Encode DetectEncode(const uint8_t* data, size_t size) 71 { 72 int utf16HeadLen = 2; 73 int utf8HeadLen = 3; 74 // Detect the size and the first and second bytes 75 if (size > utf16HeadLen && data[0] == 0xFF && data[1] == 0xFE) { 76 return Encode::UTF16_LE; 77 // Detect the size and the first and second bytes 78 } else if (size > utf16HeadLen && data[0] == 0xFE && data[1] == 0xFF) { 79 return Encode::UTF16_BE; 80 // Detect the size and the first, second, and third bytes 81 } else if (size > utf8HeadLen && data[0] == 0xEF && data[1] == 0xBB&& data[2] == 0xBF) { // NOLINT 82 return Encode::UTF8_BOM; 83 } else { 84 return IsUtf8Data(data, size); 85 } 86 }; 87 88 static std::string StringToUtf8(const std::string& str); 89 static std::string Utf8ToString(const std::string& str); 90 }; 91 92 #endif // STRINGHELPER_H