• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2024 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H
17 #define FOUNDATION_ACE_FRAMEWORKS_BASE_UTILS_UTF_H
18 
19 #include <cstddef>
20 #include <cstdint>
21 #include <utility>
22 #include <string>
23 
24 namespace OHOS::Ace {
25 
26 /*
27  * https://en.wikipedia.org/wiki/UTF-8
28  *
29  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4
30  *    code point   code point   code point
31  * 1  7            U+0000       U+007F      0xxxxxxx
32  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
33  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
34  * 4  21           U+10000      U+10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
35  */
36 constexpr size_t MASK1 = 0x80;
37 constexpr size_t MASK2 = 0x20;
38 constexpr size_t MASK3 = 0x10;
39 
40 constexpr size_t MASK_4BIT = 0x0f;
41 constexpr size_t MASK_5BIT = 0x1f;
42 constexpr size_t MASK_6BIT = 0x3f;
43 constexpr size_t MASK_10BIT = 0x03ff;
44 constexpr size_t MASK_16BIT = 0xffff;
45 
46 constexpr size_t DATA_WIDTH = 6;
47 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
48 
49 constexpr size_t HI_SURROGATE_MIN = 0xd800;
50 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
51 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
52 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
53 
54 constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
55 
56 constexpr size_t U16_LEAD = 0xd7c0;
57 constexpr size_t U16_TAIL = 0xdc00;
58 
59 constexpr uint8_t MUTF8_1B_MAX = 0x7f;
60 
61 constexpr uint16_t MUTF8_2B_MAX = 0x7ff;
62 constexpr uint8_t MUTF8_2B_FIRST = 0xc0;
63 constexpr uint8_t MUTF8_2B_SECOND = 0x80;
64 
65 constexpr uint8_t MUTF8_3B_FIRST = 0xe0;
66 constexpr uint8_t MUTF8_3B_SECOND = 0x80;
67 constexpr uint8_t MUTF8_3B_THIRD = 0x80;
68 
69 constexpr uint8_t MUTF8_4B_FIRST = 0xf0;
70 constexpr uint8_t MUTF8_4B_FIRST_MASK = 0xf8;
71 
72 constexpr size_t MAX_U16 = 0xffff;
73 constexpr size_t CONST_2 = 2;
74 constexpr size_t CONST_3 = 3;
75 constexpr size_t CONST_4 = 4;
76 constexpr size_t CONST_6 = 6;
77 constexpr size_t CONST_12 = 12;
78 
79 constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
80 constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
81 constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
82 constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
83 constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
84 constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
85 constexpr uint32_t UTF8_OFFSET = 6;
86 constexpr uint32_t UTF16_OFFSET = 10;
87 constexpr uint16_t SURROGATE_MASK = 0xF800;
88 constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
89 
90 constexpr uint8_t UTF8_1B_MAX = 0x7f;
91 
92 constexpr uint16_t UTF8_2B_MAX = 0x7ff;
93 constexpr uint8_t UTF8_2B_FIRST = 0xc0;
94 constexpr uint8_t UTF8_2B_SECOND = 0x80;
95 constexpr uint8_t UTF8_2B_THIRD = 0x3f;
96 constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2; // the minimum for 2 bytes is 128, which is 0xc280
97 
98 constexpr uint16_t UTF8_3B_MAX = 0xffff;
99 constexpr uint8_t UTF8_3B_FIRST = 0xe0;
100 constexpr uint8_t UTF8_3B_SECOND = 0x80;
101 constexpr uint8_t UTF8_3B_THIRD = 0x80;
102 constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0; // the minimum for 3 bytes is 2048, which is 0xe0a080
103 
104 constexpr uint8_t UTF8_4B_FIRST = 0xf0;
105 constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90; // the minimum for 4 bytes is 65536, which is 0xf0908080
106 
107 constexpr uint8_t BYTE_MASK = 0xbf;
108 constexpr uint8_t BYTE_MARK = 0x80;
109 
110 enum UTF8BytePatterns {
111     UTF8_TWO_BYTE_MASK = 0xE0,
112     UTF8_TWO_BYTE_PATTERN = 0xC0,
113     UTF8_THREE_BYTE_MASK = 0xF0,
114     UTF8_THREE_BYTE_PATTERN = 0xE0,
115     UTF8_FOUR_BYTE_MASK = 0xF8,
116     UTF8_FOUR_BYTE_PATTERN = 0xF0,
117     UTF8_MULTIBYTE_FOLLOWER = 0x80,
118     UTF8_HIGH_BIT = 0x80
119 };
120 
121 enum UTF16LEPatterns {
122     UTF16LE_BOM_FF = 0xFF,
123     UTF16LE_BOM_FE = 0xFE,
124     UTF16LE_ZERO_BYTE = 0x00
125 };
126 
127 enum INDEX {
128     INDEX_ONE = 1,
129     INDEX_TWO = 2,
130     INDEX_THREE = 3
131 };
132 
133 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
134 
135 const unsigned char FIRST_BYTE_MARK[7] = { 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC };
136 
137 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t* data, size_t maxBytes = 4);
138 
139 size_t MUtf8ToUtf16Size(const uint8_t* mutf8, size_t mutf8Len);
140 
141 size_t ConvertRegionUtf8ToUtf16(
142     const uint8_t* utf8In, uint16_t* utf16Out, size_t utf8Len, size_t utf16Len, size_t start);
143 
144 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t* utf16In, uint8_t* utf8Out, size_t utf16Len, size_t utf8Len,
145     size_t start);
146 
147 void ConvertIllegalStr(std::string& str);
148 
149 bool IsUTF8(std::string& data);
150 
SplitUtf16Pair(uint32_t pair)151 inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
152 {
153     constexpr size_t P1_MASK = 0xffff;
154     constexpr size_t P2_SHIFT = 16;
155     return { pair >> P2_SHIFT, pair & P1_MASK };
156 }
157 
158 } // namespace OHOS::Ace
159 
160 #endif