• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2021 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef ECMASCRIPT_BASE_UTF_HELPER_H
17 #define ECMASCRIPT_BASE_UTF_HELPER_H
18 
19 #include <cstdint>
20 #include <vector>
21 
22 #include "libpandabase/utils/utf.h"
23 #include "ecmascript/common.h"
24 
25 namespace panda::ecmascript::base::utf_helper {
26 
27 static constexpr size_t CONST_2 = 2;
28 static constexpr size_t CONST_3 = 3;
29 static constexpr size_t CONST_4 = 4;
30 static constexpr size_t MASK1 = 0x80;
31 static constexpr size_t MASK2 = 0x20;
32 static constexpr size_t MASK3 = 0x10;
33 static constexpr size_t LOW_3BITS = 0x7;
34 static constexpr size_t LOW_4BITS = 0xF;
35 static constexpr size_t LOW_5BITS = 0x1F;
36 static constexpr size_t LOW_6BITS = 0x3F;
37 static constexpr size_t L_SURROGATE_START = 0xDC00;
38 static constexpr size_t H_SURROGATE_START = 0xD800;
39 static constexpr size_t SURROGATE_RAIR_START = 0x10000;
40 static constexpr size_t OFFSET_18POS = 18;
41 static constexpr size_t OFFSET_12POS = 12;
42 static constexpr size_t OFFSET_10POS = 10;
43 static constexpr size_t OFFSET_6POS = 6;
44 static constexpr uint16_t DECODE_LEAD_LOW = 0xD800;
45 static constexpr uint16_t DECODE_LEAD_HIGH = 0xDBFF;
46 static constexpr uint16_t DECODE_TRAIL_LOW = 0xDC00;
47 static constexpr uint16_t DECODE_TRAIL_HIGH = 0xDFFF;
48 static constexpr uint32_t DECODE_FIRST_FACTOR = 0x400;
49 static constexpr uint32_t DECODE_SECOND_FACTOR = 0x10000;
50 static constexpr uint32_t UTF8_OFFSET = 6;
51 static constexpr uint32_t UTF16_OFFSET = 10;
52 static constexpr uint16_t SURROGATE_MASK = 0xF800;
53 static constexpr uint16_t UTF16_REPLACEMENT_CHARACTER = 0xFFFD;
54 
55 static constexpr uint8_t BIT_MASK_1 = 0x80;
56 static constexpr uint8_t BIT_MASK_2 = 0xC0;
57 static constexpr uint8_t BIT_MASK_3 = 0xE0;
58 static constexpr uint8_t BIT_MASK_4 = 0xF0;
59 static constexpr uint8_t BIT_MASK_5 = 0xF8;
60 static constexpr uint8_t BIT_MASK_FF = 0xFF;
61 static constexpr uint16_t BIT16_MASK = 0x3FF;
62 
63 static constexpr uint8_t UTF8_1B_MAX = 0x7f;
64 
65 static constexpr uint16_t UTF8_2B_MAX = 0x7ff;
66 static constexpr uint8_t UTF8_2B_FIRST = 0xc0;
67 static constexpr uint8_t UTF8_2B_SECOND = 0x80;
68 static constexpr uint8_t UTF8_2B_THIRD = 0x3f;
69 static constexpr uint8_t UTF8_2B_FIRST_MIN = 0xc2;  // the minimum for 2 bytes is 128, which is 0xc280
70 
71 static constexpr uint16_t UTF8_3B_MAX = 0xffff;
72 static constexpr uint8_t UTF8_3B_FIRST = 0xe0;
73 static constexpr uint8_t UTF8_3B_SECOND = 0x80;
74 static constexpr uint8_t UTF8_3B_THIRD = 0x80;
75 static constexpr uint8_t UTF8_3B_SECOND_MIN = 0xa0;  // the minimum for 3 bytes is 2048, which is 0xe0a080
76 static constexpr uint8_t UTF8_3B_RESERVED_FIRST = 0xED;
77 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MIN = 0xA0;
78 static constexpr uint8_t UTF8_3B_RESERVED_SECOND_MAX = 0xBF; // U+D800~U+DFFF is reserved for UTF-16 surrogate pairs
79 
80 static constexpr uint8_t UTF8_4B_FIRST = 0xf0;
81 static constexpr uint8_t UTF8_4B_SECOND_MIN = 0x90;  // the minimum for 4 bytes is 65536, which is 0xf0908080
82 static constexpr uint8_t UTF8_4B_FIRST_MAX = 0xF4; // the maximum for 4 bytes is 1114111, which is 0x10FFFF
83 static constexpr uint8_t UTF8_4B_SECOND_MAX = 0x8F;
84 
85 static constexpr uint8_t byteMask = 0xbf;
86 static constexpr uint8_t byteMark = 0x80;
87 
88 static constexpr uint8_t latin1Limit = 0xFF;
89 
90 static constexpr int32_t INVALID_UTF8 = -1;
91 
92 enum UtfLength : uint8_t { ONE = 1, TWO = 2, THREE = 3, FOUR = 4 };
93 enum UtfOffset : uint8_t { SIX = 6, TEN = 10, TWELVE = 12, EIGHTEEN = 18 };
94 
95 static constexpr size_t MAX_BYTES = 4;
96 struct Utf8Char {
97     size_t n;
98     std::array<uint8_t, MAX_BYTES> ch;
99 };
100 
101 static const unsigned char firstByteMark[7] = {0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC};
102 
103 uint32_t DecodeUTF16(uint16_t const *utf16, size_t len, size_t *index, bool cesu8 = false);
104 
105 size_t EncodeUTF8(uint32_t codepoint, uint8_t* utf8, size_t index, size_t size);
106 
107 uint32_t UTF16Decode(uint16_t lead, uint16_t trail);
108 
109 bool IsValidUTF8(const std::vector<uint8_t> &data);
110 
111 Utf8Char ConvertUtf16ToUtf8(uint16_t d0, uint16_t d1, bool modify, bool isWriteBuffer = false);
112 
113 size_t Utf16ToUtf8Size(const uint16_t *utf16, uint32_t length, bool modify = true,
114                        bool isGetBufferSize = false, bool cesu8 = false);
115 
116 size_t PUBLIC_API ConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len,
117                                            size_t utf8Len, size_t start, bool modify = true,
118                                            bool isWriteBuffer = false, bool cesu = false);
119 
120 size_t DebuggerConvertRegionUtf16ToUtf8(const uint16_t *utf16In, uint8_t *utf8Out, size_t utf16Len, size_t utf8Len,
121                                         size_t start, bool modify = true, bool isWriteBuffer = false);
122 
123 uint32_t HandleAndDecodeInvalidUTF16(uint16_t const *utf16, size_t len, size_t *index);
124 
125 std::pair<uint32_t, size_t> ConvertUtf8ToUtf16Pair(const uint8_t *data, bool combine = false);
126 
127 size_t Utf8ToUtf16Size(const uint8_t *utf8, size_t utf8Len);
128 
129 size_t ConvertRegionUtf8ToUtf16(const uint8_t *utf8In, uint16_t *utf16Out, size_t utf8Len, size_t utf16Len);
130 
131 size_t ConvertRegionUtf16ToLatin1(const uint16_t *utf16In, uint8_t *latin1Out, size_t utf16Len, size_t latin1Len);
132 
CombineTwoU16(uint16_t d0,uint16_t d1)133 static inline uint32_t CombineTwoU16(uint16_t d0, uint16_t d1)
134 {
135     uint32_t codePoint = d0 - utf::HI_SURROGATE_MIN;
136     codePoint <<= UtfOffset::TEN;
137     codePoint |= d1 - utf::LO_SURROGATE_MIN;
138     codePoint += utf::LO_SUPPLEMENTS_MIN;
139     return codePoint;
140 }
141 
142 std::pair<int32_t, size_t> ConvertUtf8ToUnicodeChar(const uint8_t *utf8, size_t maxLen);
143 
IsHexDigits(uint16_t ch)144 static inline bool IsHexDigits(uint16_t ch)
145 {
146     return (ch >= '0' && ch <= '9') || (ch >= 'A' && ch <= 'F') || (ch >= 'a' && ch <= 'f');
147 }
148 
149 }  // namespace panda::ecmascript::base::utf_helper
150 
151 #endif  // ECMASCRIPT_BASE_UTF_HELPER_H