1 /*
2 * Copyright (c) 2021 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef PANDA_LIBPANDABASE_UTILS_UTF_H_
17 #define PANDA_LIBPANDABASE_UTILS_UTF_H_
18
19 #include <cstdint>
20 #include <cstddef>
21
22 #include "utils/hash.h"
23 #include "utils/span.h"
24
25 namespace panda::utf {
26
27 /*
28 * N Bits for First Last Byte 1 Byte 2 Byte 3 Byte 4
29 * code point code point code point
30 * 1 7 U+0000 U+007F 0xxxxxxx
31 * 2 11 U+0080 U+07FF 110xxxxx 10xxxxxx
32 * 3 16 U+0800 U+FFFF 1110xxxx 10xxxxxx 10xxxxxx
33 * 4 21 U+10000 U+10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
34 */
35 constexpr size_t MASK1 = 0x80;
36 constexpr size_t MASK2 = 0x20;
37 constexpr size_t MASK3 = 0x10;
38
39 constexpr size_t MASK_4BIT = 0x0f;
40 constexpr size_t MASK_5BIT = 0x1f;
41 constexpr size_t MASK_6BIT = 0x3f;
42 constexpr size_t MASK_10BIT = 0x03ff;
43 constexpr size_t MASK_16BIT = 0xffff;
44
45 constexpr size_t DATA_WIDTH = 6;
46 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
47
48 constexpr size_t HI_SURROGATE_MIN = 0xd800;
49 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
50 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
51 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
52
53 constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
54
55 constexpr size_t U16_LEAD = 0xd7c0;
56 constexpr size_t U16_TAIL = 0xdc00;
57
58 constexpr uint8_t MUTF8_1B_MAX = 0x7f;
59
60 constexpr uint16_t MUTF8_2B_MAX = 0x7ff;
61 constexpr uint8_t MUTF8_2B_FIRST = 0xc0;
62 constexpr uint8_t MUTF8_2B_SECOND = 0x80;
63
64 constexpr uint8_t MUTF8_3B_FIRST = 0xe0;
65 constexpr uint8_t MUTF8_3B_SECOND = 0x80;
66 constexpr uint8_t MUTF8_3B_THIRD = 0x80;
67
68 constexpr uint8_t MUTF8_4B_FIRST = 0xf0;
69
70 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4);
71
72 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in);
73
74 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out);
75
76 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
77 size_t start);
78
79 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
80 size_t start);
81
82 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
83
84 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length);
85
86 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2);
87
88 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
89
90 size_t MUtf8ToUtf16Size(const uint8_t *mutf8);
91
92 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len);
93
94 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length);
95
96 size_t Mutf8Size(const uint8_t *mutf8);
97
CStringAsMutf8(const char * str)98 inline const uint8_t *CStringAsMutf8(const char *str)
99 {
100 return reinterpret_cast<const uint8_t *>(str);
101 }
102
Mutf8AsCString(const uint8_t * mutf8)103 inline const char *Mutf8AsCString(const uint8_t *mutf8)
104 {
105 return reinterpret_cast<const char *>(mutf8);
106 }
107
IsAvailableNextUtf16Code(uint16_t val)108 inline constexpr bool IsAvailableNextUtf16Code(uint16_t val)
109 {
110 return val >= HI_SURROGATE_MIN && val <= LO_SURROGATE_MAX;
111 }
112
113 struct Mutf8Hash {
operatorMutf8Hash114 uint32_t operator()(const uint8_t *data) const
115 {
116 return GetHash32String(data);
117 }
118 };
119
120 struct Mutf8Equal {
operatorMutf8Equal121 bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
122 {
123 return IsEqual(mutf8_1, mutf8_2);
124 }
125 };
126
127 struct Mutf8Less {
operatorMutf8Less128 bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
129 {
130 return CompareMUtf8ToMUtf8(mutf8_1, mutf8_2) < 0;
131 }
132 };
133
SplitUtf16Pair(uint32_t pair)134 static inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
135 {
136 constexpr size_t P1_MASK = 0xffff;
137 constexpr size_t P2_SHIFT = 16;
138 return {pair >> P2_SHIFT, pair & P1_MASK};
139 }
140
141 } // namespace panda::utf
142
143 #endif // PANDA_LIBPANDABASE_UTILS_UTF_H_
144