• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /**
2  * Copyright (c) 2021-2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  * http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef LIBPANDABASE_UTILS_UTF_H
17 #define LIBPANDABASE_UTILS_UTF_H
18 
19 #include <cstdint>
20 #include <cstddef>
21 
22 #include "utils/hash.h"
23 #include "utils/span.h"
24 
25 namespace panda::utf {
26 
27 /*
28  * https://en.wikipedia.org/wiki/UTF-8
29  *
30  * N  Bits for     First        Last        Byte 1      Byte 2      Byte 3      Byte 4
31  *    code point   code point   code point
32  * 1  7            U+0000       U+007F      0xxxxxxx
33  * 2  11           U+0080       U+07FF      110xxxxx    10xxxxxx
34  * 3  16           U+0800       U+FFFF      1110xxxx    10xxxxxx    10xxxxxx
35  * 4  21           U+10000      U+10FFFF    11110xxx    10xxxxxx    10xxxxxx    10xxxxxx
36  */
37 constexpr size_t MASK1 = 0x80;
38 constexpr size_t MASK2 = 0x20;
39 constexpr size_t MASK3 = 0x10;
40 
41 constexpr size_t MASK_4BIT = 0x0f;
42 constexpr size_t MASK_5BIT = 0x1f;
43 constexpr size_t MASK_6BIT = 0x3f;
44 constexpr size_t MASK_10BIT = 0x03ff;
45 constexpr size_t MASK_16BIT = 0xffff;
46 
47 constexpr size_t DATA_WIDTH = 6;
48 constexpr size_t PAIR_ELEMENT_WIDTH = 16;
49 
50 constexpr size_t HI_SURROGATE_MIN = 0xd800;
51 constexpr size_t HI_SURROGATE_MAX = 0xdbff;
52 constexpr size_t LO_SURROGATE_MIN = 0xdc00;
53 constexpr size_t LO_SURROGATE_MAX = 0xdfff;
54 
55 constexpr size_t LO_SUPPLEMENTS_MIN = 0x10000;
56 
57 constexpr size_t U16_LEAD = 0xd7c0;
58 constexpr size_t U16_TAIL = 0xdc00;
59 
60 constexpr uint8_t MUTF8_1B_MAX = 0x7f;
61 
62 constexpr uint16_t MUTF8_2B_MAX = 0x7ff;
63 constexpr uint8_t MUTF8_2B_FIRST = 0xc0;
64 constexpr uint8_t MUTF8_2B_SECOND = 0x80;
65 
66 constexpr uint8_t MUTF8_3B_FIRST = 0xe0;
67 constexpr uint8_t MUTF8_3B_SECOND = 0x80;
68 constexpr uint8_t MUTF8_3B_THIRD = 0x80;
69 
70 constexpr uint8_t MUTF8_4B_FIRST = 0xf0;
71 
72 WEAK_FOR_LTO_START
73 
74 std::pair<uint32_t, size_t> ConvertMUtf8ToUtf16Pair(const uint8_t *data, size_t max_bytes = 4);
75 
76 bool IsMUtf8OnlySingleBytes(const uint8_t *mutf8_in);
77 
78 void ConvertMUtf8ToUtf16(const uint8_t *mutf8_in, size_t mutf8_len, uint16_t *utf16_out);
79 
80 size_t ConvertRegionMUtf8ToUtf16(const uint8_t *mutf8_in, uint16_t *utf16_out, size_t mutf8_len, size_t utf16_len,
81                                  size_t start);
82 
83 size_t ConvertRegionUtf16ToMUtf8(const uint16_t *utf16_in, uint8_t *mutf8_out, size_t utf16_len, size_t mutf8_len,
84                                  size_t start);
85 
86 int CompareMUtf8ToMUtf8(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
87 
88 int CompareUtf8ToUtf8(const uint8_t *utf8_1, size_t utf8_1_length, const uint8_t *utf8_2, size_t utf8_2_length);
89 
90 bool IsEqual(Span<const uint8_t> utf8_1, Span<const uint8_t> utf8_2);
91 
92 bool IsEqual(const uint8_t *mutf8_1, const uint8_t *mutf8_2);
93 
94 size_t MUtf8ToUtf16Size(const uint8_t *mutf8);
95 
96 size_t MUtf8ToUtf16Size(const uint8_t *mutf8, size_t mutf8_len);
97 
98 size_t Utf16ToMUtf8Size(const uint16_t *mutf16, uint32_t length);
99 
100 size_t Mutf8Size(const uint8_t *mutf8);
101 
102 bool IsValidModifiedUTF8(const uint8_t *elems);
103 
104 WEAK_FOR_LTO_END
105 
CStringAsMutf8(const char * str)106 inline const uint8_t *CStringAsMutf8(const char *str)
107 {
108     return reinterpret_cast<const uint8_t *>(str);
109 }
110 
Mutf8AsCString(const uint8_t * mutf8)111 inline const char *Mutf8AsCString(const uint8_t *mutf8)
112 {
113     return reinterpret_cast<const char *>(mutf8);
114 }
115 
IsAvailableNextUtf16Code(uint16_t val)116 inline constexpr bool IsAvailableNextUtf16Code(uint16_t val)
117 {
118     return val >= HI_SURROGATE_MIN && val <= LO_SURROGATE_MAX;
119 }
120 
121 struct Mutf8Hash {
operatorMutf8Hash122     uint32_t operator()(const uint8_t *data) const
123     {
124         return GetHash32String(data);
125     }
126 };
127 
128 struct Mutf8Equal {
operatorMutf8Equal129     bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
130     {
131         return IsEqual(mutf8_1, mutf8_2);
132     }
133 };
134 
135 struct Mutf8Less {
operatorMutf8Less136     bool operator()(const uint8_t *mutf8_1, const uint8_t *mutf8_2) const
137     {
138         return CompareMUtf8ToMUtf8(mutf8_1, mutf8_2) < 0;
139     }
140 };
141 
SplitUtf16Pair(uint32_t pair)142 static inline std::pair<uint16_t, uint16_t> SplitUtf16Pair(uint32_t pair)
143 {
144     constexpr size_t P1_MASK = 0xffff;
145     constexpr size_t P2_SHIFT = 16;
146     return {pair >> P2_SHIFT, pair & P1_MASK};
147 }
148 
149 }  // namespace panda::utf
150 
151 #endif  // LIBPANDABASE_UTILS_UTF_H
152