1 /*
2 * Copyright (c) 2022 Huawei Device Co., Ltd.
3 * Licensed under the Apache License, Version 2.0 (the "License");
4 * you may not use this file except in compliance with the License.
5 * You may obtain a copy of the License at
6 *
7 * http://www.apache.org/licenses/LICENSE-2.0
8 *
9 * Unless required by applicable law or agreed to in writing, software
10 * distributed under the License is distributed on an "AS IS" BASIS,
11 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 * See the License for the specific language governing permissions and
13 * limitations under the License.
14 */
15
16 #ifndef API_BASE_UTIL_UTF8_DECODE_H
17 #define API_BASE_UTIL_UTF8_DECODE_H
18
19 #include <cstdint>
20
21 #include <base/containers/string_view.h>
22 #include <base/namespace.h>
23 #include <base/util/log.h>
24
BASE_BEGIN_NAMESPACE()25 BASE_BEGIN_NAMESPACE()
26 namespace {
27
28 constexpr uint32_t UTF8_ACCEPT = 0U;
29 constexpr uint32_t UTF8_REJECT = 12U;
30
31 // The first table maps bytes to character classes that to reduce the size of the transition table and
32 // create bitmasks.
33 static constexpr const uint8_t CHAR_MAP[] = { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
34 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
35 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
36 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
37 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 1U, 1U, 1U, 1U, 1U, 1U,
38 1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 7U, 7U, 7U,
39 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U,
40 8U, 8U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U,
41 2U, 2U, 2U, 10U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 4U, 3U, 3U, 11U, 6U, 6U, 6U, 5U, 8U, 8U, 8U, 8U,
42 8U, 8U, 8U, 8U, 8U, 8U, 8 };
43
44 // The second transition table that maps a combination of a state of the automaton and a character class to a state.
45 // These have been premultiplied with 12 to save the operation from runtime.
46 static constexpr const uint8_t STATE[] = { 0U, 12U, 24U, 36U, 60U, 96U, 84U, 12U, 12U, 12U, 48U, 72U, 12U, 12U, 12U,
47 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 0U, 12U, 12U, 12U, 12U, 12U, 0U, 12U, 0U, 12U, 12U, 12U, 24U, 12U,
48 12U, 12U, 12U, 12U, 24U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 24U,
49 12U, 12U, 12U, 12U, 12U, 12U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 36U, 12U, 36U, 12U, 12U, 12U,
50 36U, 12U, 12U, 12U, 12U, 12U, 36U, 12U, 36U, 12U, 12U, 12U, 36U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12 };
51
52 constexpr inline uint32_t decode(uint32_t* state, uint32_t* codep, unsigned char byte)
53 {
54 uint32_t type = CHAR_MAP[byte];
55 uint32_t prevCodep = (byte & 0x3fU) | (*codep << 6U);
56 uint32_t codepoint = (0xffU >> type) & (byte);
57 *codep = (*state > UTF8_REJECT) ? prevCodep : codepoint;
58
59 *state = STATE[*state + type];
60 return *state;
61 }
62 } // namespace
63
64 /** Decode utf8 encoded string.
65 * @param buf Utf8 encoded string pointer, moved to next codepoint on success.
66 * @return Next unicode codepoint on success, 0 otherwise.
67 */
GetCharUtf8(const char ** buf)68 constexpr uint32_t GetCharUtf8(const char** buf)
69 {
70 uint32_t state = UTF8_ACCEPT;
71 uint32_t codepoint = 0U;
72
73 while (**buf) {
74 decode(&state, &codepoint, static_cast<unsigned char>(**buf));
75 (*buf)++;
76 switch (state) {
77 case UTF8_ACCEPT:
78 return codepoint;
79 case UTF8_REJECT:
80 BASE_LOG_E("invalid utf8 sequence\n");
81 return 0;
82 }
83 }
84 return 0;
85 }
86
87 /** Count valid character in provided utf8 encoded string.
88 * @param string Utf8 encoded string.
89 * @return Valid unicode codepoint count in provided utf8 string.
90 */
CountGlyphsUtf8(const BASE_NS::string_view string)91 constexpr uint32_t CountGlyphsUtf8(const BASE_NS::string_view string)
92 {
93 uint32_t state = UTF8_ACCEPT;
94 uint32_t codepoint = 0U;
95 uint32_t count = 0U;
96 const char* s = string.data();
97 const char* sEnd = string.data() + string.length();
98
99 for (; (s < sEnd) && *s; ++s) {
100 if (!decode(&state, &codepoint, static_cast<unsigned char>(*s))) {
101 count += 1U;
102 }
103 }
104 if (state != UTF8_ACCEPT) {
105 BASE_LOG_E("malformed utf8 string\n");
106 }
107 return count;
108 }
109 BASE_END_NAMESPACE()
110 #endif // API_BASE_UTIL_UTF8_DECODE_H
111