• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Copyright (c) 2022 Huawei Device Co., Ltd.
3  * Licensed under the Apache License, Version 2.0 (the "License");
4  * you may not use this file except in compliance with the License.
5  * You may obtain a copy of the License at
6  *
7  *     http://www.apache.org/licenses/LICENSE-2.0
8  *
9  * Unless required by applicable law or agreed to in writing, software
10  * distributed under the License is distributed on an "AS IS" BASIS,
11  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12  * See the License for the specific language governing permissions and
13  * limitations under the License.
14  */
15 
16 #ifndef API_BASE_UTIL_UTF8_DECODE_H
17 #define API_BASE_UTIL_UTF8_DECODE_H
18 
19 #include <cstdint>
20 
21 #include <base/containers/string_view.h>
22 #include <base/namespace.h>
23 #include <base/util/log.h>
24 
BASE_BEGIN_NAMESPACE()25 BASE_BEGIN_NAMESPACE()
26 namespace {
27 
28 constexpr uint32_t UTF8_ACCEPT = 0U;
29 constexpr uint32_t UTF8_REJECT = 12U;
30 
31 // The first table maps bytes to character classes that to reduce the size of the transition table and
32 // create bitmasks.
33 static constexpr const uint8_t CHAR_MAP[] = { 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
34     0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
35     0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
36     0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U,
37     0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 0U, 1U, 1U, 1U, 1U, 1U, 1U,
38     1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 1U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 9U, 7U, 7U, 7U,
39     7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U, 7U,
40     8U, 8U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U, 2U,
41     2U, 2U, 2U, 10U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 3U, 4U, 3U, 3U, 11U, 6U, 6U, 6U, 5U, 8U, 8U, 8U, 8U,
42     8U, 8U, 8U, 8U, 8U, 8U, 8 };
43 
44 // The second transition table that maps a combination of a state of the automaton and a character class to a state.
45 // These have been premultiplied with 12 to save the operation from runtime.
46 static constexpr const uint8_t STATE[] = { 0U, 12U, 24U, 36U, 60U, 96U, 84U, 12U, 12U, 12U, 48U, 72U, 12U, 12U, 12U,
47     12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 0U, 12U, 12U, 12U, 12U, 12U, 0U, 12U, 0U, 12U, 12U, 12U, 24U, 12U,
48     12U, 12U, 12U, 12U, 24U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 24U,
49     12U, 12U, 12U, 12U, 12U, 12U, 12U, 24U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 36U, 12U, 36U, 12U, 12U, 12U,
50     36U, 12U, 12U, 12U, 12U, 12U, 36U, 12U, 36U, 12U, 12U, 12U, 36U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12U, 12 };
51 
52 constexpr inline uint32_t decode(uint32_t* state, uint32_t* codep, unsigned char byte)
53 {
54     uint32_t type = CHAR_MAP[byte];
55     uint32_t prevCodep = (byte & 0x3fU) | (*codep << 6U);
56     uint32_t codepoint = (0xffU >> type) & (byte);
57     *codep = (*state > UTF8_REJECT) ? prevCodep : codepoint;
58 
59     *state = STATE[*state + type];
60     return *state;
61 }
62 } // namespace
63 
64 /** Decode utf8 encoded string.
65  * @param buf Utf8 encoded string pointer, moved to next codepoint on success.
66  * @return Next unicode codepoint on success, 0 otherwise.
67  */
GetCharUtf8(const char ** buf)68 constexpr uint32_t GetCharUtf8(const char** buf)
69 {
70     uint32_t state = UTF8_ACCEPT;
71     uint32_t codepoint = 0U;
72 
73     while (**buf) {
74         decode(&state, &codepoint, static_cast<unsigned char>(**buf));
75         (*buf)++;
76         switch (state) {
77             case UTF8_ACCEPT:
78                 return codepoint;
79             case UTF8_REJECT:
80                 BASE_LOG_E("invalid utf8 sequence\n");
81                 return 0;
82         }
83     }
84     return 0;
85 }
86 
87 /** Count valid character in provided utf8 encoded string.
88  * @param string Utf8 encoded string.
89  * @return Valid unicode codepoint count in provided utf8 string.
90  */
CountGlyphsUtf8(const BASE_NS::string_view string)91 constexpr uint32_t CountGlyphsUtf8(const BASE_NS::string_view string)
92 {
93     uint32_t state = UTF8_ACCEPT;
94     uint32_t codepoint = 0U;
95     uint32_t count = 0U;
96     const char* s = string.data();
97     const char* sEnd = string.data() + string.length();
98 
99     for (; (s < sEnd) && *s; ++s) {
100         if (!decode(&state, &codepoint, static_cast<unsigned char>(*s))) {
101             count += 1U;
102         }
103     }
104     if (state != UTF8_ACCEPT) {
105         BASE_LOG_E("malformed utf8 string\n");
106     }
107     return count;
108 }
109 BASE_END_NAMESPACE()
110 #endif // API_BASE_UTIL_UTF8_DECODE_H
111