1 // See http://bjoern.hoehrmann.de/utf-8/decoder/dfa/ for details.
2 // The remapped transition table is justified at
3 // https://docs.google.com/spreadsheets/d/1AZcQwuEL93HmNCljJWUwFMGqf7JAQ0puawZaUgP0E14
4
5 #include <stdint.h>
6
7 #ifndef __UTF8_DFA_DECODER_H
8 #define __UTF8_DFA_DECODER_H
9
10 namespace Utf8DfaDecoder {
11
12 enum State : uint8_t {
13 kReject = 0,
14 kAccept = 12,
15 kTwoByte = 24,
16 kThreeByte = 36,
17 kThreeByteLowMid = 48,
18 kFourByte = 60,
19 kFourByteLow = 72,
20 kThreeByteHigh = 84,
21 kFourByteMidHigh = 96,
22 };
23
Decode(uint8_t byte,State * state,uint32_t * buffer)24 static inline void Decode(uint8_t byte, State* state, uint32_t* buffer) {
25 // This first table maps bytes to character to a transition.
26 static constexpr uint8_t transitions[] = {
27 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 00-0F
28 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 10-1F
29 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 20-2F
30 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 30-3F
31 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 40-4F
32 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 50-5F
33 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 60-6F
34 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 70-7F
35 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, // 80-8F
36 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, // 90-9F
37 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // A0-AF
38 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, // B0-BF
39 9, 9, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // C0-CF
40 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, // D0-DF
41 10, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, // E0-EF
42 11, 7, 7, 7, 8, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, // F0-FF
43 };
44
45 // This second table maps a state to a new state when adding a transition.
46 // 00-7F
47 // | 80-8F
48 // | | 90-9F
49 // | | | A0-BF
50 // | | | | C2-DF
51 // | | | | | E1-EC, EE, EF
52 // | | | | | | ED
53 // | | | | | | | F1-F3
54 // | | | | | | | | F4
55 // | | | | | | | | | C0, C1, F5-FF
56 // | | | | | | | | | | E0
57 // | | | | | | | | | | | F0
58 static constexpr uint8_t states[] = {
59 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // REJECT = 0
60 12, 0, 0, 0, 24, 36, 48, 60, 72, 0, 84, 96, // ACCEPT = 12
61 0, 12, 12, 12, 0, 0, 0, 0, 0, 0, 0, 0, // 2-byte = 24
62 0, 24, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte = 36
63 0, 24, 24, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte low/mid = 48
64 0, 36, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte = 60
65 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte low = 72
66 0, 0, 0, 24, 0, 0, 0, 0, 0, 0, 0, 0, // 3-byte high = 84
67 0, 0, 36, 36, 0, 0, 0, 0, 0, 0, 0, 0, // 4-byte mid/high = 96
68 };
69
70 uint8_t type = transitions[byte];
71 *state = static_cast<State>(states[*state + type]);
72 *buffer = (*buffer << 6) | (byte & (0x7F >> (type >> 1)));
73 }
74
75 } // namespace Utf8DfaDecoder
76
77 #endif /* __UTF8_DFA_DECODER_H */
78