1 // Copyright 2005-2008 Google Inc. All Rights Reserved.
2 // Author: jrm@google.com (Jim Meehan)
3
4 #include <google/protobuf/stubs/common.h>
5
6 #include <google/protobuf/stubs/stringpiece.h>
7
8 namespace google {
9 namespace protobuf {
10 namespace internal {
11
12 // These four-byte entries compactly encode how many bytes 0..255 to delete
13 // in making a string replacement, how many bytes to add 0..255, and the offset
14 // 0..64k-1 of the replacement string in remap_string.
15 struct RemapEntry {
16 uint8 delete_bytes;
17 uint8 add_bytes;
18 uint16 bytes_offset;
19 };
20
21 // Exit type codes for state tables. All but the first get stuffed into
22 // signed one-byte entries. The first is only generated by executable code.
23 // To distinguish from next-state entries, these must be contiguous and
24 // all <= kExitNone
25 typedef enum {
26 kExitDstSpaceFull = 239,
27 kExitIllegalStructure, // 240
28 kExitOK, // 241
29 kExitReject, // ...
30 kExitReplace1,
31 kExitReplace2,
32 kExitReplace3,
33 kExitReplace21,
34 kExitReplace31,
35 kExitReplace32,
36 kExitReplaceOffset1,
37 kExitReplaceOffset2,
38 kExitReplace1S0,
39 kExitSpecial,
40 kExitDoAgain,
41 kExitRejectAlt,
42 kExitNone // 255
43 } ExitReason;
44
45
46 // This struct represents one entire state table. The three initialized byte
47 // areas are state_table, remap_base, and remap_string. state0 and state0_size
48 // give the byte offset and length within state_table of the initial state --
49 // table lookups are expected to start and end in this state, but for
50 // truncated UTF-8 strings, may end in a different state. These allow a quick
51 // test for that condition. entry_shift is 8 for tables subscripted by a full
52 // byte value and 6 for space-optimized tables subscripted by only six
53 // significant bits in UTF-8 continuation bytes.
54 typedef struct {
55 const uint32 state0;
56 const uint32 state0_size;
57 const uint32 total_size;
58 const int max_expand;
59 const int entry_shift;
60 const int bytes_per_entry;
61 const uint32 losub;
62 const uint32 hiadd;
63 const uint8* state_table;
64 const RemapEntry* remap_base;
65 const uint8* remap_string;
66 const uint8* fast_state;
67 } UTF8StateMachineObj;
68
69 typedef UTF8StateMachineObj UTF8ScanObj;
70
71 #define X__ (kExitIllegalStructure)
72 #define RJ_ (kExitReject)
73 #define S1_ (kExitReplace1)
74 #define S2_ (kExitReplace2)
75 #define S3_ (kExitReplace3)
76 #define S21 (kExitReplace21)
77 #define S31 (kExitReplace31)
78 #define S32 (kExitReplace32)
79 #define T1_ (kExitReplaceOffset1)
80 #define T2_ (kExitReplaceOffset2)
81 #define S11 (kExitReplace1S0)
82 #define SP_ (kExitSpecial)
83 #define D__ (kExitDoAgain)
84 #define RJA (kExitRejectAlt)
85
86 // Entire table has 9 state blocks of 256 entries each
87 static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
88 static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
89 static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
90 static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
91 static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
92 static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
93 static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
94 static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
95
96 static const uint8 utf8acceptnonsurrogates[] = {
97 // state[0] 0x000000 Byte 1
98 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
99 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
100 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
101 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
102
103 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
104 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
105 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107
108 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
109 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
110 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
111 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
112
113 X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
114 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
115 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
116 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
117
118 // state[1] 0x000080 Byte 2 of 2
119 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
120 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
121 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
122 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
123
124 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
125 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
126 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
127 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
128
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133
134 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
135 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
136 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
137 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
138
139 // state[2] 0x000000 Byte 2 of 3
140 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
142 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
143 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
144
145 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
147 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
148 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149
150 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
151 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
152 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
153 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
154
155 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
158 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
159
160 // state[3] 0x001000 Byte 2 of 3
161 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
162 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
163 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
164 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165
166 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
168 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
169 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
170
171 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
172 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
173 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
174 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
175
176 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
179 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
180
181 // state[4] 0x000000 Byte 2 of 4
182 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
183 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
184 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
185 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186
187 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
189 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
190 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191
192 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
193 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
194 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
195 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
196
197 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
200 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
201
202 // state[5] 0x040000 Byte 2 of 4
203 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
204 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
205 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
206 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207
208 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
210 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
211 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
212
213 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
214 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
215 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
216 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
217
218 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
221 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
222
223 // state[6] 0x100000 Byte 2 of 4
224 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
225 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
226 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
227 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228
229 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
231 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
232 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
233
234 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
235 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
237 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238
239 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
242 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
243
244 // state[7] 0x00d000 Byte 2 of 3
245 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
246 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
247 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
248 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249
250 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
252 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
253 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
254
255 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
256 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
257 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
258 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
259
260 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
263 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
264
265 // state[8] 0x00d800 Byte 3 of 3
266 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
268 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
269 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270
271 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
273 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
274 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
275
276 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
277 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
278 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
279 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
280
281 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
284 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
285 };
286
287 // Remap base[0] = (del, add, string_offset)
288 static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
289 {0, 0, 0} };
290
291 // Remap string[0]
292 static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
293 0 };
294
295 static const unsigned char utf8acceptnonsurrogates_fast[256] = {
296 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
298 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
299 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
300
301 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
302 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
303 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
304 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
305
306 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
307 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
308 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
309 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
310
311 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
312 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
313 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
314 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
315 };
316
317 static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
318 utf8acceptnonsurrogates_STATE0,
319 utf8acceptnonsurrogates_STATE0_SIZE,
320 utf8acceptnonsurrogates_TOTAL_SIZE,
321 utf8acceptnonsurrogates_MAX_EXPAND_X4,
322 utf8acceptnonsurrogates_SHIFT,
323 utf8acceptnonsurrogates_BYTES,
324 utf8acceptnonsurrogates_LOSUB,
325 utf8acceptnonsurrogates_HIADD,
326 utf8acceptnonsurrogates,
327 utf8acceptnonsurrogates_remap_base,
328 utf8acceptnonsurrogates_remap_string,
329 utf8acceptnonsurrogates_fast
330 };
331
332
333 #undef X__
334 #undef RJ_
335 #undef S1_
336 #undef S2_
337 #undef S3_
338 #undef S21
339 #undef S31
340 #undef S32
341 #undef T1_
342 #undef T2_
343 #undef S11
344 #undef SP_
345 #undef D__
346 #undef RJA
347
348 // Return true if current Tbl pointer is within state0 range
349 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)350 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
351 const uint8* Tbl0 = &st->state_table[st->state0];
352 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
353 }
354
355 // Scan a UTF-8 string based on state table.
356 // Always scan complete UTF-8 characters
357 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)358 int UTF8GenericScan(const UTF8ScanObj* st,
359 const char * str,
360 int str_length,
361 int* bytes_consumed) {
362 *bytes_consumed = 0;
363 if (str_length == 0) return kExitOK;
364
365 int eshift = st->entry_shift;
366 const uint8* isrc = reinterpret_cast<const uint8*>(str);
367 const uint8* src = isrc;
368 const uint8* srclimit = isrc + str_length;
369 const uint8* srclimit8 = srclimit - 7;
370 const uint8* Tbl_0 = &st->state_table[st->state0];
371
372 DoAgain:
373 // Do state-table scan
374 int e = 0;
375 uint8 c;
376 const uint8* Tbl2 = &st->fast_state[0];
377 const uint32 losub = st->losub;
378 const uint32 hiadd = st->hiadd;
379 // Check initial few bytes one at a time until 8-byte aligned
380 //----------------------------
381 while ((((uintptr_t)src & 0x07) != 0) &&
382 (src < srclimit) &&
383 Tbl2[src[0]] == 0) {
384 src++;
385 }
386 if (((uintptr_t)src & 0x07) == 0) {
387 // Do fast for groups of 8 identity bytes.
388 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
389 // including slowing slightly on cr/lf/ht
390 //----------------------------
391 while (src < srclimit8) {
392 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
393 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
394 src += 8;
395 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
396 uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
397 (s4567 - losub) | (s4567 + hiadd);
398 if ((temp & 0x80808080) != 0) {
399 // We typically end up here on cr/lf/ht; src was incremented
400 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
401 (Tbl2[src[-6]] | Tbl2[src[-5]]);
402 if (e0123 != 0) {
403 src -= 8;
404 break;
405 } // Exit on Non-interchange
406 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
407 (Tbl2[src[-2]] | Tbl2[src[-1]]);
408 if (e0123 != 0) {
409 src -= 4;
410 break;
411 } // Exit on Non-interchange
412 // Else OK, go around again
413 }
414 }
415 }
416 //----------------------------
417
418 // Byte-at-a-time scan
419 //----------------------------
420 const uint8* Tbl = Tbl_0;
421 while (src < srclimit) {
422 c = *src;
423 e = Tbl[c];
424 src++;
425 if (e >= kExitIllegalStructure) {break;}
426 Tbl = &Tbl_0[e << eshift];
427 }
428 //----------------------------
429
430
431 // Exit posibilities:
432 // Some exit code, !state0, back up over last char
433 // Some exit code, state0, back up one byte exactly
434 // source consumed, !state0, back up over partial char
435 // source consumed, state0, exit OK
436 // For illegal byte in state0, avoid backup up over PREVIOUS char
437 // For truncated last char, back up to beginning of it
438
439 if (e >= kExitIllegalStructure) {
440 // Back up over exactly one byte of rejected/illegal UTF-8 character
441 src--;
442 // Back up more if needed
443 if (!InStateZero(st, Tbl)) {
444 do {
445 src--;
446 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
447 }
448 } else if (!InStateZero(st, Tbl)) {
449 // Back up over truncated UTF-8 character
450 e = kExitIllegalStructure;
451 do {
452 src--;
453 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
454 } else {
455 // Normal termination, source fully consumed
456 e = kExitOK;
457 }
458
459 if (e == kExitDoAgain) {
460 // Loop back up to the fast scan
461 goto DoAgain;
462 }
463
464 *bytes_consumed = src - isrc;
465 return e;
466 }
467
UTF8GenericScanFastAscii(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)468 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
469 const char * str,
470 int str_length,
471 int* bytes_consumed) {
472 *bytes_consumed = 0;
473 if (str_length == 0) return kExitOK;
474
475 const uint8* isrc = reinterpret_cast<const uint8*>(str);
476 const uint8* src = isrc;
477 const uint8* srclimit = isrc + str_length;
478 const uint8* srclimit8 = srclimit - 7;
479 int n;
480 int rest_consumed;
481 int exit_reason;
482 do {
483 // Check initial few bytes one at a time until 8-byte aligned
484 while ((((uintptr_t)src & 0x07) != 0) &&
485 (src < srclimit) && (src[0] < 0x80)) {
486 src++;
487 }
488 if (((uintptr_t)src & 0x07) == 0) {
489 while ((src < srclimit8) &&
490 (((reinterpret_cast<const uint32*>(src)[0] |
491 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
492 src += 8;
493 }
494 }
495 while ((src < srclimit) && (src[0] < 0x80)) {
496 src++;
497 }
498 // Run state table on the rest
499 n = src - isrc;
500 exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
501 src += rest_consumed;
502 } while ( exit_reason == kExitDoAgain );
503
504 *bytes_consumed = src - isrc;
505 return exit_reason;
506 }
507
508 // Hack: On some compilers the static tables are initialized at startup.
509 // We can't use them until they are initialized. However, some Protocol
510 // Buffer parsing happens at static init time and may try to validate
511 // UTF-8 strings. Since UTF-8 validation is only used for debugging
512 // anyway, we simply always return success if initialization hasn't
513 // occurred yet.
514 namespace {
515
516 bool module_initialized_ = false;
517
518 struct InitDetector {
InitDetectorgoogle::protobuf::internal::__anon6d501fde0311::InitDetector519 InitDetector() {
520 module_initialized_ = true;
521 }
522 };
523 InitDetector init_detector;
524
525 } // namespace
526
IsStructurallyValidUTF8(const char * buf,int len)527 bool IsStructurallyValidUTF8(const char* buf, int len) {
528 if (!module_initialized_) return true;
529
530 int bytes_consumed = 0;
531 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
532 buf, len, &bytes_consumed);
533 return (bytes_consumed == len);
534 }
535
UTF8SpnStructurallyValid(const StringPiece & str)536 int UTF8SpnStructurallyValid(const StringPiece& str) {
537 if (!module_initialized_) return str.size();
538
539 int bytes_consumed = 0;
540 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
541 str.data(), str.size(), &bytes_consumed);
542 return bytes_consumed;
543 }
544
545 // Coerce UTF-8 byte string in src_str to be
546 // a structurally-valid equal-length string by selectively
547 // overwriting illegal bytes with replace_char (typically blank).
548 // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
549 // src_str is read-only. If any overwriting is needed, a modified byte string
550 // is created in idst, length isrclen.
551 //
552 // Returns pointer to output buffer, isrc if no changes were made,
553 // or idst if some bytes were changed.
554 //
555 // Fast case: all is structurally valid and no byte copying is done.
556 //
UTF8CoerceToStructurallyValid(const StringPiece & src_str,char * idst,const char replace_char)557 char* UTF8CoerceToStructurallyValid(const StringPiece& src_str,
558 char* idst,
559 const char replace_char) {
560 const char* isrc = src_str.data();
561 const int len = src_str.length();
562 int n = UTF8SpnStructurallyValid(src_str);
563 if (n == len) { // Normal case -- all is cool, return
564 return const_cast<char*>(isrc);
565 } else { // Unusual case -- copy w/o bad bytes
566 const char* src = isrc;
567 const char* srclimit = isrc + len;
568 char* dst = idst;
569 memmove(dst, src, n); // Copy initial good chunk
570 src += n;
571 dst += n;
572 while (src < srclimit) { // src points to bogus byte or is off the end
573 dst[0] = replace_char; // replace one bad byte
574 src++;
575 dst++;
576 StringPiece str2(src, srclimit - src);
577 n = UTF8SpnStructurallyValid(str2); // scan the remainder
578 memmove(dst, src, n); // copy next good chunk
579 src += n;
580 dst += n;
581 }
582 }
583 return idst;
584 }
585
586 } // namespace internal
587 } // namespace protobuf
588 } // namespace google
589