1 // Protocol Buffers - Google's data interchange format
2 // Copyright 2008 Google Inc. All rights reserved.
3 // https://developers.google.com/protocol-buffers/
4 //
5 // Redistribution and use in source and binary forms, with or without
6 // modification, are permitted provided that the following conditions are
7 // met:
8 //
9 // * Redistributions of source code must retain the above copyright
10 // notice, this list of conditions and the following disclaimer.
11 // * Redistributions in binary form must reproduce the above
12 // copyright notice, this list of conditions and the following disclaimer
13 // in the documentation and/or other materials provided with the
14 // distribution.
15 // * Neither the name of Google Inc. nor the names of its
16 // contributors may be used to endorse or promote products derived from
17 // this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
20 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
21 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
22 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
23 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
24 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
25 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
26 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
27 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
28 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
29 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30
31 // Author: jrm@google.com (Jim Meehan)
32
33 #include <google/protobuf/stubs/common.h>
34
35 #include <google/protobuf/stubs/stringpiece.h>
36
37 namespace google {
38 namespace protobuf {
39 namespace internal {
40
41 // These four-byte entries compactly encode how many bytes 0..255 to delete
42 // in making a string replacement, how many bytes to add 0..255, and the offset
43 // 0..64k-1 of the replacement string in remap_string.
44 struct RemapEntry {
45 uint8 delete_bytes;
46 uint8 add_bytes;
47 uint16 bytes_offset;
48 };
49
50 // Exit type codes for state tables. All but the first get stuffed into
51 // signed one-byte entries. The first is only generated by executable code.
52 // To distinguish from next-state entries, these must be contiguous and
53 // all <= kExitNone
54 typedef enum {
55 kExitDstSpaceFull = 239,
56 kExitIllegalStructure, // 240
57 kExitOK, // 241
58 kExitReject, // ...
59 kExitReplace1,
60 kExitReplace2,
61 kExitReplace3,
62 kExitReplace21,
63 kExitReplace31,
64 kExitReplace32,
65 kExitReplaceOffset1,
66 kExitReplaceOffset2,
67 kExitReplace1S0,
68 kExitSpecial,
69 kExitDoAgain,
70 kExitRejectAlt,
71 kExitNone // 255
72 } ExitReason;
73
74
75 // This struct represents one entire state table. The three initialized byte
76 // areas are state_table, remap_base, and remap_string. state0 and state0_size
77 // give the byte offset and length within state_table of the initial state --
78 // table lookups are expected to start and end in this state, but for
79 // truncated UTF-8 strings, may end in a different state. These allow a quick
80 // test for that condition. entry_shift is 8 for tables subscripted by a full
81 // byte value and 6 for space-optimized tables subscripted by only six
82 // significant bits in UTF-8 continuation bytes.
83 typedef struct {
84 const uint32 state0;
85 const uint32 state0_size;
86 const uint32 total_size;
87 const int max_expand;
88 const int entry_shift;
89 const int bytes_per_entry;
90 const uint32 losub;
91 const uint32 hiadd;
92 const uint8* state_table;
93 const RemapEntry* remap_base;
94 const uint8* remap_string;
95 const uint8* fast_state;
96 } UTF8StateMachineObj;
97
98 typedef UTF8StateMachineObj UTF8ScanObj;
99
100 #define X__ (kExitIllegalStructure)
101 #define RJ_ (kExitReject)
102 #define S1_ (kExitReplace1)
103 #define S2_ (kExitReplace2)
104 #define S3_ (kExitReplace3)
105 #define S21 (kExitReplace21)
106 #define S31 (kExitReplace31)
107 #define S32 (kExitReplace32)
108 #define T1_ (kExitReplaceOffset1)
109 #define T2_ (kExitReplaceOffset2)
110 #define S11 (kExitReplace1S0)
111 #define SP_ (kExitSpecial)
112 #define D__ (kExitDoAgain)
113 #define RJA (kExitRejectAlt)
114
115 // Entire table has 9 state blocks of 256 entries each
116 static const unsigned int utf8acceptnonsurrogates_STATE0 = 0; // state[0]
117 static const unsigned int utf8acceptnonsurrogates_STATE0_SIZE = 256; // =[1]
118 static const unsigned int utf8acceptnonsurrogates_TOTAL_SIZE = 2304;
119 static const unsigned int utf8acceptnonsurrogates_MAX_EXPAND_X4 = 0;
120 static const unsigned int utf8acceptnonsurrogates_SHIFT = 8;
121 static const unsigned int utf8acceptnonsurrogates_BYTES = 1;
122 static const unsigned int utf8acceptnonsurrogates_LOSUB = 0x20202020;
123 static const unsigned int utf8acceptnonsurrogates_HIADD = 0x00000000;
124
125 static const uint8 utf8acceptnonsurrogates[] = {
126 // state[0] 0x000000 Byte 1
127 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
128 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
129 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
131
132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
133 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
134 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
135 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
136
137 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
138 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
139 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
140 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
141
142 X__, X__, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
143 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
144 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 7, 3, 3,
145 4, 5, 5, 5, 6, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
146
147 // state[1] 0x000080 Byte 2 of 2
148 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
149 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
150 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
151 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
152
153 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
154 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
155 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
156 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
157
158 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
161 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
162
163 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
164 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
165 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
166 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
167
168 // state[2] 0x000000 Byte 2 of 3
169 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
170 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
171 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
172 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
173
174 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
175 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
176 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
177 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
178
179 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
180 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
181 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
182 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
183
184 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
185 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
186 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
187 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
188
189 // state[3] 0x001000 Byte 2 of 3
190 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
191 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
192 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
193 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
194
195 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
196 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
197 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
198 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
199
200 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
201 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
202 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
203 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
204
205 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
206 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
207 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
208 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
209
210 // state[4] 0x000000 Byte 2 of 4
211 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
212 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
213 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
214 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
215
216 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
217 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
218 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
219 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
220
221 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
222 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
223 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
224 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
225
226 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
227 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
228 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
229 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
230
231 // state[5] 0x040000 Byte 2 of 4
232 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
233 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
234 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
235 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
236
237 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
238 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
239 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
240 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
241
242 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
243 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
244 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
245 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
246
247 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
248 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
249 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
250 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
251
252 // state[6] 0x100000 Byte 2 of 4
253 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
254 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
255 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
256 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
257
258 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
259 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
260 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
261 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
262
263 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
264 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
265 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
266 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
267
268 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
269 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
270 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
271 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
272
273 // state[7] 0x00d000 Byte 2 of 3
274 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
275 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
276 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
277 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
278
279 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
280 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
281 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
282 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
283
284 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
285 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
286 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
287 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
288
289 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
290 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
291 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
292 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
293
294 // state[8] 0x00d800 Byte 3 of 3
295 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
296 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
297 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
298 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
299
300 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
301 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
302 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
303 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
304
305 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
306 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
307 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
308 RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_, RJ_,
309
310 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
311 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
312 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
313 X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__, X__,
314 };
315
316 // Remap base[0] = (del, add, string_offset)
317 static const RemapEntry utf8acceptnonsurrogates_remap_base[] = {
318 {0, 0, 0} };
319
320 // Remap string[0]
321 static const unsigned char utf8acceptnonsurrogates_remap_string[] = {
322 0 };
323
324 static const unsigned char utf8acceptnonsurrogates_fast[256] = {
325 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
326 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
327 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
328 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
329
330 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
331 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
332 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
333 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
334
335 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
336 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
337 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
338 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
339
340 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
341 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
342 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
343 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
344 };
345
346 static const UTF8ScanObj utf8acceptnonsurrogates_obj = {
347 utf8acceptnonsurrogates_STATE0,
348 utf8acceptnonsurrogates_STATE0_SIZE,
349 utf8acceptnonsurrogates_TOTAL_SIZE,
350 utf8acceptnonsurrogates_MAX_EXPAND_X4,
351 utf8acceptnonsurrogates_SHIFT,
352 utf8acceptnonsurrogates_BYTES,
353 utf8acceptnonsurrogates_LOSUB,
354 utf8acceptnonsurrogates_HIADD,
355 utf8acceptnonsurrogates,
356 utf8acceptnonsurrogates_remap_base,
357 utf8acceptnonsurrogates_remap_string,
358 utf8acceptnonsurrogates_fast
359 };
360
361
362 #undef X__
363 #undef RJ_
364 #undef S1_
365 #undef S2_
366 #undef S3_
367 #undef S21
368 #undef S31
369 #undef S32
370 #undef T1_
371 #undef T2_
372 #undef S11
373 #undef SP_
374 #undef D__
375 #undef RJA
376
377 // Return true if current Tbl pointer is within state0 range
378 // Note that unsigned compare checks both ends of range simultaneously
InStateZero(const UTF8ScanObj * st,const uint8 * Tbl)379 static inline bool InStateZero(const UTF8ScanObj* st, const uint8* Tbl) {
380 const uint8* Tbl0 = &st->state_table[st->state0];
381 return (static_cast<uint32>(Tbl - Tbl0) < st->state0_size);
382 }
383
384 // Scan a UTF-8 string based on state table.
385 // Always scan complete UTF-8 characters
386 // Set number of bytes scanned. Return reason for exiting
UTF8GenericScan(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)387 int UTF8GenericScan(const UTF8ScanObj* st,
388 const char * str,
389 int str_length,
390 int* bytes_consumed) {
391 *bytes_consumed = 0;
392 if (str_length == 0) return kExitOK;
393
394 int eshift = st->entry_shift;
395 const uint8* isrc = reinterpret_cast<const uint8*>(str);
396 const uint8* src = isrc;
397 const uint8* srclimit = isrc + str_length;
398 const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
399 const uint8* Tbl_0 = &st->state_table[st->state0];
400
401 DoAgain:
402 // Do state-table scan
403 int e = 0;
404 uint8 c;
405 const uint8* Tbl2 = &st->fast_state[0];
406 const uint32 losub = st->losub;
407 const uint32 hiadd = st->hiadd;
408 // Check initial few bytes one at a time until 8-byte aligned
409 //----------------------------
410 while ((((uintptr_t)src & 0x07) != 0) &&
411 (src < srclimit) &&
412 Tbl2[src[0]] == 0) {
413 src++;
414 }
415 if (((uintptr_t)src & 0x07) == 0) {
416 // Do fast for groups of 8 identity bytes.
417 // This covers a lot of 7-bit ASCII ~8x faster then the 1-byte loop,
418 // including slowing slightly on cr/lf/ht
419 //----------------------------
420 while (src < srclimit8) {
421 uint32 s0123 = (reinterpret_cast<const uint32 *>(src))[0];
422 uint32 s4567 = (reinterpret_cast<const uint32 *>(src))[1];
423 src += 8;
424 // This is a fast range check for all bytes in [lowsub..0x80-hiadd)
425 uint32 temp = (s0123 - losub) | (s0123 + hiadd) |
426 (s4567 - losub) | (s4567 + hiadd);
427 if ((temp & 0x80808080) != 0) {
428 // We typically end up here on cr/lf/ht; src was incremented
429 int e0123 = (Tbl2[src[-8]] | Tbl2[src[-7]]) |
430 (Tbl2[src[-6]] | Tbl2[src[-5]]);
431 if (e0123 != 0) {
432 src -= 8;
433 break;
434 } // Exit on Non-interchange
435 e0123 = (Tbl2[src[-4]] | Tbl2[src[-3]]) |
436 (Tbl2[src[-2]] | Tbl2[src[-1]]);
437 if (e0123 != 0) {
438 src -= 4;
439 break;
440 } // Exit on Non-interchange
441 // Else OK, go around again
442 }
443 }
444 }
445 //----------------------------
446
447 // Byte-at-a-time scan
448 //----------------------------
449 const uint8* Tbl = Tbl_0;
450 while (src < srclimit) {
451 c = *src;
452 e = Tbl[c];
453 src++;
454 if (e >= kExitIllegalStructure) {break;}
455 Tbl = &Tbl_0[e << eshift];
456 }
457 //----------------------------
458
459 // Exit possibilities:
460 // Some exit code, !state0, back up over last char
461 // Some exit code, state0, back up one byte exactly
462 // source consumed, !state0, back up over partial char
463 // source consumed, state0, exit OK
464 // For illegal byte in state0, avoid backup up over PREVIOUS char
465 // For truncated last char, back up to beginning of it
466
467 if (e >= kExitIllegalStructure) {
468 // Back up over exactly one byte of rejected/illegal UTF-8 character
469 src--;
470 // Back up more if needed
471 if (!InStateZero(st, Tbl)) {
472 do {
473 src--;
474 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
475 }
476 } else if (!InStateZero(st, Tbl)) {
477 // Back up over truncated UTF-8 character
478 e = kExitIllegalStructure;
479 do {
480 src--;
481 } while ((src > isrc) && ((src[0] & 0xc0) == 0x80));
482 } else {
483 // Normal termination, source fully consumed
484 e = kExitOK;
485 }
486
487 if (e == kExitDoAgain) {
488 // Loop back up to the fast scan
489 goto DoAgain;
490 }
491
492 *bytes_consumed = src - isrc;
493 return e;
494 }
495
UTF8GenericScanFastAscii(const UTF8ScanObj * st,const char * str,int str_length,int * bytes_consumed)496 int UTF8GenericScanFastAscii(const UTF8ScanObj* st,
497 const char * str,
498 int str_length,
499 int* bytes_consumed) {
500 *bytes_consumed = 0;
501 if (str_length == 0) return kExitOK;
502
503 const uint8* isrc = reinterpret_cast<const uint8*>(str);
504 const uint8* src = isrc;
505 const uint8* srclimit = isrc + str_length;
506 const uint8* srclimit8 = str_length < 7 ? isrc : srclimit - 7;
507 int n;
508 int rest_consumed;
509 int exit_reason;
510 do {
511 // Check initial few bytes one at a time until 8-byte aligned
512 while ((((uintptr_t)src & 0x07) != 0) &&
513 (src < srclimit) && (src[0] < 0x80)) {
514 src++;
515 }
516 if (((uintptr_t)src & 0x07) == 0) {
517 while ((src < srclimit8) &&
518 (((reinterpret_cast<const uint32*>(src)[0] |
519 reinterpret_cast<const uint32*>(src)[1]) & 0x80808080) == 0)) {
520 src += 8;
521 }
522 }
523 while ((src < srclimit) && (src[0] < 0x80)) {
524 src++;
525 }
526 // Run state table on the rest
527 n = src - isrc;
528 exit_reason = UTF8GenericScan(st, str + n, str_length - n, &rest_consumed);
529 src += rest_consumed;
530 } while ( exit_reason == kExitDoAgain );
531
532 *bytes_consumed = src - isrc;
533 return exit_reason;
534 }
535
536 // Hack: On some compilers the static tables are initialized at startup.
537 // We can't use them until they are initialized. However, some Protocol
538 // Buffer parsing happens at static init time and may try to validate
539 // UTF-8 strings. Since UTF-8 validation is only used for debugging
540 // anyway, we simply always return success if initialization hasn't
541 // occurred yet.
542 namespace {
543
544 bool module_initialized_ = false;
545
546 struct InitDetector {
InitDetectorgoogle::protobuf::internal::__anon6beaf6f30311::InitDetector547 InitDetector() {
548 module_initialized_ = true;
549 }
550 };
551 InitDetector init_detector;
552
553 } // namespace
554
IsStructurallyValidUTF8(const char * buf,int len)555 bool IsStructurallyValidUTF8(const char* buf, int len) {
556 if (!module_initialized_) return true;
557
558 int bytes_consumed = 0;
559 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
560 buf, len, &bytes_consumed);
561 return (bytes_consumed == len);
562 }
563
UTF8SpnStructurallyValid(StringPiece str)564 int UTF8SpnStructurallyValid(StringPiece str) {
565 if (!module_initialized_) return str.size();
566
567 int bytes_consumed = 0;
568 UTF8GenericScanFastAscii(&utf8acceptnonsurrogates_obj,
569 str.data(), str.size(), &bytes_consumed);
570 return bytes_consumed;
571 }
572
573 // Coerce UTF-8 byte string in src_str to be
574 // a structurally-valid equal-length string by selectively
575 // overwriting illegal bytes with replace_char (typically blank).
576 // replace_char must be legal printable 7-bit Ascii 0x20..0x7e.
577 // src_str is read-only. If any overwriting is needed, a modified byte string
578 // is created in idst, length isrclen.
579 //
580 // Returns pointer to output buffer, isrc if no changes were made,
581 // or idst if some bytes were changed.
582 //
583 // Fast case: all is structurally valid and no byte copying is done.
584 //
UTF8CoerceToStructurallyValid(StringPiece src_str,char * idst,const char replace_char)585 char* UTF8CoerceToStructurallyValid(StringPiece src_str, char* idst,
586 const char replace_char) {
587 const char* isrc = src_str.data();
588 const int len = src_str.length();
589 int n = UTF8SpnStructurallyValid(src_str);
590 if (n == len) { // Normal case -- all is cool, return
591 return const_cast<char*>(isrc);
592 } else { // Unusual case -- copy w/o bad bytes
593 const char* src = isrc;
594 const char* srclimit = isrc + len;
595 char* dst = idst;
596 memmove(dst, src, n); // Copy initial good chunk
597 src += n;
598 dst += n;
599 while (src < srclimit) { // src points to bogus byte or is off the end
600 dst[0] = replace_char; // replace one bad byte
601 src++;
602 dst++;
603 StringPiece str2(src, srclimit - src);
604 n = UTF8SpnStructurallyValid(str2); // scan the remainder
605 memmove(dst, src, n); // copy next good chunk
606 src += n;
607 dst += n;
608 }
609 }
610 return idst;
611 }
612
613 } // namespace internal
614 } // namespace protobuf
615 } // namespace google
616