1 /*
2 ** upb::Decoder (Bytecode Decoder VM)
3 **
4 ** Bytecode must previously have been generated using the bytecode compiler in
5 ** compile_decoder.c. This decoder then walks through the bytecode op-by-op to
6 ** parse the input.
7 **
8 ** Decoding is fully resumable; we just keep a pointer to the current bytecode
9 ** instruction and resume from there. A fair amount of the logic here is to
10 ** handle the fact that values can span buffer seams and we have to be able to
11 ** be capable of suspending/resuming from any byte in the stream. This
12 ** sometimes requires keeping a few trailing bytes from the last buffer around
13 ** in the "residual" buffer.
14 */
15
16 #include <inttypes.h>
17 #include <stddef.h>
18 #include "upb/pb/decoder.int.h"
19 #include "upb/pb/varint.int.h"
20
21 #ifdef UPB_DUMP_BYTECODE
22 #include <stdio.h>
23 #endif
24
25 #include "upb/port_def.inc"
26
27 #define CHECK_SUSPEND(x) if (!(x)) return upb_pbdecoder_suspend(d);
28
29 /* Error messages that are shared between the bytecode and JIT decoders. */
30 const char *kPbDecoderStackOverflow = "Nesting too deep.";
31 const char *kPbDecoderSubmessageTooLong =
32 "Submessage end extends past enclosing submessage.";
33
34 /* Error messages shared within this file. */
35 static const char *kUnterminatedVarint = "Unterminated varint.";
36
37 /* upb_pbdecoder **************************************************************/
38
39 static opcode halt = OP_HALT;
40
41 /* A dummy character we can point to when the user passes us a NULL buffer.
42 * We need this because in C (NULL + 0) and (NULL - NULL) are undefined
43 * behavior, which would invalidate functions like curbufleft(). */
44 static const char dummy_char;
45
46 /* Whether an op consumes any of the input buffer. */
consumes_input(opcode op)47 static bool consumes_input(opcode op) {
48 switch (op) {
49 case OP_SETDISPATCH:
50 case OP_STARTMSG:
51 case OP_ENDMSG:
52 case OP_STARTSEQ:
53 case OP_ENDSEQ:
54 case OP_STARTSUBMSG:
55 case OP_ENDSUBMSG:
56 case OP_STARTSTR:
57 case OP_ENDSTR:
58 case OP_PUSHTAGDELIM:
59 case OP_POP:
60 case OP_SETDELIM:
61 case OP_SETBIGGROUPNUM:
62 case OP_CHECKDELIM:
63 case OP_CALL:
64 case OP_RET:
65 case OP_BRANCH:
66 return false;
67 default:
68 return true;
69 }
70 }
71
stacksize(upb_pbdecoder * d,size_t entries)72 static size_t stacksize(upb_pbdecoder *d, size_t entries) {
73 UPB_UNUSED(d);
74 return entries * sizeof(upb_pbdecoder_frame);
75 }
76
callstacksize(upb_pbdecoder * d,size_t entries)77 static size_t callstacksize(upb_pbdecoder *d, size_t entries) {
78 UPB_UNUSED(d);
79
80 return entries * sizeof(uint32_t*);
81 }
82
83
84 static bool in_residual_buf(const upb_pbdecoder *d, const char *p);
85
86 /* It's unfortunate that we have to micro-manage the compiler with
87 * UPB_FORCEINLINE and UPB_NOINLINE, especially since this tuning is necessarily
88 * specific to one hardware configuration. But empirically on a Core i7,
89 * performance increases 30-50% with these annotations. Every instance where
90 * these appear, gcc 4.2.1 made the wrong decision and degraded performance in
91 * benchmarks. */
92
seterr(upb_pbdecoder * d,const char * msg)93 static void seterr(upb_pbdecoder *d, const char *msg) {
94 upb_status_seterrmsg(d->status, msg);
95 }
96
upb_pbdecoder_seterr(upb_pbdecoder * d,const char * msg)97 void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg) {
98 seterr(d, msg);
99 }
100
101
102 /* Buffering ******************************************************************/
103
104 /* We operate on one buffer at a time, which is either the user's buffer passed
105 * to our "decode" callback or some residual bytes from the previous buffer. */
106
107 /* How many bytes can be safely read from d->ptr without reading past end-of-buf
108 * or past the current delimited end. */
curbufleft(const upb_pbdecoder * d)109 static size_t curbufleft(const upb_pbdecoder *d) {
110 UPB_ASSERT(d->data_end >= d->ptr);
111 return d->data_end - d->ptr;
112 }
113
114 /* How many bytes are available before end-of-buffer. */
bufleft(const upb_pbdecoder * d)115 static size_t bufleft(const upb_pbdecoder *d) {
116 return d->end - d->ptr;
117 }
118
119 /* Overall stream offset of d->ptr. */
offset(const upb_pbdecoder * d)120 uint64_t offset(const upb_pbdecoder *d) {
121 return d->bufstart_ofs + (d->ptr - d->buf);
122 }
123
124 /* How many bytes are available before the end of this delimited region. */
delim_remaining(const upb_pbdecoder * d)125 size_t delim_remaining(const upb_pbdecoder *d) {
126 return d->top->end_ofs - offset(d);
127 }
128
129 /* Advances d->ptr. */
advance(upb_pbdecoder * d,size_t len)130 static void advance(upb_pbdecoder *d, size_t len) {
131 UPB_ASSERT(curbufleft(d) >= len);
132 d->ptr += len;
133 }
134
in_buf(const char * p,const char * buf,const char * end)135 static bool in_buf(const char *p, const char *buf, const char *end) {
136 return p >= buf && p <= end;
137 }
138
in_residual_buf(const upb_pbdecoder * d,const char * p)139 static bool in_residual_buf(const upb_pbdecoder *d, const char *p) {
140 return in_buf(p, d->residual, d->residual_end);
141 }
142
143 /* Calculates the delim_end value, which is affected by both the current buffer
144 * and the parsing stack, so must be called whenever either is updated. */
set_delim_end(upb_pbdecoder * d)145 static void set_delim_end(upb_pbdecoder *d) {
146 size_t delim_ofs = d->top->end_ofs - d->bufstart_ofs;
147 if (delim_ofs <= (size_t)(d->end - d->buf)) {
148 d->delim_end = d->buf + delim_ofs;
149 d->data_end = d->delim_end;
150 } else {
151 d->data_end = d->end;
152 d->delim_end = NULL;
153 }
154 }
155
switchtobuf(upb_pbdecoder * d,const char * buf,const char * end)156 static void switchtobuf(upb_pbdecoder *d, const char *buf, const char *end) {
157 d->ptr = buf;
158 d->buf = buf;
159 d->end = end;
160 set_delim_end(d);
161 }
162
advancetobuf(upb_pbdecoder * d,const char * buf,size_t len)163 static void advancetobuf(upb_pbdecoder *d, const char *buf, size_t len) {
164 UPB_ASSERT(curbufleft(d) == 0);
165 d->bufstart_ofs += (d->end - d->buf);
166 switchtobuf(d, buf, buf + len);
167 }
168
checkpoint(upb_pbdecoder * d)169 static void checkpoint(upb_pbdecoder *d) {
170 /* The assertion here is in the interests of efficiency, not correctness.
171 * We are trying to ensure that we don't checkpoint() more often than
172 * necessary. */
173 UPB_ASSERT(d->checkpoint != d->ptr);
174 d->checkpoint = d->ptr;
175 }
176
177 /* Skips "bytes" bytes in the stream, which may be more than available. If we
178 * skip more bytes than are available, we return a long read count to the caller
179 * indicating how many bytes can be skipped over before passing actual data
180 * again. Skipped bytes can pass a NULL buffer and the decoder guarantees they
181 * won't actually be read.
182 */
skip(upb_pbdecoder * d,size_t bytes)183 static int32_t skip(upb_pbdecoder *d, size_t bytes) {
184 UPB_ASSERT(!in_residual_buf(d, d->ptr) || d->size_param == 0);
185 UPB_ASSERT(d->skip == 0);
186 if (bytes > delim_remaining(d)) {
187 seterr(d, "Skipped value extended beyond enclosing submessage.");
188 return (int32_t)upb_pbdecoder_suspend(d);
189 } else if (bufleft(d) >= bytes) {
190 /* Skipped data is all in current buffer, and more is still available. */
191 advance(d, bytes);
192 d->skip = 0;
193 return DECODE_OK;
194 } else {
195 /* Skipped data extends beyond currently available buffers. */
196 d->pc = d->last;
197 d->skip = bytes - curbufleft(d);
198 d->bufstart_ofs += (d->end - d->buf);
199 d->residual_end = d->residual;
200 switchtobuf(d, d->residual, d->residual_end);
201 return (int32_t)(d->size_param + d->skip);
202 }
203 }
204
205
206 /* Resumes the decoder from an initial state or from a previous suspend. */
upb_pbdecoder_resume(upb_pbdecoder * d,void * p,const char * buf,size_t size,const upb_bufhandle * handle)207 int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf,
208 size_t size, const upb_bufhandle *handle) {
209 UPB_UNUSED(p); /* Useless; just for the benefit of the JIT. */
210
211 /* d->skip and d->residual_end could probably elegantly be represented
212 * as a single variable, to more easily represent this invariant. */
213 UPB_ASSERT(!(d->skip && d->residual_end > d->residual));
214
215 /* We need to remember the original size_param, so that the value we return
216 * is relative to it, even if we do some skipping first. */
217 d->size_param = size;
218 d->handle = handle;
219
220 /* Have to handle this case specially (ie. not with skip()) because the user
221 * is allowed to pass a NULL buffer here, which won't allow us to safely
222 * calculate a d->end or use our normal functions like curbufleft(). */
223 if (d->skip && d->skip >= size) {
224 d->skip -= size;
225 d->bufstart_ofs += size;
226 buf = &dummy_char;
227 size = 0;
228
229 /* We can't just return now, because we might need to execute some ops
230 * like CHECKDELIM, which could call some callbacks and pop the stack. */
231 }
232
233 /* We need to pretend that this was the actual buffer param, since some of the
234 * calculations assume that d->ptr/d->buf is relative to this. */
235 d->buf_param = buf;
236
237 if (!buf) {
238 /* NULL buf is ok if its entire span is covered by the "skip" above, but
239 * by this point we know that "skip" doesn't cover the buffer. */
240 seterr(d, "Passed NULL buffer over non-skippable region.");
241 return (int32_t)upb_pbdecoder_suspend(d);
242 }
243
244 if (d->residual_end > d->residual) {
245 /* We have residual bytes from the last buffer. */
246 UPB_ASSERT(d->ptr == d->residual);
247 } else {
248 switchtobuf(d, buf, buf + size);
249 }
250
251 d->checkpoint = d->ptr;
252
253 /* Handle skips that don't cover the whole buffer (as above). */
254 if (d->skip) {
255 size_t skip_bytes = d->skip;
256 d->skip = 0;
257 CHECK_RETURN(skip(d, skip_bytes));
258 checkpoint(d);
259 }
260
261 /* If we're inside an unknown group, continue to parse unknown values. */
262 if (d->top->groupnum < 0) {
263 CHECK_RETURN(upb_pbdecoder_skipunknown(d, -1, 0));
264 checkpoint(d);
265 }
266
267 return DECODE_OK;
268 }
269
270 /* Suspends the decoder at the last checkpoint, without saving any residual
271 * bytes. If there are any unconsumed bytes, returns a short byte count. */
upb_pbdecoder_suspend(upb_pbdecoder * d)272 size_t upb_pbdecoder_suspend(upb_pbdecoder *d) {
273 d->pc = d->last;
274 if (d->checkpoint == d->residual) {
275 /* Checkpoint was in residual buf; no user bytes were consumed. */
276 d->ptr = d->residual;
277 return 0;
278 } else {
279 size_t ret = d->size_param - (d->end - d->checkpoint);
280 UPB_ASSERT(!in_residual_buf(d, d->checkpoint));
281 UPB_ASSERT(d->buf == d->buf_param || d->buf == &dummy_char);
282
283 d->bufstart_ofs += (d->checkpoint - d->buf);
284 d->residual_end = d->residual;
285 switchtobuf(d, d->residual, d->residual_end);
286 return ret;
287 }
288 }
289
290 /* Suspends the decoder at the last checkpoint, and saves any unconsumed
291 * bytes in our residual buffer. This is necessary if we need more user
292 * bytes to form a complete value, which might not be contiguous in the
293 * user's buffers. Always consumes all user bytes. */
suspend_save(upb_pbdecoder * d)294 static size_t suspend_save(upb_pbdecoder *d) {
295 /* We hit end-of-buffer before we could parse a full value.
296 * Save any unconsumed bytes (if any) to the residual buffer. */
297 d->pc = d->last;
298
299 if (d->checkpoint == d->residual) {
300 /* Checkpoint was in residual buf; append user byte(s) to residual buf. */
301 UPB_ASSERT((d->residual_end - d->residual) + d->size_param <=
302 sizeof(d->residual));
303 if (!in_residual_buf(d, d->ptr)) {
304 d->bufstart_ofs -= (d->residual_end - d->residual);
305 }
306 memcpy(d->residual_end, d->buf_param, d->size_param);
307 d->residual_end += d->size_param;
308 } else {
309 /* Checkpoint was in user buf; old residual bytes not needed. */
310 size_t save;
311 UPB_ASSERT(!in_residual_buf(d, d->checkpoint));
312
313 d->ptr = d->checkpoint;
314 save = curbufleft(d);
315 UPB_ASSERT(save <= sizeof(d->residual));
316 memcpy(d->residual, d->ptr, save);
317 d->residual_end = d->residual + save;
318 d->bufstart_ofs = offset(d);
319 }
320
321 switchtobuf(d, d->residual, d->residual_end);
322 return d->size_param;
323 }
324
325 /* Copies the next "bytes" bytes into "buf" and advances the stream.
326 * Requires that this many bytes are available in the current buffer. */
consumebytes(upb_pbdecoder * d,void * buf,size_t bytes)327 UPB_FORCEINLINE static void consumebytes(upb_pbdecoder *d, void *buf,
328 size_t bytes) {
329 UPB_ASSERT(bytes <= curbufleft(d));
330 memcpy(buf, d->ptr, bytes);
331 advance(d, bytes);
332 }
333
334 /* Slow path for getting the next "bytes" bytes, regardless of whether they are
335 * available in the current buffer or not. Returns a status code as described
336 * in decoder.int.h. */
getbytes_slow(upb_pbdecoder * d,void * buf,size_t bytes)337 UPB_NOINLINE static int32_t getbytes_slow(upb_pbdecoder *d, void *buf,
338 size_t bytes) {
339 const size_t avail = curbufleft(d);
340 consumebytes(d, buf, avail);
341 bytes -= avail;
342 UPB_ASSERT(bytes > 0);
343 if (in_residual_buf(d, d->ptr)) {
344 advancetobuf(d, d->buf_param, d->size_param);
345 }
346 if (curbufleft(d) >= bytes) {
347 consumebytes(d, (char *)buf + avail, bytes);
348 return DECODE_OK;
349 } else if (d->data_end == d->delim_end) {
350 seterr(d, "Submessage ended in the middle of a value or group");
351 return (int32_t)upb_pbdecoder_suspend(d);
352 } else {
353 return (int32_t)suspend_save(d);
354 }
355 }
356
357 /* Gets the next "bytes" bytes, regardless of whether they are available in the
358 * current buffer or not. Returns a status code as described in decoder.int.h.
359 */
getbytes(upb_pbdecoder * d,void * buf,size_t bytes)360 UPB_FORCEINLINE static int32_t getbytes(upb_pbdecoder *d, void *buf,
361 size_t bytes) {
362 if (curbufleft(d) >= bytes) {
363 /* Buffer has enough data to satisfy. */
364 consumebytes(d, buf, bytes);
365 return DECODE_OK;
366 } else {
367 return getbytes_slow(d, buf, bytes);
368 }
369 }
370
peekbytes_slow(upb_pbdecoder * d,void * buf,size_t bytes)371 UPB_NOINLINE static size_t peekbytes_slow(upb_pbdecoder *d, void *buf,
372 size_t bytes) {
373 size_t ret = curbufleft(d);
374 memcpy(buf, d->ptr, ret);
375 if (in_residual_buf(d, d->ptr)) {
376 size_t copy = UPB_MIN(bytes - ret, d->size_param);
377 memcpy((char *)buf + ret, d->buf_param, copy);
378 ret += copy;
379 }
380 return ret;
381 }
382
peekbytes(upb_pbdecoder * d,void * buf,size_t bytes)383 UPB_FORCEINLINE static size_t peekbytes(upb_pbdecoder *d, void *buf,
384 size_t bytes) {
385 if (curbufleft(d) >= bytes) {
386 memcpy(buf, d->ptr, bytes);
387 return bytes;
388 } else {
389 return peekbytes_slow(d, buf, bytes);
390 }
391 }
392
393
394 /* Decoding of wire types *****************************************************/
395
396 /* Slow path for decoding a varint from the current buffer position.
397 * Returns a status code as described in decoder.int.h. */
upb_pbdecoder_decode_varint_slow(upb_pbdecoder * d,uint64_t * u64)398 UPB_NOINLINE int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d,
399 uint64_t *u64) {
400 uint8_t byte = 0x80;
401 int bitpos;
402 *u64 = 0;
403 for(bitpos = 0; bitpos < 70 && (byte & 0x80); bitpos += 7) {
404 CHECK_RETURN(getbytes(d, &byte, 1));
405 *u64 |= (uint64_t)(byte & 0x7F) << bitpos;
406 }
407 if(bitpos == 70 && (byte & 0x80)) {
408 seterr(d, kUnterminatedVarint);
409 return (int32_t)upb_pbdecoder_suspend(d);
410 }
411 return DECODE_OK;
412 }
413
414 /* Decodes a varint from the current buffer position.
415 * Returns a status code as described in decoder.int.h. */
decode_varint(upb_pbdecoder * d,uint64_t * u64)416 UPB_FORCEINLINE static int32_t decode_varint(upb_pbdecoder *d, uint64_t *u64) {
417 if (curbufleft(d) > 0 && !(*d->ptr & 0x80)) {
418 *u64 = *d->ptr;
419 advance(d, 1);
420 return DECODE_OK;
421 } else if (curbufleft(d) >= 10) {
422 /* Fast case. */
423 upb_decoderet r = upb_vdecode_fast(d->ptr);
424 if (r.p == NULL) {
425 seterr(d, kUnterminatedVarint);
426 return (int32_t)upb_pbdecoder_suspend(d);
427 }
428 advance(d, r.p - d->ptr);
429 *u64 = r.val;
430 return DECODE_OK;
431 } else {
432 /* Slow case -- varint spans buffer seam. */
433 return upb_pbdecoder_decode_varint_slow(d, u64);
434 }
435 }
436
437 /* Decodes a 32-bit varint from the current buffer position.
438 * Returns a status code as described in decoder.int.h. */
decode_v32(upb_pbdecoder * d,uint32_t * u32)439 UPB_FORCEINLINE static int32_t decode_v32(upb_pbdecoder *d, uint32_t *u32) {
440 uint64_t u64;
441 int32_t ret = decode_varint(d, &u64);
442 if (ret >= 0) return ret;
443 if (u64 > UINT32_MAX) {
444 seterr(d, "Unterminated 32-bit varint");
445 /* TODO(haberman) guarantee that this function return is >= 0 somehow,
446 * so we know this path will always be treated as error by our caller.
447 * Right now the size_t -> int32_t can overflow and produce negative values.
448 */
449 *u32 = 0;
450 return (int32_t)upb_pbdecoder_suspend(d);
451 }
452 *u32 = (uint32_t)u64;
453 return DECODE_OK;
454 }
455
456 /* Decodes a fixed32 from the current buffer position.
457 * Returns a status code as described in decoder.int.h.
458 * TODO: proper byte swapping for big-endian machines. */
decode_fixed32(upb_pbdecoder * d,uint32_t * u32)459 UPB_FORCEINLINE static int32_t decode_fixed32(upb_pbdecoder *d, uint32_t *u32) {
460 return getbytes(d, u32, 4);
461 }
462
463 /* Decodes a fixed64 from the current buffer position.
464 * Returns a status code as described in decoder.int.h.
465 * TODO: proper byte swapping for big-endian machines. */
decode_fixed64(upb_pbdecoder * d,uint64_t * u64)466 UPB_FORCEINLINE static int32_t decode_fixed64(upb_pbdecoder *d, uint64_t *u64) {
467 return getbytes(d, u64, 8);
468 }
469
470 /* Non-static versions of the above functions.
471 * These are called by the JIT for fallback paths. */
upb_pbdecoder_decode_f32(upb_pbdecoder * d,uint32_t * u32)472 int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32) {
473 return decode_fixed32(d, u32);
474 }
475
upb_pbdecoder_decode_f64(upb_pbdecoder * d,uint64_t * u64)476 int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64) {
477 return decode_fixed64(d, u64);
478 }
479
as_double(uint64_t n)480 static double as_double(uint64_t n) { double d; memcpy(&d, &n, 8); return d; }
as_float(uint32_t n)481 static float as_float(uint32_t n) { float f; memcpy(&f, &n, 4); return f; }
482
483 /* Pushes a frame onto the decoder stack. */
decoder_push(upb_pbdecoder * d,uint64_t end)484 static bool decoder_push(upb_pbdecoder *d, uint64_t end) {
485 upb_pbdecoder_frame *fr = d->top;
486
487 if (end > fr->end_ofs) {
488 seterr(d, kPbDecoderSubmessageTooLong);
489 return false;
490 } else if (fr == d->limit) {
491 seterr(d, kPbDecoderStackOverflow);
492 return false;
493 }
494
495 fr++;
496 fr->end_ofs = end;
497 fr->dispatch = NULL;
498 fr->groupnum = 0;
499 d->top = fr;
500 return true;
501 }
502
pushtagdelim(upb_pbdecoder * d,uint32_t arg)503 static bool pushtagdelim(upb_pbdecoder *d, uint32_t arg) {
504 /* While we expect to see an "end" tag (either ENDGROUP or a non-sequence
505 * field number) prior to hitting any enclosing submessage end, pushing our
506 * existing delim end prevents us from continuing to parse values from a
507 * corrupt proto that doesn't give us an END tag in time. */
508 if (!decoder_push(d, d->top->end_ofs))
509 return false;
510 d->top->groupnum = arg;
511 return true;
512 }
513
514 /* Pops a frame from the decoder stack. */
decoder_pop(upb_pbdecoder * d)515 static void decoder_pop(upb_pbdecoder *d) { d->top--; }
516
upb_pbdecoder_checktag_slow(upb_pbdecoder * d,uint64_t expected)517 UPB_NOINLINE int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d,
518 uint64_t expected) {
519 uint64_t data = 0;
520 size_t bytes = upb_value_size(expected);
521 size_t read = peekbytes(d, &data, bytes);
522 if (read == bytes && data == expected) {
523 /* Advance past matched bytes. */
524 int32_t ok = getbytes(d, &data, read);
525 UPB_ASSERT(ok < 0);
526 return DECODE_OK;
527 } else if (read < bytes && memcmp(&data, &expected, read) == 0) {
528 return (int32_t)suspend_save(d);
529 } else {
530 return DECODE_MISMATCH;
531 }
532 }
533
upb_pbdecoder_skipunknown(upb_pbdecoder * d,int32_t fieldnum,uint8_t wire_type)534 int32_t upb_pbdecoder_skipunknown(upb_pbdecoder *d, int32_t fieldnum,
535 uint8_t wire_type) {
536 if (fieldnum >= 0)
537 goto have_tag;
538
539 while (true) {
540 uint32_t tag;
541 CHECK_RETURN(decode_v32(d, &tag));
542 wire_type = tag & 0x7;
543 fieldnum = tag >> 3;
544
545 have_tag:
546 if (fieldnum == 0) {
547 seterr(d, "Saw invalid field number (0)");
548 return (int32_t)upb_pbdecoder_suspend(d);
549 }
550
551 switch (wire_type) {
552 case UPB_WIRE_TYPE_32BIT:
553 CHECK_RETURN(skip(d, 4));
554 break;
555 case UPB_WIRE_TYPE_64BIT:
556 CHECK_RETURN(skip(d, 8));
557 break;
558 case UPB_WIRE_TYPE_VARINT: {
559 uint64_t u64;
560 CHECK_RETURN(decode_varint(d, &u64));
561 break;
562 }
563 case UPB_WIRE_TYPE_DELIMITED: {
564 uint32_t len;
565 CHECK_RETURN(decode_v32(d, &len));
566 CHECK_RETURN(skip(d, len));
567 break;
568 }
569 case UPB_WIRE_TYPE_START_GROUP:
570 if (!pushtagdelim(d, -fieldnum)) {
571 return (int32_t)upb_pbdecoder_suspend(d);
572 }
573 break;
574 case UPB_WIRE_TYPE_END_GROUP:
575 if (fieldnum == -d->top->groupnum) {
576 decoder_pop(d);
577 } else if (fieldnum == d->top->groupnum) {
578 return DECODE_ENDGROUP;
579 } else {
580 seterr(d, "Unmatched ENDGROUP tag.");
581 return (int32_t)upb_pbdecoder_suspend(d);
582 }
583 break;
584 default:
585 seterr(d, "Invalid wire type");
586 return (int32_t)upb_pbdecoder_suspend(d);
587 }
588
589 if (d->top->groupnum >= 0) {
590 /* TODO: More code needed for handling unknown groups. */
591 upb_sink_putunknown(d->top->sink, d->checkpoint, d->ptr - d->checkpoint);
592 return DECODE_OK;
593 }
594
595 /* Unknown group -- continue looping over unknown fields. */
596 checkpoint(d);
597 }
598 }
599
goto_endmsg(upb_pbdecoder * d)600 static void goto_endmsg(upb_pbdecoder *d) {
601 upb_value v;
602 bool found = upb_inttable_lookup32(d->top->dispatch, DISPATCH_ENDMSG, &v);
603 UPB_ASSERT(found);
604 d->pc = d->top->base + upb_value_getuint64(v);
605 }
606
607 /* Parses a tag and jumps to the corresponding bytecode instruction for this
608 * field.
609 *
610 * If the tag is unknown (or the wire type doesn't match), parses the field as
611 * unknown. If the tag is a valid ENDGROUP tag, jumps to the bytecode
612 * instruction for the end of message. */
dispatch(upb_pbdecoder * d)613 static int32_t dispatch(upb_pbdecoder *d) {
614 upb_inttable *dispatch = d->top->dispatch;
615 uint32_t tag;
616 uint8_t wire_type;
617 uint32_t fieldnum;
618 upb_value val;
619 int32_t retval;
620
621 /* Decode tag. */
622 CHECK_RETURN(decode_v32(d, &tag));
623 wire_type = tag & 0x7;
624 fieldnum = tag >> 3;
625
626 /* Lookup tag. Because of packed/non-packed compatibility, we have to
627 * check the wire type against two possibilities. */
628 if (fieldnum != DISPATCH_ENDMSG &&
629 upb_inttable_lookup32(dispatch, fieldnum, &val)) {
630 uint64_t v = upb_value_getuint64(val);
631 if (wire_type == (v & 0xff)) {
632 d->pc = d->top->base + (v >> 16);
633 return DECODE_OK;
634 } else if (wire_type == ((v >> 8) & 0xff)) {
635 bool found =
636 upb_inttable_lookup(dispatch, fieldnum + UPB_MAX_FIELDNUMBER, &val);
637 UPB_ASSERT(found);
638 d->pc = d->top->base + upb_value_getuint64(val);
639 return DECODE_OK;
640 }
641 }
642
643 /* We have some unknown fields (or ENDGROUP) to parse. The DISPATCH or TAG
644 * bytecode that triggered this is preceded by a CHECKDELIM bytecode which
645 * we need to back up to, so that when we're done skipping unknown data we
646 * can re-check the delimited end. */
647 d->last--; /* Necessary if we get suspended */
648 d->pc = d->last;
649 UPB_ASSERT(getop(*d->last) == OP_CHECKDELIM);
650
651 /* Unknown field or ENDGROUP. */
652 retval = upb_pbdecoder_skipunknown(d, fieldnum, wire_type);
653
654 CHECK_RETURN(retval);
655
656 if (retval == DECODE_ENDGROUP) {
657 goto_endmsg(d);
658 return DECODE_OK;
659 }
660
661 return DECODE_OK;
662 }
663
664 /* Callers know that the stack is more than one deep because the opcodes that
665 * call this only occur after PUSH operations. */
outer_frame(upb_pbdecoder * d)666 upb_pbdecoder_frame *outer_frame(upb_pbdecoder *d) {
667 UPB_ASSERT(d->top != d->stack);
668 return d->top - 1;
669 }
670
671
672 /* The main decoding loop *****************************************************/
673
674 /* The main decoder VM function. Uses traditional bytecode dispatch loop with a
675 * switch() statement. */
run_decoder_vm(upb_pbdecoder * d,const mgroup * group,const upb_bufhandle * handle)676 size_t run_decoder_vm(upb_pbdecoder *d, const mgroup *group,
677 const upb_bufhandle* handle) {
678
679 #define VMCASE(op, code) \
680 case op: { code; if (consumes_input(op)) checkpoint(d); break; }
681 #define PRIMITIVE_OP(type, wt, name, convfunc, ctype) \
682 VMCASE(OP_PARSE_ ## type, { \
683 ctype val; \
684 CHECK_RETURN(decode_ ## wt(d, &val)); \
685 upb_sink_put ## name(d->top->sink, arg, (convfunc)(val)); \
686 })
687
688 while(1) {
689 int32_t instruction;
690 opcode op;
691 uint32_t arg;
692 int32_t longofs;
693
694 d->last = d->pc;
695 instruction = *d->pc++;
696 op = getop(instruction);
697 arg = instruction >> 8;
698 longofs = arg;
699 UPB_ASSERT(d->ptr != d->residual_end);
700 UPB_UNUSED(group);
701 #ifdef UPB_DUMP_BYTECODE
702 fprintf(stderr, "s_ofs=%d buf_ofs=%d data_rem=%d buf_rem=%d delim_rem=%d "
703 "%x %s (%d)\n",
704 (int)offset(d),
705 (int)(d->ptr - d->buf),
706 (int)(d->data_end - d->ptr),
707 (int)(d->end - d->ptr),
708 (int)((d->top->end_ofs - d->bufstart_ofs) - (d->ptr - d->buf)),
709 (int)(d->pc - 1 - group->bytecode),
710 upb_pbdecoder_getopname(op),
711 arg);
712 #endif
713 switch (op) {
714 /* Technically, we are losing data if we see a 32-bit varint that is not
715 * properly sign-extended. We could detect this and error about the data
716 * loss, but proto2 does not do this, so we pass. */
717 PRIMITIVE_OP(INT32, varint, int32, int32_t, uint64_t)
718 PRIMITIVE_OP(INT64, varint, int64, int64_t, uint64_t)
719 PRIMITIVE_OP(UINT32, varint, uint32, uint32_t, uint64_t)
720 PRIMITIVE_OP(UINT64, varint, uint64, uint64_t, uint64_t)
721 PRIMITIVE_OP(FIXED32, fixed32, uint32, uint32_t, uint32_t)
722 PRIMITIVE_OP(FIXED64, fixed64, uint64, uint64_t, uint64_t)
723 PRIMITIVE_OP(SFIXED32, fixed32, int32, int32_t, uint32_t)
724 PRIMITIVE_OP(SFIXED64, fixed64, int64, int64_t, uint64_t)
725 PRIMITIVE_OP(BOOL, varint, bool, bool, uint64_t)
726 PRIMITIVE_OP(DOUBLE, fixed64, double, as_double, uint64_t)
727 PRIMITIVE_OP(FLOAT, fixed32, float, as_float, uint32_t)
728 PRIMITIVE_OP(SINT32, varint, int32, upb_zzdec_32, uint64_t)
729 PRIMITIVE_OP(SINT64, varint, int64, upb_zzdec_64, uint64_t)
730
731 VMCASE(OP_SETDISPATCH,
732 d->top->base = d->pc - 1;
733 memcpy(&d->top->dispatch, d->pc, sizeof(void*));
734 d->pc += sizeof(void*) / sizeof(uint32_t);
735 )
736 VMCASE(OP_STARTMSG,
737 CHECK_SUSPEND(upb_sink_startmsg(d->top->sink));
738 )
739 VMCASE(OP_ENDMSG,
740 CHECK_SUSPEND(upb_sink_endmsg(d->top->sink, d->status));
741 )
742 VMCASE(OP_STARTSEQ,
743 upb_pbdecoder_frame *outer = outer_frame(d);
744 CHECK_SUSPEND(upb_sink_startseq(outer->sink, arg, &d->top->sink));
745 )
746 VMCASE(OP_ENDSEQ,
747 CHECK_SUSPEND(upb_sink_endseq(d->top->sink, arg));
748 )
749 VMCASE(OP_STARTSUBMSG,
750 upb_pbdecoder_frame *outer = outer_frame(d);
751 CHECK_SUSPEND(upb_sink_startsubmsg(outer->sink, arg, &d->top->sink));
752 )
753 VMCASE(OP_ENDSUBMSG,
754 upb_sink subsink = (d->top + 1)->sink;
755 CHECK_SUSPEND(upb_sink_endsubmsg(d->top->sink, subsink, arg));
756 )
757 VMCASE(OP_STARTSTR,
758 uint32_t len = (uint32_t)delim_remaining(d);
759 upb_pbdecoder_frame *outer = outer_frame(d);
760 CHECK_SUSPEND(upb_sink_startstr(outer->sink, arg, len, &d->top->sink));
761 if (len == 0) {
762 d->pc++; /* Skip OP_STRING. */
763 }
764 )
765 VMCASE(OP_STRING,
766 uint32_t len = (uint32_t)curbufleft(d);
767 size_t n = upb_sink_putstring(d->top->sink, arg, d->ptr, len, handle);
768 if (n > len) {
769 if (n > delim_remaining(d)) {
770 seterr(d, "Tried to skip past end of string.");
771 return upb_pbdecoder_suspend(d);
772 } else {
773 int32_t ret = skip(d, n);
774 /* This shouldn't return DECODE_OK, because n > len. */
775 UPB_ASSERT(ret >= 0);
776 return ret;
777 }
778 }
779 advance(d, n);
780 if (n < len || d->delim_end == NULL) {
781 /* We aren't finished with this string yet. */
782 d->pc--; /* Repeat OP_STRING. */
783 if (n > 0) checkpoint(d);
784 return upb_pbdecoder_suspend(d);
785 }
786 )
787 VMCASE(OP_ENDSTR,
788 CHECK_SUSPEND(upb_sink_endstr(d->top->sink, arg));
789 )
790 VMCASE(OP_PUSHTAGDELIM,
791 CHECK_SUSPEND(pushtagdelim(d, arg));
792 )
793 VMCASE(OP_SETBIGGROUPNUM,
794 d->top->groupnum = *d->pc++;
795 )
796 VMCASE(OP_POP,
797 UPB_ASSERT(d->top > d->stack);
798 decoder_pop(d);
799 )
800 VMCASE(OP_PUSHLENDELIM,
801 uint32_t len;
802 CHECK_RETURN(decode_v32(d, &len));
803 CHECK_SUSPEND(decoder_push(d, offset(d) + len));
804 set_delim_end(d);
805 )
806 VMCASE(OP_SETDELIM,
807 set_delim_end(d);
808 )
809 VMCASE(OP_CHECKDELIM,
810 /* We are guaranteed of this assert because we never allow ourselves to
811 * consume bytes beyond data_end, which covers delim_end when non-NULL.
812 */
813 UPB_ASSERT(!(d->delim_end && d->ptr > d->delim_end));
814 if (d->ptr == d->delim_end)
815 d->pc += longofs;
816 )
817 VMCASE(OP_CALL,
818 d->callstack[d->call_len++] = d->pc;
819 d->pc += longofs;
820 )
821 VMCASE(OP_RET,
822 UPB_ASSERT(d->call_len > 0);
823 d->pc = d->callstack[--d->call_len];
824 )
825 VMCASE(OP_BRANCH,
826 d->pc += longofs;
827 )
828 VMCASE(OP_TAG1,
829 uint8_t expected;
830 CHECK_SUSPEND(curbufleft(d) > 0);
831 expected = (arg >> 8) & 0xff;
832 if (*d->ptr == expected) {
833 advance(d, 1);
834 } else {
835 int8_t shortofs;
836 badtag:
837 shortofs = arg;
838 if (shortofs == LABEL_DISPATCH) {
839 CHECK_RETURN(dispatch(d));
840 } else {
841 d->pc += shortofs;
842 break; /* Avoid checkpoint(). */
843 }
844 }
845 )
846 VMCASE(OP_TAG2,
847 uint16_t expected;
848 CHECK_SUSPEND(curbufleft(d) > 0);
849 expected = (arg >> 8) & 0xffff;
850 if (curbufleft(d) >= 2) {
851 uint16_t actual;
852 memcpy(&actual, d->ptr, 2);
853 if (expected == actual) {
854 advance(d, 2);
855 } else {
856 goto badtag;
857 }
858 } else {
859 int32_t result = upb_pbdecoder_checktag_slow(d, expected);
860 if (result == DECODE_MISMATCH) goto badtag;
861 if (result >= 0) return result;
862 }
863 )
864 VMCASE(OP_TAGN, {
865 uint64_t expected;
866 int32_t result;
867 memcpy(&expected, d->pc, 8);
868 d->pc += 2;
869 result = upb_pbdecoder_checktag_slow(d, expected);
870 if (result == DECODE_MISMATCH) goto badtag;
871 if (result >= 0) return result;
872 })
873 VMCASE(OP_DISPATCH, {
874 CHECK_RETURN(dispatch(d));
875 })
876 VMCASE(OP_HALT, {
877 return d->size_param;
878 })
879 }
880 }
881 }
882
883
884 /* BytesHandler handlers ******************************************************/
885
upb_pbdecoder_startbc(void * closure,const void * pc,size_t size_hint)886 void *upb_pbdecoder_startbc(void *closure, const void *pc, size_t size_hint) {
887 upb_pbdecoder *d = closure;
888 UPB_UNUSED(size_hint);
889 d->top->end_ofs = UINT64_MAX;
890 d->bufstart_ofs = 0;
891 d->call_len = 1;
892 d->callstack[0] = &halt;
893 d->pc = pc;
894 d->skip = 0;
895 return d;
896 }
897
upb_pbdecoder_end(void * closure,const void * handler_data)898 bool upb_pbdecoder_end(void *closure, const void *handler_data) {
899 upb_pbdecoder *d = closure;
900 const upb_pbdecodermethod *method = handler_data;
901 uint64_t end;
902 char dummy;
903
904 if (d->residual_end > d->residual) {
905 seterr(d, "Unexpected EOF: decoder still has buffered unparsed data");
906 return false;
907 }
908
909 if (d->skip) {
910 seterr(d, "Unexpected EOF inside skipped data");
911 return false;
912 }
913
914 if (d->top->end_ofs != UINT64_MAX) {
915 seterr(d, "Unexpected EOF inside delimited string");
916 return false;
917 }
918
919 /* The user's end() call indicates that the message ends here. */
920 end = offset(d);
921 d->top->end_ofs = end;
922
923 {
924 const uint32_t *p = d->pc;
925 d->stack->end_ofs = end;
926 /* Check the previous bytecode, but guard against beginning. */
927 if (p != method->code_base.ptr) p--;
928 if (getop(*p) == OP_CHECKDELIM) {
929 /* Rewind from OP_TAG* to OP_CHECKDELIM. */
930 UPB_ASSERT(getop(*d->pc) == OP_TAG1 ||
931 getop(*d->pc) == OP_TAG2 ||
932 getop(*d->pc) == OP_TAGN ||
933 getop(*d->pc) == OP_DISPATCH);
934 d->pc = p;
935 }
936 upb_pbdecoder_decode(closure, handler_data, &dummy, 0, NULL);
937 }
938
939 if (d->call_len != 0) {
940 seterr(d, "Unexpected EOF inside submessage or group");
941 return false;
942 }
943
944 return true;
945 }
946
upb_pbdecoder_decode(void * decoder,const void * group,const char * buf,size_t size,const upb_bufhandle * handle)947 size_t upb_pbdecoder_decode(void *decoder, const void *group, const char *buf,
948 size_t size, const upb_bufhandle *handle) {
949 int32_t result = upb_pbdecoder_resume(decoder, NULL, buf, size, handle);
950
951 if (result == DECODE_ENDGROUP) goto_endmsg(decoder);
952 CHECK_RETURN(result);
953
954 return run_decoder_vm(decoder, group, handle);
955 }
956
957
958 /* Public API *****************************************************************/
959
upb_pbdecoder_reset(upb_pbdecoder * d)960 void upb_pbdecoder_reset(upb_pbdecoder *d) {
961 d->top = d->stack;
962 d->top->groupnum = 0;
963 d->ptr = d->residual;
964 d->buf = d->residual;
965 d->end = d->residual;
966 d->residual_end = d->residual;
967 }
968
upb_pbdecoder_create(upb_arena * a,const upb_pbdecodermethod * m,upb_sink sink,upb_status * status)969 upb_pbdecoder *upb_pbdecoder_create(upb_arena *a, const upb_pbdecodermethod *m,
970 upb_sink sink, upb_status *status) {
971 const size_t default_max_nesting = 64;
972
973 upb_pbdecoder *d = upb_arena_malloc(a, sizeof(upb_pbdecoder));
974 if (!d) return NULL;
975
976 d->method_ = m;
977 d->callstack = upb_arena_malloc(a, callstacksize(d, default_max_nesting));
978 d->stack = upb_arena_malloc(a, stacksize(d, default_max_nesting));
979 if (!d->stack || !d->callstack) {
980 return NULL;
981 }
982
983 d->arena = a;
984 d->limit = d->stack + default_max_nesting - 1;
985 d->stack_size = default_max_nesting;
986 d->status = status;
987
988 upb_pbdecoder_reset(d);
989 upb_bytessink_reset(&d->input_, &m->input_handler_, d);
990
991 if (d->method_->dest_handlers_) {
992 if (sink.handlers != d->method_->dest_handlers_)
993 return NULL;
994 }
995 d->top->sink = sink;
996
997 return d;
998 }
999
upb_pbdecoder_bytesparsed(const upb_pbdecoder * d)1000 uint64_t upb_pbdecoder_bytesparsed(const upb_pbdecoder *d) {
1001 return offset(d);
1002 }
1003
upb_pbdecoder_method(const upb_pbdecoder * d)1004 const upb_pbdecodermethod *upb_pbdecoder_method(const upb_pbdecoder *d) {
1005 return d->method_;
1006 }
1007
upb_pbdecoder_input(upb_pbdecoder * d)1008 upb_bytessink upb_pbdecoder_input(upb_pbdecoder *d) {
1009 return d->input_;
1010 }
1011
upb_pbdecoder_maxnesting(const upb_pbdecoder * d)1012 size_t upb_pbdecoder_maxnesting(const upb_pbdecoder *d) {
1013 return d->stack_size;
1014 }
1015
upb_pbdecoder_setmaxnesting(upb_pbdecoder * d,size_t max)1016 bool upb_pbdecoder_setmaxnesting(upb_pbdecoder *d, size_t max) {
1017 UPB_ASSERT(d->top >= d->stack);
1018
1019 if (max < (size_t)(d->top - d->stack)) {
1020 /* Can't set a limit smaller than what we are currently at. */
1021 return false;
1022 }
1023
1024 if (max > d->stack_size) {
1025 /* Need to reallocate stack and callstack to accommodate. */
1026 size_t old_size = stacksize(d, d->stack_size);
1027 size_t new_size = stacksize(d, max);
1028 void *p = upb_arena_realloc(d->arena, d->stack, old_size, new_size);
1029 if (!p) {
1030 return false;
1031 }
1032 d->stack = p;
1033
1034 old_size = callstacksize(d, d->stack_size);
1035 new_size = callstacksize(d, max);
1036 p = upb_arena_realloc(d->arena, d->callstack, old_size, new_size);
1037 if (!p) {
1038 return false;
1039 }
1040 d->callstack = p;
1041
1042 d->stack_size = max;
1043 }
1044
1045 d->limit = d->stack + max - 1;
1046 return true;
1047 }
1048