• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2 ** upb::Decoder (Bytecode Decoder VM)
3 **
4 ** Bytecode must previously have been generated using the bytecode compiler in
5 ** compile_decoder.c.  This decoder then walks through the bytecode op-by-op to
6 ** parse the input.
7 **
8 ** Decoding is fully resumable; we just keep a pointer to the current bytecode
9 ** instruction and resume from there.  A fair amount of the logic here is to
10 ** handle the fact that values can span buffer seams and we have to be able to
11 ** be capable of suspending/resuming from any byte in the stream.  This
12 ** sometimes requires keeping a few trailing bytes from the last buffer around
13 ** in the "residual" buffer.
14 */
15 
16 #include <inttypes.h>
17 #include <stddef.h>
18 #include "upb/pb/decoder.int.h"
19 #include "upb/pb/varint.int.h"
20 
21 #ifdef UPB_DUMP_BYTECODE
22 #include <stdio.h>
23 #endif
24 
25 #include "upb/port_def.inc"
26 
27 #define CHECK_SUSPEND(x) if (!(x)) return upb_pbdecoder_suspend(d);
28 
29 /* Error messages that are shared between the bytecode and JIT decoders. */
30 const char *kPbDecoderStackOverflow = "Nesting too deep.";
31 const char *kPbDecoderSubmessageTooLong =
32     "Submessage end extends past enclosing submessage.";
33 
34 /* Error messages shared within this file. */
35 static const char *kUnterminatedVarint = "Unterminated varint.";
36 
37 /* upb_pbdecoder **************************************************************/
38 
39 static opcode halt = OP_HALT;
40 
41 /* A dummy character we can point to when the user passes us a NULL buffer.
42  * We need this because in C (NULL + 0) and (NULL - NULL) are undefined
43  * behavior, which would invalidate functions like curbufleft(). */
44 static const char dummy_char;
45 
46 /* Whether an op consumes any of the input buffer. */
consumes_input(opcode op)47 static bool consumes_input(opcode op) {
48   switch (op) {
49     case OP_SETDISPATCH:
50     case OP_STARTMSG:
51     case OP_ENDMSG:
52     case OP_STARTSEQ:
53     case OP_ENDSEQ:
54     case OP_STARTSUBMSG:
55     case OP_ENDSUBMSG:
56     case OP_STARTSTR:
57     case OP_ENDSTR:
58     case OP_PUSHTAGDELIM:
59     case OP_POP:
60     case OP_SETDELIM:
61     case OP_SETBIGGROUPNUM:
62     case OP_CHECKDELIM:
63     case OP_CALL:
64     case OP_RET:
65     case OP_BRANCH:
66       return false;
67     default:
68       return true;
69   }
70 }
71 
stacksize(upb_pbdecoder * d,size_t entries)72 static size_t stacksize(upb_pbdecoder *d, size_t entries) {
73   UPB_UNUSED(d);
74   return entries * sizeof(upb_pbdecoder_frame);
75 }
76 
callstacksize(upb_pbdecoder * d,size_t entries)77 static size_t callstacksize(upb_pbdecoder *d, size_t entries) {
78   UPB_UNUSED(d);
79 
80   return entries * sizeof(uint32_t*);
81 }
82 
83 
84 static bool in_residual_buf(const upb_pbdecoder *d, const char *p);
85 
86 /* It's unfortunate that we have to micro-manage the compiler with
87  * UPB_FORCEINLINE and UPB_NOINLINE, especially since this tuning is necessarily
88  * specific to one hardware configuration.  But empirically on a Core i7,
89  * performance increases 30-50% with these annotations.  Every instance where
90  * these appear, gcc 4.2.1 made the wrong decision and degraded performance in
91  * benchmarks. */
92 
seterr(upb_pbdecoder * d,const char * msg)93 static void seterr(upb_pbdecoder *d, const char *msg) {
94   upb_status_seterrmsg(d->status, msg);
95 }
96 
upb_pbdecoder_seterr(upb_pbdecoder * d,const char * msg)97 void upb_pbdecoder_seterr(upb_pbdecoder *d, const char *msg) {
98   seterr(d, msg);
99 }
100 
101 
102 /* Buffering ******************************************************************/
103 
104 /* We operate on one buffer at a time, which is either the user's buffer passed
105  * to our "decode" callback or some residual bytes from the previous buffer. */
106 
107 /* How many bytes can be safely read from d->ptr without reading past end-of-buf
108  * or past the current delimited end. */
curbufleft(const upb_pbdecoder * d)109 static size_t curbufleft(const upb_pbdecoder *d) {
110   UPB_ASSERT(d->data_end >= d->ptr);
111   return d->data_end - d->ptr;
112 }
113 
114 /* How many bytes are available before end-of-buffer. */
bufleft(const upb_pbdecoder * d)115 static size_t bufleft(const upb_pbdecoder *d) {
116   return d->end - d->ptr;
117 }
118 
119 /* Overall stream offset of d->ptr. */
offset(const upb_pbdecoder * d)120 uint64_t offset(const upb_pbdecoder *d) {
121   return d->bufstart_ofs + (d->ptr - d->buf);
122 }
123 
124 /* How many bytes are available before the end of this delimited region. */
delim_remaining(const upb_pbdecoder * d)125 size_t delim_remaining(const upb_pbdecoder *d) {
126   return d->top->end_ofs - offset(d);
127 }
128 
129 /* Advances d->ptr. */
advance(upb_pbdecoder * d,size_t len)130 static void advance(upb_pbdecoder *d, size_t len) {
131   UPB_ASSERT(curbufleft(d) >= len);
132   d->ptr += len;
133 }
134 
in_buf(const char * p,const char * buf,const char * end)135 static bool in_buf(const char *p, const char *buf, const char *end) {
136   return p >= buf && p <= end;
137 }
138 
in_residual_buf(const upb_pbdecoder * d,const char * p)139 static bool in_residual_buf(const upb_pbdecoder *d, const char *p) {
140   return in_buf(p, d->residual, d->residual_end);
141 }
142 
143 /* Calculates the delim_end value, which is affected by both the current buffer
144  * and the parsing stack, so must be called whenever either is updated. */
set_delim_end(upb_pbdecoder * d)145 static void set_delim_end(upb_pbdecoder *d) {
146   size_t delim_ofs = d->top->end_ofs - d->bufstart_ofs;
147   if (delim_ofs <= (size_t)(d->end - d->buf)) {
148     d->delim_end = d->buf + delim_ofs;
149     d->data_end = d->delim_end;
150   } else {
151     d->data_end = d->end;
152     d->delim_end = NULL;
153   }
154 }
155 
switchtobuf(upb_pbdecoder * d,const char * buf,const char * end)156 static void switchtobuf(upb_pbdecoder *d, const char *buf, const char *end) {
157   d->ptr = buf;
158   d->buf = buf;
159   d->end = end;
160   set_delim_end(d);
161 }
162 
advancetobuf(upb_pbdecoder * d,const char * buf,size_t len)163 static void advancetobuf(upb_pbdecoder *d, const char *buf, size_t len) {
164   UPB_ASSERT(curbufleft(d) == 0);
165   d->bufstart_ofs += (d->end - d->buf);
166   switchtobuf(d, buf, buf + len);
167 }
168 
checkpoint(upb_pbdecoder * d)169 static void checkpoint(upb_pbdecoder *d) {
170   /* The assertion here is in the interests of efficiency, not correctness.
171    * We are trying to ensure that we don't checkpoint() more often than
172    * necessary. */
173   UPB_ASSERT(d->checkpoint != d->ptr);
174   d->checkpoint = d->ptr;
175 }
176 
177 /* Skips "bytes" bytes in the stream, which may be more than available.  If we
178  * skip more bytes than are available, we return a long read count to the caller
179  * indicating how many bytes can be skipped over before passing actual data
180  * again.  Skipped bytes can pass a NULL buffer and the decoder guarantees they
181  * won't actually be read.
182  */
skip(upb_pbdecoder * d,size_t bytes)183 static int32_t skip(upb_pbdecoder *d, size_t bytes) {
184   UPB_ASSERT(!in_residual_buf(d, d->ptr) || d->size_param == 0);
185   UPB_ASSERT(d->skip == 0);
186   if (bytes > delim_remaining(d)) {
187     seterr(d, "Skipped value extended beyond enclosing submessage.");
188     return (int32_t)upb_pbdecoder_suspend(d);
189   } else if (bufleft(d) >= bytes) {
190     /* Skipped data is all in current buffer, and more is still available. */
191     advance(d, bytes);
192     d->skip = 0;
193     return DECODE_OK;
194   } else {
195     /* Skipped data extends beyond currently available buffers. */
196     d->pc = d->last;
197     d->skip = bytes - curbufleft(d);
198     d->bufstart_ofs += (d->end - d->buf);
199     d->residual_end = d->residual;
200     switchtobuf(d, d->residual, d->residual_end);
201     return (int32_t)(d->size_param + d->skip);
202   }
203 }
204 
205 
206 /* Resumes the decoder from an initial state or from a previous suspend. */
upb_pbdecoder_resume(upb_pbdecoder * d,void * p,const char * buf,size_t size,const upb_bufhandle * handle)207 int32_t upb_pbdecoder_resume(upb_pbdecoder *d, void *p, const char *buf,
208                              size_t size, const upb_bufhandle *handle) {
209   UPB_UNUSED(p);  /* Useless; just for the benefit of the JIT. */
210 
211   /* d->skip and d->residual_end could probably elegantly be represented
212    * as a single variable, to more easily represent this invariant. */
213   UPB_ASSERT(!(d->skip && d->residual_end > d->residual));
214 
215   /* We need to remember the original size_param, so that the value we return
216    * is relative to it, even if we do some skipping first. */
217   d->size_param = size;
218   d->handle = handle;
219 
220   /* Have to handle this case specially (ie. not with skip()) because the user
221    * is allowed to pass a NULL buffer here, which won't allow us to safely
222    * calculate a d->end or use our normal functions like curbufleft(). */
223   if (d->skip && d->skip >= size) {
224     d->skip -= size;
225     d->bufstart_ofs += size;
226     buf = &dummy_char;
227     size = 0;
228 
229     /* We can't just return now, because we might need to execute some ops
230      * like CHECKDELIM, which could call some callbacks and pop the stack. */
231   }
232 
233   /* We need to pretend that this was the actual buffer param, since some of the
234    * calculations assume that d->ptr/d->buf is relative to this. */
235   d->buf_param = buf;
236 
237   if (!buf) {
238     /* NULL buf is ok if its entire span is covered by the "skip" above, but
239      * by this point we know that "skip" doesn't cover the buffer. */
240     seterr(d, "Passed NULL buffer over non-skippable region.");
241     return (int32_t)upb_pbdecoder_suspend(d);
242   }
243 
244   if (d->residual_end > d->residual) {
245     /* We have residual bytes from the last buffer. */
246     UPB_ASSERT(d->ptr == d->residual);
247   } else {
248     switchtobuf(d, buf, buf + size);
249   }
250 
251   d->checkpoint = d->ptr;
252 
253   /* Handle skips that don't cover the whole buffer (as above). */
254   if (d->skip) {
255     size_t skip_bytes = d->skip;
256     d->skip = 0;
257     CHECK_RETURN(skip(d, skip_bytes));
258     checkpoint(d);
259   }
260 
261   /* If we're inside an unknown group, continue to parse unknown values. */
262   if (d->top->groupnum < 0) {
263     CHECK_RETURN(upb_pbdecoder_skipunknown(d, -1, 0));
264     checkpoint(d);
265   }
266 
267   return DECODE_OK;
268 }
269 
270 /* Suspends the decoder at the last checkpoint, without saving any residual
271  * bytes.  If there are any unconsumed bytes, returns a short byte count. */
upb_pbdecoder_suspend(upb_pbdecoder * d)272 size_t upb_pbdecoder_suspend(upb_pbdecoder *d) {
273   d->pc = d->last;
274   if (d->checkpoint == d->residual) {
275     /* Checkpoint was in residual buf; no user bytes were consumed. */
276     d->ptr = d->residual;
277     return 0;
278   } else {
279     size_t ret = d->size_param - (d->end - d->checkpoint);
280     UPB_ASSERT(!in_residual_buf(d, d->checkpoint));
281     UPB_ASSERT(d->buf == d->buf_param || d->buf == &dummy_char);
282 
283     d->bufstart_ofs += (d->checkpoint - d->buf);
284     d->residual_end = d->residual;
285     switchtobuf(d, d->residual, d->residual_end);
286     return ret;
287   }
288 }
289 
290 /* Suspends the decoder at the last checkpoint, and saves any unconsumed
291  * bytes in our residual buffer.  This is necessary if we need more user
292  * bytes to form a complete value, which might not be contiguous in the
293  * user's buffers.  Always consumes all user bytes. */
suspend_save(upb_pbdecoder * d)294 static size_t suspend_save(upb_pbdecoder *d) {
295   /* We hit end-of-buffer before we could parse a full value.
296    * Save any unconsumed bytes (if any) to the residual buffer. */
297   d->pc = d->last;
298 
299   if (d->checkpoint == d->residual) {
300     /* Checkpoint was in residual buf; append user byte(s) to residual buf. */
301     UPB_ASSERT((d->residual_end - d->residual) + d->size_param <=
302            sizeof(d->residual));
303     if (!in_residual_buf(d, d->ptr)) {
304       d->bufstart_ofs -= (d->residual_end - d->residual);
305     }
306     memcpy(d->residual_end, d->buf_param, d->size_param);
307     d->residual_end += d->size_param;
308   } else {
309     /* Checkpoint was in user buf; old residual bytes not needed. */
310     size_t save;
311     UPB_ASSERT(!in_residual_buf(d, d->checkpoint));
312 
313     d->ptr = d->checkpoint;
314     save = curbufleft(d);
315     UPB_ASSERT(save <= sizeof(d->residual));
316     memcpy(d->residual, d->ptr, save);
317     d->residual_end = d->residual + save;
318     d->bufstart_ofs = offset(d);
319   }
320 
321   switchtobuf(d, d->residual, d->residual_end);
322   return d->size_param;
323 }
324 
325 /* Copies the next "bytes" bytes into "buf" and advances the stream.
326  * Requires that this many bytes are available in the current buffer. */
consumebytes(upb_pbdecoder * d,void * buf,size_t bytes)327 UPB_FORCEINLINE static void consumebytes(upb_pbdecoder *d, void *buf,
328                                          size_t bytes) {
329   UPB_ASSERT(bytes <= curbufleft(d));
330   memcpy(buf, d->ptr, bytes);
331   advance(d, bytes);
332 }
333 
334 /* Slow path for getting the next "bytes" bytes, regardless of whether they are
335  * available in the current buffer or not.  Returns a status code as described
336  * in decoder.int.h. */
getbytes_slow(upb_pbdecoder * d,void * buf,size_t bytes)337 UPB_NOINLINE static int32_t getbytes_slow(upb_pbdecoder *d, void *buf,
338                                           size_t bytes) {
339   const size_t avail = curbufleft(d);
340   consumebytes(d, buf, avail);
341   bytes -= avail;
342   UPB_ASSERT(bytes > 0);
343   if (in_residual_buf(d, d->ptr)) {
344     advancetobuf(d, d->buf_param, d->size_param);
345   }
346   if (curbufleft(d) >= bytes) {
347     consumebytes(d, (char *)buf + avail, bytes);
348     return DECODE_OK;
349   } else if (d->data_end == d->delim_end) {
350     seterr(d, "Submessage ended in the middle of a value or group");
351     return (int32_t)upb_pbdecoder_suspend(d);
352   } else {
353     return (int32_t)suspend_save(d);
354   }
355 }
356 
357 /* Gets the next "bytes" bytes, regardless of whether they are available in the
358  * current buffer or not.  Returns a status code as described in decoder.int.h.
359  */
getbytes(upb_pbdecoder * d,void * buf,size_t bytes)360 UPB_FORCEINLINE static int32_t getbytes(upb_pbdecoder *d, void *buf,
361                                         size_t bytes) {
362   if (curbufleft(d) >= bytes) {
363     /* Buffer has enough data to satisfy. */
364     consumebytes(d, buf, bytes);
365     return DECODE_OK;
366   } else {
367     return getbytes_slow(d, buf, bytes);
368   }
369 }
370 
peekbytes_slow(upb_pbdecoder * d,void * buf,size_t bytes)371 UPB_NOINLINE static size_t peekbytes_slow(upb_pbdecoder *d, void *buf,
372                                           size_t bytes) {
373   size_t ret = curbufleft(d);
374   memcpy(buf, d->ptr, ret);
375   if (in_residual_buf(d, d->ptr)) {
376     size_t copy = UPB_MIN(bytes - ret, d->size_param);
377     memcpy((char *)buf + ret, d->buf_param, copy);
378     ret += copy;
379   }
380   return ret;
381 }
382 
peekbytes(upb_pbdecoder * d,void * buf,size_t bytes)383 UPB_FORCEINLINE static size_t peekbytes(upb_pbdecoder *d, void *buf,
384                                         size_t bytes) {
385   if (curbufleft(d) >= bytes) {
386     memcpy(buf, d->ptr, bytes);
387     return bytes;
388   } else {
389     return peekbytes_slow(d, buf, bytes);
390   }
391 }
392 
393 
394 /* Decoding of wire types *****************************************************/
395 
396 /* Slow path for decoding a varint from the current buffer position.
397  * Returns a status code as described in decoder.int.h. */
upb_pbdecoder_decode_varint_slow(upb_pbdecoder * d,uint64_t * u64)398 UPB_NOINLINE int32_t upb_pbdecoder_decode_varint_slow(upb_pbdecoder *d,
399                                                       uint64_t *u64) {
400   uint8_t byte = 0x80;
401   int bitpos;
402   *u64 = 0;
403   for(bitpos = 0; bitpos < 70 && (byte & 0x80); bitpos += 7) {
404     CHECK_RETURN(getbytes(d, &byte, 1));
405     *u64 |= (uint64_t)(byte & 0x7F) << bitpos;
406   }
407   if(bitpos == 70 && (byte & 0x80)) {
408     seterr(d, kUnterminatedVarint);
409     return (int32_t)upb_pbdecoder_suspend(d);
410   }
411   return DECODE_OK;
412 }
413 
414 /* Decodes a varint from the current buffer position.
415  * Returns a status code as described in decoder.int.h. */
decode_varint(upb_pbdecoder * d,uint64_t * u64)416 UPB_FORCEINLINE static int32_t decode_varint(upb_pbdecoder *d, uint64_t *u64) {
417   if (curbufleft(d) > 0 && !(*d->ptr & 0x80)) {
418     *u64 = *d->ptr;
419     advance(d, 1);
420     return DECODE_OK;
421   } else if (curbufleft(d) >= 10) {
422     /* Fast case. */
423     upb_decoderet r = upb_vdecode_fast(d->ptr);
424     if (r.p == NULL) {
425       seterr(d, kUnterminatedVarint);
426       return (int32_t)upb_pbdecoder_suspend(d);
427     }
428     advance(d, r.p - d->ptr);
429     *u64 = r.val;
430     return DECODE_OK;
431   } else {
432     /* Slow case -- varint spans buffer seam. */
433     return upb_pbdecoder_decode_varint_slow(d, u64);
434   }
435 }
436 
437 /* Decodes a 32-bit varint from the current buffer position.
438  * Returns a status code as described in decoder.int.h. */
decode_v32(upb_pbdecoder * d,uint32_t * u32)439 UPB_FORCEINLINE static int32_t decode_v32(upb_pbdecoder *d, uint32_t *u32) {
440   uint64_t u64;
441   int32_t ret = decode_varint(d, &u64);
442   if (ret >= 0) return ret;
443   if (u64 > UINT32_MAX) {
444     seterr(d, "Unterminated 32-bit varint");
445     /* TODO(haberman) guarantee that this function return is >= 0 somehow,
446      * so we know this path will always be treated as error by our caller.
447      * Right now the size_t -> int32_t can overflow and produce negative values.
448      */
449     *u32 = 0;
450     return (int32_t)upb_pbdecoder_suspend(d);
451   }
452   *u32 = (uint32_t)u64;
453   return DECODE_OK;
454 }
455 
456 /* Decodes a fixed32 from the current buffer position.
457  * Returns a status code as described in decoder.int.h.
458  * TODO: proper byte swapping for big-endian machines. */
decode_fixed32(upb_pbdecoder * d,uint32_t * u32)459 UPB_FORCEINLINE static int32_t decode_fixed32(upb_pbdecoder *d, uint32_t *u32) {
460   return getbytes(d, u32, 4);
461 }
462 
463 /* Decodes a fixed64 from the current buffer position.
464  * Returns a status code as described in decoder.int.h.
465  * TODO: proper byte swapping for big-endian machines. */
decode_fixed64(upb_pbdecoder * d,uint64_t * u64)466 UPB_FORCEINLINE static int32_t decode_fixed64(upb_pbdecoder *d, uint64_t *u64) {
467   return getbytes(d, u64, 8);
468 }
469 
470 /* Non-static versions of the above functions.
471  * These are called by the JIT for fallback paths. */
upb_pbdecoder_decode_f32(upb_pbdecoder * d,uint32_t * u32)472 int32_t upb_pbdecoder_decode_f32(upb_pbdecoder *d, uint32_t *u32) {
473   return decode_fixed32(d, u32);
474 }
475 
upb_pbdecoder_decode_f64(upb_pbdecoder * d,uint64_t * u64)476 int32_t upb_pbdecoder_decode_f64(upb_pbdecoder *d, uint64_t *u64) {
477   return decode_fixed64(d, u64);
478 }
479 
as_double(uint64_t n)480 static double as_double(uint64_t n) { double d; memcpy(&d, &n, 8); return d; }
as_float(uint32_t n)481 static float  as_float(uint32_t n)  { float  f; memcpy(&f, &n, 4); return f; }
482 
483 /* Pushes a frame onto the decoder stack. */
decoder_push(upb_pbdecoder * d,uint64_t end)484 static bool decoder_push(upb_pbdecoder *d, uint64_t end) {
485   upb_pbdecoder_frame *fr = d->top;
486 
487   if (end > fr->end_ofs) {
488     seterr(d, kPbDecoderSubmessageTooLong);
489     return false;
490   } else if (fr == d->limit) {
491     seterr(d, kPbDecoderStackOverflow);
492     return false;
493   }
494 
495   fr++;
496   fr->end_ofs = end;
497   fr->dispatch = NULL;
498   fr->groupnum = 0;
499   d->top = fr;
500   return true;
501 }
502 
pushtagdelim(upb_pbdecoder * d,uint32_t arg)503 static bool pushtagdelim(upb_pbdecoder *d, uint32_t arg) {
504   /* While we expect to see an "end" tag (either ENDGROUP or a non-sequence
505    * field number) prior to hitting any enclosing submessage end, pushing our
506    * existing delim end prevents us from continuing to parse values from a
507    * corrupt proto that doesn't give us an END tag in time. */
508   if (!decoder_push(d, d->top->end_ofs))
509     return false;
510   d->top->groupnum = arg;
511   return true;
512 }
513 
514 /* Pops a frame from the decoder stack. */
decoder_pop(upb_pbdecoder * d)515 static void decoder_pop(upb_pbdecoder *d) { d->top--; }
516 
upb_pbdecoder_checktag_slow(upb_pbdecoder * d,uint64_t expected)517 UPB_NOINLINE int32_t upb_pbdecoder_checktag_slow(upb_pbdecoder *d,
518                                                  uint64_t expected) {
519   uint64_t data = 0;
520   size_t bytes = upb_value_size(expected);
521   size_t read = peekbytes(d, &data, bytes);
522   if (read == bytes && data == expected) {
523     /* Advance past matched bytes. */
524     int32_t ok = getbytes(d, &data, read);
525     UPB_ASSERT(ok < 0);
526     return DECODE_OK;
527   } else if (read < bytes && memcmp(&data, &expected, read) == 0) {
528     return (int32_t)suspend_save(d);
529   } else {
530     return DECODE_MISMATCH;
531   }
532 }
533 
upb_pbdecoder_skipunknown(upb_pbdecoder * d,int32_t fieldnum,uint8_t wire_type)534 int32_t upb_pbdecoder_skipunknown(upb_pbdecoder *d, int32_t fieldnum,
535                                   uint8_t wire_type) {
536   if (fieldnum >= 0)
537     goto have_tag;
538 
539   while (true) {
540     uint32_t tag;
541     CHECK_RETURN(decode_v32(d, &tag));
542     wire_type = tag & 0x7;
543     fieldnum = tag >> 3;
544 
545 have_tag:
546     if (fieldnum == 0) {
547       seterr(d, "Saw invalid field number (0)");
548       return (int32_t)upb_pbdecoder_suspend(d);
549     }
550 
551     switch (wire_type) {
552       case UPB_WIRE_TYPE_32BIT:
553         CHECK_RETURN(skip(d, 4));
554         break;
555       case UPB_WIRE_TYPE_64BIT:
556         CHECK_RETURN(skip(d, 8));
557         break;
558       case UPB_WIRE_TYPE_VARINT: {
559         uint64_t u64;
560         CHECK_RETURN(decode_varint(d, &u64));
561         break;
562       }
563       case UPB_WIRE_TYPE_DELIMITED: {
564         uint32_t len;
565         CHECK_RETURN(decode_v32(d, &len));
566         CHECK_RETURN(skip(d, len));
567         break;
568       }
569       case UPB_WIRE_TYPE_START_GROUP:
570         if (!pushtagdelim(d, -fieldnum)) {
571           return (int32_t)upb_pbdecoder_suspend(d);
572         }
573         break;
574       case UPB_WIRE_TYPE_END_GROUP:
575         if (fieldnum == -d->top->groupnum) {
576           decoder_pop(d);
577         } else if (fieldnum == d->top->groupnum) {
578           return DECODE_ENDGROUP;
579         } else {
580           seterr(d, "Unmatched ENDGROUP tag.");
581           return (int32_t)upb_pbdecoder_suspend(d);
582         }
583         break;
584       default:
585         seterr(d, "Invalid wire type");
586         return (int32_t)upb_pbdecoder_suspend(d);
587     }
588 
589     if (d->top->groupnum >= 0) {
590       /* TODO: More code needed for handling unknown groups. */
591       upb_sink_putunknown(d->top->sink, d->checkpoint, d->ptr - d->checkpoint);
592       return DECODE_OK;
593     }
594 
595     /* Unknown group -- continue looping over unknown fields. */
596     checkpoint(d);
597   }
598 }
599 
goto_endmsg(upb_pbdecoder * d)600 static void goto_endmsg(upb_pbdecoder *d) {
601   upb_value v;
602   bool found = upb_inttable_lookup32(d->top->dispatch, DISPATCH_ENDMSG, &v);
603   UPB_ASSERT(found);
604   d->pc = d->top->base + upb_value_getuint64(v);
605 }
606 
607 /* Parses a tag and jumps to the corresponding bytecode instruction for this
608  * field.
609  *
610  * If the tag is unknown (or the wire type doesn't match), parses the field as
611  * unknown.  If the tag is a valid ENDGROUP tag, jumps to the bytecode
612  * instruction for the end of message. */
dispatch(upb_pbdecoder * d)613 static int32_t dispatch(upb_pbdecoder *d) {
614   upb_inttable *dispatch = d->top->dispatch;
615   uint32_t tag;
616   uint8_t wire_type;
617   uint32_t fieldnum;
618   upb_value val;
619   int32_t retval;
620 
621   /* Decode tag. */
622   CHECK_RETURN(decode_v32(d, &tag));
623   wire_type = tag & 0x7;
624   fieldnum = tag >> 3;
625 
626   /* Lookup tag.  Because of packed/non-packed compatibility, we have to
627    * check the wire type against two possibilities. */
628   if (fieldnum != DISPATCH_ENDMSG &&
629       upb_inttable_lookup32(dispatch, fieldnum, &val)) {
630     uint64_t v = upb_value_getuint64(val);
631     if (wire_type == (v & 0xff)) {
632       d->pc = d->top->base + (v >> 16);
633       return DECODE_OK;
634     } else if (wire_type == ((v >> 8) & 0xff)) {
635       bool found =
636           upb_inttable_lookup(dispatch, fieldnum + UPB_MAX_FIELDNUMBER, &val);
637       UPB_ASSERT(found);
638       d->pc = d->top->base + upb_value_getuint64(val);
639       return DECODE_OK;
640     }
641   }
642 
643   /* We have some unknown fields (or ENDGROUP) to parse.  The DISPATCH or TAG
644    * bytecode that triggered this is preceded by a CHECKDELIM bytecode which
645    * we need to back up to, so that when we're done skipping unknown data we
646    * can re-check the delimited end. */
647   d->last--;  /* Necessary if we get suspended */
648   d->pc = d->last;
649   UPB_ASSERT(getop(*d->last) == OP_CHECKDELIM);
650 
651   /* Unknown field or ENDGROUP. */
652   retval = upb_pbdecoder_skipunknown(d, fieldnum, wire_type);
653 
654   CHECK_RETURN(retval);
655 
656   if (retval == DECODE_ENDGROUP) {
657     goto_endmsg(d);
658     return DECODE_OK;
659   }
660 
661   return DECODE_OK;
662 }
663 
664 /* Callers know that the stack is more than one deep because the opcodes that
665  * call this only occur after PUSH operations. */
outer_frame(upb_pbdecoder * d)666 upb_pbdecoder_frame *outer_frame(upb_pbdecoder *d) {
667   UPB_ASSERT(d->top != d->stack);
668   return d->top - 1;
669 }
670 
671 
672 /* The main decoding loop *****************************************************/
673 
674 /* The main decoder VM function.  Uses traditional bytecode dispatch loop with a
675  * switch() statement. */
run_decoder_vm(upb_pbdecoder * d,const mgroup * group,const upb_bufhandle * handle)676 size_t run_decoder_vm(upb_pbdecoder *d, const mgroup *group,
677                       const upb_bufhandle* handle) {
678 
679 #define VMCASE(op, code) \
680   case op: { code; if (consumes_input(op)) checkpoint(d); break; }
681 #define PRIMITIVE_OP(type, wt, name, convfunc, ctype) \
682   VMCASE(OP_PARSE_ ## type, { \
683     ctype val; \
684     CHECK_RETURN(decode_ ## wt(d, &val)); \
685     upb_sink_put ## name(d->top->sink, arg, (convfunc)(val)); \
686   })
687 
688   while(1) {
689     int32_t instruction;
690     opcode op;
691     uint32_t arg;
692     int32_t longofs;
693 
694     d->last = d->pc;
695     instruction = *d->pc++;
696     op = getop(instruction);
697     arg = instruction >> 8;
698     longofs = arg;
699     UPB_ASSERT(d->ptr != d->residual_end);
700     UPB_UNUSED(group);
701 #ifdef UPB_DUMP_BYTECODE
702     fprintf(stderr, "s_ofs=%d buf_ofs=%d data_rem=%d buf_rem=%d delim_rem=%d "
703                     "%x %s (%d)\n",
704             (int)offset(d),
705             (int)(d->ptr - d->buf),
706             (int)(d->data_end - d->ptr),
707             (int)(d->end - d->ptr),
708             (int)((d->top->end_ofs - d->bufstart_ofs) - (d->ptr - d->buf)),
709             (int)(d->pc - 1 - group->bytecode),
710             upb_pbdecoder_getopname(op),
711             arg);
712 #endif
713     switch (op) {
714       /* Technically, we are losing data if we see a 32-bit varint that is not
715        * properly sign-extended.  We could detect this and error about the data
716        * loss, but proto2 does not do this, so we pass. */
717       PRIMITIVE_OP(INT32,    varint,  int32,  int32_t,      uint64_t)
718       PRIMITIVE_OP(INT64,    varint,  int64,  int64_t,      uint64_t)
719       PRIMITIVE_OP(UINT32,   varint,  uint32, uint32_t,     uint64_t)
720       PRIMITIVE_OP(UINT64,   varint,  uint64, uint64_t,     uint64_t)
721       PRIMITIVE_OP(FIXED32,  fixed32, uint32, uint32_t,     uint32_t)
722       PRIMITIVE_OP(FIXED64,  fixed64, uint64, uint64_t,     uint64_t)
723       PRIMITIVE_OP(SFIXED32, fixed32, int32,  int32_t,      uint32_t)
724       PRIMITIVE_OP(SFIXED64, fixed64, int64,  int64_t,      uint64_t)
725       PRIMITIVE_OP(BOOL,     varint,  bool,   bool,         uint64_t)
726       PRIMITIVE_OP(DOUBLE,   fixed64, double, as_double,    uint64_t)
727       PRIMITIVE_OP(FLOAT,    fixed32, float,  as_float,     uint32_t)
728       PRIMITIVE_OP(SINT32,   varint,  int32,  upb_zzdec_32, uint64_t)
729       PRIMITIVE_OP(SINT64,   varint,  int64,  upb_zzdec_64, uint64_t)
730 
731       VMCASE(OP_SETDISPATCH,
732         d->top->base = d->pc - 1;
733         memcpy(&d->top->dispatch, d->pc, sizeof(void*));
734         d->pc += sizeof(void*) / sizeof(uint32_t);
735       )
736       VMCASE(OP_STARTMSG,
737         CHECK_SUSPEND(upb_sink_startmsg(d->top->sink));
738       )
739       VMCASE(OP_ENDMSG,
740         CHECK_SUSPEND(upb_sink_endmsg(d->top->sink, d->status));
741       )
742       VMCASE(OP_STARTSEQ,
743         upb_pbdecoder_frame *outer = outer_frame(d);
744         CHECK_SUSPEND(upb_sink_startseq(outer->sink, arg, &d->top->sink));
745       )
746       VMCASE(OP_ENDSEQ,
747         CHECK_SUSPEND(upb_sink_endseq(d->top->sink, arg));
748       )
749       VMCASE(OP_STARTSUBMSG,
750         upb_pbdecoder_frame *outer = outer_frame(d);
751         CHECK_SUSPEND(upb_sink_startsubmsg(outer->sink, arg, &d->top->sink));
752       )
753       VMCASE(OP_ENDSUBMSG,
754         upb_sink subsink = (d->top + 1)->sink;
755         CHECK_SUSPEND(upb_sink_endsubmsg(d->top->sink, subsink, arg));
756       )
757       VMCASE(OP_STARTSTR,
758         uint32_t len = (uint32_t)delim_remaining(d);
759         upb_pbdecoder_frame *outer = outer_frame(d);
760         CHECK_SUSPEND(upb_sink_startstr(outer->sink, arg, len, &d->top->sink));
761         if (len == 0) {
762           d->pc++;  /* Skip OP_STRING. */
763         }
764       )
765       VMCASE(OP_STRING,
766         uint32_t len = (uint32_t)curbufleft(d);
767         size_t n = upb_sink_putstring(d->top->sink, arg, d->ptr, len, handle);
768         if (n > len) {
769           if (n > delim_remaining(d)) {
770             seterr(d, "Tried to skip past end of string.");
771             return upb_pbdecoder_suspend(d);
772           } else {
773             int32_t ret = skip(d, n);
774             /* This shouldn't return DECODE_OK, because n > len. */
775             UPB_ASSERT(ret >= 0);
776             return ret;
777           }
778         }
779         advance(d, n);
780         if (n < len || d->delim_end == NULL) {
781           /* We aren't finished with this string yet. */
782           d->pc--;  /* Repeat OP_STRING. */
783           if (n > 0) checkpoint(d);
784           return upb_pbdecoder_suspend(d);
785         }
786       )
787       VMCASE(OP_ENDSTR,
788         CHECK_SUSPEND(upb_sink_endstr(d->top->sink, arg));
789       )
790       VMCASE(OP_PUSHTAGDELIM,
791         CHECK_SUSPEND(pushtagdelim(d, arg));
792       )
793       VMCASE(OP_SETBIGGROUPNUM,
794         d->top->groupnum = *d->pc++;
795       )
796       VMCASE(OP_POP,
797         UPB_ASSERT(d->top > d->stack);
798         decoder_pop(d);
799       )
800       VMCASE(OP_PUSHLENDELIM,
801         uint32_t len;
802         CHECK_RETURN(decode_v32(d, &len));
803         CHECK_SUSPEND(decoder_push(d, offset(d) + len));
804         set_delim_end(d);
805       )
806       VMCASE(OP_SETDELIM,
807         set_delim_end(d);
808       )
809       VMCASE(OP_CHECKDELIM,
810         /* We are guaranteed of this assert because we never allow ourselves to
811          * consume bytes beyond data_end, which covers delim_end when non-NULL.
812          */
813         UPB_ASSERT(!(d->delim_end && d->ptr > d->delim_end));
814         if (d->ptr == d->delim_end)
815           d->pc += longofs;
816       )
817       VMCASE(OP_CALL,
818         d->callstack[d->call_len++] = d->pc;
819         d->pc += longofs;
820       )
821       VMCASE(OP_RET,
822         UPB_ASSERT(d->call_len > 0);
823         d->pc = d->callstack[--d->call_len];
824       )
825       VMCASE(OP_BRANCH,
826         d->pc += longofs;
827       )
828       VMCASE(OP_TAG1,
829         uint8_t expected;
830         CHECK_SUSPEND(curbufleft(d) > 0);
831         expected = (arg >> 8) & 0xff;
832         if (*d->ptr == expected) {
833           advance(d, 1);
834         } else {
835           int8_t shortofs;
836          badtag:
837           shortofs = arg;
838           if (shortofs == LABEL_DISPATCH) {
839             CHECK_RETURN(dispatch(d));
840           } else {
841             d->pc += shortofs;
842             break; /* Avoid checkpoint(). */
843           }
844         }
845       )
846       VMCASE(OP_TAG2,
847         uint16_t expected;
848         CHECK_SUSPEND(curbufleft(d) > 0);
849         expected = (arg >> 8) & 0xffff;
850         if (curbufleft(d) >= 2) {
851           uint16_t actual;
852           memcpy(&actual, d->ptr, 2);
853           if (expected == actual) {
854             advance(d, 2);
855           } else {
856             goto badtag;
857           }
858         } else {
859           int32_t result = upb_pbdecoder_checktag_slow(d, expected);
860           if (result == DECODE_MISMATCH) goto badtag;
861           if (result >= 0) return result;
862         }
863       )
864       VMCASE(OP_TAGN, {
865         uint64_t expected;
866         int32_t result;
867         memcpy(&expected, d->pc, 8);
868         d->pc += 2;
869         result = upb_pbdecoder_checktag_slow(d, expected);
870         if (result == DECODE_MISMATCH) goto badtag;
871         if (result >= 0) return result;
872       })
873       VMCASE(OP_DISPATCH, {
874         CHECK_RETURN(dispatch(d));
875       })
876       VMCASE(OP_HALT, {
877         return d->size_param;
878       })
879     }
880   }
881 }
882 
883 
884 /* BytesHandler handlers ******************************************************/
885 
upb_pbdecoder_startbc(void * closure,const void * pc,size_t size_hint)886 void *upb_pbdecoder_startbc(void *closure, const void *pc, size_t size_hint) {
887   upb_pbdecoder *d = closure;
888   UPB_UNUSED(size_hint);
889   d->top->end_ofs = UINT64_MAX;
890   d->bufstart_ofs = 0;
891   d->call_len = 1;
892   d->callstack[0] = &halt;
893   d->pc = pc;
894   d->skip = 0;
895   return d;
896 }
897 
upb_pbdecoder_end(void * closure,const void * handler_data)898 bool upb_pbdecoder_end(void *closure, const void *handler_data) {
899   upb_pbdecoder *d = closure;
900   const upb_pbdecodermethod *method = handler_data;
901   uint64_t end;
902   char dummy;
903 
904   if (d->residual_end > d->residual) {
905     seterr(d, "Unexpected EOF: decoder still has buffered unparsed data");
906     return false;
907   }
908 
909   if (d->skip) {
910     seterr(d, "Unexpected EOF inside skipped data");
911     return false;
912   }
913 
914   if (d->top->end_ofs != UINT64_MAX) {
915     seterr(d, "Unexpected EOF inside delimited string");
916     return false;
917   }
918 
919   /* The user's end() call indicates that the message ends here. */
920   end = offset(d);
921   d->top->end_ofs = end;
922 
923   {
924     const uint32_t *p = d->pc;
925     d->stack->end_ofs = end;
926     /* Check the previous bytecode, but guard against beginning. */
927     if (p != method->code_base.ptr) p--;
928     if (getop(*p) == OP_CHECKDELIM) {
929       /* Rewind from OP_TAG* to OP_CHECKDELIM. */
930       UPB_ASSERT(getop(*d->pc) == OP_TAG1 ||
931              getop(*d->pc) == OP_TAG2 ||
932              getop(*d->pc) == OP_TAGN ||
933              getop(*d->pc) == OP_DISPATCH);
934       d->pc = p;
935     }
936     upb_pbdecoder_decode(closure, handler_data, &dummy, 0, NULL);
937   }
938 
939   if (d->call_len != 0) {
940     seterr(d, "Unexpected EOF inside submessage or group");
941     return false;
942   }
943 
944   return true;
945 }
946 
upb_pbdecoder_decode(void * decoder,const void * group,const char * buf,size_t size,const upb_bufhandle * handle)947 size_t upb_pbdecoder_decode(void *decoder, const void *group, const char *buf,
948                             size_t size, const upb_bufhandle *handle) {
949   int32_t result = upb_pbdecoder_resume(decoder, NULL, buf, size, handle);
950 
951   if (result == DECODE_ENDGROUP) goto_endmsg(decoder);
952   CHECK_RETURN(result);
953 
954   return run_decoder_vm(decoder, group, handle);
955 }
956 
957 
958 /* Public API *****************************************************************/
959 
upb_pbdecoder_reset(upb_pbdecoder * d)960 void upb_pbdecoder_reset(upb_pbdecoder *d) {
961   d->top = d->stack;
962   d->top->groupnum = 0;
963   d->ptr = d->residual;
964   d->buf = d->residual;
965   d->end = d->residual;
966   d->residual_end = d->residual;
967 }
968 
upb_pbdecoder_create(upb_arena * a,const upb_pbdecodermethod * m,upb_sink sink,upb_status * status)969 upb_pbdecoder *upb_pbdecoder_create(upb_arena *a, const upb_pbdecodermethod *m,
970                                     upb_sink sink, upb_status *status) {
971   const size_t default_max_nesting = 64;
972 
973   upb_pbdecoder *d = upb_arena_malloc(a, sizeof(upb_pbdecoder));
974   if (!d) return NULL;
975 
976   d->method_ = m;
977   d->callstack = upb_arena_malloc(a, callstacksize(d, default_max_nesting));
978   d->stack = upb_arena_malloc(a, stacksize(d, default_max_nesting));
979   if (!d->stack || !d->callstack) {
980     return NULL;
981   }
982 
983   d->arena = a;
984   d->limit = d->stack + default_max_nesting - 1;
985   d->stack_size = default_max_nesting;
986   d->status = status;
987 
988   upb_pbdecoder_reset(d);
989   upb_bytessink_reset(&d->input_, &m->input_handler_, d);
990 
991   if (d->method_->dest_handlers_) {
992     if (sink.handlers != d->method_->dest_handlers_)
993       return NULL;
994   }
995   d->top->sink = sink;
996 
997   return d;
998 }
999 
upb_pbdecoder_bytesparsed(const upb_pbdecoder * d)1000 uint64_t upb_pbdecoder_bytesparsed(const upb_pbdecoder *d) {
1001   return offset(d);
1002 }
1003 
upb_pbdecoder_method(const upb_pbdecoder * d)1004 const upb_pbdecodermethod *upb_pbdecoder_method(const upb_pbdecoder *d) {
1005   return d->method_;
1006 }
1007 
upb_pbdecoder_input(upb_pbdecoder * d)1008 upb_bytessink upb_pbdecoder_input(upb_pbdecoder *d) {
1009   return d->input_;
1010 }
1011 
upb_pbdecoder_maxnesting(const upb_pbdecoder * d)1012 size_t upb_pbdecoder_maxnesting(const upb_pbdecoder *d) {
1013   return d->stack_size;
1014 }
1015 
upb_pbdecoder_setmaxnesting(upb_pbdecoder * d,size_t max)1016 bool upb_pbdecoder_setmaxnesting(upb_pbdecoder *d, size_t max) {
1017   UPB_ASSERT(d->top >= d->stack);
1018 
1019   if (max < (size_t)(d->top - d->stack)) {
1020     /* Can't set a limit smaller than what we are currently at. */
1021     return false;
1022   }
1023 
1024   if (max > d->stack_size) {
1025     /* Need to reallocate stack and callstack to accommodate. */
1026     size_t old_size = stacksize(d, d->stack_size);
1027     size_t new_size = stacksize(d, max);
1028     void *p = upb_arena_realloc(d->arena, d->stack, old_size, new_size);
1029     if (!p) {
1030       return false;
1031     }
1032     d->stack = p;
1033 
1034     old_size = callstacksize(d, d->stack_size);
1035     new_size = callstacksize(d, max);
1036     p = upb_arena_realloc(d->arena, d->callstack, old_size, new_size);
1037     if (!p) {
1038       return false;
1039     }
1040     d->callstack = p;
1041 
1042     d->stack_size = max;
1043   }
1044 
1045   d->limit = d->stack + max - 1;
1046   return true;
1047 }
1048