• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 /*
2  * Regular Expression Engine
3  *
4  * Copyright (c) 2017-2018 Fabrice Bellard
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to deal
8  * in the Software without restriction, including without limitation the rights
9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10  * copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in
14  * all copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22  * THE SOFTWARE.
23  */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <inttypes.h>
28 #include <string.h>
29 #include <assert.h>
30 
31 #include "cutils.h"
32 #include "libregexp.h"
33 
34 /*
35   TODO:
36 
37   - Add full unicode canonicalize rules for character ranges (not
38     really useful but needed for exact "ignorecase" compatibility).
39 
40   - Add a lock step execution mode (=linear time execution guaranteed)
41     when the regular expression is "simple" i.e. no backreference nor
42     complicated lookahead. The opcodes are designed for this execution
43     model.
44 */
45 
46 #if defined(TEST)
47 #define DUMP_REOP
48 #endif
49 
50 typedef enum {
51 #define DEF(id, size) REOP_ ## id,
52 #include "libregexp-opcode.h"
53 #undef DEF
54     REOP_COUNT,
55 } REOPCodeEnum;
56 
57 #define CAPTURE_COUNT_MAX 255
58 #define STACK_SIZE_MAX 255
59 
60 /* unicode code points */
61 #define CP_LS   0x2028
62 #define CP_PS   0x2029
63 
64 #define TMP_BUF_SIZE 128
65 
66 typedef struct {
67     DynBuf byte_code;
68     const uint8_t *buf_ptr;
69     const uint8_t *buf_end;
70     const uint8_t *buf_start;
71     int re_flags;
72     BOOL is_utf16;
73     BOOL ignore_case;
74     BOOL dotall;
75     int capture_count;
76     int total_capture_count; /* -1 = not computed yet */
77     int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
78     void *opaque;
79     DynBuf group_names;
80     union {
81         char error_msg[TMP_BUF_SIZE];
82         char tmp_buf[TMP_BUF_SIZE];
83     } u;
84 } REParseState;
85 
86 typedef struct {
87 #ifdef DUMP_REOP
88     const char *name;
89 #endif
90     uint8_t size;
91 } REOpCode;
92 
93 static const REOpCode reopcode_info[REOP_COUNT] = {
94 #ifdef DUMP_REOP
95 #define DEF(id, size) { #id, size },
96 #else
97 #define DEF(id, size) { size },
98 #endif
99 #include "libregexp-opcode.h"
100 #undef DEF
101 };
102 
103 #define RE_HEADER_FLAGS         0
104 #define RE_HEADER_CAPTURE_COUNT 1
105 #define RE_HEADER_STACK_SIZE    2
106 
107 #define RE_HEADER_LEN 7
108 
is_digit(int c)109 static inline int is_digit(int c) {
110     return c >= '0' && c <= '9';
111 }
112 
113 /* insert 'len' bytes at position 'pos'. Return < 0 if error. */
dbuf_insert(DynBuf * s,int pos,int len)114 static int dbuf_insert(DynBuf *s, int pos, int len)
115 {
116     if (dbuf_realloc(s, s->size + len))
117         return -1;
118     memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
119     s->size += len;
120     return 0;
121 }
122 
123 /* canonicalize with the specific JS regexp rules */
lre_canonicalize(uint32_t c,BOOL is_utf16)124 static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
125 {
126     uint32_t res[LRE_CC_RES_LEN_MAX];
127     int len;
128     if (is_utf16) {
129         if (likely(c < 128)) {
130             if (c >= 'A' && c <= 'Z')
131                 c = c - 'A' + 'a';
132         } else {
133             lre_case_conv(res, c, 2);
134             c = res[0];
135         }
136     } else {
137         if (likely(c < 128)) {
138             if (c >= 'a' && c <= 'z')
139                 c = c - 'a' + 'A';
140         } else {
141             /* legacy regexp: to upper case if single char >= 128 */
142             len = lre_case_conv(res, c, FALSE);
143             if (len == 1 && res[0] >= 128)
144                 c = res[0];
145         }
146     }
147     return c;
148 }
149 
150 static const uint16_t char_range_d[] = {
151     1,
152     0x0030, 0x0039 + 1,
153 };
154 
155 /* code point ranges for Zs,Zl or Zp property */
156 static const uint16_t char_range_s[] = {
157     10,
158     0x0009, 0x000D + 1,
159     0x0020, 0x0020 + 1,
160     0x00A0, 0x00A0 + 1,
161     0x1680, 0x1680 + 1,
162     0x2000, 0x200A + 1,
163     /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
164     /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
165     0x2028, 0x2029 + 1,
166     0x202F, 0x202F + 1,
167     0x205F, 0x205F + 1,
168     0x3000, 0x3000 + 1,
169     /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
170     0xFEFF, 0xFEFF + 1,
171 };
172 
lre_is_space(int c)173 BOOL lre_is_space(int c)
174 {
175     int i, n, low, high;
176     n = (countof(char_range_s) - 1) / 2;
177     for(i = 0; i < n; i++) {
178         low = char_range_s[2 * i + 1];
179         if (c < low)
180             return FALSE;
181         high = char_range_s[2 * i + 2];
182         if (c < high)
183             return TRUE;
184     }
185     return FALSE;
186 }
187 
188 uint32_t const lre_id_start_table_ascii[4] = {
189     /* $ A-Z _ a-z */
190     0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
191 };
192 
193 uint32_t const lre_id_continue_table_ascii[4] = {
194     /* $ 0-9 A-Z _ a-z */
195     0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
196 };
197 
198 
199 static const uint16_t char_range_w[] = {
200     4,
201     0x0030, 0x0039 + 1,
202     0x0041, 0x005A + 1,
203     0x005F, 0x005F + 1,
204     0x0061, 0x007A + 1,
205 };
206 
207 #define CLASS_RANGE_BASE 0x40000000
208 
209 typedef enum {
210     CHAR_RANGE_d,
211     CHAR_RANGE_D,
212     CHAR_RANGE_s,
213     CHAR_RANGE_S,
214     CHAR_RANGE_w,
215     CHAR_RANGE_W,
216 } CharRangeEnum;
217 
218 static const uint16_t *char_range_table[] = {
219     char_range_d,
220     char_range_s,
221     char_range_w,
222 };
223 
cr_init_char_range(REParseState * s,CharRange * cr,uint32_t c)224 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
225 {
226     BOOL invert;
227     const uint16_t *c_pt;
228     int len, i;
229 
230     invert = c & 1;
231     c_pt = char_range_table[c >> 1];
232     len = *c_pt++;
233     cr_init(cr, s->opaque, lre_realloc);
234     for(i = 0; i < len * 2; i++) {
235         if (cr_add_point(cr, c_pt[i]))
236             goto fail;
237     }
238     if (invert) {
239         if (cr_invert(cr))
240             goto fail;
241     }
242     return 0;
243  fail:
244     cr_free(cr);
245     return -1;
246 }
247 
cr_canonicalize(CharRange * cr)248 static int cr_canonicalize(CharRange *cr)
249 {
250     CharRange a;
251     uint32_t pt[2];
252     int i, ret;
253 
254     cr_init(&a, cr->mem_opaque, lre_realloc);
255     pt[0] = 'a';
256     pt[1] = 'z' + 1;
257     ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
258     if (ret)
259         goto fail;
260     /* convert to upper case */
261     /* XXX: the generic unicode case would be much more complicated
262        and not really useful */
263     for(i = 0; i < a.len; i++) {
264         a.points[i] += 'A' - 'a';
265     }
266     /* Note: for simplicity we keep the lower case ranges */
267     ret = cr_union1(cr, a.points, a.len);
268  fail:
269     cr_free(&a);
270     return ret;
271 }
272 
273 #ifdef DUMP_REOP
lre_dump_bytecode(const uint8_t * buf,int buf_len)274 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
275                                                      int buf_len)
276 {
277     int pos, len, opcode, bc_len, re_flags, i;
278     uint32_t val;
279 
280     assert(buf_len >= RE_HEADER_LEN);
281 
282     re_flags=  buf[0];
283     bc_len = get_u32(buf + 3);
284     assert(bc_len + RE_HEADER_LEN <= buf_len);
285     printf("flags: 0x%x capture_count=%d stack_size=%d\n",
286            re_flags, buf[1], buf[2]);
287     if (re_flags & LRE_FLAG_NAMED_GROUPS) {
288         const char *p;
289         p = (char *)buf + RE_HEADER_LEN + bc_len;
290         printf("named groups: ");
291         for(i = 1; i < buf[1]; i++) {
292             if (i != 1)
293                 printf(",");
294             printf("<%s>", p);
295             p += strlen(p) + 1;
296         }
297         printf("\n");
298         assert(p == (char *)(buf + buf_len));
299     }
300     printf("bytecode_len=%d\n", bc_len);
301 
302     buf += RE_HEADER_LEN;
303     pos = 0;
304     while (pos < bc_len) {
305         printf("%5u: ", pos);
306         opcode = buf[pos];
307         len = reopcode_info[opcode].size;
308         if (opcode >= REOP_COUNT) {
309             printf(" invalid opcode=0x%02x\n", opcode);
310             break;
311         }
312         if ((pos + len) > bc_len) {
313             printf(" buffer overflow (opcode=0x%02x)\n", opcode);
314             break;
315         }
316         printf("%s", reopcode_info[opcode].name);
317         switch(opcode) {
318         case REOP_char:
319             val = get_u16(buf + pos + 1);
320             if (val >= ' ' && val <= 126)
321                 printf(" '%c'", val);
322             else
323                 printf(" 0x%04x", val);
324             break;
325         case REOP_char32:
326             val = get_u32(buf + pos + 1);
327             if (val >= ' ' && val <= 126)
328                 printf(" '%c'", val);
329             else
330                 printf(" 0x%08x", val);
331             break;
332         case REOP_goto:
333         case REOP_split_goto_first:
334         case REOP_split_next_first:
335         case REOP_loop:
336         case REOP_lookahead:
337         case REOP_negative_lookahead:
338         case REOP_bne_char_pos:
339             val = get_u32(buf + pos + 1);
340             val += (pos + 5);
341             printf(" %u", val);
342             break;
343         case REOP_simple_greedy_quant:
344             printf(" %u %u %u %u",
345                    get_u32(buf + pos + 1) + (pos + 17),
346                    get_u32(buf + pos + 1 + 4),
347                    get_u32(buf + pos + 1 + 8),
348                    get_u32(buf + pos + 1 + 12));
349             break;
350         case REOP_save_start:
351         case REOP_save_end:
352         case REOP_back_reference:
353         case REOP_backward_back_reference:
354             printf(" %u", buf[pos + 1]);
355             break;
356         case REOP_save_reset:
357             printf(" %u %u", buf[pos + 1], buf[pos + 2]);
358             break;
359         case REOP_push_i32:
360             val = get_u32(buf + pos + 1);
361             printf(" %d", val);
362             break;
363         case REOP_range:
364             {
365                 int n, i;
366                 n = get_u16(buf + pos + 1);
367                 len += n * 4;
368                 for(i = 0; i < n * 2; i++) {
369                     val = get_u16(buf + pos + 3 + i * 2);
370                     printf(" 0x%04x", val);
371                 }
372             }
373             break;
374         case REOP_range32:
375             {
376                 int n, i;
377                 n = get_u16(buf + pos + 1);
378                 len += n * 8;
379                 for(i = 0; i < n * 2; i++) {
380                     val = get_u32(buf + pos + 3 + i * 4);
381                     printf(" 0x%08x", val);
382                 }
383             }
384             break;
385         default:
386             break;
387         }
388         printf("\n");
389         pos += len;
390     }
391 }
392 #endif
393 
re_emit_op(REParseState * s,int op)394 static void re_emit_op(REParseState *s, int op)
395 {
396     dbuf_putc(&s->byte_code, op);
397 }
398 
399 /* return the offset of the u32 value */
re_emit_op_u32(REParseState * s,int op,uint32_t val)400 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
401 {
402     int pos;
403     dbuf_putc(&s->byte_code, op);
404     pos = s->byte_code.size;
405     dbuf_put_u32(&s->byte_code, val);
406     return pos;
407 }
408 
re_emit_goto(REParseState * s,int op,uint32_t val)409 static int re_emit_goto(REParseState *s, int op, uint32_t val)
410 {
411     int pos;
412     dbuf_putc(&s->byte_code, op);
413     pos = s->byte_code.size;
414     dbuf_put_u32(&s->byte_code, val - (pos + 4));
415     return pos;
416 }
417 
re_emit_op_u8(REParseState * s,int op,uint32_t val)418 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
419 {
420     dbuf_putc(&s->byte_code, op);
421     dbuf_putc(&s->byte_code, val);
422 }
423 
re_emit_op_u16(REParseState * s,int op,uint32_t val)424 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
425 {
426     dbuf_putc(&s->byte_code, op);
427     dbuf_put_u16(&s->byte_code, val);
428 }
429 
re_parse_error(REParseState * s,const char * fmt,...)430 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
431 {
432     va_list ap;
433     va_start(ap, fmt);
434     vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
435     va_end(ap);
436     return -1;
437 }
438 
re_parse_out_of_memory(REParseState * s)439 static int re_parse_out_of_memory(REParseState *s)
440 {
441     return re_parse_error(s, "out of memory");
442 }
443 
444 /* If allow_overflow is false, return -1 in case of
445    overflow. Otherwise return INT32_MAX. */
parse_digits(const uint8_t ** pp,BOOL allow_overflow)446 static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
447 {
448     const uint8_t *p;
449     uint64_t v;
450     int c;
451 
452     p = *pp;
453     v = 0;
454     for(;;) {
455         c = *p;
456         if (c < '0' || c > '9')
457             break;
458         v = v * 10 + c - '0';
459         if (v >= INT32_MAX) {
460             if (allow_overflow)
461                 v = INT32_MAX;
462             else
463                 return -1;
464         }
465         p++;
466     }
467     *pp = p;
468     return v;
469 }
470 
re_parse_expect(REParseState * s,const uint8_t ** pp,int c)471 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
472 {
473     const uint8_t *p;
474     p = *pp;
475     if (*p != c)
476         return re_parse_error(s, "expecting '%c'", c);
477     p++;
478     *pp = p;
479     return 0;
480 }
481 
482 /* Parse an escape sequence, *pp points after the '\':
483    allow_utf16 value:
484    0 : no UTF-16 escapes allowed
485    1 : UTF-16 escapes allowed
486    2 : UTF-16 escapes allowed and escapes of surrogate pairs are
487    converted to a unicode character (unicode regexp case).
488 
489    Return the unicode char and update *pp if recognized,
490    return -1 if malformed escape,
491    return -2 otherwise. */
lre_parse_escape(const uint8_t ** pp,int allow_utf16)492 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
493 {
494     const uint8_t *p;
495     uint32_t c;
496 
497     p = *pp;
498     c = *p++;
499     switch(c) {
500     case 'b':
501         c = '\b';
502         break;
503     case 'f':
504         c = '\f';
505         break;
506     case 'n':
507         c = '\n';
508         break;
509     case 'r':
510         c = '\r';
511         break;
512     case 't':
513         c = '\t';
514         break;
515     case 'v':
516         c = '\v';
517         break;
518     case 'x':
519     case 'u':
520         {
521             int h, n, i;
522             uint32_t c1;
523 
524             if (*p == '{' && allow_utf16) {
525                 p++;
526                 c = 0;
527                 for(;;) {
528                     h = from_hex(*p++);
529                     if (h < 0)
530                         return -1;
531                     c = (c << 4) | h;
532                     if (c > 0x10FFFF)
533                         return -1;
534                     if (*p == '}')
535                         break;
536                 }
537                 p++;
538             } else {
539                 if (c == 'x') {
540                     n = 2;
541                 } else {
542                     n = 4;
543                 }
544 
545                 c = 0;
546                 for(i = 0; i < n; i++) {
547                     h = from_hex(*p++);
548                     if (h < 0) {
549                         return -1;
550                     }
551                     c = (c << 4) | h;
552                 }
553                 if (c >= 0xd800 && c < 0xdc00 &&
554                     allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
555                     /* convert an escaped surrogate pair into a
556                        unicode char */
557                     c1 = 0;
558                     for(i = 0; i < 4; i++) {
559                         h = from_hex(p[2 + i]);
560                         if (h < 0)
561                             break;
562                         c1 = (c1 << 4) | h;
563                     }
564                     if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
565                         p += 6;
566                         c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
567                     }
568                 }
569             }
570         }
571         break;
572     case '0': case '1': case '2': case '3':
573     case '4': case '5': case '6': case '7':
574         c -= '0';
575         if (allow_utf16 == 2) {
576             /* only accept \0 not followed by digit */
577             if (c != 0 || is_digit(*p))
578                 return -1;
579         } else {
580             /* parse a legacy octal sequence */
581             uint32_t v;
582             v = *p - '0';
583             if (v > 7)
584                 break;
585             c = (c << 3) | v;
586             p++;
587             if (c >= 32)
588                 break;
589             v = *p - '0';
590             if (v > 7)
591                 break;
592             c = (c << 3) | v;
593             p++;
594         }
595         break;
596     default:
597         return -2;
598     }
599     *pp = p;
600     return c;
601 }
602 
603 #ifdef CONFIG_ALL_UNICODE
604 /* XXX: we use the same chars for name and value */
is_unicode_char(int c)605 static BOOL is_unicode_char(int c)
606 {
607     return ((c >= '0' && c <= '9') ||
608             (c >= 'A' && c <= 'Z') ||
609             (c >= 'a' && c <= 'z') ||
610             (c == '_'));
611 }
612 
parse_unicode_property(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL is_inv)613 static int parse_unicode_property(REParseState *s, CharRange *cr,
614                                   const uint8_t **pp, BOOL is_inv)
615 {
616     const uint8_t *p;
617     char name[64], value[64];
618     char *q;
619     BOOL script_ext;
620     int ret;
621 
622     p = *pp;
623     if (*p != '{')
624         return re_parse_error(s, "expecting '{' after \\p");
625     p++;
626     q = name;
627     while (is_unicode_char(*p)) {
628         if ((q - name) >= sizeof(name) - 1)
629             goto unknown_property_name;
630         *q++ = *p++;
631     }
632     *q = '\0';
633     q = value;
634     if (*p == '=') {
635         p++;
636         while (is_unicode_char(*p)) {
637             if ((q - value) >= sizeof(value) - 1)
638                 return re_parse_error(s, "unknown unicode property value");
639             *q++ = *p++;
640         }
641     }
642     *q = '\0';
643     if (*p != '}')
644         return re_parse_error(s, "expecting '}'");
645     p++;
646     //    printf("name=%s value=%s\n", name, value);
647 
648     if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
649         script_ext = FALSE;
650         goto do_script;
651     } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
652         script_ext = TRUE;
653     do_script:
654         cr_init(cr, s->opaque, lre_realloc);
655         ret = unicode_script(cr, value, script_ext);
656         if (ret) {
657             cr_free(cr);
658             if (ret == -2)
659                 return re_parse_error(s, "unknown unicode script");
660             else
661                 goto out_of_memory;
662         }
663     } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
664         cr_init(cr, s->opaque, lre_realloc);
665         ret = unicode_general_category(cr, value);
666         if (ret) {
667             cr_free(cr);
668             if (ret == -2)
669                 return re_parse_error(s, "unknown unicode general category");
670             else
671                 goto out_of_memory;
672         }
673     } else if (value[0] == '\0') {
674         cr_init(cr, s->opaque, lre_realloc);
675         ret = unicode_general_category(cr, name);
676         if (ret == -1) {
677             cr_free(cr);
678             goto out_of_memory;
679         }
680         if (ret < 0) {
681             ret = unicode_prop(cr, name);
682             if (ret) {
683                 cr_free(cr);
684                 if (ret == -2)
685                     goto unknown_property_name;
686                 else
687                     goto out_of_memory;
688             }
689         }
690     } else {
691     unknown_property_name:
692         return re_parse_error(s, "unknown unicode property name");
693     }
694 
695     if (is_inv) {
696         if (cr_invert(cr)) {
697             cr_free(cr);
698             return -1;
699         }
700     }
701     *pp = p;
702     return 0;
703  out_of_memory:
704     return re_parse_out_of_memory(s);
705 }
706 #endif /* CONFIG_ALL_UNICODE */
707 
708 /* return -1 if error otherwise the character or a class range
709    (CLASS_RANGE_BASE). In case of class range, 'cr' is
710    initialized. Otherwise, it is ignored. */
get_class_atom(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL inclass)711 static int get_class_atom(REParseState *s, CharRange *cr,
712                           const uint8_t **pp, BOOL inclass)
713 {
714     const uint8_t *p;
715     uint32_t c;
716     int ret;
717 
718     p = *pp;
719 
720     c = *p;
721     switch(c) {
722     case '\\':
723         p++;
724         if (p >= s->buf_end)
725             goto unexpected_end;
726         c = *p++;
727         switch(c) {
728         case 'd':
729             c = CHAR_RANGE_d;
730             goto class_range;
731         case 'D':
732             c = CHAR_RANGE_D;
733             goto class_range;
734         case 's':
735             c = CHAR_RANGE_s;
736             goto class_range;
737         case 'S':
738             c = CHAR_RANGE_S;
739             goto class_range;
740         case 'w':
741             c = CHAR_RANGE_w;
742             goto class_range;
743         case 'W':
744             c = CHAR_RANGE_W;
745         class_range:
746             if (cr_init_char_range(s, cr, c))
747                 return -1;
748             c = CLASS_RANGE_BASE;
749             break;
750         case 'c':
751             c = *p;
752             if ((c >= 'a' && c <= 'z') ||
753                 (c >= 'A' && c <= 'Z') ||
754                 (((c >= '0' && c <= '9') || c == '_') &&
755                  inclass && !s->is_utf16)) {   /* Annex B.1.4 */
756                 c &= 0x1f;
757                 p++;
758             } else if (s->is_utf16) {
759                 goto invalid_escape;
760             } else {
761                 /* otherwise return '\' and 'c' */
762                 p--;
763                 c = '\\';
764             }
765             break;
766 #ifdef CONFIG_ALL_UNICODE
767         case 'p':
768         case 'P':
769             if (s->is_utf16) {
770                 if (parse_unicode_property(s, cr, &p, (c == 'P')))
771                     return -1;
772                 c = CLASS_RANGE_BASE;
773                 break;
774             }
775             /* fall thru */
776 #endif
777         default:
778             p--;
779             ret = lre_parse_escape(&p, s->is_utf16 * 2);
780             if (ret >= 0) {
781                 c = ret;
782             } else {
783                 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
784                     /* always valid to escape these characters */
785                     goto normal_char;
786                 } else if (s->is_utf16) {
787                 invalid_escape:
788                     return re_parse_error(s, "invalid escape sequence in regular expression");
789                 } else {
790                     /* just ignore the '\' */
791                     goto normal_char;
792                 }
793             }
794             break;
795         }
796         break;
797     case '\0':
798         if (p >= s->buf_end) {
799         unexpected_end:
800             return re_parse_error(s, "unexpected end");
801         }
802         /* fall thru */
803     default:
804     normal_char:
805         /* normal char */
806         if (c >= 128) {
807             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
808             if ((unsigned)c > 0xffff && !s->is_utf16) {
809                 /* XXX: should handle non BMP-1 code points */
810                 return re_parse_error(s, "malformed unicode char");
811             }
812         } else {
813             p++;
814         }
815         break;
816     }
817     *pp = p;
818     return c;
819 }
820 
re_emit_range(REParseState * s,const CharRange * cr)821 static int re_emit_range(REParseState *s, const CharRange *cr)
822 {
823     int len, i;
824     uint32_t high;
825 
826     len = (unsigned)cr->len / 2;
827     if (len >= 65535)
828         return re_parse_error(s, "too many ranges");
829     if (len == 0) {
830         /* not sure it can really happen. Emit a match that is always
831            false */
832         re_emit_op_u32(s, REOP_char32, -1);
833     } else {
834         high = cr->points[cr->len - 1];
835         if (high == UINT32_MAX)
836             high = cr->points[cr->len - 2];
837         if (high <= 0xffff) {
838             /* can use 16 bit ranges with the conversion that 0xffff =
839                infinity */
840             re_emit_op_u16(s, REOP_range, len);
841             for(i = 0; i < cr->len; i += 2) {
842                 dbuf_put_u16(&s->byte_code, cr->points[i]);
843                 high = cr->points[i + 1] - 1;
844                 if (high == UINT32_MAX - 1)
845                     high = 0xffff;
846                 dbuf_put_u16(&s->byte_code, high);
847             }
848         } else {
849             re_emit_op_u16(s, REOP_range32, len);
850             for(i = 0; i < cr->len; i += 2) {
851                 dbuf_put_u32(&s->byte_code, cr->points[i]);
852                 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
853             }
854         }
855     }
856     return 0;
857 }
858 
re_parse_char_class(REParseState * s,const uint8_t ** pp)859 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
860 {
861     const uint8_t *p;
862     uint32_t c1, c2;
863     CharRange cr_s, *cr = &cr_s;
864     CharRange cr1_s, *cr1 = &cr1_s;
865     BOOL invert;
866 
867     cr_init(cr, s->opaque, lre_realloc);
868     p = *pp;
869     p++;    /* skip '[' */
870     invert = FALSE;
871     if (*p == '^') {
872         p++;
873         invert = TRUE;
874     }
875     for(;;) {
876         if (*p == ']')
877             break;
878         c1 = get_class_atom(s, cr1, &p, TRUE);
879         if ((int)c1 < 0)
880             goto fail;
881         if (*p == '-' && p[1] != ']') {
882             const uint8_t *p0 = p + 1;
883             if (c1 >= CLASS_RANGE_BASE) {
884                 if (s->is_utf16) {
885                     cr_free(cr1);
886                     goto invalid_class_range;
887                 }
888                 /* Annex B: match '-' character */
889                 goto class_atom;
890             }
891             c2 = get_class_atom(s, cr1, &p0, TRUE);
892             if ((int)c2 < 0)
893                 goto fail;
894             if (c2 >= CLASS_RANGE_BASE) {
895                 cr_free(cr1);
896                 if (s->is_utf16) {
897                     goto invalid_class_range;
898                 }
899                 /* Annex B: match '-' character */
900                 goto class_atom;
901             }
902             p = p0;
903             if (c2 < c1) {
904             invalid_class_range:
905                 re_parse_error(s, "invalid class range");
906                 goto fail;
907             }
908             if (cr_union_interval(cr, c1, c2))
909                 goto memory_error;
910         } else {
911         class_atom:
912             if (c1 >= CLASS_RANGE_BASE) {
913                 int ret;
914                 ret = cr_union1(cr, cr1->points, cr1->len);
915                 cr_free(cr1);
916                 if (ret)
917                     goto memory_error;
918             } else {
919                 if (cr_union_interval(cr, c1, c1))
920                     goto memory_error;
921             }
922         }
923     }
924     if (s->ignore_case) {
925         if (cr_canonicalize(cr))
926             goto memory_error;
927     }
928     if (invert) {
929         if (cr_invert(cr))
930             goto memory_error;
931     }
932     if (re_emit_range(s, cr))
933         goto fail;
934     cr_free(cr);
935     p++;    /* skip ']' */
936     *pp = p;
937     return 0;
938  memory_error:
939     re_parse_out_of_memory(s);
940  fail:
941     cr_free(cr);
942     return -1;
943 }
944 
945 /* Return:
946    1 if the opcodes in bc_buf[] always advance the character pointer.
947    0 if the character pointer may not be advanced.
948    -1 if the code may depend on side effects of its previous execution (backreference)
949 */
re_check_advance(const uint8_t * bc_buf,int bc_buf_len)950 static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
951 {
952     int pos, opcode, ret, len, i;
953     uint32_t val, last;
954     BOOL has_back_reference;
955     uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
956 
957     ret = -2; /* not known yet */
958     pos = 0;
959     has_back_reference = FALSE;
960     memset(capture_bitmap, 0, sizeof(capture_bitmap));
961 
962     while (pos < bc_buf_len) {
963         opcode = bc_buf[pos];
964         len = reopcode_info[opcode].size;
965         switch(opcode) {
966         case REOP_range:
967             val = get_u16(bc_buf + pos + 1);
968             len += val * 4;
969             goto simple_char;
970         case REOP_range32:
971             val = get_u16(bc_buf + pos + 1);
972             len += val * 8;
973             goto simple_char;
974         case REOP_char:
975         case REOP_char32:
976         case REOP_dot:
977         case REOP_any:
978         simple_char:
979             if (ret == -2)
980                 ret = 1;
981             break;
982         case REOP_line_start:
983         case REOP_line_end:
984         case REOP_push_i32:
985         case REOP_push_char_pos:
986         case REOP_drop:
987         case REOP_word_boundary:
988         case REOP_not_word_boundary:
989         case REOP_prev:
990             /* no effect */
991             break;
992         case REOP_save_start:
993         case REOP_save_end:
994             val = bc_buf[pos + 1];
995             capture_bitmap[val] |= 1;
996             break;
997         case REOP_save_reset:
998             {
999                 val = bc_buf[pos + 1];
1000                 last = bc_buf[pos + 2];
1001                 while (val < last)
1002                     capture_bitmap[val++] |= 1;
1003             }
1004             break;
1005         case REOP_back_reference:
1006         case REOP_backward_back_reference:
1007             val = bc_buf[pos + 1];
1008             capture_bitmap[val] |= 2;
1009             has_back_reference = TRUE;
1010             break;
1011         default:
1012             /* safe behvior: we cannot predict the outcome */
1013             if (ret == -2)
1014                 ret = 0;
1015             break;
1016         }
1017         pos += len;
1018     }
1019     if (has_back_reference) {
1020         /* check if there is back reference which references a capture
1021            made in the some code */
1022         for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
1023             if (capture_bitmap[i] == 3)
1024                 return -1;
1025         }
1026     }
1027     if (ret == -2)
1028         ret = 0;
1029     return ret;
1030 }
1031 
1032 /* return -1 if a simple quantifier cannot be used. Otherwise return
1033    the number of characters in the atom. */
re_is_simple_quantifier(const uint8_t * bc_buf,int bc_buf_len)1034 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1035 {
1036     int pos, opcode, len, count;
1037     uint32_t val;
1038 
1039     count = 0;
1040     pos = 0;
1041     while (pos < bc_buf_len) {
1042         opcode = bc_buf[pos];
1043         len = reopcode_info[opcode].size;
1044         switch(opcode) {
1045         case REOP_range:
1046             val = get_u16(bc_buf + pos + 1);
1047             len += val * 4;
1048             goto simple_char;
1049         case REOP_range32:
1050             val = get_u16(bc_buf + pos + 1);
1051             len += val * 8;
1052             goto simple_char;
1053         case REOP_char:
1054         case REOP_char32:
1055         case REOP_dot:
1056         case REOP_any:
1057         simple_char:
1058             count++;
1059             break;
1060         case REOP_line_start:
1061         case REOP_line_end:
1062         case REOP_word_boundary:
1063         case REOP_not_word_boundary:
1064             break;
1065         default:
1066             return -1;
1067         }
1068         pos += len;
1069     }
1070     return count;
1071 }
1072 
1073 /* '*pp' is the first char after '<' */
re_parse_group_name(char * buf,int buf_size,const uint8_t ** pp,BOOL is_utf16)1074 static int re_parse_group_name(char *buf, int buf_size,
1075                                const uint8_t **pp, BOOL is_utf16)
1076 {
1077     const uint8_t *p;
1078     uint32_t c;
1079     char *q;
1080 
1081     p = *pp;
1082     q = buf;
1083     for(;;) {
1084         c = *p;
1085         if (c == '\\') {
1086             p++;
1087             if (*p != 'u')
1088                 return -1;
1089             c = lre_parse_escape(&p, is_utf16 * 2);
1090         } else if (c == '>') {
1091             break;
1092         } else if (c >= 128) {
1093             c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1094         } else {
1095             p++;
1096         }
1097         if (c > 0x10FFFF)
1098             return -1;
1099         if (q == buf) {
1100             if (!lre_js_is_ident_first(c))
1101                 return -1;
1102         } else {
1103             if (!lre_js_is_ident_next(c))
1104                 return -1;
1105         }
1106         if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1107             return -1;
1108         if (c < 128) {
1109             *q++ = c;
1110         } else {
1111             q += unicode_to_utf8((uint8_t*)q, c);
1112         }
1113     }
1114     if (q == buf)
1115         return -1;
1116     *q = '\0';
1117     p++;
1118     *pp = p;
1119     return 0;
1120 }
1121 
1122 /* if capture_name = NULL: return the number of captures + 1.
1123    Otherwise, return the capture index corresponding to capture_name
1124    or -1 if none */
re_parse_captures(REParseState * s,int * phas_named_captures,const char * capture_name)1125 static int re_parse_captures(REParseState *s, int *phas_named_captures,
1126                              const char *capture_name)
1127 {
1128     const uint8_t *p;
1129     int capture_index;
1130     char name[TMP_BUF_SIZE];
1131 
1132     capture_index = 1;
1133     *phas_named_captures = 0;
1134     for (p = s->buf_start; p < s->buf_end; p++) {
1135         switch (*p) {
1136         case '(':
1137             if (p[1] == '?') {
1138                 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1139                     *phas_named_captures = 1;
1140                     /* potential named capture */
1141                     if (capture_name) {
1142                         p += 3;
1143                         if (re_parse_group_name(name, sizeof(name), &p,
1144                                                 s->is_utf16) == 0) {
1145                             if (!strcmp(name, capture_name))
1146                                 return capture_index;
1147                         }
1148                     }
1149                     capture_index++;
1150                     if (capture_index >= CAPTURE_COUNT_MAX)
1151                         goto done;
1152                 }
1153             } else {
1154                 capture_index++;
1155                 if (capture_index >= CAPTURE_COUNT_MAX)
1156                     goto done;
1157             }
1158             break;
1159         case '\\':
1160             p++;
1161             break;
1162         case '[':
1163             for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1164                 if (*p == '\\')
1165                     p++;
1166             }
1167             break;
1168         }
1169     }
1170  done:
1171     if (capture_name)
1172         return -1;
1173     else
1174         return capture_index;
1175 }
1176 
re_count_captures(REParseState * s)1177 static int re_count_captures(REParseState *s)
1178 {
1179     if (s->total_capture_count < 0) {
1180         s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1181                                                    NULL);
1182     }
1183     return s->total_capture_count;
1184 }
1185 
re_has_named_captures(REParseState * s)1186 static BOOL re_has_named_captures(REParseState *s)
1187 {
1188     if (s->has_named_captures < 0)
1189         re_count_captures(s);
1190     return s->has_named_captures;
1191 }
1192 
find_group_name(REParseState * s,const char * name)1193 static int find_group_name(REParseState *s, const char *name)
1194 {
1195     const char *p, *buf_end;
1196     size_t len, name_len;
1197     int capture_index;
1198 
1199     name_len = strlen(name);
1200     p = (char *)s->group_names.buf;
1201     buf_end = (char *)s->group_names.buf + s->group_names.size;
1202     capture_index = 1;
1203     while (p < buf_end) {
1204         len = strlen(p);
1205         if (len == name_len && memcmp(name, p, name_len) == 0)
1206             return capture_index;
1207         p += len + 1;
1208         capture_index++;
1209     }
1210     return -1;
1211 }
1212 
1213 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1214 
re_parse_term(REParseState * s,BOOL is_backward_dir)1215 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1216 {
1217     const uint8_t *p;
1218     int c, last_atom_start, quant_min, quant_max, last_capture_count;
1219     BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1220     CharRange cr_s, *cr = &cr_s;
1221 
1222     last_atom_start = -1;
1223     last_capture_count = 0;
1224     p = s->buf_ptr;
1225     c = *p;
1226     switch(c) {
1227     case '^':
1228         p++;
1229         re_emit_op(s, REOP_line_start);
1230         break;
1231     case '$':
1232         p++;
1233         re_emit_op(s, REOP_line_end);
1234         break;
1235     case '.':
1236         p++;
1237         last_atom_start = s->byte_code.size;
1238         last_capture_count = s->capture_count;
1239         if (is_backward_dir)
1240             re_emit_op(s, REOP_prev);
1241         re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1242         if (is_backward_dir)
1243             re_emit_op(s, REOP_prev);
1244         break;
1245     case '{':
1246         if (s->is_utf16) {
1247             return re_parse_error(s, "syntax error");
1248         } else if (!is_digit(p[1])) {
1249             /* Annex B: we accept '{' not followed by digits as a
1250                normal atom */
1251             goto parse_class_atom;
1252         } else {
1253             const uint8_t *p1 = p + 1;
1254             /* Annex B: error if it is like a repetition count */
1255             parse_digits(&p1, TRUE);
1256             if (*p1 == ',') {
1257                 p1++;
1258                 if (is_digit(*p1)) {
1259                     parse_digits(&p1, TRUE);
1260                 }
1261             }
1262             if (*p1 != '}') {
1263                 goto parse_class_atom;
1264             }
1265         }
1266         /* fall thru */
1267     case '*':
1268     case '+':
1269     case '?':
1270         return re_parse_error(s, "nothing to repeat");
1271     case '(':
1272         if (p[1] == '?') {
1273             if (p[2] == ':') {
1274                 p += 3;
1275                 last_atom_start = s->byte_code.size;
1276                 last_capture_count = s->capture_count;
1277                 s->buf_ptr = p;
1278                 if (re_parse_disjunction(s, is_backward_dir))
1279                     return -1;
1280                 p = s->buf_ptr;
1281                 if (re_parse_expect(s, &p, ')'))
1282                     return -1;
1283             } else if ((p[2] == '=' || p[2] == '!')) {
1284                 is_neg = (p[2] == '!');
1285                 is_backward_lookahead = FALSE;
1286                 p += 3;
1287                 goto lookahead;
1288             } else if (p[2] == '<' &&
1289                        (p[3] == '=' || p[3] == '!')) {
1290                 int pos;
1291                 is_neg = (p[3] == '!');
1292                 is_backward_lookahead = TRUE;
1293                 p += 4;
1294                 /* lookahead */
1295             lookahead:
1296                 /* Annex B allows lookahead to be used as an atom for
1297                    the quantifiers */
1298                 if (!s->is_utf16 && !is_backward_lookahead)  {
1299                     last_atom_start = s->byte_code.size;
1300                     last_capture_count = s->capture_count;
1301                 }
1302                 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1303                 s->buf_ptr = p;
1304                 if (re_parse_disjunction(s, is_backward_lookahead))
1305                     return -1;
1306                 p = s->buf_ptr;
1307                 if (re_parse_expect(s, &p, ')'))
1308                     return -1;
1309                 re_emit_op(s, REOP_match);
1310                 /* jump after the 'match' after the lookahead is successful */
1311                 if (dbuf_error(&s->byte_code))
1312                     return -1;
1313                 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1314             } else if (p[2] == '<') {
1315                 p += 3;
1316                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1317                                         &p, s->is_utf16)) {
1318                     return re_parse_error(s, "invalid group name");
1319                 }
1320                 if (find_group_name(s, s->u.tmp_buf) > 0) {
1321                     return re_parse_error(s, "duplicate group name");
1322                 }
1323                 /* group name with a trailing zero */
1324                 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1325                          strlen(s->u.tmp_buf) + 1);
1326                 s->has_named_captures = 1;
1327                 goto parse_capture;
1328             } else {
1329                 return re_parse_error(s, "invalid group");
1330             }
1331         } else {
1332             int capture_index;
1333             p++;
1334             /* capture without group name */
1335             dbuf_putc(&s->group_names, 0);
1336         parse_capture:
1337             if (s->capture_count >= CAPTURE_COUNT_MAX)
1338                 return re_parse_error(s, "too many captures");
1339             last_atom_start = s->byte_code.size;
1340             last_capture_count = s->capture_count;
1341             capture_index = s->capture_count++;
1342             re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1343                           capture_index);
1344 
1345             s->buf_ptr = p;
1346             if (re_parse_disjunction(s, is_backward_dir))
1347                 return -1;
1348             p = s->buf_ptr;
1349 
1350             re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1351                           capture_index);
1352 
1353             if (re_parse_expect(s, &p, ')'))
1354                 return -1;
1355         }
1356         break;
1357     case '\\':
1358         switch(p[1]) {
1359         case 'b':
1360         case 'B':
1361             re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1362             p += 2;
1363             break;
1364         case 'k':
1365             {
1366                 const uint8_t *p1;
1367                 int dummy_res;
1368 
1369                 p1 = p;
1370                 if (p1[2] != '<') {
1371                     /* annex B: we tolerate invalid group names in non
1372                        unicode mode if there is no named capture
1373                        definition */
1374                     if (s->is_utf16 || re_has_named_captures(s))
1375                         return re_parse_error(s, "expecting group name");
1376                     else
1377                         goto parse_class_atom;
1378                 }
1379                 p1 += 3;
1380                 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1381                                         &p1, s->is_utf16)) {
1382                     if (s->is_utf16 || re_has_named_captures(s))
1383                         return re_parse_error(s, "invalid group name");
1384                     else
1385                         goto parse_class_atom;
1386                 }
1387                 c = find_group_name(s, s->u.tmp_buf);
1388                 if (c < 0) {
1389                     /* no capture name parsed before, try to look
1390                        after (inefficient, but hopefully not common */
1391                     c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1392                     if (c < 0) {
1393                         if (s->is_utf16 || re_has_named_captures(s))
1394                             return re_parse_error(s, "group name not defined");
1395                         else
1396                             goto parse_class_atom;
1397                     }
1398                 }
1399                 p = p1;
1400             }
1401             goto emit_back_reference;
1402         case '0':
1403             p += 2;
1404             c = 0;
1405             if (s->is_utf16) {
1406                 if (is_digit(*p)) {
1407                     return re_parse_error(s, "invalid decimal escape in regular expression");
1408                 }
1409             } else {
1410                 /* Annex B.1.4: accept legacy octal */
1411                 if (*p >= '0' && *p <= '7') {
1412                     c = *p++ - '0';
1413                     if (*p >= '0' && *p <= '7') {
1414                         c = (c << 3) + *p++ - '0';
1415                     }
1416                 }
1417             }
1418             goto normal_char;
1419         case '1': case '2': case '3': case '4':
1420         case '5': case '6': case '7': case '8':
1421         case '9':
1422             {
1423                 const uint8_t *q = ++p;
1424 
1425                 c = parse_digits(&p, FALSE);
1426                 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1427                     if (!s->is_utf16) {
1428                         /* Annex B.1.4: accept legacy octal */
1429                         p = q;
1430                         if (*p <= '7') {
1431                             c = 0;
1432                             if (*p <= '3')
1433                                 c = *p++ - '0';
1434                             if (*p >= '0' && *p <= '7') {
1435                                 c = (c << 3) + *p++ - '0';
1436                                 if (*p >= '0' && *p <= '7') {
1437                                     c = (c << 3) + *p++ - '0';
1438                                 }
1439                             }
1440                         } else {
1441                             c = *p++;
1442                         }
1443                         goto normal_char;
1444                     }
1445                     return re_parse_error(s, "back reference out of range in regular expression");
1446                 }
1447             emit_back_reference:
1448                 last_atom_start = s->byte_code.size;
1449                 last_capture_count = s->capture_count;
1450                 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1451             }
1452             break;
1453         default:
1454             goto parse_class_atom;
1455         }
1456         break;
1457     case '[':
1458         last_atom_start = s->byte_code.size;
1459         last_capture_count = s->capture_count;
1460         if (is_backward_dir)
1461             re_emit_op(s, REOP_prev);
1462         if (re_parse_char_class(s, &p))
1463             return -1;
1464         if (is_backward_dir)
1465             re_emit_op(s, REOP_prev);
1466         break;
1467     case ']':
1468     case '}':
1469         if (s->is_utf16)
1470             return re_parse_error(s, "syntax error");
1471         goto parse_class_atom;
1472     default:
1473     parse_class_atom:
1474         c = get_class_atom(s, cr, &p, FALSE);
1475         if ((int)c < 0)
1476             return -1;
1477     normal_char:
1478         last_atom_start = s->byte_code.size;
1479         last_capture_count = s->capture_count;
1480         if (is_backward_dir)
1481             re_emit_op(s, REOP_prev);
1482         if (c >= CLASS_RANGE_BASE) {
1483             int ret;
1484             /* Note: canonicalization is not needed */
1485             ret = re_emit_range(s, cr);
1486             cr_free(cr);
1487             if (ret)
1488                 return -1;
1489         } else {
1490             if (s->ignore_case)
1491                 c = lre_canonicalize(c, s->is_utf16);
1492             if (c <= 0xffff)
1493                 re_emit_op_u16(s, REOP_char, c);
1494             else
1495                 re_emit_op_u32(s, REOP_char32, c);
1496         }
1497         if (is_backward_dir)
1498             re_emit_op(s, REOP_prev);
1499         break;
1500     }
1501 
1502     /* quantifier */
1503     if (last_atom_start >= 0) {
1504         c = *p;
1505         switch(c) {
1506         case '*':
1507             p++;
1508             quant_min = 0;
1509             quant_max = INT32_MAX;
1510             goto quantifier;
1511         case '+':
1512             p++;
1513             quant_min = 1;
1514             quant_max = INT32_MAX;
1515             goto quantifier;
1516         case '?':
1517             p++;
1518             quant_min = 0;
1519             quant_max = 1;
1520             goto quantifier;
1521         case '{':
1522             {
1523                 const uint8_t *p1 = p;
1524                 /* As an extension (see ES6 annex B), we accept '{' not
1525                    followed by digits as a normal atom */
1526                 if (!is_digit(p[1])) {
1527                     if (s->is_utf16)
1528                         goto invalid_quant_count;
1529                     break;
1530                 }
1531                 p++;
1532                 quant_min = parse_digits(&p, TRUE);
1533                 quant_max = quant_min;
1534                 if (*p == ',') {
1535                     p++;
1536                     if (is_digit(*p)) {
1537                         quant_max = parse_digits(&p, TRUE);
1538                         if (quant_max < quant_min) {
1539                         invalid_quant_count:
1540                             return re_parse_error(s, "invalid repetition count");
1541                         }
1542                     } else {
1543                         quant_max = INT32_MAX; /* infinity */
1544                     }
1545                 }
1546                 if (*p != '}' && !s->is_utf16) {
1547                     /* Annex B: normal atom if invalid '{' syntax */
1548                     p = p1;
1549                     break;
1550                 }
1551                 if (re_parse_expect(s, &p, '}'))
1552                     return -1;
1553             }
1554         quantifier:
1555             greedy = TRUE;
1556             if (*p == '?') {
1557                 p++;
1558                 greedy = FALSE;
1559             }
1560             if (last_atom_start < 0) {
1561                 return re_parse_error(s, "nothing to repeat");
1562             }
1563             if (greedy) {
1564                 int len, pos;
1565 
1566                 if (quant_max > 0) {
1567                     /* specific optimization for simple quantifiers */
1568                     if (dbuf_error(&s->byte_code))
1569                         goto out_of_memory;
1570                     len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1571                                                  s->byte_code.size - last_atom_start);
1572                     if (len > 0) {
1573                         re_emit_op(s, REOP_match);
1574 
1575                         if (dbuf_insert(&s->byte_code, last_atom_start, 17))
1576                             goto out_of_memory;
1577                         pos = last_atom_start;
1578                         s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1579                         put_u32(&s->byte_code.buf[pos],
1580                                 s->byte_code.size - last_atom_start - 17);
1581                         pos += 4;
1582                         put_u32(&s->byte_code.buf[pos], quant_min);
1583                         pos += 4;
1584                         put_u32(&s->byte_code.buf[pos], quant_max);
1585                         pos += 4;
1586                         put_u32(&s->byte_code.buf[pos], len);
1587                         pos += 4;
1588                         goto done;
1589                     }
1590                 }
1591 
1592                 if (dbuf_error(&s->byte_code))
1593                     goto out_of_memory;
1594                 add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
1595                                                            s->byte_code.size - last_atom_start) == 0);
1596             } else {
1597                 add_zero_advance_check = FALSE;
1598             }
1599 
1600             {
1601                 int len, pos;
1602                 len = s->byte_code.size - last_atom_start;
1603                 if (quant_min == 0) {
1604                     /* need to reset the capture in case the atom is
1605                        not executed */
1606                     if (last_capture_count != s->capture_count) {
1607                         if (dbuf_insert(&s->byte_code, last_atom_start, 3))
1608                             goto out_of_memory;
1609                         s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1610                         s->byte_code.buf[last_atom_start++] = last_capture_count;
1611                         s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
1612                     }
1613                     if (quant_max == 0) {
1614                         s->byte_code.size = last_atom_start;
1615                     } else if (quant_max == 1) {
1616                         if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1617                             goto out_of_memory;
1618                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1619                             greedy;
1620                         put_u32(s->byte_code.buf + last_atom_start + 1, len);
1621                     } else if (quant_max == INT32_MAX) {
1622                         if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
1623                             goto out_of_memory;
1624                         s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1625                             greedy;
1626                         put_u32(s->byte_code.buf + last_atom_start + 1,
1627                                 len + 5 + add_zero_advance_check);
1628                         if (add_zero_advance_check) {
1629                             /* avoid infinite loop by stoping the
1630                                recursion if no advance was made in the
1631                                atom (only works if the atom has no
1632                                side effect) */
1633                             s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1634                             re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
1635                         } else {
1636                             re_emit_goto(s, REOP_goto, last_atom_start);
1637                         }
1638                     } else {
1639                         if (dbuf_insert(&s->byte_code, last_atom_start, 10))
1640                             goto out_of_memory;
1641                         pos = last_atom_start;
1642                         s->byte_code.buf[pos++] = REOP_push_i32;
1643                         put_u32(s->byte_code.buf + pos, quant_max);
1644                         pos += 4;
1645                         s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1646                         put_u32(s->byte_code.buf + pos, len + 5);
1647                         re_emit_goto(s, REOP_loop, last_atom_start + 5);
1648                         re_emit_op(s, REOP_drop);
1649                     }
1650                 } else if (quant_min == 1 && quant_max == INT32_MAX &&
1651                            !add_zero_advance_check) {
1652                     re_emit_goto(s, REOP_split_next_first - greedy,
1653                                  last_atom_start);
1654                 } else {
1655                     if (quant_min == 1) {
1656                         /* nothing to add */
1657                     } else {
1658                         if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1659                             goto out_of_memory;
1660                         s->byte_code.buf[last_atom_start] = REOP_push_i32;
1661                         put_u32(s->byte_code.buf + last_atom_start + 1,
1662                                 quant_min);
1663                         last_atom_start += 5;
1664                         re_emit_goto(s, REOP_loop, last_atom_start);
1665                         re_emit_op(s, REOP_drop);
1666                     }
1667                     if (quant_max == INT32_MAX) {
1668                         pos = s->byte_code.size;
1669                         re_emit_op_u32(s, REOP_split_goto_first + greedy,
1670                                        len + 5 + add_zero_advance_check);
1671                         if (add_zero_advance_check)
1672                             re_emit_op(s, REOP_push_char_pos);
1673                         /* copy the atom */
1674                         dbuf_put_self(&s->byte_code, last_atom_start, len);
1675                         if (add_zero_advance_check)
1676                             re_emit_goto(s, REOP_bne_char_pos, pos);
1677                         else
1678                             re_emit_goto(s, REOP_goto, pos);
1679                     } else if (quant_max > quant_min) {
1680                         re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1681                         pos = s->byte_code.size;
1682                         re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
1683                         /* copy the atom */
1684                         dbuf_put_self(&s->byte_code, last_atom_start, len);
1685 
1686                         re_emit_goto(s, REOP_loop, pos);
1687                         re_emit_op(s, REOP_drop);
1688                     }
1689                 }
1690                 last_atom_start = -1;
1691             }
1692             break;
1693         default:
1694             break;
1695         }
1696     }
1697  done:
1698     s->buf_ptr = p;
1699     return 0;
1700  out_of_memory:
1701     return re_parse_out_of_memory(s);
1702 }
1703 
re_parse_alternative(REParseState * s,BOOL is_backward_dir)1704 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1705 {
1706     const uint8_t *p;
1707     int ret;
1708     size_t start, term_start, end, term_size;
1709 
1710     start = s->byte_code.size;
1711     for(;;) {
1712         p = s->buf_ptr;
1713         if (p >= s->buf_end)
1714             break;
1715         if (*p == '|' || *p == ')')
1716             break;
1717         term_start = s->byte_code.size;
1718         ret = re_parse_term(s, is_backward_dir);
1719         if (ret)
1720             return ret;
1721         if (is_backward_dir) {
1722             /* reverse the order of the terms (XXX: inefficient, but
1723                speed is not really critical here) */
1724             end = s->byte_code.size;
1725             term_size = end - term_start;
1726             if (dbuf_realloc(&s->byte_code, end + term_size))
1727                 return -1;
1728             memmove(s->byte_code.buf + start + term_size,
1729                     s->byte_code.buf + start,
1730                     end - start);
1731             memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1732                    term_size);
1733         }
1734     }
1735     return 0;
1736 }
1737 
re_parse_disjunction(REParseState * s,BOOL is_backward_dir)1738 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1739 {
1740     int start, len, pos;
1741 
1742     if (lre_check_stack_overflow(s->opaque, 0))
1743         return re_parse_error(s, "stack overflow");
1744 
1745     start = s->byte_code.size;
1746     if (re_parse_alternative(s, is_backward_dir))
1747         return -1;
1748     while (*s->buf_ptr == '|') {
1749         s->buf_ptr++;
1750 
1751         len = s->byte_code.size - start;
1752 
1753         /* insert a split before the first alternative */
1754         if (dbuf_insert(&s->byte_code, start, 5)) {
1755             return re_parse_out_of_memory(s);
1756         }
1757         s->byte_code.buf[start] = REOP_split_next_first;
1758         put_u32(s->byte_code.buf + start + 1, len + 5);
1759 
1760         pos = re_emit_op_u32(s, REOP_goto, 0);
1761 
1762         if (re_parse_alternative(s, is_backward_dir))
1763             return -1;
1764 
1765         /* patch the goto */
1766         len = s->byte_code.size - (pos + 4);
1767         put_u32(s->byte_code.buf + pos, len);
1768     }
1769     return 0;
1770 }
1771 
1772 /* the control flow is recursive so the analysis can be linear */
compute_stack_size(const uint8_t * bc_buf,int bc_buf_len)1773 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1774 {
1775     int stack_size, stack_size_max, pos, opcode, len;
1776     uint32_t val;
1777 
1778     stack_size = 0;
1779     stack_size_max = 0;
1780     bc_buf += RE_HEADER_LEN;
1781     bc_buf_len -= RE_HEADER_LEN;
1782     pos = 0;
1783     while (pos < bc_buf_len) {
1784         opcode = bc_buf[pos];
1785         len = reopcode_info[opcode].size;
1786         assert(opcode < REOP_COUNT);
1787         assert((pos + len) <= bc_buf_len);
1788         switch(opcode) {
1789         case REOP_push_i32:
1790         case REOP_push_char_pos:
1791             stack_size++;
1792             if (stack_size > stack_size_max) {
1793                 if (stack_size > STACK_SIZE_MAX)
1794                     return -1;
1795                 stack_size_max = stack_size;
1796             }
1797             break;
1798         case REOP_drop:
1799         case REOP_bne_char_pos:
1800             assert(stack_size > 0);
1801             stack_size--;
1802             break;
1803         case REOP_range:
1804             val = get_u16(bc_buf + pos + 1);
1805             len += val * 4;
1806             break;
1807         case REOP_range32:
1808             val = get_u16(bc_buf + pos + 1);
1809             len += val * 8;
1810             break;
1811         }
1812         pos += len;
1813     }
1814     return stack_size_max;
1815 }
1816 
1817 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1818    Return NULL if error and allocate an error message in *perror_msg,
1819    otherwise the compiled bytecode and its length in plen.
1820 */
lre_compile(int * plen,char * error_msg,int error_msg_size,const char * buf,size_t buf_len,int re_flags,void * opaque)1821 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1822                      const char *buf, size_t buf_len, int re_flags,
1823                      void *opaque)
1824 {
1825     REParseState s_s, *s = &s_s;
1826     int stack_size;
1827     BOOL is_sticky;
1828 
1829     memset(s, 0, sizeof(*s));
1830     s->opaque = opaque;
1831     s->buf_ptr = (const uint8_t *)buf;
1832     s->buf_end = s->buf_ptr + buf_len;
1833     s->buf_start = s->buf_ptr;
1834     s->re_flags = re_flags;
1835     s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1836     is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1837     s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
1838     s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
1839     s->capture_count = 1;
1840     s->total_capture_count = -1;
1841     s->has_named_captures = -1;
1842 
1843     dbuf_init2(&s->byte_code, opaque, lre_realloc);
1844     dbuf_init2(&s->group_names, opaque, lre_realloc);
1845 
1846     dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
1847     dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1848     dbuf_putc(&s->byte_code, 0); /* stack size */
1849     dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
1850 
1851     if (!is_sticky) {
1852         /* iterate thru all positions (about the same as .*?( ... ) )
1853            .  We do it without an explicit loop so that lock step
1854            thread execution will be possible in an optimized
1855            implementation */
1856         re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
1857         re_emit_op(s, REOP_any);
1858         re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
1859     }
1860     re_emit_op_u8(s, REOP_save_start, 0);
1861 
1862     if (re_parse_disjunction(s, FALSE)) {
1863     error:
1864         dbuf_free(&s->byte_code);
1865         dbuf_free(&s->group_names);
1866         pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1867         *plen = 0;
1868         return NULL;
1869     }
1870 
1871     re_emit_op_u8(s, REOP_save_end, 0);
1872 
1873     re_emit_op(s, REOP_match);
1874 
1875     if (*s->buf_ptr != '\0') {
1876         re_parse_error(s, "extraneous characters at the end");
1877         goto error;
1878     }
1879 
1880     if (dbuf_error(&s->byte_code)) {
1881         re_parse_out_of_memory(s);
1882         goto error;
1883     }
1884 
1885     stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1886     if (stack_size < 0) {
1887         re_parse_error(s, "too many imbricated quantifiers");
1888         goto error;
1889     }
1890 
1891     s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1892     s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1893     put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
1894 
1895     /* add the named groups if needed */
1896     if (s->group_names.size > (s->capture_count - 1)) {
1897         dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1898         s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
1899     }
1900     dbuf_free(&s->group_names);
1901 
1902 #ifdef DUMP_REOP
1903     lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1904 #endif
1905 
1906     error_msg[0] = '\0';
1907     *plen = s->byte_code.size;
1908     return s->byte_code.buf;
1909 }
1910 
is_line_terminator(uint32_t c)1911 static BOOL is_line_terminator(uint32_t c)
1912 {
1913     return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
1914 }
1915 
is_word_char(uint32_t c)1916 static BOOL is_word_char(uint32_t c)
1917 {
1918     return ((c >= '0' && c <= '9') ||
1919             (c >= 'a' && c <= 'z') ||
1920             (c >= 'A' && c <= 'Z') ||
1921             (c == '_'));
1922 }
1923 
1924 #define GET_CHAR(c, cptr, cbuf_end)                                     \
1925     do {                                                                \
1926         if (cbuf_type == 0) {                                           \
1927             c = *cptr++;                                                \
1928         } else {                                                        \
1929             uint32_t __c1;                                              \
1930             c = *(uint16_t *)cptr;                                      \
1931             cptr += 2;                                                  \
1932             if (c >= 0xd800 && c < 0xdc00 &&                            \
1933                 cbuf_type == 2 && cptr < cbuf_end) {                    \
1934                 __c1 = *(uint16_t *)cptr;                               \
1935                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
1936                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1937                     cptr += 2;                                          \
1938                 }                                                       \
1939             }                                                           \
1940         }                                                               \
1941     } while (0)
1942 
1943 #define PEEK_CHAR(c, cptr, cbuf_end)             \
1944     do {                                         \
1945         if (cbuf_type == 0) {                    \
1946             c = cptr[0];                         \
1947         } else {                                 \
1948             uint32_t __c1;                                              \
1949             c = ((uint16_t *)cptr)[0];                                  \
1950             if (c >= 0xd800 && c < 0xdc00 &&                            \
1951                 cbuf_type == 2 && (cptr + 2) < cbuf_end) {              \
1952                 __c1 = ((uint16_t *)cptr)[1];                           \
1953                 if (__c1 >= 0xdc00 && __c1 < 0xe000) {                  \
1954                     c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1955                 }                                                       \
1956             }                                                           \
1957         }                                        \
1958     } while (0)
1959 
1960 #define PEEK_PREV_CHAR(c, cptr, cbuf_start)                 \
1961     do {                                         \
1962         if (cbuf_type == 0) {                    \
1963             c = cptr[-1];                        \
1964         } else {                                 \
1965             uint32_t __c1;                                              \
1966             c = ((uint16_t *)cptr)[-1];                                 \
1967             if (c >= 0xdc00 && c < 0xe000 &&                            \
1968                 cbuf_type == 2 && (cptr - 4) >= cbuf_start) {              \
1969                 __c1 = ((uint16_t *)cptr)[-2];                          \
1970                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
1971                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1972                 }                                                       \
1973             }                                                           \
1974         }                                                               \
1975     } while (0)
1976 
1977 #define GET_PREV_CHAR(c, cptr, cbuf_start)       \
1978     do {                                         \
1979         if (cbuf_type == 0) {                    \
1980             cptr--;                              \
1981             c = cptr[0];                         \
1982         } else {                                 \
1983             uint32_t __c1;                                              \
1984             cptr -= 2;                                                  \
1985             c = ((uint16_t *)cptr)[0];                                 \
1986             if (c >= 0xdc00 && c < 0xe000 &&                            \
1987                 cbuf_type == 2 && cptr > cbuf_start) {                  \
1988                 __c1 = ((uint16_t *)cptr)[-1];                          \
1989                 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) {                 \
1990                     cptr -= 2;                                          \
1991                     c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1992                 }                                                       \
1993             }                                                           \
1994         }                                                               \
1995     } while (0)
1996 
1997 #define PREV_CHAR(cptr, cbuf_start)       \
1998     do {                                  \
1999         if (cbuf_type == 0) {             \
2000             cptr--;                       \
2001         } else {                          \
2002             cptr -= 2;                          \
2003             if (cbuf_type == 2) {                                       \
2004                 c = ((uint16_t *)cptr)[0];                              \
2005                 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) {   \
2006                     c = ((uint16_t *)cptr)[-1];                         \
2007                     if (c >= 0xd800 && c < 0xdc00)                      \
2008                         cptr -= 2;                                      \
2009                 }                                                       \
2010             }                                                           \
2011         }                                                               \
2012     } while (0)
2013 
2014 typedef uintptr_t StackInt;
2015 
2016 typedef enum {
2017     RE_EXEC_STATE_SPLIT,
2018     RE_EXEC_STATE_LOOKAHEAD,
2019     RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2020     RE_EXEC_STATE_GREEDY_QUANT,
2021 } REExecStateEnum;
2022 
2023 typedef struct REExecState {
2024     REExecStateEnum type : 8;
2025     uint8_t stack_len;
2026     size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
2027     const uint8_t *cptr;
2028     const uint8_t *pc;
2029     void *buf[0];
2030 } REExecState;
2031 
2032 typedef struct {
2033     const uint8_t *cbuf;
2034     const uint8_t *cbuf_end;
2035     /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2036     int cbuf_type;
2037     int capture_count;
2038     int stack_size_max;
2039     BOOL multi_line;
2040     BOOL ignore_case;
2041     BOOL is_utf16;
2042     void *opaque; /* used for stack overflow check */
2043 
2044     size_t state_size;
2045     uint8_t *state_stack;
2046     size_t state_stack_size;
2047     size_t state_stack_len;
2048 } REExecContext;
2049 
push_state(REExecContext * s,uint8_t ** capture,StackInt * stack,size_t stack_len,const uint8_t * pc,const uint8_t * cptr,REExecStateEnum type,size_t count)2050 static int push_state(REExecContext *s,
2051                       uint8_t **capture,
2052                       StackInt *stack, size_t stack_len,
2053                       const uint8_t *pc, const uint8_t *cptr,
2054                       REExecStateEnum type, size_t count)
2055 {
2056     REExecState *rs;
2057     uint8_t *new_stack;
2058     size_t new_size, i, n;
2059     StackInt *stack_buf;
2060 
2061     if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2062         /* reallocate the stack */
2063         new_size = s->state_stack_size * 3 / 2;
2064         if (new_size < 8)
2065             new_size = 8;
2066         new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2067         if (!new_stack)
2068             return -1;
2069         s->state_stack_size = new_size;
2070         s->state_stack = new_stack;
2071     }
2072     rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2073     s->state_stack_len++;
2074     rs->type = type;
2075     rs->count = count;
2076     rs->stack_len = stack_len;
2077     rs->cptr = cptr;
2078     rs->pc = pc;
2079     n = 2 * s->capture_count;
2080     for(i = 0; i < n; i++)
2081         rs->buf[i] = capture[i];
2082     stack_buf = (StackInt *)(rs->buf + n);
2083     for(i = 0; i < stack_len; i++)
2084         stack_buf[i] = stack[i];
2085     return 0;
2086 }
2087 
2088 /* return 1 if match, 0 if not match or -1 if error. */
lre_exec_backtrack(REExecContext * s,uint8_t ** capture,StackInt * stack,int stack_len,const uint8_t * pc,const uint8_t * cptr,BOOL no_recurse)2089 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2090                                    StackInt *stack, int stack_len,
2091                                    const uint8_t *pc, const uint8_t *cptr,
2092                                    BOOL no_recurse)
2093 {
2094     int opcode, ret;
2095     int cbuf_type;
2096     uint32_t val, c;
2097     const uint8_t *cbuf_end;
2098 
2099     cbuf_type = s->cbuf_type;
2100     cbuf_end = s->cbuf_end;
2101 
2102     for(;;) {
2103         //        printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2104         opcode = *pc++;
2105         switch(opcode) {
2106         case REOP_match:
2107             {
2108                 REExecState *rs;
2109                 if (no_recurse)
2110                     return (intptr_t)cptr;
2111                 ret = 1;
2112                 goto recurse;
2113             no_match:
2114                 if (no_recurse)
2115                     return 0;
2116                 ret = 0;
2117             recurse:
2118                 for(;;) {
2119                     if (s->state_stack_len == 0)
2120                         return ret;
2121                     rs = (REExecState *)(s->state_stack +
2122                                          (s->state_stack_len - 1) * s->state_size);
2123                     if (rs->type == RE_EXEC_STATE_SPLIT) {
2124                         if (!ret) {
2125                         pop_state:
2126                             memcpy(capture, rs->buf,
2127                                    sizeof(capture[0]) * 2 * s->capture_count);
2128                         pop_state1:
2129                             pc = rs->pc;
2130                             cptr = rs->cptr;
2131                             stack_len = rs->stack_len;
2132                             memcpy(stack, rs->buf + 2 * s->capture_count,
2133                                    stack_len * sizeof(stack[0]));
2134                             s->state_stack_len--;
2135                             break;
2136                         }
2137                     } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2138                         if (!ret) {
2139                             uint32_t char_count, i;
2140                             memcpy(capture, rs->buf,
2141                                    sizeof(capture[0]) * 2 * s->capture_count);
2142                             stack_len = rs->stack_len;
2143                             memcpy(stack, rs->buf + 2 * s->capture_count,
2144                                    stack_len * sizeof(stack[0]));
2145                             pc = rs->pc;
2146                             cptr = rs->cptr;
2147                             /* go backward */
2148                             char_count = get_u32(pc + 12);
2149                             for(i = 0; i < char_count; i++) {
2150                                 PREV_CHAR(cptr, s->cbuf);
2151                             }
2152                             pc = (pc + 16) + (int)get_u32(pc);
2153                             rs->cptr = cptr;
2154                             rs->count--;
2155                             if (rs->count == 0) {
2156                                 s->state_stack_len--;
2157                             }
2158                             break;
2159                         }
2160                     } else {
2161                         ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2162                                (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2163                         if (ret) {
2164                             /* keep the capture in case of positive lookahead */
2165                             if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2166                                 goto pop_state1;
2167                             else
2168                                 goto pop_state;
2169                         }
2170                     }
2171                     s->state_stack_len--;
2172                 }
2173             }
2174             break;
2175         case REOP_char32:
2176             val = get_u32(pc);
2177             pc += 4;
2178             goto test_char;
2179         case REOP_char:
2180             val = get_u16(pc);
2181             pc += 2;
2182         test_char:
2183             if (cptr >= cbuf_end)
2184                 goto no_match;
2185             GET_CHAR(c, cptr, cbuf_end);
2186             if (s->ignore_case) {
2187                 c = lre_canonicalize(c, s->is_utf16);
2188             }
2189             if (val != c)
2190                 goto no_match;
2191             break;
2192         case REOP_split_goto_first:
2193         case REOP_split_next_first:
2194             {
2195                 const uint8_t *pc1;
2196 
2197                 val = get_u32(pc);
2198                 pc += 4;
2199                 if (opcode == REOP_split_next_first) {
2200                     pc1 = pc + (int)val;
2201                 } else {
2202                     pc1 = pc;
2203                     pc = pc + (int)val;
2204                 }
2205                 ret = push_state(s, capture, stack, stack_len,
2206                                  pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2207                 if (ret < 0)
2208                     return -1;
2209                 break;
2210             }
2211         case REOP_lookahead:
2212         case REOP_negative_lookahead:
2213             val = get_u32(pc);
2214             pc += 4;
2215             ret = push_state(s, capture, stack, stack_len,
2216                              pc + (int)val, cptr,
2217                              RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2218                              0);
2219             if (ret < 0)
2220                 return -1;
2221             break;
2222 
2223         case REOP_goto:
2224             val = get_u32(pc);
2225             pc += 4 + (int)val;
2226             break;
2227         case REOP_line_start:
2228             if (cptr == s->cbuf)
2229                 break;
2230             if (!s->multi_line)
2231                 goto no_match;
2232             PEEK_PREV_CHAR(c, cptr, s->cbuf);
2233             if (!is_line_terminator(c))
2234                 goto no_match;
2235             break;
2236         case REOP_line_end:
2237             if (cptr == cbuf_end)
2238                 break;
2239             if (!s->multi_line)
2240                 goto no_match;
2241             PEEK_CHAR(c, cptr, cbuf_end);
2242             if (!is_line_terminator(c))
2243                 goto no_match;
2244             break;
2245         case REOP_dot:
2246             if (cptr == cbuf_end)
2247                 goto no_match;
2248             GET_CHAR(c, cptr, cbuf_end);
2249             if (is_line_terminator(c))
2250                 goto no_match;
2251             break;
2252         case REOP_any:
2253             if (cptr == cbuf_end)
2254                 goto no_match;
2255             GET_CHAR(c, cptr, cbuf_end);
2256             break;
2257         case REOP_save_start:
2258         case REOP_save_end:
2259             val = *pc++;
2260             assert(val < s->capture_count);
2261             capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2262             break;
2263         case REOP_save_reset:
2264             {
2265                 uint32_t val2;
2266                 val = pc[0];
2267                 val2 = pc[1];
2268                 pc += 2;
2269                 assert(val2 < s->capture_count);
2270                 while (val <= val2) {
2271                     capture[2 * val] = NULL;
2272                     capture[2 * val + 1] = NULL;
2273                     val++;
2274                 }
2275             }
2276             break;
2277         case REOP_push_i32:
2278             val = get_u32(pc);
2279             pc += 4;
2280             stack[stack_len++] = val;
2281             break;
2282         case REOP_drop:
2283             stack_len--;
2284             break;
2285         case REOP_loop:
2286             val = get_u32(pc);
2287             pc += 4;
2288             if (--stack[stack_len - 1] != 0) {
2289                 pc += (int)val;
2290             }
2291             break;
2292         case REOP_push_char_pos:
2293             stack[stack_len++] = (uintptr_t)cptr;
2294             break;
2295         case REOP_bne_char_pos:
2296             val = get_u32(pc);
2297             pc += 4;
2298             if (stack[--stack_len] != (uintptr_t)cptr)
2299                 pc += (int)val;
2300             break;
2301         case REOP_word_boundary:
2302         case REOP_not_word_boundary:
2303             {
2304                 BOOL v1, v2;
2305                 /* char before */
2306                 if (cptr == s->cbuf) {
2307                     v1 = FALSE;
2308                 } else {
2309                     PEEK_PREV_CHAR(c, cptr, s->cbuf);
2310                     v1 = is_word_char(c);
2311                 }
2312                 /* current char */
2313                 if (cptr >= cbuf_end) {
2314                     v2 = FALSE;
2315                 } else {
2316                     PEEK_CHAR(c, cptr, cbuf_end);
2317                     v2 = is_word_char(c);
2318                 }
2319                 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2320                     goto no_match;
2321             }
2322             break;
2323         case REOP_back_reference:
2324         case REOP_backward_back_reference:
2325             {
2326                 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2327                 uint32_t c1, c2;
2328 
2329                 val = *pc++;
2330                 if (val >= s->capture_count)
2331                     goto no_match;
2332                 cptr1_start = capture[2 * val];
2333                 cptr1_end = capture[2 * val + 1];
2334                 if (!cptr1_start || !cptr1_end)
2335                     break;
2336                 if (opcode == REOP_back_reference) {
2337                     cptr1 = cptr1_start;
2338                     while (cptr1 < cptr1_end) {
2339                         if (cptr >= cbuf_end)
2340                             goto no_match;
2341                         GET_CHAR(c1, cptr1, cptr1_end);
2342                         GET_CHAR(c2, cptr, cbuf_end);
2343                         if (s->ignore_case) {
2344                             c1 = lre_canonicalize(c1, s->is_utf16);
2345                             c2 = lre_canonicalize(c2, s->is_utf16);
2346                         }
2347                         if (c1 != c2)
2348                             goto no_match;
2349                     }
2350                 } else {
2351                     cptr1 = cptr1_end;
2352                     while (cptr1 > cptr1_start) {
2353                         if (cptr == s->cbuf)
2354                             goto no_match;
2355                         GET_PREV_CHAR(c1, cptr1, cptr1_start);
2356                         GET_PREV_CHAR(c2, cptr, s->cbuf);
2357                         if (s->ignore_case) {
2358                             c1 = lre_canonicalize(c1, s->is_utf16);
2359                             c2 = lre_canonicalize(c2, s->is_utf16);
2360                         }
2361                         if (c1 != c2)
2362                             goto no_match;
2363                     }
2364                 }
2365             }
2366             break;
2367         case REOP_range:
2368             {
2369                 int n;
2370                 uint32_t low, high, idx_min, idx_max, idx;
2371 
2372                 n = get_u16(pc); /* n must be >= 1 */
2373                 pc += 2;
2374                 if (cptr >= cbuf_end)
2375                     goto no_match;
2376                 GET_CHAR(c, cptr, cbuf_end);
2377                 if (s->ignore_case) {
2378                     c = lre_canonicalize(c, s->is_utf16);
2379                 }
2380                 idx_min = 0;
2381                 low = get_u16(pc + 0 * 4);
2382                 if (c < low)
2383                     goto no_match;
2384                 idx_max = n - 1;
2385                 high = get_u16(pc + idx_max * 4 + 2);
2386                 /* 0xffff in for last value means +infinity */
2387                 if (unlikely(c >= 0xffff) && high == 0xffff)
2388                     goto range_match;
2389                 if (c > high)
2390                     goto no_match;
2391                 while (idx_min <= idx_max) {
2392                     idx = (idx_min + idx_max) / 2;
2393                     low = get_u16(pc + idx * 4);
2394                     high = get_u16(pc + idx * 4 + 2);
2395                     if (c < low)
2396                         idx_max = idx - 1;
2397                     else if (c > high)
2398                         idx_min = idx + 1;
2399                     else
2400                         goto range_match;
2401                 }
2402                 goto no_match;
2403             range_match:
2404                 pc += 4 * n;
2405             }
2406             break;
2407         case REOP_range32:
2408             {
2409                 int n;
2410                 uint32_t low, high, idx_min, idx_max, idx;
2411 
2412                 n = get_u16(pc); /* n must be >= 1 */
2413                 pc += 2;
2414                 if (cptr >= cbuf_end)
2415                     goto no_match;
2416                 GET_CHAR(c, cptr, cbuf_end);
2417                 if (s->ignore_case) {
2418                     c = lre_canonicalize(c, s->is_utf16);
2419                 }
2420                 idx_min = 0;
2421                 low = get_u32(pc + 0 * 8);
2422                 if (c < low)
2423                     goto no_match;
2424                 idx_max = n - 1;
2425                 high = get_u32(pc + idx_max * 8 + 4);
2426                 if (c > high)
2427                     goto no_match;
2428                 while (idx_min <= idx_max) {
2429                     idx = (idx_min + idx_max) / 2;
2430                     low = get_u32(pc + idx * 8);
2431                     high = get_u32(pc + idx * 8 + 4);
2432                     if (c < low)
2433                         idx_max = idx - 1;
2434                     else if (c > high)
2435                         idx_min = idx + 1;
2436                     else
2437                         goto range32_match;
2438                 }
2439                 goto no_match;
2440             range32_match:
2441                 pc += 8 * n;
2442             }
2443             break;
2444         case REOP_prev:
2445             /* go to the previous char */
2446             if (cptr == s->cbuf)
2447                 goto no_match;
2448             PREV_CHAR(cptr, s->cbuf);
2449             break;
2450         case REOP_simple_greedy_quant:
2451             {
2452                 uint32_t next_pos, quant_min, quant_max;
2453                 size_t q;
2454                 intptr_t res;
2455                 const uint8_t *pc1;
2456 
2457                 next_pos = get_u32(pc);
2458                 quant_min = get_u32(pc + 4);
2459                 quant_max = get_u32(pc + 8);
2460                 pc += 16;
2461                 pc1 = pc;
2462                 pc += (int)next_pos;
2463 
2464                 q = 0;
2465                 for(;;) {
2466                     res = lre_exec_backtrack(s, capture, stack, stack_len,
2467                                              pc1, cptr, TRUE);
2468                     if (res == -1)
2469                         return res;
2470                     if (!res)
2471                         break;
2472                     cptr = (uint8_t *)res;
2473                     q++;
2474                     if (q >= quant_max && quant_max != INT32_MAX)
2475                         break;
2476                 }
2477                 if (q < quant_min)
2478                     goto no_match;
2479                 if (q > quant_min) {
2480                     /* will examine all matches down to quant_min */
2481                     ret = push_state(s, capture, stack, stack_len,
2482                                      pc1 - 16, cptr,
2483                                      RE_EXEC_STATE_GREEDY_QUANT,
2484                                      q - quant_min);
2485                     if (ret < 0)
2486                         return -1;
2487                 }
2488             }
2489             break;
2490         default:
2491             abort();
2492         }
2493     }
2494 }
2495 
2496 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
2497    starting position of the match and must be such as 0 <= cindex <=
2498    clen. */
lre_exec(uint8_t ** capture,const uint8_t * bc_buf,const uint8_t * cbuf,int cindex,int clen,int cbuf_type,void * opaque)2499 int lre_exec(uint8_t **capture,
2500              const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
2501              int cbuf_type, void *opaque)
2502 {
2503     REExecContext s_s, *s = &s_s;
2504     int re_flags, i, alloca_size, ret;
2505     StackInt *stack_buf;
2506 
2507     re_flags = bc_buf[RE_HEADER_FLAGS];
2508     s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2509     s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2510     s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2511     s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2512     s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2513     s->cbuf = cbuf;
2514     s->cbuf_end = cbuf + (clen << cbuf_type);
2515     s->cbuf_type = cbuf_type;
2516     if (s->cbuf_type == 1 && s->is_utf16)
2517         s->cbuf_type = 2;
2518     s->opaque = opaque;
2519 
2520     s->state_size = sizeof(REExecState) +
2521         s->capture_count * sizeof(capture[0]) * 2 +
2522         s->stack_size_max * sizeof(stack_buf[0]);
2523     s->state_stack = NULL;
2524     s->state_stack_len = 0;
2525     s->state_stack_size = 0;
2526 
2527     for(i = 0; i < s->capture_count * 2; i++)
2528         capture[i] = NULL;
2529     alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2530     stack_buf = alloca(alloca_size);
2531     ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2532                              cbuf + (cindex << cbuf_type), FALSE);
2533     lre_realloc(s->opaque, s->state_stack, 0);
2534     return ret;
2535 }
2536 
lre_get_capture_count(const uint8_t * bc_buf)2537 int lre_get_capture_count(const uint8_t *bc_buf)
2538 {
2539     return bc_buf[RE_HEADER_CAPTURE_COUNT];
2540 }
2541 
lre_get_flags(const uint8_t * bc_buf)2542 int lre_get_flags(const uint8_t *bc_buf)
2543 {
2544     return bc_buf[RE_HEADER_FLAGS];
2545 }
2546 
2547 /* Return NULL if no group names. Otherwise, return a pointer to
2548    'capture_count - 1' zero terminated UTF-8 strings. */
lre_get_groupnames(const uint8_t * bc_buf)2549 const char *lre_get_groupnames(const uint8_t *bc_buf)
2550 {
2551     uint32_t re_bytecode_len;
2552     if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
2553         return NULL;
2554     re_bytecode_len = get_u32(bc_buf + 3);
2555     return (const char *)(bc_buf + 7 + re_bytecode_len);
2556 }
2557 
2558 #ifdef TEST
2559 
lre_check_stack_overflow(void * opaque,size_t alloca_size)2560 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2561 {
2562     return FALSE;
2563 }
2564 
lre_realloc(void * opaque,void * ptr,size_t size)2565 void *lre_realloc(void *opaque, void *ptr, size_t size)
2566 {
2567     return realloc(ptr, size);
2568 }
2569 
main(int argc,char ** argv)2570 int main(int argc, char **argv)
2571 {
2572     int len, ret, i;
2573     uint8_t *bc;
2574     char error_msg[64];
2575     uint8_t *capture[CAPTURE_COUNT_MAX * 2];
2576     const char *input;
2577     int input_len, capture_count;
2578 
2579     if (argc < 3) {
2580         printf("usage: %s regexp input\n", argv[0]);
2581         exit(1);
2582     }
2583     bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
2584                      strlen(argv[1]), 0, NULL);
2585     if (!bc) {
2586         fprintf(stderr, "error: %s\n", error_msg);
2587         exit(1);
2588     }
2589 
2590     input = argv[2];
2591     input_len = strlen(input);
2592 
2593     ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
2594     printf("ret=%d\n", ret);
2595     if (ret == 1) {
2596         capture_count = lre_get_capture_count(bc);
2597         for(i = 0; i < 2 * capture_count; i++) {
2598             uint8_t *ptr;
2599             ptr = capture[i];
2600             printf("%d: ", i);
2601             if (!ptr)
2602                 printf("<nil>");
2603             else
2604                 printf("%u", (int)(ptr - (uint8_t *)input));
2605             printf("\n");
2606         }
2607     }
2608     return 0;
2609 }
2610 #endif
2611