1 /*
2 * Regular Expression Engine
3 *
4 * Copyright (c) 2017-2018 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24 #include <stdlib.h>
25 #include <stdio.h>
26 #include <stdarg.h>
27 #include <inttypes.h>
28 #include <string.h>
29 #include <assert.h>
30
31 #include "cutils.h"
32 #include "libregexp.h"
33
34 /*
35 TODO:
36
37 - Add full unicode canonicalize rules for character ranges (not
38 really useful but needed for exact "ignorecase" compatibility).
39
40 - Add a lock step execution mode (=linear time execution guaranteed)
41 when the regular expression is "simple" i.e. no backreference nor
42 complicated lookahead. The opcodes are designed for this execution
43 model.
44 */
45
46 #if defined(TEST)
47 #define DUMP_REOP
48 #endif
49
50 typedef enum {
51 #define DEF(id, size) REOP_ ## id,
52 #include "libregexp-opcode.h"
53 #undef DEF
54 REOP_COUNT,
55 } REOPCodeEnum;
56
57 #define CAPTURE_COUNT_MAX 255
58 #define STACK_SIZE_MAX 255
59
60 /* unicode code points */
61 #define CP_LS 0x2028
62 #define CP_PS 0x2029
63
64 #define TMP_BUF_SIZE 128
65
66 typedef struct {
67 DynBuf byte_code;
68 const uint8_t *buf_ptr;
69 const uint8_t *buf_end;
70 const uint8_t *buf_start;
71 int re_flags;
72 BOOL is_utf16;
73 BOOL ignore_case;
74 BOOL dotall;
75 int capture_count;
76 int total_capture_count; /* -1 = not computed yet */
77 int has_named_captures; /* -1 = don't know, 0 = no, 1 = yes */
78 void *opaque;
79 DynBuf group_names;
80 union {
81 char error_msg[TMP_BUF_SIZE];
82 char tmp_buf[TMP_BUF_SIZE];
83 } u;
84 } REParseState;
85
86 typedef struct {
87 #ifdef DUMP_REOP
88 const char *name;
89 #endif
90 uint8_t size;
91 } REOpCode;
92
93 static const REOpCode reopcode_info[REOP_COUNT] = {
94 #ifdef DUMP_REOP
95 #define DEF(id, size) { #id, size },
96 #else
97 #define DEF(id, size) { size },
98 #endif
99 #include "libregexp-opcode.h"
100 #undef DEF
101 };
102
103 #define RE_HEADER_FLAGS 0
104 #define RE_HEADER_CAPTURE_COUNT 1
105 #define RE_HEADER_STACK_SIZE 2
106
107 #define RE_HEADER_LEN 7
108
is_digit(int c)109 static inline int is_digit(int c) {
110 return c >= '0' && c <= '9';
111 }
112
113 /* insert 'len' bytes at position 'pos'. Return < 0 if error. */
dbuf_insert(DynBuf * s,int pos,int len)114 static int dbuf_insert(DynBuf *s, int pos, int len)
115 {
116 if (dbuf_realloc(s, s->size + len))
117 return -1;
118 memmove(s->buf + pos + len, s->buf + pos, s->size - pos);
119 s->size += len;
120 return 0;
121 }
122
123 /* canonicalize with the specific JS regexp rules */
lre_canonicalize(uint32_t c,BOOL is_utf16)124 static uint32_t lre_canonicalize(uint32_t c, BOOL is_utf16)
125 {
126 uint32_t res[LRE_CC_RES_LEN_MAX];
127 int len;
128 if (is_utf16) {
129 if (likely(c < 128)) {
130 if (c >= 'A' && c <= 'Z')
131 c = c - 'A' + 'a';
132 } else {
133 lre_case_conv(res, c, 2);
134 c = res[0];
135 }
136 } else {
137 if (likely(c < 128)) {
138 if (c >= 'a' && c <= 'z')
139 c = c - 'a' + 'A';
140 } else {
141 /* legacy regexp: to upper case if single char >= 128 */
142 len = lre_case_conv(res, c, FALSE);
143 if (len == 1 && res[0] >= 128)
144 c = res[0];
145 }
146 }
147 return c;
148 }
149
150 static const uint16_t char_range_d[] = {
151 1,
152 0x0030, 0x0039 + 1,
153 };
154
155 /* code point ranges for Zs,Zl or Zp property */
156 static const uint16_t char_range_s[] = {
157 10,
158 0x0009, 0x000D + 1,
159 0x0020, 0x0020 + 1,
160 0x00A0, 0x00A0 + 1,
161 0x1680, 0x1680 + 1,
162 0x2000, 0x200A + 1,
163 /* 2028;LINE SEPARATOR;Zl;0;WS;;;;;N;;;;; */
164 /* 2029;PARAGRAPH SEPARATOR;Zp;0;B;;;;;N;;;;; */
165 0x2028, 0x2029 + 1,
166 0x202F, 0x202F + 1,
167 0x205F, 0x205F + 1,
168 0x3000, 0x3000 + 1,
169 /* FEFF;ZERO WIDTH NO-BREAK SPACE;Cf;0;BN;;;;;N;BYTE ORDER MARK;;;; */
170 0xFEFF, 0xFEFF + 1,
171 };
172
lre_is_space(int c)173 BOOL lre_is_space(int c)
174 {
175 int i, n, low, high;
176 n = (countof(char_range_s) - 1) / 2;
177 for(i = 0; i < n; i++) {
178 low = char_range_s[2 * i + 1];
179 if (c < low)
180 return FALSE;
181 high = char_range_s[2 * i + 2];
182 if (c < high)
183 return TRUE;
184 }
185 return FALSE;
186 }
187
188 uint32_t const lre_id_start_table_ascii[4] = {
189 /* $ A-Z _ a-z */
190 0x00000000, 0x00000010, 0x87FFFFFE, 0x07FFFFFE
191 };
192
193 uint32_t const lre_id_continue_table_ascii[4] = {
194 /* $ 0-9 A-Z _ a-z */
195 0x00000000, 0x03FF0010, 0x87FFFFFE, 0x07FFFFFE
196 };
197
198
199 static const uint16_t char_range_w[] = {
200 4,
201 0x0030, 0x0039 + 1,
202 0x0041, 0x005A + 1,
203 0x005F, 0x005F + 1,
204 0x0061, 0x007A + 1,
205 };
206
207 #define CLASS_RANGE_BASE 0x40000000
208
209 typedef enum {
210 CHAR_RANGE_d,
211 CHAR_RANGE_D,
212 CHAR_RANGE_s,
213 CHAR_RANGE_S,
214 CHAR_RANGE_w,
215 CHAR_RANGE_W,
216 } CharRangeEnum;
217
218 static const uint16_t *char_range_table[] = {
219 char_range_d,
220 char_range_s,
221 char_range_w,
222 };
223
cr_init_char_range(REParseState * s,CharRange * cr,uint32_t c)224 static int cr_init_char_range(REParseState *s, CharRange *cr, uint32_t c)
225 {
226 BOOL invert;
227 const uint16_t *c_pt;
228 int len, i;
229
230 invert = c & 1;
231 c_pt = char_range_table[c >> 1];
232 len = *c_pt++;
233 cr_init(cr, s->opaque, lre_realloc);
234 for(i = 0; i < len * 2; i++) {
235 if (cr_add_point(cr, c_pt[i]))
236 goto fail;
237 }
238 if (invert) {
239 if (cr_invert(cr))
240 goto fail;
241 }
242 return 0;
243 fail:
244 cr_free(cr);
245 return -1;
246 }
247
cr_canonicalize(CharRange * cr)248 static int cr_canonicalize(CharRange *cr)
249 {
250 CharRange a;
251 uint32_t pt[2];
252 int i, ret;
253
254 cr_init(&a, cr->mem_opaque, lre_realloc);
255 pt[0] = 'a';
256 pt[1] = 'z' + 1;
257 ret = cr_op(&a, cr->points, cr->len, pt, 2, CR_OP_INTER);
258 if (ret)
259 goto fail;
260 /* convert to upper case */
261 /* XXX: the generic unicode case would be much more complicated
262 and not really useful */
263 for(i = 0; i < a.len; i++) {
264 a.points[i] += 'A' - 'a';
265 }
266 /* Note: for simplicity we keep the lower case ranges */
267 ret = cr_union1(cr, a.points, a.len);
268 fail:
269 cr_free(&a);
270 return ret;
271 }
272
273 #ifdef DUMP_REOP
lre_dump_bytecode(const uint8_t * buf,int buf_len)274 static __maybe_unused void lre_dump_bytecode(const uint8_t *buf,
275 int buf_len)
276 {
277 int pos, len, opcode, bc_len, re_flags, i;
278 uint32_t val;
279
280 assert(buf_len >= RE_HEADER_LEN);
281
282 re_flags= buf[0];
283 bc_len = get_u32(buf + 3);
284 assert(bc_len + RE_HEADER_LEN <= buf_len);
285 printf("flags: 0x%x capture_count=%d stack_size=%d\n",
286 re_flags, buf[1], buf[2]);
287 if (re_flags & LRE_FLAG_NAMED_GROUPS) {
288 const char *p;
289 p = (char *)buf + RE_HEADER_LEN + bc_len;
290 printf("named groups: ");
291 for(i = 1; i < buf[1]; i++) {
292 if (i != 1)
293 printf(",");
294 printf("<%s>", p);
295 p += strlen(p) + 1;
296 }
297 printf("\n");
298 assert(p == (char *)(buf + buf_len));
299 }
300 printf("bytecode_len=%d\n", bc_len);
301
302 buf += RE_HEADER_LEN;
303 pos = 0;
304 while (pos < bc_len) {
305 printf("%5u: ", pos);
306 opcode = buf[pos];
307 len = reopcode_info[opcode].size;
308 if (opcode >= REOP_COUNT) {
309 printf(" invalid opcode=0x%02x\n", opcode);
310 break;
311 }
312 if ((pos + len) > bc_len) {
313 printf(" buffer overflow (opcode=0x%02x)\n", opcode);
314 break;
315 }
316 printf("%s", reopcode_info[opcode].name);
317 switch(opcode) {
318 case REOP_char:
319 val = get_u16(buf + pos + 1);
320 if (val >= ' ' && val <= 126)
321 printf(" '%c'", val);
322 else
323 printf(" 0x%04x", val);
324 break;
325 case REOP_char32:
326 val = get_u32(buf + pos + 1);
327 if (val >= ' ' && val <= 126)
328 printf(" '%c'", val);
329 else
330 printf(" 0x%08x", val);
331 break;
332 case REOP_goto:
333 case REOP_split_goto_first:
334 case REOP_split_next_first:
335 case REOP_loop:
336 case REOP_lookahead:
337 case REOP_negative_lookahead:
338 case REOP_bne_char_pos:
339 val = get_u32(buf + pos + 1);
340 val += (pos + 5);
341 printf(" %u", val);
342 break;
343 case REOP_simple_greedy_quant:
344 printf(" %u %u %u %u",
345 get_u32(buf + pos + 1) + (pos + 17),
346 get_u32(buf + pos + 1 + 4),
347 get_u32(buf + pos + 1 + 8),
348 get_u32(buf + pos + 1 + 12));
349 break;
350 case REOP_save_start:
351 case REOP_save_end:
352 case REOP_back_reference:
353 case REOP_backward_back_reference:
354 printf(" %u", buf[pos + 1]);
355 break;
356 case REOP_save_reset:
357 printf(" %u %u", buf[pos + 1], buf[pos + 2]);
358 break;
359 case REOP_push_i32:
360 val = get_u32(buf + pos + 1);
361 printf(" %d", val);
362 break;
363 case REOP_range:
364 {
365 int n, i;
366 n = get_u16(buf + pos + 1);
367 len += n * 4;
368 for(i = 0; i < n * 2; i++) {
369 val = get_u16(buf + pos + 3 + i * 2);
370 printf(" 0x%04x", val);
371 }
372 }
373 break;
374 case REOP_range32:
375 {
376 int n, i;
377 n = get_u16(buf + pos + 1);
378 len += n * 8;
379 for(i = 0; i < n * 2; i++) {
380 val = get_u32(buf + pos + 3 + i * 4);
381 printf(" 0x%08x", val);
382 }
383 }
384 break;
385 default:
386 break;
387 }
388 printf("\n");
389 pos += len;
390 }
391 }
392 #endif
393
re_emit_op(REParseState * s,int op)394 static void re_emit_op(REParseState *s, int op)
395 {
396 dbuf_putc(&s->byte_code, op);
397 }
398
399 /* return the offset of the u32 value */
re_emit_op_u32(REParseState * s,int op,uint32_t val)400 static int re_emit_op_u32(REParseState *s, int op, uint32_t val)
401 {
402 int pos;
403 dbuf_putc(&s->byte_code, op);
404 pos = s->byte_code.size;
405 dbuf_put_u32(&s->byte_code, val);
406 return pos;
407 }
408
re_emit_goto(REParseState * s,int op,uint32_t val)409 static int re_emit_goto(REParseState *s, int op, uint32_t val)
410 {
411 int pos;
412 dbuf_putc(&s->byte_code, op);
413 pos = s->byte_code.size;
414 dbuf_put_u32(&s->byte_code, val - (pos + 4));
415 return pos;
416 }
417
re_emit_op_u8(REParseState * s,int op,uint32_t val)418 static void re_emit_op_u8(REParseState *s, int op, uint32_t val)
419 {
420 dbuf_putc(&s->byte_code, op);
421 dbuf_putc(&s->byte_code, val);
422 }
423
re_emit_op_u16(REParseState * s,int op,uint32_t val)424 static void re_emit_op_u16(REParseState *s, int op, uint32_t val)
425 {
426 dbuf_putc(&s->byte_code, op);
427 dbuf_put_u16(&s->byte_code, val);
428 }
429
re_parse_error(REParseState * s,const char * fmt,...)430 static int __attribute__((format(printf, 2, 3))) re_parse_error(REParseState *s, const char *fmt, ...)
431 {
432 va_list ap;
433 va_start(ap, fmt);
434 vsnprintf(s->u.error_msg, sizeof(s->u.error_msg), fmt, ap);
435 va_end(ap);
436 return -1;
437 }
438
re_parse_out_of_memory(REParseState * s)439 static int re_parse_out_of_memory(REParseState *s)
440 {
441 return re_parse_error(s, "out of memory");
442 }
443
444 /* If allow_overflow is false, return -1 in case of
445 overflow. Otherwise return INT32_MAX. */
parse_digits(const uint8_t ** pp,BOOL allow_overflow)446 static int parse_digits(const uint8_t **pp, BOOL allow_overflow)
447 {
448 const uint8_t *p;
449 uint64_t v;
450 int c;
451
452 p = *pp;
453 v = 0;
454 for(;;) {
455 c = *p;
456 if (c < '0' || c > '9')
457 break;
458 v = v * 10 + c - '0';
459 if (v >= INT32_MAX) {
460 if (allow_overflow)
461 v = INT32_MAX;
462 else
463 return -1;
464 }
465 p++;
466 }
467 *pp = p;
468 return v;
469 }
470
re_parse_expect(REParseState * s,const uint8_t ** pp,int c)471 static int re_parse_expect(REParseState *s, const uint8_t **pp, int c)
472 {
473 const uint8_t *p;
474 p = *pp;
475 if (*p != c)
476 return re_parse_error(s, "expecting '%c'", c);
477 p++;
478 *pp = p;
479 return 0;
480 }
481
482 /* Parse an escape sequence, *pp points after the '\':
483 allow_utf16 value:
484 0 : no UTF-16 escapes allowed
485 1 : UTF-16 escapes allowed
486 2 : UTF-16 escapes allowed and escapes of surrogate pairs are
487 converted to a unicode character (unicode regexp case).
488
489 Return the unicode char and update *pp if recognized,
490 return -1 if malformed escape,
491 return -2 otherwise. */
lre_parse_escape(const uint8_t ** pp,int allow_utf16)492 int lre_parse_escape(const uint8_t **pp, int allow_utf16)
493 {
494 const uint8_t *p;
495 uint32_t c;
496
497 p = *pp;
498 c = *p++;
499 switch(c) {
500 case 'b':
501 c = '\b';
502 break;
503 case 'f':
504 c = '\f';
505 break;
506 case 'n':
507 c = '\n';
508 break;
509 case 'r':
510 c = '\r';
511 break;
512 case 't':
513 c = '\t';
514 break;
515 case 'v':
516 c = '\v';
517 break;
518 case 'x':
519 case 'u':
520 {
521 int h, n, i;
522 uint32_t c1;
523
524 if (*p == '{' && allow_utf16) {
525 p++;
526 c = 0;
527 for(;;) {
528 h = from_hex(*p++);
529 if (h < 0)
530 return -1;
531 c = (c << 4) | h;
532 if (c > 0x10FFFF)
533 return -1;
534 if (*p == '}')
535 break;
536 }
537 p++;
538 } else {
539 if (c == 'x') {
540 n = 2;
541 } else {
542 n = 4;
543 }
544
545 c = 0;
546 for(i = 0; i < n; i++) {
547 h = from_hex(*p++);
548 if (h < 0) {
549 return -1;
550 }
551 c = (c << 4) | h;
552 }
553 if (c >= 0xd800 && c < 0xdc00 &&
554 allow_utf16 == 2 && p[0] == '\\' && p[1] == 'u') {
555 /* convert an escaped surrogate pair into a
556 unicode char */
557 c1 = 0;
558 for(i = 0; i < 4; i++) {
559 h = from_hex(p[2 + i]);
560 if (h < 0)
561 break;
562 c1 = (c1 << 4) | h;
563 }
564 if (i == 4 && c1 >= 0xdc00 && c1 < 0xe000) {
565 p += 6;
566 c = (((c & 0x3ff) << 10) | (c1 & 0x3ff)) + 0x10000;
567 }
568 }
569 }
570 }
571 break;
572 case '0': case '1': case '2': case '3':
573 case '4': case '5': case '6': case '7':
574 c -= '0';
575 if (allow_utf16 == 2) {
576 /* only accept \0 not followed by digit */
577 if (c != 0 || is_digit(*p))
578 return -1;
579 } else {
580 /* parse a legacy octal sequence */
581 uint32_t v;
582 v = *p - '0';
583 if (v > 7)
584 break;
585 c = (c << 3) | v;
586 p++;
587 if (c >= 32)
588 break;
589 v = *p - '0';
590 if (v > 7)
591 break;
592 c = (c << 3) | v;
593 p++;
594 }
595 break;
596 default:
597 return -2;
598 }
599 *pp = p;
600 return c;
601 }
602
603 #ifdef CONFIG_ALL_UNICODE
604 /* XXX: we use the same chars for name and value */
is_unicode_char(int c)605 static BOOL is_unicode_char(int c)
606 {
607 return ((c >= '0' && c <= '9') ||
608 (c >= 'A' && c <= 'Z') ||
609 (c >= 'a' && c <= 'z') ||
610 (c == '_'));
611 }
612
parse_unicode_property(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL is_inv)613 static int parse_unicode_property(REParseState *s, CharRange *cr,
614 const uint8_t **pp, BOOL is_inv)
615 {
616 const uint8_t *p;
617 char name[64], value[64];
618 char *q;
619 BOOL script_ext;
620 int ret;
621
622 p = *pp;
623 if (*p != '{')
624 return re_parse_error(s, "expecting '{' after \\p");
625 p++;
626 q = name;
627 while (is_unicode_char(*p)) {
628 if ((q - name) >= sizeof(name) - 1)
629 goto unknown_property_name;
630 *q++ = *p++;
631 }
632 *q = '\0';
633 q = value;
634 if (*p == '=') {
635 p++;
636 while (is_unicode_char(*p)) {
637 if ((q - value) >= sizeof(value) - 1)
638 return re_parse_error(s, "unknown unicode property value");
639 *q++ = *p++;
640 }
641 }
642 *q = '\0';
643 if (*p != '}')
644 return re_parse_error(s, "expecting '}'");
645 p++;
646 // printf("name=%s value=%s\n", name, value);
647
648 if (!strcmp(name, "Script") || !strcmp(name, "sc")) {
649 script_ext = FALSE;
650 goto do_script;
651 } else if (!strcmp(name, "Script_Extensions") || !strcmp(name, "scx")) {
652 script_ext = TRUE;
653 do_script:
654 cr_init(cr, s->opaque, lre_realloc);
655 ret = unicode_script(cr, value, script_ext);
656 if (ret) {
657 cr_free(cr);
658 if (ret == -2)
659 return re_parse_error(s, "unknown unicode script");
660 else
661 goto out_of_memory;
662 }
663 } else if (!strcmp(name, "General_Category") || !strcmp(name, "gc")) {
664 cr_init(cr, s->opaque, lre_realloc);
665 ret = unicode_general_category(cr, value);
666 if (ret) {
667 cr_free(cr);
668 if (ret == -2)
669 return re_parse_error(s, "unknown unicode general category");
670 else
671 goto out_of_memory;
672 }
673 } else if (value[0] == '\0') {
674 cr_init(cr, s->opaque, lre_realloc);
675 ret = unicode_general_category(cr, name);
676 if (ret == -1) {
677 cr_free(cr);
678 goto out_of_memory;
679 }
680 if (ret < 0) {
681 ret = unicode_prop(cr, name);
682 if (ret) {
683 cr_free(cr);
684 if (ret == -2)
685 goto unknown_property_name;
686 else
687 goto out_of_memory;
688 }
689 }
690 } else {
691 unknown_property_name:
692 return re_parse_error(s, "unknown unicode property name");
693 }
694
695 if (is_inv) {
696 if (cr_invert(cr)) {
697 cr_free(cr);
698 return -1;
699 }
700 }
701 *pp = p;
702 return 0;
703 out_of_memory:
704 return re_parse_out_of_memory(s);
705 }
706 #endif /* CONFIG_ALL_UNICODE */
707
708 /* return -1 if error otherwise the character or a class range
709 (CLASS_RANGE_BASE). In case of class range, 'cr' is
710 initialized. Otherwise, it is ignored. */
get_class_atom(REParseState * s,CharRange * cr,const uint8_t ** pp,BOOL inclass)711 static int get_class_atom(REParseState *s, CharRange *cr,
712 const uint8_t **pp, BOOL inclass)
713 {
714 const uint8_t *p;
715 uint32_t c;
716 int ret;
717
718 p = *pp;
719
720 c = *p;
721 switch(c) {
722 case '\\':
723 p++;
724 if (p >= s->buf_end)
725 goto unexpected_end;
726 c = *p++;
727 switch(c) {
728 case 'd':
729 c = CHAR_RANGE_d;
730 goto class_range;
731 case 'D':
732 c = CHAR_RANGE_D;
733 goto class_range;
734 case 's':
735 c = CHAR_RANGE_s;
736 goto class_range;
737 case 'S':
738 c = CHAR_RANGE_S;
739 goto class_range;
740 case 'w':
741 c = CHAR_RANGE_w;
742 goto class_range;
743 case 'W':
744 c = CHAR_RANGE_W;
745 class_range:
746 if (cr_init_char_range(s, cr, c))
747 return -1;
748 c = CLASS_RANGE_BASE;
749 break;
750 case 'c':
751 c = *p;
752 if ((c >= 'a' && c <= 'z') ||
753 (c >= 'A' && c <= 'Z') ||
754 (((c >= '0' && c <= '9') || c == '_') &&
755 inclass && !s->is_utf16)) { /* Annex B.1.4 */
756 c &= 0x1f;
757 p++;
758 } else if (s->is_utf16) {
759 goto invalid_escape;
760 } else {
761 /* otherwise return '\' and 'c' */
762 p--;
763 c = '\\';
764 }
765 break;
766 #ifdef CONFIG_ALL_UNICODE
767 case 'p':
768 case 'P':
769 if (s->is_utf16) {
770 if (parse_unicode_property(s, cr, &p, (c == 'P')))
771 return -1;
772 c = CLASS_RANGE_BASE;
773 break;
774 }
775 /* fall thru */
776 #endif
777 default:
778 p--;
779 ret = lre_parse_escape(&p, s->is_utf16 * 2);
780 if (ret >= 0) {
781 c = ret;
782 } else {
783 if (ret == -2 && *p != '\0' && strchr("^$\\.*+?()[]{}|/", *p)) {
784 /* always valid to escape these characters */
785 goto normal_char;
786 } else if (s->is_utf16) {
787 invalid_escape:
788 return re_parse_error(s, "invalid escape sequence in regular expression");
789 } else {
790 /* just ignore the '\' */
791 goto normal_char;
792 }
793 }
794 break;
795 }
796 break;
797 case '\0':
798 if (p >= s->buf_end) {
799 unexpected_end:
800 return re_parse_error(s, "unexpected end");
801 }
802 /* fall thru */
803 default:
804 normal_char:
805 /* normal char */
806 if (c >= 128) {
807 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
808 if ((unsigned)c > 0xffff && !s->is_utf16) {
809 /* XXX: should handle non BMP-1 code points */
810 return re_parse_error(s, "malformed unicode char");
811 }
812 } else {
813 p++;
814 }
815 break;
816 }
817 *pp = p;
818 return c;
819 }
820
re_emit_range(REParseState * s,const CharRange * cr)821 static int re_emit_range(REParseState *s, const CharRange *cr)
822 {
823 int len, i;
824 uint32_t high;
825
826 len = (unsigned)cr->len / 2;
827 if (len >= 65535)
828 return re_parse_error(s, "too many ranges");
829 if (len == 0) {
830 /* not sure it can really happen. Emit a match that is always
831 false */
832 re_emit_op_u32(s, REOP_char32, -1);
833 } else {
834 high = cr->points[cr->len - 1];
835 if (high == UINT32_MAX)
836 high = cr->points[cr->len - 2];
837 if (high <= 0xffff) {
838 /* can use 16 bit ranges with the conversion that 0xffff =
839 infinity */
840 re_emit_op_u16(s, REOP_range, len);
841 for(i = 0; i < cr->len; i += 2) {
842 dbuf_put_u16(&s->byte_code, cr->points[i]);
843 high = cr->points[i + 1] - 1;
844 if (high == UINT32_MAX - 1)
845 high = 0xffff;
846 dbuf_put_u16(&s->byte_code, high);
847 }
848 } else {
849 re_emit_op_u16(s, REOP_range32, len);
850 for(i = 0; i < cr->len; i += 2) {
851 dbuf_put_u32(&s->byte_code, cr->points[i]);
852 dbuf_put_u32(&s->byte_code, cr->points[i + 1] - 1);
853 }
854 }
855 }
856 return 0;
857 }
858
re_parse_char_class(REParseState * s,const uint8_t ** pp)859 static int re_parse_char_class(REParseState *s, const uint8_t **pp)
860 {
861 const uint8_t *p;
862 uint32_t c1, c2;
863 CharRange cr_s, *cr = &cr_s;
864 CharRange cr1_s, *cr1 = &cr1_s;
865 BOOL invert;
866
867 cr_init(cr, s->opaque, lre_realloc);
868 p = *pp;
869 p++; /* skip '[' */
870 invert = FALSE;
871 if (*p == '^') {
872 p++;
873 invert = TRUE;
874 }
875 for(;;) {
876 if (*p == ']')
877 break;
878 c1 = get_class_atom(s, cr1, &p, TRUE);
879 if ((int)c1 < 0)
880 goto fail;
881 if (*p == '-' && p[1] != ']') {
882 const uint8_t *p0 = p + 1;
883 if (c1 >= CLASS_RANGE_BASE) {
884 if (s->is_utf16) {
885 cr_free(cr1);
886 goto invalid_class_range;
887 }
888 /* Annex B: match '-' character */
889 goto class_atom;
890 }
891 c2 = get_class_atom(s, cr1, &p0, TRUE);
892 if ((int)c2 < 0)
893 goto fail;
894 if (c2 >= CLASS_RANGE_BASE) {
895 cr_free(cr1);
896 if (s->is_utf16) {
897 goto invalid_class_range;
898 }
899 /* Annex B: match '-' character */
900 goto class_atom;
901 }
902 p = p0;
903 if (c2 < c1) {
904 invalid_class_range:
905 re_parse_error(s, "invalid class range");
906 goto fail;
907 }
908 if (cr_union_interval(cr, c1, c2))
909 goto memory_error;
910 } else {
911 class_atom:
912 if (c1 >= CLASS_RANGE_BASE) {
913 int ret;
914 ret = cr_union1(cr, cr1->points, cr1->len);
915 cr_free(cr1);
916 if (ret)
917 goto memory_error;
918 } else {
919 if (cr_union_interval(cr, c1, c1))
920 goto memory_error;
921 }
922 }
923 }
924 if (s->ignore_case) {
925 if (cr_canonicalize(cr))
926 goto memory_error;
927 }
928 if (invert) {
929 if (cr_invert(cr))
930 goto memory_error;
931 }
932 if (re_emit_range(s, cr))
933 goto fail;
934 cr_free(cr);
935 p++; /* skip ']' */
936 *pp = p;
937 return 0;
938 memory_error:
939 re_parse_out_of_memory(s);
940 fail:
941 cr_free(cr);
942 return -1;
943 }
944
945 /* Return:
946 1 if the opcodes in bc_buf[] always advance the character pointer.
947 0 if the character pointer may not be advanced.
948 -1 if the code may depend on side effects of its previous execution (backreference)
949 */
re_check_advance(const uint8_t * bc_buf,int bc_buf_len)950 static int re_check_advance(const uint8_t *bc_buf, int bc_buf_len)
951 {
952 int pos, opcode, ret, len, i;
953 uint32_t val, last;
954 BOOL has_back_reference;
955 uint8_t capture_bitmap[CAPTURE_COUNT_MAX];
956
957 ret = -2; /* not known yet */
958 pos = 0;
959 has_back_reference = FALSE;
960 memset(capture_bitmap, 0, sizeof(capture_bitmap));
961
962 while (pos < bc_buf_len) {
963 opcode = bc_buf[pos];
964 len = reopcode_info[opcode].size;
965 switch(opcode) {
966 case REOP_range:
967 val = get_u16(bc_buf + pos + 1);
968 len += val * 4;
969 goto simple_char;
970 case REOP_range32:
971 val = get_u16(bc_buf + pos + 1);
972 len += val * 8;
973 goto simple_char;
974 case REOP_char:
975 case REOP_char32:
976 case REOP_dot:
977 case REOP_any:
978 simple_char:
979 if (ret == -2)
980 ret = 1;
981 break;
982 case REOP_line_start:
983 case REOP_line_end:
984 case REOP_push_i32:
985 case REOP_push_char_pos:
986 case REOP_drop:
987 case REOP_word_boundary:
988 case REOP_not_word_boundary:
989 case REOP_prev:
990 /* no effect */
991 break;
992 case REOP_save_start:
993 case REOP_save_end:
994 val = bc_buf[pos + 1];
995 capture_bitmap[val] |= 1;
996 break;
997 case REOP_save_reset:
998 {
999 val = bc_buf[pos + 1];
1000 last = bc_buf[pos + 2];
1001 while (val < last)
1002 capture_bitmap[val++] |= 1;
1003 }
1004 break;
1005 case REOP_back_reference:
1006 case REOP_backward_back_reference:
1007 val = bc_buf[pos + 1];
1008 capture_bitmap[val] |= 2;
1009 has_back_reference = TRUE;
1010 break;
1011 default:
1012 /* safe behvior: we cannot predict the outcome */
1013 if (ret == -2)
1014 ret = 0;
1015 break;
1016 }
1017 pos += len;
1018 }
1019 if (has_back_reference) {
1020 /* check if there is back reference which references a capture
1021 made in the some code */
1022 for(i = 0; i < CAPTURE_COUNT_MAX; i++) {
1023 if (capture_bitmap[i] == 3)
1024 return -1;
1025 }
1026 }
1027 if (ret == -2)
1028 ret = 0;
1029 return ret;
1030 }
1031
1032 /* return -1 if a simple quantifier cannot be used. Otherwise return
1033 the number of characters in the atom. */
re_is_simple_quantifier(const uint8_t * bc_buf,int bc_buf_len)1034 static int re_is_simple_quantifier(const uint8_t *bc_buf, int bc_buf_len)
1035 {
1036 int pos, opcode, len, count;
1037 uint32_t val;
1038
1039 count = 0;
1040 pos = 0;
1041 while (pos < bc_buf_len) {
1042 opcode = bc_buf[pos];
1043 len = reopcode_info[opcode].size;
1044 switch(opcode) {
1045 case REOP_range:
1046 val = get_u16(bc_buf + pos + 1);
1047 len += val * 4;
1048 goto simple_char;
1049 case REOP_range32:
1050 val = get_u16(bc_buf + pos + 1);
1051 len += val * 8;
1052 goto simple_char;
1053 case REOP_char:
1054 case REOP_char32:
1055 case REOP_dot:
1056 case REOP_any:
1057 simple_char:
1058 count++;
1059 break;
1060 case REOP_line_start:
1061 case REOP_line_end:
1062 case REOP_word_boundary:
1063 case REOP_not_word_boundary:
1064 break;
1065 default:
1066 return -1;
1067 }
1068 pos += len;
1069 }
1070 return count;
1071 }
1072
1073 /* '*pp' is the first char after '<' */
re_parse_group_name(char * buf,int buf_size,const uint8_t ** pp,BOOL is_utf16)1074 static int re_parse_group_name(char *buf, int buf_size,
1075 const uint8_t **pp, BOOL is_utf16)
1076 {
1077 const uint8_t *p;
1078 uint32_t c;
1079 char *q;
1080
1081 p = *pp;
1082 q = buf;
1083 for(;;) {
1084 c = *p;
1085 if (c == '\\') {
1086 p++;
1087 if (*p != 'u')
1088 return -1;
1089 c = lre_parse_escape(&p, is_utf16 * 2);
1090 } else if (c == '>') {
1091 break;
1092 } else if (c >= 128) {
1093 c = unicode_from_utf8(p, UTF8_CHAR_LEN_MAX, &p);
1094 } else {
1095 p++;
1096 }
1097 if (c > 0x10FFFF)
1098 return -1;
1099 if (q == buf) {
1100 if (!lre_js_is_ident_first(c))
1101 return -1;
1102 } else {
1103 if (!lre_js_is_ident_next(c))
1104 return -1;
1105 }
1106 if ((q - buf + UTF8_CHAR_LEN_MAX + 1) > buf_size)
1107 return -1;
1108 if (c < 128) {
1109 *q++ = c;
1110 } else {
1111 q += unicode_to_utf8((uint8_t*)q, c);
1112 }
1113 }
1114 if (q == buf)
1115 return -1;
1116 *q = '\0';
1117 p++;
1118 *pp = p;
1119 return 0;
1120 }
1121
1122 /* if capture_name = NULL: return the number of captures + 1.
1123 Otherwise, return the capture index corresponding to capture_name
1124 or -1 if none */
re_parse_captures(REParseState * s,int * phas_named_captures,const char * capture_name)1125 static int re_parse_captures(REParseState *s, int *phas_named_captures,
1126 const char *capture_name)
1127 {
1128 const uint8_t *p;
1129 int capture_index;
1130 char name[TMP_BUF_SIZE];
1131
1132 capture_index = 1;
1133 *phas_named_captures = 0;
1134 for (p = s->buf_start; p < s->buf_end; p++) {
1135 switch (*p) {
1136 case '(':
1137 if (p[1] == '?') {
1138 if (p[2] == '<' && p[3] != '=' && p[3] != '!') {
1139 *phas_named_captures = 1;
1140 /* potential named capture */
1141 if (capture_name) {
1142 p += 3;
1143 if (re_parse_group_name(name, sizeof(name), &p,
1144 s->is_utf16) == 0) {
1145 if (!strcmp(name, capture_name))
1146 return capture_index;
1147 }
1148 }
1149 capture_index++;
1150 if (capture_index >= CAPTURE_COUNT_MAX)
1151 goto done;
1152 }
1153 } else {
1154 capture_index++;
1155 if (capture_index >= CAPTURE_COUNT_MAX)
1156 goto done;
1157 }
1158 break;
1159 case '\\':
1160 p++;
1161 break;
1162 case '[':
1163 for (p += 1 + (*p == ']'); p < s->buf_end && *p != ']'; p++) {
1164 if (*p == '\\')
1165 p++;
1166 }
1167 break;
1168 }
1169 }
1170 done:
1171 if (capture_name)
1172 return -1;
1173 else
1174 return capture_index;
1175 }
1176
re_count_captures(REParseState * s)1177 static int re_count_captures(REParseState *s)
1178 {
1179 if (s->total_capture_count < 0) {
1180 s->total_capture_count = re_parse_captures(s, &s->has_named_captures,
1181 NULL);
1182 }
1183 return s->total_capture_count;
1184 }
1185
re_has_named_captures(REParseState * s)1186 static BOOL re_has_named_captures(REParseState *s)
1187 {
1188 if (s->has_named_captures < 0)
1189 re_count_captures(s);
1190 return s->has_named_captures;
1191 }
1192
find_group_name(REParseState * s,const char * name)1193 static int find_group_name(REParseState *s, const char *name)
1194 {
1195 const char *p, *buf_end;
1196 size_t len, name_len;
1197 int capture_index;
1198
1199 name_len = strlen(name);
1200 p = (char *)s->group_names.buf;
1201 buf_end = (char *)s->group_names.buf + s->group_names.size;
1202 capture_index = 1;
1203 while (p < buf_end) {
1204 len = strlen(p);
1205 if (len == name_len && memcmp(name, p, name_len) == 0)
1206 return capture_index;
1207 p += len + 1;
1208 capture_index++;
1209 }
1210 return -1;
1211 }
1212
1213 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir);
1214
re_parse_term(REParseState * s,BOOL is_backward_dir)1215 static int re_parse_term(REParseState *s, BOOL is_backward_dir)
1216 {
1217 const uint8_t *p;
1218 int c, last_atom_start, quant_min, quant_max, last_capture_count;
1219 BOOL greedy, add_zero_advance_check, is_neg, is_backward_lookahead;
1220 CharRange cr_s, *cr = &cr_s;
1221
1222 last_atom_start = -1;
1223 last_capture_count = 0;
1224 p = s->buf_ptr;
1225 c = *p;
1226 switch(c) {
1227 case '^':
1228 p++;
1229 re_emit_op(s, REOP_line_start);
1230 break;
1231 case '$':
1232 p++;
1233 re_emit_op(s, REOP_line_end);
1234 break;
1235 case '.':
1236 p++;
1237 last_atom_start = s->byte_code.size;
1238 last_capture_count = s->capture_count;
1239 if (is_backward_dir)
1240 re_emit_op(s, REOP_prev);
1241 re_emit_op(s, s->dotall ? REOP_any : REOP_dot);
1242 if (is_backward_dir)
1243 re_emit_op(s, REOP_prev);
1244 break;
1245 case '{':
1246 if (s->is_utf16) {
1247 return re_parse_error(s, "syntax error");
1248 } else if (!is_digit(p[1])) {
1249 /* Annex B: we accept '{' not followed by digits as a
1250 normal atom */
1251 goto parse_class_atom;
1252 } else {
1253 const uint8_t *p1 = p + 1;
1254 /* Annex B: error if it is like a repetition count */
1255 parse_digits(&p1, TRUE);
1256 if (*p1 == ',') {
1257 p1++;
1258 if (is_digit(*p1)) {
1259 parse_digits(&p1, TRUE);
1260 }
1261 }
1262 if (*p1 != '}') {
1263 goto parse_class_atom;
1264 }
1265 }
1266 /* fall thru */
1267 case '*':
1268 case '+':
1269 case '?':
1270 return re_parse_error(s, "nothing to repeat");
1271 case '(':
1272 if (p[1] == '?') {
1273 if (p[2] == ':') {
1274 p += 3;
1275 last_atom_start = s->byte_code.size;
1276 last_capture_count = s->capture_count;
1277 s->buf_ptr = p;
1278 if (re_parse_disjunction(s, is_backward_dir))
1279 return -1;
1280 p = s->buf_ptr;
1281 if (re_parse_expect(s, &p, ')'))
1282 return -1;
1283 } else if ((p[2] == '=' || p[2] == '!')) {
1284 is_neg = (p[2] == '!');
1285 is_backward_lookahead = FALSE;
1286 p += 3;
1287 goto lookahead;
1288 } else if (p[2] == '<' &&
1289 (p[3] == '=' || p[3] == '!')) {
1290 int pos;
1291 is_neg = (p[3] == '!');
1292 is_backward_lookahead = TRUE;
1293 p += 4;
1294 /* lookahead */
1295 lookahead:
1296 /* Annex B allows lookahead to be used as an atom for
1297 the quantifiers */
1298 if (!s->is_utf16 && !is_backward_lookahead) {
1299 last_atom_start = s->byte_code.size;
1300 last_capture_count = s->capture_count;
1301 }
1302 pos = re_emit_op_u32(s, REOP_lookahead + is_neg, 0);
1303 s->buf_ptr = p;
1304 if (re_parse_disjunction(s, is_backward_lookahead))
1305 return -1;
1306 p = s->buf_ptr;
1307 if (re_parse_expect(s, &p, ')'))
1308 return -1;
1309 re_emit_op(s, REOP_match);
1310 /* jump after the 'match' after the lookahead is successful */
1311 if (dbuf_error(&s->byte_code))
1312 return -1;
1313 put_u32(s->byte_code.buf + pos, s->byte_code.size - (pos + 4));
1314 } else if (p[2] == '<') {
1315 p += 3;
1316 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1317 &p, s->is_utf16)) {
1318 return re_parse_error(s, "invalid group name");
1319 }
1320 if (find_group_name(s, s->u.tmp_buf) > 0) {
1321 return re_parse_error(s, "duplicate group name");
1322 }
1323 /* group name with a trailing zero */
1324 dbuf_put(&s->group_names, (uint8_t *)s->u.tmp_buf,
1325 strlen(s->u.tmp_buf) + 1);
1326 s->has_named_captures = 1;
1327 goto parse_capture;
1328 } else {
1329 return re_parse_error(s, "invalid group");
1330 }
1331 } else {
1332 int capture_index;
1333 p++;
1334 /* capture without group name */
1335 dbuf_putc(&s->group_names, 0);
1336 parse_capture:
1337 if (s->capture_count >= CAPTURE_COUNT_MAX)
1338 return re_parse_error(s, "too many captures");
1339 last_atom_start = s->byte_code.size;
1340 last_capture_count = s->capture_count;
1341 capture_index = s->capture_count++;
1342 re_emit_op_u8(s, REOP_save_start + is_backward_dir,
1343 capture_index);
1344
1345 s->buf_ptr = p;
1346 if (re_parse_disjunction(s, is_backward_dir))
1347 return -1;
1348 p = s->buf_ptr;
1349
1350 re_emit_op_u8(s, REOP_save_start + 1 - is_backward_dir,
1351 capture_index);
1352
1353 if (re_parse_expect(s, &p, ')'))
1354 return -1;
1355 }
1356 break;
1357 case '\\':
1358 switch(p[1]) {
1359 case 'b':
1360 case 'B':
1361 re_emit_op(s, REOP_word_boundary + (p[1] != 'b'));
1362 p += 2;
1363 break;
1364 case 'k':
1365 {
1366 const uint8_t *p1;
1367 int dummy_res;
1368
1369 p1 = p;
1370 if (p1[2] != '<') {
1371 /* annex B: we tolerate invalid group names in non
1372 unicode mode if there is no named capture
1373 definition */
1374 if (s->is_utf16 || re_has_named_captures(s))
1375 return re_parse_error(s, "expecting group name");
1376 else
1377 goto parse_class_atom;
1378 }
1379 p1 += 3;
1380 if (re_parse_group_name(s->u.tmp_buf, sizeof(s->u.tmp_buf),
1381 &p1, s->is_utf16)) {
1382 if (s->is_utf16 || re_has_named_captures(s))
1383 return re_parse_error(s, "invalid group name");
1384 else
1385 goto parse_class_atom;
1386 }
1387 c = find_group_name(s, s->u.tmp_buf);
1388 if (c < 0) {
1389 /* no capture name parsed before, try to look
1390 after (inefficient, but hopefully not common */
1391 c = re_parse_captures(s, &dummy_res, s->u.tmp_buf);
1392 if (c < 0) {
1393 if (s->is_utf16 || re_has_named_captures(s))
1394 return re_parse_error(s, "group name not defined");
1395 else
1396 goto parse_class_atom;
1397 }
1398 }
1399 p = p1;
1400 }
1401 goto emit_back_reference;
1402 case '0':
1403 p += 2;
1404 c = 0;
1405 if (s->is_utf16) {
1406 if (is_digit(*p)) {
1407 return re_parse_error(s, "invalid decimal escape in regular expression");
1408 }
1409 } else {
1410 /* Annex B.1.4: accept legacy octal */
1411 if (*p >= '0' && *p <= '7') {
1412 c = *p++ - '0';
1413 if (*p >= '0' && *p <= '7') {
1414 c = (c << 3) + *p++ - '0';
1415 }
1416 }
1417 }
1418 goto normal_char;
1419 case '1': case '2': case '3': case '4':
1420 case '5': case '6': case '7': case '8':
1421 case '9':
1422 {
1423 const uint8_t *q = ++p;
1424
1425 c = parse_digits(&p, FALSE);
1426 if (c < 0 || (c >= s->capture_count && c >= re_count_captures(s))) {
1427 if (!s->is_utf16) {
1428 /* Annex B.1.4: accept legacy octal */
1429 p = q;
1430 if (*p <= '7') {
1431 c = 0;
1432 if (*p <= '3')
1433 c = *p++ - '0';
1434 if (*p >= '0' && *p <= '7') {
1435 c = (c << 3) + *p++ - '0';
1436 if (*p >= '0' && *p <= '7') {
1437 c = (c << 3) + *p++ - '0';
1438 }
1439 }
1440 } else {
1441 c = *p++;
1442 }
1443 goto normal_char;
1444 }
1445 return re_parse_error(s, "back reference out of range in regular expression");
1446 }
1447 emit_back_reference:
1448 last_atom_start = s->byte_code.size;
1449 last_capture_count = s->capture_count;
1450 re_emit_op_u8(s, REOP_back_reference + is_backward_dir, c);
1451 }
1452 break;
1453 default:
1454 goto parse_class_atom;
1455 }
1456 break;
1457 case '[':
1458 last_atom_start = s->byte_code.size;
1459 last_capture_count = s->capture_count;
1460 if (is_backward_dir)
1461 re_emit_op(s, REOP_prev);
1462 if (re_parse_char_class(s, &p))
1463 return -1;
1464 if (is_backward_dir)
1465 re_emit_op(s, REOP_prev);
1466 break;
1467 case ']':
1468 case '}':
1469 if (s->is_utf16)
1470 return re_parse_error(s, "syntax error");
1471 goto parse_class_atom;
1472 default:
1473 parse_class_atom:
1474 c = get_class_atom(s, cr, &p, FALSE);
1475 if ((int)c < 0)
1476 return -1;
1477 normal_char:
1478 last_atom_start = s->byte_code.size;
1479 last_capture_count = s->capture_count;
1480 if (is_backward_dir)
1481 re_emit_op(s, REOP_prev);
1482 if (c >= CLASS_RANGE_BASE) {
1483 int ret;
1484 /* Note: canonicalization is not needed */
1485 ret = re_emit_range(s, cr);
1486 cr_free(cr);
1487 if (ret)
1488 return -1;
1489 } else {
1490 if (s->ignore_case)
1491 c = lre_canonicalize(c, s->is_utf16);
1492 if (c <= 0xffff)
1493 re_emit_op_u16(s, REOP_char, c);
1494 else
1495 re_emit_op_u32(s, REOP_char32, c);
1496 }
1497 if (is_backward_dir)
1498 re_emit_op(s, REOP_prev);
1499 break;
1500 }
1501
1502 /* quantifier */
1503 if (last_atom_start >= 0) {
1504 c = *p;
1505 switch(c) {
1506 case '*':
1507 p++;
1508 quant_min = 0;
1509 quant_max = INT32_MAX;
1510 goto quantifier;
1511 case '+':
1512 p++;
1513 quant_min = 1;
1514 quant_max = INT32_MAX;
1515 goto quantifier;
1516 case '?':
1517 p++;
1518 quant_min = 0;
1519 quant_max = 1;
1520 goto quantifier;
1521 case '{':
1522 {
1523 const uint8_t *p1 = p;
1524 /* As an extension (see ES6 annex B), we accept '{' not
1525 followed by digits as a normal atom */
1526 if (!is_digit(p[1])) {
1527 if (s->is_utf16)
1528 goto invalid_quant_count;
1529 break;
1530 }
1531 p++;
1532 quant_min = parse_digits(&p, TRUE);
1533 quant_max = quant_min;
1534 if (*p == ',') {
1535 p++;
1536 if (is_digit(*p)) {
1537 quant_max = parse_digits(&p, TRUE);
1538 if (quant_max < quant_min) {
1539 invalid_quant_count:
1540 return re_parse_error(s, "invalid repetition count");
1541 }
1542 } else {
1543 quant_max = INT32_MAX; /* infinity */
1544 }
1545 }
1546 if (*p != '}' && !s->is_utf16) {
1547 /* Annex B: normal atom if invalid '{' syntax */
1548 p = p1;
1549 break;
1550 }
1551 if (re_parse_expect(s, &p, '}'))
1552 return -1;
1553 }
1554 quantifier:
1555 greedy = TRUE;
1556 if (*p == '?') {
1557 p++;
1558 greedy = FALSE;
1559 }
1560 if (last_atom_start < 0) {
1561 return re_parse_error(s, "nothing to repeat");
1562 }
1563 if (greedy) {
1564 int len, pos;
1565
1566 if (quant_max > 0) {
1567 /* specific optimization for simple quantifiers */
1568 if (dbuf_error(&s->byte_code))
1569 goto out_of_memory;
1570 len = re_is_simple_quantifier(s->byte_code.buf + last_atom_start,
1571 s->byte_code.size - last_atom_start);
1572 if (len > 0) {
1573 re_emit_op(s, REOP_match);
1574
1575 if (dbuf_insert(&s->byte_code, last_atom_start, 17))
1576 goto out_of_memory;
1577 pos = last_atom_start;
1578 s->byte_code.buf[pos++] = REOP_simple_greedy_quant;
1579 put_u32(&s->byte_code.buf[pos],
1580 s->byte_code.size - last_atom_start - 17);
1581 pos += 4;
1582 put_u32(&s->byte_code.buf[pos], quant_min);
1583 pos += 4;
1584 put_u32(&s->byte_code.buf[pos], quant_max);
1585 pos += 4;
1586 put_u32(&s->byte_code.buf[pos], len);
1587 pos += 4;
1588 goto done;
1589 }
1590 }
1591
1592 if (dbuf_error(&s->byte_code))
1593 goto out_of_memory;
1594 add_zero_advance_check = (re_check_advance(s->byte_code.buf + last_atom_start,
1595 s->byte_code.size - last_atom_start) == 0);
1596 } else {
1597 add_zero_advance_check = FALSE;
1598 }
1599
1600 {
1601 int len, pos;
1602 len = s->byte_code.size - last_atom_start;
1603 if (quant_min == 0) {
1604 /* need to reset the capture in case the atom is
1605 not executed */
1606 if (last_capture_count != s->capture_count) {
1607 if (dbuf_insert(&s->byte_code, last_atom_start, 3))
1608 goto out_of_memory;
1609 s->byte_code.buf[last_atom_start++] = REOP_save_reset;
1610 s->byte_code.buf[last_atom_start++] = last_capture_count;
1611 s->byte_code.buf[last_atom_start++] = s->capture_count - 1;
1612 }
1613 if (quant_max == 0) {
1614 s->byte_code.size = last_atom_start;
1615 } else if (quant_max == 1) {
1616 if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1617 goto out_of_memory;
1618 s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1619 greedy;
1620 put_u32(s->byte_code.buf + last_atom_start + 1, len);
1621 } else if (quant_max == INT32_MAX) {
1622 if (dbuf_insert(&s->byte_code, last_atom_start, 5 + add_zero_advance_check))
1623 goto out_of_memory;
1624 s->byte_code.buf[last_atom_start] = REOP_split_goto_first +
1625 greedy;
1626 put_u32(s->byte_code.buf + last_atom_start + 1,
1627 len + 5 + add_zero_advance_check);
1628 if (add_zero_advance_check) {
1629 /* avoid infinite loop by stoping the
1630 recursion if no advance was made in the
1631 atom (only works if the atom has no
1632 side effect) */
1633 s->byte_code.buf[last_atom_start + 1 + 4] = REOP_push_char_pos;
1634 re_emit_goto(s, REOP_bne_char_pos, last_atom_start);
1635 } else {
1636 re_emit_goto(s, REOP_goto, last_atom_start);
1637 }
1638 } else {
1639 if (dbuf_insert(&s->byte_code, last_atom_start, 10))
1640 goto out_of_memory;
1641 pos = last_atom_start;
1642 s->byte_code.buf[pos++] = REOP_push_i32;
1643 put_u32(s->byte_code.buf + pos, quant_max);
1644 pos += 4;
1645 s->byte_code.buf[pos++] = REOP_split_goto_first + greedy;
1646 put_u32(s->byte_code.buf + pos, len + 5);
1647 re_emit_goto(s, REOP_loop, last_atom_start + 5);
1648 re_emit_op(s, REOP_drop);
1649 }
1650 } else if (quant_min == 1 && quant_max == INT32_MAX &&
1651 !add_zero_advance_check) {
1652 re_emit_goto(s, REOP_split_next_first - greedy,
1653 last_atom_start);
1654 } else {
1655 if (quant_min == 1) {
1656 /* nothing to add */
1657 } else {
1658 if (dbuf_insert(&s->byte_code, last_atom_start, 5))
1659 goto out_of_memory;
1660 s->byte_code.buf[last_atom_start] = REOP_push_i32;
1661 put_u32(s->byte_code.buf + last_atom_start + 1,
1662 quant_min);
1663 last_atom_start += 5;
1664 re_emit_goto(s, REOP_loop, last_atom_start);
1665 re_emit_op(s, REOP_drop);
1666 }
1667 if (quant_max == INT32_MAX) {
1668 pos = s->byte_code.size;
1669 re_emit_op_u32(s, REOP_split_goto_first + greedy,
1670 len + 5 + add_zero_advance_check);
1671 if (add_zero_advance_check)
1672 re_emit_op(s, REOP_push_char_pos);
1673 /* copy the atom */
1674 dbuf_put_self(&s->byte_code, last_atom_start, len);
1675 if (add_zero_advance_check)
1676 re_emit_goto(s, REOP_bne_char_pos, pos);
1677 else
1678 re_emit_goto(s, REOP_goto, pos);
1679 } else if (quant_max > quant_min) {
1680 re_emit_op_u32(s, REOP_push_i32, quant_max - quant_min);
1681 pos = s->byte_code.size;
1682 re_emit_op_u32(s, REOP_split_goto_first + greedy, len + 5);
1683 /* copy the atom */
1684 dbuf_put_self(&s->byte_code, last_atom_start, len);
1685
1686 re_emit_goto(s, REOP_loop, pos);
1687 re_emit_op(s, REOP_drop);
1688 }
1689 }
1690 last_atom_start = -1;
1691 }
1692 break;
1693 default:
1694 break;
1695 }
1696 }
1697 done:
1698 s->buf_ptr = p;
1699 return 0;
1700 out_of_memory:
1701 return re_parse_out_of_memory(s);
1702 }
1703
re_parse_alternative(REParseState * s,BOOL is_backward_dir)1704 static int re_parse_alternative(REParseState *s, BOOL is_backward_dir)
1705 {
1706 const uint8_t *p;
1707 int ret;
1708 size_t start, term_start, end, term_size;
1709
1710 start = s->byte_code.size;
1711 for(;;) {
1712 p = s->buf_ptr;
1713 if (p >= s->buf_end)
1714 break;
1715 if (*p == '|' || *p == ')')
1716 break;
1717 term_start = s->byte_code.size;
1718 ret = re_parse_term(s, is_backward_dir);
1719 if (ret)
1720 return ret;
1721 if (is_backward_dir) {
1722 /* reverse the order of the terms (XXX: inefficient, but
1723 speed is not really critical here) */
1724 end = s->byte_code.size;
1725 term_size = end - term_start;
1726 if (dbuf_realloc(&s->byte_code, end + term_size))
1727 return -1;
1728 memmove(s->byte_code.buf + start + term_size,
1729 s->byte_code.buf + start,
1730 end - start);
1731 memcpy(s->byte_code.buf + start, s->byte_code.buf + end,
1732 term_size);
1733 }
1734 }
1735 return 0;
1736 }
1737
re_parse_disjunction(REParseState * s,BOOL is_backward_dir)1738 static int re_parse_disjunction(REParseState *s, BOOL is_backward_dir)
1739 {
1740 int start, len, pos;
1741
1742 if (lre_check_stack_overflow(s->opaque, 0))
1743 return re_parse_error(s, "stack overflow");
1744
1745 start = s->byte_code.size;
1746 if (re_parse_alternative(s, is_backward_dir))
1747 return -1;
1748 while (*s->buf_ptr == '|') {
1749 s->buf_ptr++;
1750
1751 len = s->byte_code.size - start;
1752
1753 /* insert a split before the first alternative */
1754 if (dbuf_insert(&s->byte_code, start, 5)) {
1755 return re_parse_out_of_memory(s);
1756 }
1757 s->byte_code.buf[start] = REOP_split_next_first;
1758 put_u32(s->byte_code.buf + start + 1, len + 5);
1759
1760 pos = re_emit_op_u32(s, REOP_goto, 0);
1761
1762 if (re_parse_alternative(s, is_backward_dir))
1763 return -1;
1764
1765 /* patch the goto */
1766 len = s->byte_code.size - (pos + 4);
1767 put_u32(s->byte_code.buf + pos, len);
1768 }
1769 return 0;
1770 }
1771
1772 /* the control flow is recursive so the analysis can be linear */
compute_stack_size(const uint8_t * bc_buf,int bc_buf_len)1773 static int compute_stack_size(const uint8_t *bc_buf, int bc_buf_len)
1774 {
1775 int stack_size, stack_size_max, pos, opcode, len;
1776 uint32_t val;
1777
1778 stack_size = 0;
1779 stack_size_max = 0;
1780 bc_buf += RE_HEADER_LEN;
1781 bc_buf_len -= RE_HEADER_LEN;
1782 pos = 0;
1783 while (pos < bc_buf_len) {
1784 opcode = bc_buf[pos];
1785 len = reopcode_info[opcode].size;
1786 assert(opcode < REOP_COUNT);
1787 assert((pos + len) <= bc_buf_len);
1788 switch(opcode) {
1789 case REOP_push_i32:
1790 case REOP_push_char_pos:
1791 stack_size++;
1792 if (stack_size > stack_size_max) {
1793 if (stack_size > STACK_SIZE_MAX)
1794 return -1;
1795 stack_size_max = stack_size;
1796 }
1797 break;
1798 case REOP_drop:
1799 case REOP_bne_char_pos:
1800 assert(stack_size > 0);
1801 stack_size--;
1802 break;
1803 case REOP_range:
1804 val = get_u16(bc_buf + pos + 1);
1805 len += val * 4;
1806 break;
1807 case REOP_range32:
1808 val = get_u16(bc_buf + pos + 1);
1809 len += val * 8;
1810 break;
1811 }
1812 pos += len;
1813 }
1814 return stack_size_max;
1815 }
1816
1817 /* 'buf' must be a zero terminated UTF-8 string of length buf_len.
1818 Return NULL if error and allocate an error message in *perror_msg,
1819 otherwise the compiled bytecode and its length in plen.
1820 */
lre_compile(int * plen,char * error_msg,int error_msg_size,const char * buf,size_t buf_len,int re_flags,void * opaque)1821 uint8_t *lre_compile(int *plen, char *error_msg, int error_msg_size,
1822 const char *buf, size_t buf_len, int re_flags,
1823 void *opaque)
1824 {
1825 REParseState s_s, *s = &s_s;
1826 int stack_size;
1827 BOOL is_sticky;
1828
1829 memset(s, 0, sizeof(*s));
1830 s->opaque = opaque;
1831 s->buf_ptr = (const uint8_t *)buf;
1832 s->buf_end = s->buf_ptr + buf_len;
1833 s->buf_start = s->buf_ptr;
1834 s->re_flags = re_flags;
1835 s->is_utf16 = ((re_flags & LRE_FLAG_UTF16) != 0);
1836 is_sticky = ((re_flags & LRE_FLAG_STICKY) != 0);
1837 s->ignore_case = ((re_flags & LRE_FLAG_IGNORECASE) != 0);
1838 s->dotall = ((re_flags & LRE_FLAG_DOTALL) != 0);
1839 s->capture_count = 1;
1840 s->total_capture_count = -1;
1841 s->has_named_captures = -1;
1842
1843 dbuf_init2(&s->byte_code, opaque, lre_realloc);
1844 dbuf_init2(&s->group_names, opaque, lre_realloc);
1845
1846 dbuf_putc(&s->byte_code, re_flags); /* first element is the flags */
1847 dbuf_putc(&s->byte_code, 0); /* second element is the number of captures */
1848 dbuf_putc(&s->byte_code, 0); /* stack size */
1849 dbuf_put_u32(&s->byte_code, 0); /* bytecode length */
1850
1851 if (!is_sticky) {
1852 /* iterate thru all positions (about the same as .*?( ... ) )
1853 . We do it without an explicit loop so that lock step
1854 thread execution will be possible in an optimized
1855 implementation */
1856 re_emit_op_u32(s, REOP_split_goto_first, 1 + 5);
1857 re_emit_op(s, REOP_any);
1858 re_emit_op_u32(s, REOP_goto, -(5 + 1 + 5));
1859 }
1860 re_emit_op_u8(s, REOP_save_start, 0);
1861
1862 if (re_parse_disjunction(s, FALSE)) {
1863 error:
1864 dbuf_free(&s->byte_code);
1865 dbuf_free(&s->group_names);
1866 pstrcpy(error_msg, error_msg_size, s->u.error_msg);
1867 *plen = 0;
1868 return NULL;
1869 }
1870
1871 re_emit_op_u8(s, REOP_save_end, 0);
1872
1873 re_emit_op(s, REOP_match);
1874
1875 if (*s->buf_ptr != '\0') {
1876 re_parse_error(s, "extraneous characters at the end");
1877 goto error;
1878 }
1879
1880 if (dbuf_error(&s->byte_code)) {
1881 re_parse_out_of_memory(s);
1882 goto error;
1883 }
1884
1885 stack_size = compute_stack_size(s->byte_code.buf, s->byte_code.size);
1886 if (stack_size < 0) {
1887 re_parse_error(s, "too many imbricated quantifiers");
1888 goto error;
1889 }
1890
1891 s->byte_code.buf[RE_HEADER_CAPTURE_COUNT] = s->capture_count;
1892 s->byte_code.buf[RE_HEADER_STACK_SIZE] = stack_size;
1893 put_u32(s->byte_code.buf + 3, s->byte_code.size - RE_HEADER_LEN);
1894
1895 /* add the named groups if needed */
1896 if (s->group_names.size > (s->capture_count - 1)) {
1897 dbuf_put(&s->byte_code, s->group_names.buf, s->group_names.size);
1898 s->byte_code.buf[RE_HEADER_FLAGS] |= LRE_FLAG_NAMED_GROUPS;
1899 }
1900 dbuf_free(&s->group_names);
1901
1902 #ifdef DUMP_REOP
1903 lre_dump_bytecode(s->byte_code.buf, s->byte_code.size);
1904 #endif
1905
1906 error_msg[0] = '\0';
1907 *plen = s->byte_code.size;
1908 return s->byte_code.buf;
1909 }
1910
is_line_terminator(uint32_t c)1911 static BOOL is_line_terminator(uint32_t c)
1912 {
1913 return (c == '\n' || c == '\r' || c == CP_LS || c == CP_PS);
1914 }
1915
is_word_char(uint32_t c)1916 static BOOL is_word_char(uint32_t c)
1917 {
1918 return ((c >= '0' && c <= '9') ||
1919 (c >= 'a' && c <= 'z') ||
1920 (c >= 'A' && c <= 'Z') ||
1921 (c == '_'));
1922 }
1923
1924 #define GET_CHAR(c, cptr, cbuf_end) \
1925 do { \
1926 if (cbuf_type == 0) { \
1927 c = *cptr++; \
1928 } else { \
1929 uint32_t __c1; \
1930 c = *(uint16_t *)cptr; \
1931 cptr += 2; \
1932 if (c >= 0xd800 && c < 0xdc00 && \
1933 cbuf_type == 2 && cptr < cbuf_end) { \
1934 __c1 = *(uint16_t *)cptr; \
1935 if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1936 c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1937 cptr += 2; \
1938 } \
1939 } \
1940 } \
1941 } while (0)
1942
1943 #define PEEK_CHAR(c, cptr, cbuf_end) \
1944 do { \
1945 if (cbuf_type == 0) { \
1946 c = cptr[0]; \
1947 } else { \
1948 uint32_t __c1; \
1949 c = ((uint16_t *)cptr)[0]; \
1950 if (c >= 0xd800 && c < 0xdc00 && \
1951 cbuf_type == 2 && (cptr + 2) < cbuf_end) { \
1952 __c1 = ((uint16_t *)cptr)[1]; \
1953 if (__c1 >= 0xdc00 && __c1 < 0xe000) { \
1954 c = (((c & 0x3ff) << 10) | (__c1 & 0x3ff)) + 0x10000; \
1955 } \
1956 } \
1957 } \
1958 } while (0)
1959
1960 #define PEEK_PREV_CHAR(c, cptr, cbuf_start) \
1961 do { \
1962 if (cbuf_type == 0) { \
1963 c = cptr[-1]; \
1964 } else { \
1965 uint32_t __c1; \
1966 c = ((uint16_t *)cptr)[-1]; \
1967 if (c >= 0xdc00 && c < 0xe000 && \
1968 cbuf_type == 2 && (cptr - 4) >= cbuf_start) { \
1969 __c1 = ((uint16_t *)cptr)[-2]; \
1970 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1971 c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1972 } \
1973 } \
1974 } \
1975 } while (0)
1976
1977 #define GET_PREV_CHAR(c, cptr, cbuf_start) \
1978 do { \
1979 if (cbuf_type == 0) { \
1980 cptr--; \
1981 c = cptr[0]; \
1982 } else { \
1983 uint32_t __c1; \
1984 cptr -= 2; \
1985 c = ((uint16_t *)cptr)[0]; \
1986 if (c >= 0xdc00 && c < 0xe000 && \
1987 cbuf_type == 2 && cptr > cbuf_start) { \
1988 __c1 = ((uint16_t *)cptr)[-1]; \
1989 if (__c1 >= 0xd800 && __c1 < 0xdc00 ) { \
1990 cptr -= 2; \
1991 c = (((__c1 & 0x3ff) << 10) | (c & 0x3ff)) + 0x10000; \
1992 } \
1993 } \
1994 } \
1995 } while (0)
1996
1997 #define PREV_CHAR(cptr, cbuf_start) \
1998 do { \
1999 if (cbuf_type == 0) { \
2000 cptr--; \
2001 } else { \
2002 cptr -= 2; \
2003 if (cbuf_type == 2) { \
2004 c = ((uint16_t *)cptr)[0]; \
2005 if (c >= 0xdc00 && c < 0xe000 && cptr > cbuf_start) { \
2006 c = ((uint16_t *)cptr)[-1]; \
2007 if (c >= 0xd800 && c < 0xdc00) \
2008 cptr -= 2; \
2009 } \
2010 } \
2011 } \
2012 } while (0)
2013
2014 typedef uintptr_t StackInt;
2015
2016 typedef enum {
2017 RE_EXEC_STATE_SPLIT,
2018 RE_EXEC_STATE_LOOKAHEAD,
2019 RE_EXEC_STATE_NEGATIVE_LOOKAHEAD,
2020 RE_EXEC_STATE_GREEDY_QUANT,
2021 } REExecStateEnum;
2022
2023 typedef struct REExecState {
2024 REExecStateEnum type : 8;
2025 uint8_t stack_len;
2026 size_t count; /* only used for RE_EXEC_STATE_GREEDY_QUANT */
2027 const uint8_t *cptr;
2028 const uint8_t *pc;
2029 void *buf[0];
2030 } REExecState;
2031
2032 typedef struct {
2033 const uint8_t *cbuf;
2034 const uint8_t *cbuf_end;
2035 /* 0 = 8 bit chars, 1 = 16 bit chars, 2 = 16 bit chars, UTF-16 */
2036 int cbuf_type;
2037 int capture_count;
2038 int stack_size_max;
2039 BOOL multi_line;
2040 BOOL ignore_case;
2041 BOOL is_utf16;
2042 void *opaque; /* used for stack overflow check */
2043
2044 size_t state_size;
2045 uint8_t *state_stack;
2046 size_t state_stack_size;
2047 size_t state_stack_len;
2048 } REExecContext;
2049
push_state(REExecContext * s,uint8_t ** capture,StackInt * stack,size_t stack_len,const uint8_t * pc,const uint8_t * cptr,REExecStateEnum type,size_t count)2050 static int push_state(REExecContext *s,
2051 uint8_t **capture,
2052 StackInt *stack, size_t stack_len,
2053 const uint8_t *pc, const uint8_t *cptr,
2054 REExecStateEnum type, size_t count)
2055 {
2056 REExecState *rs;
2057 uint8_t *new_stack;
2058 size_t new_size, i, n;
2059 StackInt *stack_buf;
2060
2061 if (unlikely((s->state_stack_len + 1) > s->state_stack_size)) {
2062 /* reallocate the stack */
2063 new_size = s->state_stack_size * 3 / 2;
2064 if (new_size < 8)
2065 new_size = 8;
2066 new_stack = lre_realloc(s->opaque, s->state_stack, new_size * s->state_size);
2067 if (!new_stack)
2068 return -1;
2069 s->state_stack_size = new_size;
2070 s->state_stack = new_stack;
2071 }
2072 rs = (REExecState *)(s->state_stack + s->state_stack_len * s->state_size);
2073 s->state_stack_len++;
2074 rs->type = type;
2075 rs->count = count;
2076 rs->stack_len = stack_len;
2077 rs->cptr = cptr;
2078 rs->pc = pc;
2079 n = 2 * s->capture_count;
2080 for(i = 0; i < n; i++)
2081 rs->buf[i] = capture[i];
2082 stack_buf = (StackInt *)(rs->buf + n);
2083 for(i = 0; i < stack_len; i++)
2084 stack_buf[i] = stack[i];
2085 return 0;
2086 }
2087
2088 /* return 1 if match, 0 if not match or -1 if error. */
lre_exec_backtrack(REExecContext * s,uint8_t ** capture,StackInt * stack,int stack_len,const uint8_t * pc,const uint8_t * cptr,BOOL no_recurse)2089 static intptr_t lre_exec_backtrack(REExecContext *s, uint8_t **capture,
2090 StackInt *stack, int stack_len,
2091 const uint8_t *pc, const uint8_t *cptr,
2092 BOOL no_recurse)
2093 {
2094 int opcode, ret;
2095 int cbuf_type;
2096 uint32_t val, c;
2097 const uint8_t *cbuf_end;
2098
2099 cbuf_type = s->cbuf_type;
2100 cbuf_end = s->cbuf_end;
2101
2102 for(;;) {
2103 // printf("top=%p: pc=%d\n", th_list.top, (int)(pc - (bc_buf + RE_HEADER_LEN)));
2104 opcode = *pc++;
2105 switch(opcode) {
2106 case REOP_match:
2107 {
2108 REExecState *rs;
2109 if (no_recurse)
2110 return (intptr_t)cptr;
2111 ret = 1;
2112 goto recurse;
2113 no_match:
2114 if (no_recurse)
2115 return 0;
2116 ret = 0;
2117 recurse:
2118 for(;;) {
2119 if (s->state_stack_len == 0)
2120 return ret;
2121 rs = (REExecState *)(s->state_stack +
2122 (s->state_stack_len - 1) * s->state_size);
2123 if (rs->type == RE_EXEC_STATE_SPLIT) {
2124 if (!ret) {
2125 pop_state:
2126 memcpy(capture, rs->buf,
2127 sizeof(capture[0]) * 2 * s->capture_count);
2128 pop_state1:
2129 pc = rs->pc;
2130 cptr = rs->cptr;
2131 stack_len = rs->stack_len;
2132 memcpy(stack, rs->buf + 2 * s->capture_count,
2133 stack_len * sizeof(stack[0]));
2134 s->state_stack_len--;
2135 break;
2136 }
2137 } else if (rs->type == RE_EXEC_STATE_GREEDY_QUANT) {
2138 if (!ret) {
2139 uint32_t char_count, i;
2140 memcpy(capture, rs->buf,
2141 sizeof(capture[0]) * 2 * s->capture_count);
2142 stack_len = rs->stack_len;
2143 memcpy(stack, rs->buf + 2 * s->capture_count,
2144 stack_len * sizeof(stack[0]));
2145 pc = rs->pc;
2146 cptr = rs->cptr;
2147 /* go backward */
2148 char_count = get_u32(pc + 12);
2149 for(i = 0; i < char_count; i++) {
2150 PREV_CHAR(cptr, s->cbuf);
2151 }
2152 pc = (pc + 16) + (int)get_u32(pc);
2153 rs->cptr = cptr;
2154 rs->count--;
2155 if (rs->count == 0) {
2156 s->state_stack_len--;
2157 }
2158 break;
2159 }
2160 } else {
2161 ret = ((rs->type == RE_EXEC_STATE_LOOKAHEAD && ret) ||
2162 (rs->type == RE_EXEC_STATE_NEGATIVE_LOOKAHEAD && !ret));
2163 if (ret) {
2164 /* keep the capture in case of positive lookahead */
2165 if (rs->type == RE_EXEC_STATE_LOOKAHEAD)
2166 goto pop_state1;
2167 else
2168 goto pop_state;
2169 }
2170 }
2171 s->state_stack_len--;
2172 }
2173 }
2174 break;
2175 case REOP_char32:
2176 val = get_u32(pc);
2177 pc += 4;
2178 goto test_char;
2179 case REOP_char:
2180 val = get_u16(pc);
2181 pc += 2;
2182 test_char:
2183 if (cptr >= cbuf_end)
2184 goto no_match;
2185 GET_CHAR(c, cptr, cbuf_end);
2186 if (s->ignore_case) {
2187 c = lre_canonicalize(c, s->is_utf16);
2188 }
2189 if (val != c)
2190 goto no_match;
2191 break;
2192 case REOP_split_goto_first:
2193 case REOP_split_next_first:
2194 {
2195 const uint8_t *pc1;
2196
2197 val = get_u32(pc);
2198 pc += 4;
2199 if (opcode == REOP_split_next_first) {
2200 pc1 = pc + (int)val;
2201 } else {
2202 pc1 = pc;
2203 pc = pc + (int)val;
2204 }
2205 ret = push_state(s, capture, stack, stack_len,
2206 pc1, cptr, RE_EXEC_STATE_SPLIT, 0);
2207 if (ret < 0)
2208 return -1;
2209 break;
2210 }
2211 case REOP_lookahead:
2212 case REOP_negative_lookahead:
2213 val = get_u32(pc);
2214 pc += 4;
2215 ret = push_state(s, capture, stack, stack_len,
2216 pc + (int)val, cptr,
2217 RE_EXEC_STATE_LOOKAHEAD + opcode - REOP_lookahead,
2218 0);
2219 if (ret < 0)
2220 return -1;
2221 break;
2222
2223 case REOP_goto:
2224 val = get_u32(pc);
2225 pc += 4 + (int)val;
2226 break;
2227 case REOP_line_start:
2228 if (cptr == s->cbuf)
2229 break;
2230 if (!s->multi_line)
2231 goto no_match;
2232 PEEK_PREV_CHAR(c, cptr, s->cbuf);
2233 if (!is_line_terminator(c))
2234 goto no_match;
2235 break;
2236 case REOP_line_end:
2237 if (cptr == cbuf_end)
2238 break;
2239 if (!s->multi_line)
2240 goto no_match;
2241 PEEK_CHAR(c, cptr, cbuf_end);
2242 if (!is_line_terminator(c))
2243 goto no_match;
2244 break;
2245 case REOP_dot:
2246 if (cptr == cbuf_end)
2247 goto no_match;
2248 GET_CHAR(c, cptr, cbuf_end);
2249 if (is_line_terminator(c))
2250 goto no_match;
2251 break;
2252 case REOP_any:
2253 if (cptr == cbuf_end)
2254 goto no_match;
2255 GET_CHAR(c, cptr, cbuf_end);
2256 break;
2257 case REOP_save_start:
2258 case REOP_save_end:
2259 val = *pc++;
2260 assert(val < s->capture_count);
2261 capture[2 * val + opcode - REOP_save_start] = (uint8_t *)cptr;
2262 break;
2263 case REOP_save_reset:
2264 {
2265 uint32_t val2;
2266 val = pc[0];
2267 val2 = pc[1];
2268 pc += 2;
2269 assert(val2 < s->capture_count);
2270 while (val <= val2) {
2271 capture[2 * val] = NULL;
2272 capture[2 * val + 1] = NULL;
2273 val++;
2274 }
2275 }
2276 break;
2277 case REOP_push_i32:
2278 val = get_u32(pc);
2279 pc += 4;
2280 stack[stack_len++] = val;
2281 break;
2282 case REOP_drop:
2283 stack_len--;
2284 break;
2285 case REOP_loop:
2286 val = get_u32(pc);
2287 pc += 4;
2288 if (--stack[stack_len - 1] != 0) {
2289 pc += (int)val;
2290 }
2291 break;
2292 case REOP_push_char_pos:
2293 stack[stack_len++] = (uintptr_t)cptr;
2294 break;
2295 case REOP_bne_char_pos:
2296 val = get_u32(pc);
2297 pc += 4;
2298 if (stack[--stack_len] != (uintptr_t)cptr)
2299 pc += (int)val;
2300 break;
2301 case REOP_word_boundary:
2302 case REOP_not_word_boundary:
2303 {
2304 BOOL v1, v2;
2305 /* char before */
2306 if (cptr == s->cbuf) {
2307 v1 = FALSE;
2308 } else {
2309 PEEK_PREV_CHAR(c, cptr, s->cbuf);
2310 v1 = is_word_char(c);
2311 }
2312 /* current char */
2313 if (cptr >= cbuf_end) {
2314 v2 = FALSE;
2315 } else {
2316 PEEK_CHAR(c, cptr, cbuf_end);
2317 v2 = is_word_char(c);
2318 }
2319 if (v1 ^ v2 ^ (REOP_not_word_boundary - opcode))
2320 goto no_match;
2321 }
2322 break;
2323 case REOP_back_reference:
2324 case REOP_backward_back_reference:
2325 {
2326 const uint8_t *cptr1, *cptr1_end, *cptr1_start;
2327 uint32_t c1, c2;
2328
2329 val = *pc++;
2330 if (val >= s->capture_count)
2331 goto no_match;
2332 cptr1_start = capture[2 * val];
2333 cptr1_end = capture[2 * val + 1];
2334 if (!cptr1_start || !cptr1_end)
2335 break;
2336 if (opcode == REOP_back_reference) {
2337 cptr1 = cptr1_start;
2338 while (cptr1 < cptr1_end) {
2339 if (cptr >= cbuf_end)
2340 goto no_match;
2341 GET_CHAR(c1, cptr1, cptr1_end);
2342 GET_CHAR(c2, cptr, cbuf_end);
2343 if (s->ignore_case) {
2344 c1 = lre_canonicalize(c1, s->is_utf16);
2345 c2 = lre_canonicalize(c2, s->is_utf16);
2346 }
2347 if (c1 != c2)
2348 goto no_match;
2349 }
2350 } else {
2351 cptr1 = cptr1_end;
2352 while (cptr1 > cptr1_start) {
2353 if (cptr == s->cbuf)
2354 goto no_match;
2355 GET_PREV_CHAR(c1, cptr1, cptr1_start);
2356 GET_PREV_CHAR(c2, cptr, s->cbuf);
2357 if (s->ignore_case) {
2358 c1 = lre_canonicalize(c1, s->is_utf16);
2359 c2 = lre_canonicalize(c2, s->is_utf16);
2360 }
2361 if (c1 != c2)
2362 goto no_match;
2363 }
2364 }
2365 }
2366 break;
2367 case REOP_range:
2368 {
2369 int n;
2370 uint32_t low, high, idx_min, idx_max, idx;
2371
2372 n = get_u16(pc); /* n must be >= 1 */
2373 pc += 2;
2374 if (cptr >= cbuf_end)
2375 goto no_match;
2376 GET_CHAR(c, cptr, cbuf_end);
2377 if (s->ignore_case) {
2378 c = lre_canonicalize(c, s->is_utf16);
2379 }
2380 idx_min = 0;
2381 low = get_u16(pc + 0 * 4);
2382 if (c < low)
2383 goto no_match;
2384 idx_max = n - 1;
2385 high = get_u16(pc + idx_max * 4 + 2);
2386 /* 0xffff in for last value means +infinity */
2387 if (unlikely(c >= 0xffff) && high == 0xffff)
2388 goto range_match;
2389 if (c > high)
2390 goto no_match;
2391 while (idx_min <= idx_max) {
2392 idx = (idx_min + idx_max) / 2;
2393 low = get_u16(pc + idx * 4);
2394 high = get_u16(pc + idx * 4 + 2);
2395 if (c < low)
2396 idx_max = idx - 1;
2397 else if (c > high)
2398 idx_min = idx + 1;
2399 else
2400 goto range_match;
2401 }
2402 goto no_match;
2403 range_match:
2404 pc += 4 * n;
2405 }
2406 break;
2407 case REOP_range32:
2408 {
2409 int n;
2410 uint32_t low, high, idx_min, idx_max, idx;
2411
2412 n = get_u16(pc); /* n must be >= 1 */
2413 pc += 2;
2414 if (cptr >= cbuf_end)
2415 goto no_match;
2416 GET_CHAR(c, cptr, cbuf_end);
2417 if (s->ignore_case) {
2418 c = lre_canonicalize(c, s->is_utf16);
2419 }
2420 idx_min = 0;
2421 low = get_u32(pc + 0 * 8);
2422 if (c < low)
2423 goto no_match;
2424 idx_max = n - 1;
2425 high = get_u32(pc + idx_max * 8 + 4);
2426 if (c > high)
2427 goto no_match;
2428 while (idx_min <= idx_max) {
2429 idx = (idx_min + idx_max) / 2;
2430 low = get_u32(pc + idx * 8);
2431 high = get_u32(pc + idx * 8 + 4);
2432 if (c < low)
2433 idx_max = idx - 1;
2434 else if (c > high)
2435 idx_min = idx + 1;
2436 else
2437 goto range32_match;
2438 }
2439 goto no_match;
2440 range32_match:
2441 pc += 8 * n;
2442 }
2443 break;
2444 case REOP_prev:
2445 /* go to the previous char */
2446 if (cptr == s->cbuf)
2447 goto no_match;
2448 PREV_CHAR(cptr, s->cbuf);
2449 break;
2450 case REOP_simple_greedy_quant:
2451 {
2452 uint32_t next_pos, quant_min, quant_max;
2453 size_t q;
2454 intptr_t res;
2455 const uint8_t *pc1;
2456
2457 next_pos = get_u32(pc);
2458 quant_min = get_u32(pc + 4);
2459 quant_max = get_u32(pc + 8);
2460 pc += 16;
2461 pc1 = pc;
2462 pc += (int)next_pos;
2463
2464 q = 0;
2465 for(;;) {
2466 res = lre_exec_backtrack(s, capture, stack, stack_len,
2467 pc1, cptr, TRUE);
2468 if (res == -1)
2469 return res;
2470 if (!res)
2471 break;
2472 cptr = (uint8_t *)res;
2473 q++;
2474 if (q >= quant_max && quant_max != INT32_MAX)
2475 break;
2476 }
2477 if (q < quant_min)
2478 goto no_match;
2479 if (q > quant_min) {
2480 /* will examine all matches down to quant_min */
2481 ret = push_state(s, capture, stack, stack_len,
2482 pc1 - 16, cptr,
2483 RE_EXEC_STATE_GREEDY_QUANT,
2484 q - quant_min);
2485 if (ret < 0)
2486 return -1;
2487 }
2488 }
2489 break;
2490 default:
2491 abort();
2492 }
2493 }
2494 }
2495
2496 /* Return 1 if match, 0 if not match or -1 if error. cindex is the
2497 starting position of the match and must be such as 0 <= cindex <=
2498 clen. */
lre_exec(uint8_t ** capture,const uint8_t * bc_buf,const uint8_t * cbuf,int cindex,int clen,int cbuf_type,void * opaque)2499 int lre_exec(uint8_t **capture,
2500 const uint8_t *bc_buf, const uint8_t *cbuf, int cindex, int clen,
2501 int cbuf_type, void *opaque)
2502 {
2503 REExecContext s_s, *s = &s_s;
2504 int re_flags, i, alloca_size, ret;
2505 StackInt *stack_buf;
2506
2507 re_flags = bc_buf[RE_HEADER_FLAGS];
2508 s->multi_line = (re_flags & LRE_FLAG_MULTILINE) != 0;
2509 s->ignore_case = (re_flags & LRE_FLAG_IGNORECASE) != 0;
2510 s->is_utf16 = (re_flags & LRE_FLAG_UTF16) != 0;
2511 s->capture_count = bc_buf[RE_HEADER_CAPTURE_COUNT];
2512 s->stack_size_max = bc_buf[RE_HEADER_STACK_SIZE];
2513 s->cbuf = cbuf;
2514 s->cbuf_end = cbuf + (clen << cbuf_type);
2515 s->cbuf_type = cbuf_type;
2516 if (s->cbuf_type == 1 && s->is_utf16)
2517 s->cbuf_type = 2;
2518 s->opaque = opaque;
2519
2520 s->state_size = sizeof(REExecState) +
2521 s->capture_count * sizeof(capture[0]) * 2 +
2522 s->stack_size_max * sizeof(stack_buf[0]);
2523 s->state_stack = NULL;
2524 s->state_stack_len = 0;
2525 s->state_stack_size = 0;
2526
2527 for(i = 0; i < s->capture_count * 2; i++)
2528 capture[i] = NULL;
2529 alloca_size = s->stack_size_max * sizeof(stack_buf[0]);
2530 stack_buf = alloca(alloca_size);
2531 ret = lre_exec_backtrack(s, capture, stack_buf, 0, bc_buf + RE_HEADER_LEN,
2532 cbuf + (cindex << cbuf_type), FALSE);
2533 lre_realloc(s->opaque, s->state_stack, 0);
2534 return ret;
2535 }
2536
lre_get_capture_count(const uint8_t * bc_buf)2537 int lre_get_capture_count(const uint8_t *bc_buf)
2538 {
2539 return bc_buf[RE_HEADER_CAPTURE_COUNT];
2540 }
2541
lre_get_flags(const uint8_t * bc_buf)2542 int lre_get_flags(const uint8_t *bc_buf)
2543 {
2544 return bc_buf[RE_HEADER_FLAGS];
2545 }
2546
2547 /* Return NULL if no group names. Otherwise, return a pointer to
2548 'capture_count - 1' zero terminated UTF-8 strings. */
lre_get_groupnames(const uint8_t * bc_buf)2549 const char *lre_get_groupnames(const uint8_t *bc_buf)
2550 {
2551 uint32_t re_bytecode_len;
2552 if ((lre_get_flags(bc_buf) & LRE_FLAG_NAMED_GROUPS) == 0)
2553 return NULL;
2554 re_bytecode_len = get_u32(bc_buf + 3);
2555 return (const char *)(bc_buf + 7 + re_bytecode_len);
2556 }
2557
2558 #ifdef TEST
2559
lre_check_stack_overflow(void * opaque,size_t alloca_size)2560 BOOL lre_check_stack_overflow(void *opaque, size_t alloca_size)
2561 {
2562 return FALSE;
2563 }
2564
lre_realloc(void * opaque,void * ptr,size_t size)2565 void *lre_realloc(void *opaque, void *ptr, size_t size)
2566 {
2567 return realloc(ptr, size);
2568 }
2569
main(int argc,char ** argv)2570 int main(int argc, char **argv)
2571 {
2572 int len, ret, i;
2573 uint8_t *bc;
2574 char error_msg[64];
2575 uint8_t *capture[CAPTURE_COUNT_MAX * 2];
2576 const char *input;
2577 int input_len, capture_count;
2578
2579 if (argc < 3) {
2580 printf("usage: %s regexp input\n", argv[0]);
2581 exit(1);
2582 }
2583 bc = lre_compile(&len, error_msg, sizeof(error_msg), argv[1],
2584 strlen(argv[1]), 0, NULL);
2585 if (!bc) {
2586 fprintf(stderr, "error: %s\n", error_msg);
2587 exit(1);
2588 }
2589
2590 input = argv[2];
2591 input_len = strlen(input);
2592
2593 ret = lre_exec(capture, bc, (uint8_t *)input, 0, input_len, 0, NULL);
2594 printf("ret=%d\n", ret);
2595 if (ret == 1) {
2596 capture_count = lre_get_capture_count(bc);
2597 for(i = 0; i < 2 * capture_count; i++) {
2598 uint8_t *ptr;
2599 ptr = capture[i];
2600 printf("%d: ", i);
2601 if (!ptr)
2602 printf("<nil>");
2603 else
2604 printf("%u", (int)(ptr - (uint8_t *)input));
2605 printf("\n");
2606 }
2607 }
2608 return 0;
2609 }
2610 #endif
2611