• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdlib.h>
2 #include <string.h>
3 
4 #include "iwre.h"
5 #include "cregex.h"
6 
7 #define REGEX_VM_MAX_MATCHES IWRE_MAX_MATCHES
8 
9 /* The VM executes one or more threads, each running a regular expression
10  * program, which is just a list of regular expression instructions. Each
11  * thread maintains two registers while it runs: a program counter (PC) and
12  * a string pointer (SP).
13  */
14 typedef struct {
15   int visited;
16   const cregex_program_instr_t *pc;
17   const char *matches[REGEX_VM_MAX_MATCHES];
18 } vm_thread;
19 
20 /* Run program on string */
21 static int vm_run(
22   const cregex_program_t *program,
23   const char             *string,
24   const char            **matches,
25   int                     nmatches);
26 
27 /* Run program on string (using a previously allocated buffer of at least
28  * vm_estimate_threads(program) threads)
29  */
30 static int vm_run_with_threads(
31   const cregex_program_t *program,
32   const char             *string,
33   const char            **matches,
34   int                     nmatches,
35   vm_thread              *threads);
36 
37 typedef struct {
38   int nthreads;
39   vm_thread *threads;
40 } vm_thread_list;
41 
vm_add_thread(vm_thread_list * list,const cregex_program_t * program,const cregex_program_instr_t * pc,const char * string,const char * sp,const char ** matches,int nmatches)42 static void vm_add_thread(
43   vm_thread_list               *list,
44   const cregex_program_t       *program,
45   const cregex_program_instr_t *pc,
46   const char                   *string,
47   const char                   *sp,
48   const char                  **matches,
49   int                           nmatches
50   ) {
51   if (list->threads[pc - program->instructions].visited == sp - string + 1) {
52     return;
53   }
54   list->threads[pc - program->instructions].visited = sp - string + 1;
55 
56   switch (pc->opcode) {
57     case REGEX_PROGRAM_OPCODE_MATCH:
58     /* fall-through */
59 
60     /* Characters */
61     case REGEX_PROGRAM_OPCODE_CHARACTER:
62     case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
63     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
64     case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
65       list->threads[list->nthreads].pc = pc;
66       memcpy(list->threads[list->nthreads].matches, matches,
67              sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
68                                    ? nmatches
69                                    : REGEX_VM_MAX_MATCHES));
70       ++list->nthreads;
71       break;
72 
73     /* Control-flow */
74     case REGEX_PROGRAM_OPCODE_SPLIT:
75       vm_add_thread(list, program, pc->first, string, sp, matches, nmatches);
76       vm_add_thread(list, program, pc->second, string, sp, matches, nmatches);
77       break;
78     case REGEX_PROGRAM_OPCODE_JUMP:
79       vm_add_thread(list, program, pc->target, string, sp, matches, nmatches);
80       break;
81 
82     /* Assertions */
83     case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
84       if (sp == string) {
85         vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
86       }
87       break;
88     case REGEX_PROGRAM_OPCODE_ASSERT_END:
89       if (!*sp) {
90         vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
91       }
92       break;
93 
94     /* Saving */
95     case REGEX_PROGRAM_OPCODE_SAVE:
96       if (pc->save < nmatches && pc->save < REGEX_VM_MAX_MATCHES) {
97         const char *saved = matches[pc->save];
98         matches[pc->save] = sp;
99         vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
100         matches[pc->save] = saved;
101       } else {
102         vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
103       }
104       break;
105   }
106 }
107 
108 /* Upper bound of number of threads required to run program */
vm_estimate_threads(const cregex_program_t * program)109 static int vm_estimate_threads(const cregex_program_t *program) {
110   return program->ninstructions * 2;
111 }
112 
vm_run(const cregex_program_t * program,const char * string,const char ** matches,int nmatches)113 static int vm_run(
114   const cregex_program_t *program,
115   const char             *string,
116   const char            **matches,
117   int                     nmatches
118   ) {
119   size_t size = sizeof(vm_thread) * vm_estimate_threads(program);
120   vm_thread *threads;
121   int matched;
122 
123   if (!(threads = malloc(size))) {
124     return -1;
125   }
126 
127   matched = vm_run_with_threads(program, string, matches, nmatches, threads);
128   free(threads);
129   return matched;
130 }
131 
vm_run_with_threads(const cregex_program_t * program,const char * string,const char ** matches,int nmatches,vm_thread * threads)132 static int vm_run_with_threads(
133   const cregex_program_t *program,
134   const char             *string,
135   const char            **matches,
136   int                     nmatches,
137   vm_thread              *threads
138   ) {
139   vm_thread_list *current
140     = &(vm_thread_list) {
141     .nthreads = 0, .threads = threads
142     };
143   vm_thread_list *next = &(vm_thread_list) {
144     .nthreads = 0, .threads = threads + program->ninstructions
145   };
146   int matched = 0;
147 
148   memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2);
149 
150   vm_add_thread(current, program, program->instructions, string, string,
151                 matches, nmatches);
152 
153   for (const char *sp = string; ; ++sp) {
154     for (int i = 0; i < current->nthreads; ++i) {
155       vm_thread *thread = current->threads + i;
156       switch (thread->pc->opcode) {
157         case REGEX_PROGRAM_OPCODE_MATCH:
158           matched = 1;
159           current->nthreads = 0;
160           memcpy(matches, thread->matches,
161                  sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
162                                        ? nmatches
163                                        : REGEX_VM_MAX_MATCHES));
164           continue;
165 
166         /* Characters */
167         case REGEX_PROGRAM_OPCODE_CHARACTER:
168           if (*sp == thread->pc->ch) {
169             break;
170           }
171           continue;
172         case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
173           if (*sp) {
174             break;
175           }
176           continue;
177         case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
178           if (cregex_char_class_contains(thread->pc->klass, *sp)) {
179             break;
180           }
181           continue;
182         case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
183           if (!cregex_char_class_contains(thread->pc->klass, *sp)) {
184             break;
185           }
186           continue;
187 
188         /* Control-flow */
189         case REGEX_PROGRAM_OPCODE_SPLIT:
190         case REGEX_PROGRAM_OPCODE_JUMP:
191         /* fall-through */
192 
193         /* Assertions */
194         case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
195         case REGEX_PROGRAM_OPCODE_ASSERT_END:
196         /* fall-through */
197 
198         /* Saving */
199         case REGEX_PROGRAM_OPCODE_SAVE:
200           /* handled in vm_add_thread() */
201           abort();
202       }
203 
204       vm_add_thread(next, program, thread->pc + 1, string, sp + 1,
205                     thread->matches, nmatches);
206     }
207 
208     /* swap current and next thread list */
209     vm_thread_list *swap = current;
210     current = next;
211     next = swap;
212     next->nthreads = 0;
213 
214     /* done if no more threads are running or end of string reached */
215     if (current->nthreads == 0 || !*sp) {
216       break;
217     }
218   }
219 
220   return matched;
221 }
222 
cregex_program_run(const cregex_program_t * program,const char * string,const char ** matches,int nmatches)223 int cregex_program_run(
224   const cregex_program_t *program,
225   const char             *string,
226   const char            **matches,
227   int                     nmatches
228   ) {
229   return vm_run(program, string, matches, nmatches);
230 }
231