1 #include <stdlib.h>
2 #include <string.h>
3
4 #include "iwre.h"
5 #include "cregex.h"
6
7 #define REGEX_VM_MAX_MATCHES IWRE_MAX_MATCHES
8
9 /* The VM executes one or more threads, each running a regular expression
10 * program, which is just a list of regular expression instructions. Each
11 * thread maintains two registers while it runs: a program counter (PC) and
12 * a string pointer (SP).
13 */
14 typedef struct {
15 int visited;
16 const cregex_program_instr_t *pc;
17 const char *matches[REGEX_VM_MAX_MATCHES];
18 } vm_thread;
19
20 /* Run program on string */
21 static int vm_run(
22 const cregex_program_t *program,
23 const char *string,
24 const char **matches,
25 int nmatches);
26
27 /* Run program on string (using a previously allocated buffer of at least
28 * vm_estimate_threads(program) threads)
29 */
30 static int vm_run_with_threads(
31 const cregex_program_t *program,
32 const char *string,
33 const char **matches,
34 int nmatches,
35 vm_thread *threads);
36
37 typedef struct {
38 int nthreads;
39 vm_thread *threads;
40 } vm_thread_list;
41
vm_add_thread(vm_thread_list * list,const cregex_program_t * program,const cregex_program_instr_t * pc,const char * string,const char * sp,const char ** matches,int nmatches)42 static void vm_add_thread(
43 vm_thread_list *list,
44 const cregex_program_t *program,
45 const cregex_program_instr_t *pc,
46 const char *string,
47 const char *sp,
48 const char **matches,
49 int nmatches
50 ) {
51 if (list->threads[pc - program->instructions].visited == sp - string + 1) {
52 return;
53 }
54 list->threads[pc - program->instructions].visited = sp - string + 1;
55
56 switch (pc->opcode) {
57 case REGEX_PROGRAM_OPCODE_MATCH:
58 /* fall-through */
59
60 /* Characters */
61 case REGEX_PROGRAM_OPCODE_CHARACTER:
62 case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
63 case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
64 case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
65 list->threads[list->nthreads].pc = pc;
66 memcpy(list->threads[list->nthreads].matches, matches,
67 sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
68 ? nmatches
69 : REGEX_VM_MAX_MATCHES));
70 ++list->nthreads;
71 break;
72
73 /* Control-flow */
74 case REGEX_PROGRAM_OPCODE_SPLIT:
75 vm_add_thread(list, program, pc->first, string, sp, matches, nmatches);
76 vm_add_thread(list, program, pc->second, string, sp, matches, nmatches);
77 break;
78 case REGEX_PROGRAM_OPCODE_JUMP:
79 vm_add_thread(list, program, pc->target, string, sp, matches, nmatches);
80 break;
81
82 /* Assertions */
83 case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
84 if (sp == string) {
85 vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
86 }
87 break;
88 case REGEX_PROGRAM_OPCODE_ASSERT_END:
89 if (!*sp) {
90 vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
91 }
92 break;
93
94 /* Saving */
95 case REGEX_PROGRAM_OPCODE_SAVE:
96 if (pc->save < nmatches && pc->save < REGEX_VM_MAX_MATCHES) {
97 const char *saved = matches[pc->save];
98 matches[pc->save] = sp;
99 vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
100 matches[pc->save] = saved;
101 } else {
102 vm_add_thread(list, program, pc + 1, string, sp, matches, nmatches);
103 }
104 break;
105 }
106 }
107
108 /* Upper bound of number of threads required to run program */
vm_estimate_threads(const cregex_program_t * program)109 static int vm_estimate_threads(const cregex_program_t *program) {
110 return program->ninstructions * 2;
111 }
112
vm_run(const cregex_program_t * program,const char * string,const char ** matches,int nmatches)113 static int vm_run(
114 const cregex_program_t *program,
115 const char *string,
116 const char **matches,
117 int nmatches
118 ) {
119 size_t size = sizeof(vm_thread) * vm_estimate_threads(program);
120 vm_thread *threads;
121 int matched;
122
123 if (!(threads = malloc(size))) {
124 return -1;
125 }
126
127 matched = vm_run_with_threads(program, string, matches, nmatches, threads);
128 free(threads);
129 return matched;
130 }
131
vm_run_with_threads(const cregex_program_t * program,const char * string,const char ** matches,int nmatches,vm_thread * threads)132 static int vm_run_with_threads(
133 const cregex_program_t *program,
134 const char *string,
135 const char **matches,
136 int nmatches,
137 vm_thread *threads
138 ) {
139 vm_thread_list *current
140 = &(vm_thread_list) {
141 .nthreads = 0, .threads = threads
142 };
143 vm_thread_list *next = &(vm_thread_list) {
144 .nthreads = 0, .threads = threads + program->ninstructions
145 };
146 int matched = 0;
147
148 memset(threads, 0, sizeof(vm_thread) * program->ninstructions * 2);
149
150 vm_add_thread(current, program, program->instructions, string, string,
151 matches, nmatches);
152
153 for (const char *sp = string; ; ++sp) {
154 for (int i = 0; i < current->nthreads; ++i) {
155 vm_thread *thread = current->threads + i;
156 switch (thread->pc->opcode) {
157 case REGEX_PROGRAM_OPCODE_MATCH:
158 matched = 1;
159 current->nthreads = 0;
160 memcpy(matches, thread->matches,
161 sizeof(matches[0]) * ((nmatches <= REGEX_VM_MAX_MATCHES)
162 ? nmatches
163 : REGEX_VM_MAX_MATCHES));
164 continue;
165
166 /* Characters */
167 case REGEX_PROGRAM_OPCODE_CHARACTER:
168 if (*sp == thread->pc->ch) {
169 break;
170 }
171 continue;
172 case REGEX_PROGRAM_OPCODE_ANY_CHARACTER:
173 if (*sp) {
174 break;
175 }
176 continue;
177 case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS:
178 if (cregex_char_class_contains(thread->pc->klass, *sp)) {
179 break;
180 }
181 continue;
182 case REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED:
183 if (!cregex_char_class_contains(thread->pc->klass, *sp)) {
184 break;
185 }
186 continue;
187
188 /* Control-flow */
189 case REGEX_PROGRAM_OPCODE_SPLIT:
190 case REGEX_PROGRAM_OPCODE_JUMP:
191 /* fall-through */
192
193 /* Assertions */
194 case REGEX_PROGRAM_OPCODE_ASSERT_BEGIN:
195 case REGEX_PROGRAM_OPCODE_ASSERT_END:
196 /* fall-through */
197
198 /* Saving */
199 case REGEX_PROGRAM_OPCODE_SAVE:
200 /* handled in vm_add_thread() */
201 abort();
202 }
203
204 vm_add_thread(next, program, thread->pc + 1, string, sp + 1,
205 thread->matches, nmatches);
206 }
207
208 /* swap current and next thread list */
209 vm_thread_list *swap = current;
210 current = next;
211 next = swap;
212 next->nthreads = 0;
213
214 /* done if no more threads are running or end of string reached */
215 if (current->nthreads == 0 || !*sp) {
216 break;
217 }
218 }
219
220 return matched;
221 }
222
cregex_program_run(const cregex_program_t * program,const char * string,const char ** matches,int nmatches)223 int cregex_program_run(
224 const cregex_program_t *program,
225 const char *string,
226 const char **matches,
227 int nmatches
228 ) {
229 return vm_run(program, string, matches, nmatches);
230 }
231