1 #include <stdbool.h>
2 #include <stdlib.h>
3
4 #include "cregex.h"
5
6 typedef struct {
7 cregex_program_instr_t *pc;
8 int ncaptures;
9 } regex_compile_context;
10
count_instructions(const cregex_node_t * node)11 static int count_instructions(const cregex_node_t *node) {
12 switch (node->type) {
13 case REGEX_NODE_TYPE_EPSILON:
14 return 0;
15
16 /* Characters */
17 case REGEX_NODE_TYPE_CHARACTER:
18 case REGEX_NODE_TYPE_ANY_CHARACTER:
19 case REGEX_NODE_TYPE_CHARACTER_CLASS:
20 case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
21 return 1;
22
23 /* Composites */
24 case REGEX_NODE_TYPE_CONCATENATION:
25 return count_instructions(node->left) + count_instructions(node->right);
26 case REGEX_NODE_TYPE_ALTERNATION:
27 return 2 + count_instructions(node->left)
28 + count_instructions(node->right);
29
30 /* Quantifiers */
31 case REGEX_NODE_TYPE_QUANTIFIER: {
32 int num = count_instructions(node->quantified);
33 if (node->nmax >= node->nmin) {
34 return node->nmin * num + (node->nmax - node->nmin) * (num + 1);
35 }
36 return 1 + (node->nmin ? node->nmin * num : num + 1);
37 }
38
39 /* Anchors */
40 case REGEX_NODE_TYPE_ANCHOR_BEGIN:
41 case REGEX_NODE_TYPE_ANCHOR_END:
42 return 1;
43
44 /* Captures */
45 case REGEX_NODE_TYPE_CAPTURE:
46 return 2 + count_instructions(node->captured);
47 }
48
49 /* should not reach here */
50 return 0;
51 }
52
node_is_anchored(const cregex_node_t * node)53 static bool node_is_anchored(const cregex_node_t *node) {
54 switch (node->type) {
55 case REGEX_NODE_TYPE_EPSILON:
56 return false;
57
58 /* Characters */
59 case REGEX_NODE_TYPE_CHARACTER:
60 case REGEX_NODE_TYPE_ANY_CHARACTER:
61 case REGEX_NODE_TYPE_CHARACTER_CLASS:
62 case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
63 return false;
64
65 /* Composites */
66 case REGEX_NODE_TYPE_CONCATENATION:
67 return node_is_anchored(node->left);
68 case REGEX_NODE_TYPE_ALTERNATION:
69 return node_is_anchored(node->left) && node_is_anchored(node->right);
70
71 /* Quantifiers */
72 case REGEX_NODE_TYPE_QUANTIFIER:
73 return node_is_anchored(node->quantified);
74
75 /* Anchors */
76 case REGEX_NODE_TYPE_ANCHOR_BEGIN:
77 return true;
78 case REGEX_NODE_TYPE_ANCHOR_END:
79 return false;
80
81 /* Captures */
82 case REGEX_NODE_TYPE_CAPTURE:
83 return node_is_anchored(node->captured);
84 }
85
86 /* should not reach here */
87 return false;
88 }
89
emit(regex_compile_context * context,const cregex_program_instr_t * instruction)90 static inline cregex_program_instr_t* emit(
91 regex_compile_context *context,
92 const cregex_program_instr_t *instruction
93 ) {
94 *context->pc = *instruction;
95 return context->pc++;
96 }
97
compile_char_class(const cregex_node_t * node,cregex_program_instr_t * instruction)98 static cregex_program_instr_t* compile_char_class(
99 const cregex_node_t *node,
100 cregex_program_instr_t *instruction
101 ) {
102 const char *sp = node->from;
103
104 for ( ; ; ) {
105 int ch = *sp++;
106 switch (ch) {
107 case ']':
108 if (sp - 1 == node->from) {
109 goto CHARACTER;
110 }
111 return instruction;
112 case '\\':
113 ch = *sp++;
114 /* fall-through */
115 default:
116 CHARACTER:
117 if (*sp == '-' && sp[1] != ']') {
118 for ( ; ch <= sp[1]; ++ch) {
119 cregex_char_class_add(instruction->klass, ch);
120 }
121 sp += 2;
122 } else {
123 cregex_char_class_add(instruction->klass, ch);
124 }
125 break;
126 }
127 }
128 }
129
compile_context(regex_compile_context * context,const cregex_node_t * node)130 static cregex_program_instr_t* compile_context(
131 regex_compile_context *context,
132 const cregex_node_t *node
133 ) {
134 cregex_program_instr_t *bottom = context->pc, *split, *jump;
135 int ncaptures = context->ncaptures, capture;
136
137 switch (node->type) {
138 case REGEX_NODE_TYPE_EPSILON:
139 break;
140
141 /* Characters */
142 case REGEX_NODE_TYPE_CHARACTER:
143 emit(context,
144 &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_CHARACTER,
145 .ch = node->ch });
146 break;
147 case REGEX_NODE_TYPE_ANY_CHARACTER:
148 emit(context, &(cregex_program_instr_t) {
149 .opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER
150 });
151 break;
152 case REGEX_NODE_TYPE_CHARACTER_CLASS:
153 compile_char_class(
154 node,
155 emit(context, &(cregex_program_instr_t) {
156 .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS
157 }));
158 break;
159 case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
160 compile_char_class(
161 node,
162 emit(context,
163 &(cregex_program_instr_t) {
164 .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED
165 }));
166 break;
167
168 /* Composites */
169 case REGEX_NODE_TYPE_CONCATENATION:
170 compile_context(context, node->left);
171 compile_context(context, node->right);
172 break;
173 case REGEX_NODE_TYPE_ALTERNATION:
174 split = emit(context, &(cregex_program_instr_t) {
175 .opcode = REGEX_PROGRAM_OPCODE_SPLIT
176 });
177 split->first = compile_context(context, node->left);
178 jump = emit(context, &(cregex_program_instr_t) {
179 .opcode = REGEX_PROGRAM_OPCODE_JUMP
180 });
181 split->second = compile_context(context, node->right);
182 jump->target = context->pc;
183 break;
184
185 /* Quantifiers */
186 case REGEX_NODE_TYPE_QUANTIFIER: {
187 cregex_program_instr_t *last = NULL;
188 for (int i = 0; i < node->nmin; ++i) {
189 context->ncaptures = ncaptures;
190 last = compile_context(context, node->quantified);
191 }
192 if (node->nmax > node->nmin) {
193 for (int i = 0; i < node->nmax - node->nmin; ++i) {
194 context->ncaptures = ncaptures;
195 split
196 = emit(context, &(cregex_program_instr_t) {
197 .opcode = REGEX_PROGRAM_OPCODE_SPLIT
198 });
199 split->first = compile_context(context, node->quantified);
200 split->second = context->pc;
201 if (!node->greedy) {
202 cregex_program_instr_t *swap = split->first;
203 split->first = split->second;
204 split->second = swap;
205 }
206 }
207 } else if (node->nmax == -1) {
208 split = emit(context, &(cregex_program_instr_t) {
209 .opcode = REGEX_PROGRAM_OPCODE_SPLIT
210 });
211 if (node->nmin == 0) {
212 split->first = compile_context(context, node->quantified);
213 jump = emit(context, &(cregex_program_instr_t) {
214 .opcode = REGEX_PROGRAM_OPCODE_JUMP
215 });
216 split->second = context->pc;
217 jump->target = split;
218 } else {
219 split->first = last;
220 split->second = context->pc;
221 }
222 if (!node->greedy) {
223 cregex_program_instr_t *swap = split->first;
224 split->first = split->second;
225 split->second = swap;
226 }
227 }
228 break;
229 }
230
231 /* Anchors */
232 case REGEX_NODE_TYPE_ANCHOR_BEGIN:
233 emit(context, &(cregex_program_instr_t) {
234 .opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN
235 });
236 break;
237 case REGEX_NODE_TYPE_ANCHOR_END:
238 emit(context, &(cregex_program_instr_t) {
239 .opcode = REGEX_PROGRAM_OPCODE_ASSERT_END
240 });
241 break;
242
243 /* Captures */
244 case REGEX_NODE_TYPE_CAPTURE:
245 capture = context->ncaptures++ *2;
246 emit(context,
247 &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_SAVE,
248 .save = capture });
249 compile_context(context, node->captured);
250 emit(context,
251 &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_SAVE,
252 .save = capture + 1 });
253 break;
254 }
255
256 return bottom;
257 }
258
259 /* Compile a parsed pattern (using a previously allocated program with at least
260 * estimate_instructions(root) instructions).
261 */
compile_node_with_program(const cregex_node_t * root,cregex_program_t * program)262 static cregex_program_t* compile_node_with_program(
263 const cregex_node_t *root,
264 cregex_program_t *program
265 ) {
266 /* add capture node for entire match */
267 root = &(cregex_node_t) {
268 .type = REGEX_NODE_TYPE_CAPTURE,
269 .captured = (cregex_node_t*) root
270 };
271
272 cregex_node_t naroot = (cregex_node_t) {
273 .type = REGEX_NODE_TYPE_CONCATENATION,
274 .left
275 = &(cregex_node_t) {
276 .type = REGEX_NODE_TYPE_QUANTIFIER,
277 .nmin = 0,
278 .nmax = -1,
279 .greedy = 0,
280 .quantified = &(
281 cregex_node_t) {
282 .type = REGEX_NODE_TYPE_ANY_CHARACTER
283 }
284 },
285 .right = (cregex_node_t*) root
286 };
287
288 /* add .*? unless pattern starts with ^ */
289 if (!node_is_anchored(root)) {
290 root = &naroot;
291 }
292
293 /* compile */
294 regex_compile_context *context
295 = &(regex_compile_context) {
296 .pc = program->instructions, .ncaptures = 0
297 };
298 compile_context(context, root);
299
300 /* emit final match instruction */
301 emit(context,
302 &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_MATCH });
303
304 /* set total number of instructions */
305 program->ninstructions = context->pc - program->instructions;
306 return program;
307 }
308
309 /* Upper bound of number of instructions required to compile parsed pattern. */
estimate_instructions(const cregex_node_t * root)310 static int estimate_instructions(const cregex_node_t *root) {
311 return count_instructions(root)
312 /* .*? is added unless pattern starts with ^,
313 * save instructions are added for beginning and end of match,
314 * a final match instruction is added to the end of the program
315 */
316 + !node_is_anchored(root) * 3 + 2 + 1;
317 }
318
cregex_compile_node(const cregex_node_t * root)319 cregex_program_t* cregex_compile_node(const cregex_node_t *root) {
320 size_t size = sizeof(cregex_program_t)
321 + sizeof(cregex_program_instr_t) * estimate_instructions(root);
322 cregex_program_t *program;
323
324 if (!(program = malloc(size))) {
325 return NULL;
326 }
327
328 if (!compile_node_with_program(root, program)) {
329 free(program);
330 return NULL;
331 }
332
333 return program;
334 }
335
336 /* Free a compiled program */
cregex_compile_free(cregex_program_t * program)337 void cregex_compile_free(cregex_program_t *program) {
338 free(program);
339 }
340