• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #include <stdbool.h>
2 #include <stdlib.h>
3 
4 #include "cregex.h"
5 
6 typedef struct {
7   cregex_program_instr_t *pc;
8   int ncaptures;
9 } regex_compile_context;
10 
count_instructions(const cregex_node_t * node)11 static int count_instructions(const cregex_node_t *node) {
12   switch (node->type) {
13     case REGEX_NODE_TYPE_EPSILON:
14       return 0;
15 
16     /* Characters */
17     case REGEX_NODE_TYPE_CHARACTER:
18     case REGEX_NODE_TYPE_ANY_CHARACTER:
19     case REGEX_NODE_TYPE_CHARACTER_CLASS:
20     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
21       return 1;
22 
23     /* Composites */
24     case REGEX_NODE_TYPE_CONCATENATION:
25       return count_instructions(node->left) + count_instructions(node->right);
26     case REGEX_NODE_TYPE_ALTERNATION:
27       return 2 + count_instructions(node->left)
28              + count_instructions(node->right);
29 
30     /* Quantifiers */
31     case REGEX_NODE_TYPE_QUANTIFIER: {
32       int num = count_instructions(node->quantified);
33       if (node->nmax >= node->nmin) {
34         return node->nmin * num + (node->nmax - node->nmin) * (num + 1);
35       }
36       return 1 + (node->nmin ? node->nmin * num : num + 1);
37     }
38 
39     /* Anchors */
40     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
41     case REGEX_NODE_TYPE_ANCHOR_END:
42       return 1;
43 
44     /* Captures */
45     case REGEX_NODE_TYPE_CAPTURE:
46       return 2 + count_instructions(node->captured);
47   }
48 
49   /* should not reach here */
50   return 0;
51 }
52 
node_is_anchored(const cregex_node_t * node)53 static bool node_is_anchored(const cregex_node_t *node) {
54   switch (node->type) {
55     case REGEX_NODE_TYPE_EPSILON:
56       return false;
57 
58     /* Characters */
59     case REGEX_NODE_TYPE_CHARACTER:
60     case REGEX_NODE_TYPE_ANY_CHARACTER:
61     case REGEX_NODE_TYPE_CHARACTER_CLASS:
62     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
63       return false;
64 
65     /* Composites */
66     case REGEX_NODE_TYPE_CONCATENATION:
67       return node_is_anchored(node->left);
68     case REGEX_NODE_TYPE_ALTERNATION:
69       return node_is_anchored(node->left) && node_is_anchored(node->right);
70 
71     /* Quantifiers */
72     case REGEX_NODE_TYPE_QUANTIFIER:
73       return node_is_anchored(node->quantified);
74 
75     /* Anchors */
76     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
77       return true;
78     case REGEX_NODE_TYPE_ANCHOR_END:
79       return false;
80 
81     /* Captures */
82     case REGEX_NODE_TYPE_CAPTURE:
83       return node_is_anchored(node->captured);
84   }
85 
86   /* should not reach here */
87   return false;
88 }
89 
emit(regex_compile_context * context,const cregex_program_instr_t * instruction)90 static inline cregex_program_instr_t* emit(
91   regex_compile_context        *context,
92   const cregex_program_instr_t *instruction
93   ) {
94   *context->pc = *instruction;
95   return context->pc++;
96 }
97 
compile_char_class(const cregex_node_t * node,cregex_program_instr_t * instruction)98 static cregex_program_instr_t* compile_char_class(
99   const cregex_node_t    *node,
100   cregex_program_instr_t *instruction
101   ) {
102   const char *sp = node->from;
103 
104   for ( ; ; ) {
105     int ch = *sp++;
106     switch (ch) {
107       case ']':
108         if (sp - 1 == node->from) {
109           goto CHARACTER;
110         }
111         return instruction;
112       case '\\':
113         ch = *sp++;
114       /* fall-through */
115       default:
116 CHARACTER:
117         if (*sp == '-' && sp[1] != ']') {
118           for ( ; ch <= sp[1]; ++ch) {
119             cregex_char_class_add(instruction->klass, ch);
120           }
121           sp += 2;
122         } else {
123           cregex_char_class_add(instruction->klass, ch);
124         }
125         break;
126     }
127   }
128 }
129 
compile_context(regex_compile_context * context,const cregex_node_t * node)130 static cregex_program_instr_t* compile_context(
131   regex_compile_context *context,
132   const cregex_node_t   *node
133   ) {
134   cregex_program_instr_t *bottom = context->pc, *split, *jump;
135   int ncaptures = context->ncaptures, capture;
136 
137   switch (node->type) {
138     case REGEX_NODE_TYPE_EPSILON:
139       break;
140 
141     /* Characters */
142     case REGEX_NODE_TYPE_CHARACTER:
143       emit(context,
144            &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_CHARACTER,
145                                        .ch = node->ch });
146       break;
147     case REGEX_NODE_TYPE_ANY_CHARACTER:
148       emit(context, &(cregex_program_instr_t) {
149       .opcode = REGEX_PROGRAM_OPCODE_ANY_CHARACTER
150     });
151       break;
152     case REGEX_NODE_TYPE_CHARACTER_CLASS:
153       compile_char_class(
154         node,
155         emit(context, &(cregex_program_instr_t) {
156       .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS
157     }));
158       break;
159     case REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED:
160       compile_char_class(
161         node,
162         emit(context,
163              &(cregex_program_instr_t) {
164       .opcode = REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED
165     }));
166       break;
167 
168     /* Composites */
169     case REGEX_NODE_TYPE_CONCATENATION:
170       compile_context(context, node->left);
171       compile_context(context, node->right);
172       break;
173     case REGEX_NODE_TYPE_ALTERNATION:
174       split = emit(context, &(cregex_program_instr_t) {
175       .opcode = REGEX_PROGRAM_OPCODE_SPLIT
176     });
177       split->first = compile_context(context, node->left);
178       jump = emit(context, &(cregex_program_instr_t) {
179       .opcode = REGEX_PROGRAM_OPCODE_JUMP
180     });
181       split->second = compile_context(context, node->right);
182       jump->target = context->pc;
183       break;
184 
185     /* Quantifiers */
186     case REGEX_NODE_TYPE_QUANTIFIER: {
187       cregex_program_instr_t *last = NULL;
188       for (int i = 0; i < node->nmin; ++i) {
189         context->ncaptures = ncaptures;
190         last = compile_context(context, node->quantified);
191       }
192       if (node->nmax > node->nmin) {
193         for (int i = 0; i < node->nmax - node->nmin; ++i) {
194           context->ncaptures = ncaptures;
195           split
196             = emit(context, &(cregex_program_instr_t) {
197             .opcode = REGEX_PROGRAM_OPCODE_SPLIT
198           });
199           split->first = compile_context(context, node->quantified);
200           split->second = context->pc;
201           if (!node->greedy) {
202             cregex_program_instr_t *swap = split->first;
203             split->first = split->second;
204             split->second = swap;
205           }
206         }
207       } else if (node->nmax == -1) {
208         split = emit(context, &(cregex_program_instr_t) {
209           .opcode = REGEX_PROGRAM_OPCODE_SPLIT
210         });
211         if (node->nmin == 0) {
212           split->first = compile_context(context, node->quantified);
213           jump = emit(context, &(cregex_program_instr_t) {
214             .opcode = REGEX_PROGRAM_OPCODE_JUMP
215           });
216           split->second = context->pc;
217           jump->target = split;
218         } else {
219           split->first = last;
220           split->second = context->pc;
221         }
222         if (!node->greedy) {
223           cregex_program_instr_t *swap = split->first;
224           split->first = split->second;
225           split->second = swap;
226         }
227       }
228       break;
229     }
230 
231     /* Anchors */
232     case REGEX_NODE_TYPE_ANCHOR_BEGIN:
233       emit(context, &(cregex_program_instr_t) {
234       .opcode = REGEX_PROGRAM_OPCODE_ASSERT_BEGIN
235     });
236       break;
237     case REGEX_NODE_TYPE_ANCHOR_END:
238       emit(context, &(cregex_program_instr_t) {
239       .opcode = REGEX_PROGRAM_OPCODE_ASSERT_END
240     });
241       break;
242 
243     /* Captures */
244     case REGEX_NODE_TYPE_CAPTURE:
245       capture = context->ncaptures++ *2;
246       emit(context,
247            &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_SAVE,
248                                        .save = capture });
249       compile_context(context, node->captured);
250       emit(context,
251            &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_SAVE,
252                                        .save = capture + 1 });
253       break;
254   }
255 
256   return bottom;
257 }
258 
259 /* Compile a parsed pattern (using a previously allocated program with at least
260  * estimate_instructions(root) instructions).
261  */
compile_node_with_program(const cregex_node_t * root,cregex_program_t * program)262 static cregex_program_t* compile_node_with_program(
263   const cregex_node_t *root,
264   cregex_program_t    *program
265   ) {
266   /* add capture node for entire match */
267   root = &(cregex_node_t) {
268     .type = REGEX_NODE_TYPE_CAPTURE,
269     .captured = (cregex_node_t*) root
270   };
271 
272   cregex_node_t naroot = (cregex_node_t) {
273     .type = REGEX_NODE_TYPE_CONCATENATION,
274     .left
275       = &(cregex_node_t) {
276       .type = REGEX_NODE_TYPE_QUANTIFIER,
277       .nmin = 0,
278       .nmax = -1,
279       .greedy = 0,
280       .quantified = &(
281         cregex_node_t) {
282         .type = REGEX_NODE_TYPE_ANY_CHARACTER
283       }
284       },
285     .right = (cregex_node_t*) root
286   };
287 
288   /* add .*? unless pattern starts with ^ */
289   if (!node_is_anchored(root)) {
290     root = &naroot;
291   }
292 
293   /* compile */
294   regex_compile_context *context
295     = &(regex_compile_context) {
296     .pc = program->instructions, .ncaptures = 0
297     };
298   compile_context(context, root);
299 
300   /* emit final match instruction */
301   emit(context,
302        &(cregex_program_instr_t) { .opcode = REGEX_PROGRAM_OPCODE_MATCH });
303 
304   /* set total number of instructions */
305   program->ninstructions = context->pc - program->instructions;
306   return program;
307 }
308 
309 /* Upper bound of number of instructions required to compile parsed pattern. */
estimate_instructions(const cregex_node_t * root)310 static int estimate_instructions(const cregex_node_t *root) {
311   return count_instructions(root)
312          /* .*? is added unless pattern starts with ^,
313           * save instructions are added for beginning and end of match,
314           * a final match instruction is added to the end of the program
315           */
316          + !node_is_anchored(root) * 3 + 2 + 1;
317 }
318 
cregex_compile_node(const cregex_node_t * root)319 cregex_program_t* cregex_compile_node(const cregex_node_t *root) {
320   size_t size = sizeof(cregex_program_t)
321                 + sizeof(cregex_program_instr_t) * estimate_instructions(root);
322   cregex_program_t *program;
323 
324   if (!(program = malloc(size))) {
325     return NULL;
326   }
327 
328   if (!compile_node_with_program(root, program)) {
329     free(program);
330     return NULL;
331   }
332 
333   return program;
334 }
335 
336 /* Free a compiled program */
cregex_compile_free(cregex_program_t * program)337 void cregex_compile_free(cregex_program_t *program) {
338   free(program);
339 }
340