1 #include <assert.h>
2 #include <stdlib.h>
3 #include <string.h>
4
5 #include "cregex.h"
6
7 typedef struct {
8 const char *sp;
9 cregex_node_t *stack, *output;
10 } regex_parse_context;
11
12 /* Shunting-yard algorithm
13 * See https://en.wikipedia.org/wiki/Shunting-yard_algorithm
14 */
15
push(regex_parse_context * context,const cregex_node_t * node)16 static inline cregex_node_t* push(
17 regex_parse_context *context,
18 const cregex_node_t *node
19 ) {
20 assert(context->stack <= context->output);
21 *context->stack = *node;
22 return context->stack++;
23 }
24
drop(regex_parse_context * context)25 static inline cregex_node_t* drop(regex_parse_context *context) {
26 return --context->stack;
27 }
28
consume(regex_parse_context * context)29 static inline cregex_node_t* consume(regex_parse_context *context) {
30 *--context->output = *--context->stack;
31 return context->output;
32 }
33
concatenate(regex_parse_context * context,const cregex_node_t * bottom)34 static inline cregex_node_t* concatenate(
35 regex_parse_context *context,
36 const cregex_node_t *bottom
37 ) {
38 if (context->stack == bottom) {
39 push(context, &(cregex_node_t) { .type = REGEX_NODE_TYPE_EPSILON });
40 } else {
41 while (context->stack - 1 > bottom) {
42 cregex_node_t *right = consume(context);
43 cregex_node_t *left = consume(context);
44 push(context,
45 &(cregex_node_t) { .type = REGEX_NODE_TYPE_CONCATENATION,
46 .left = left,
47 .right = right });
48 }
49 }
50 return context->stack - 1;
51 }
52
parse_char_class(regex_parse_context * context)53 static cregex_node_t* parse_char_class(regex_parse_context *context) {
54 cregex_node_type type
55 = (*context->sp == '^')
56 ? (++context->sp, REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED)
57 : REGEX_NODE_TYPE_CHARACTER_CLASS;
58 const char *from = context->sp;
59
60 for ( ; ; ) {
61 int ch = *context->sp++;
62 switch (ch) {
63 case '\0':
64 /* premature end of character class */
65 return NULL;
66 case ']':
67 if (context->sp - 1 == from) {
68 goto CHARACTER;
69 }
70 return push(context,
71 &(cregex_node_t) {
72 .type = type, .from = from, .to = context->sp - 1
73 });
74 case '\\':
75 ch = *context->sp++;
76 /* fall-through */
77 default:
78 CHARACTER:
79 if (*context->sp == '-' && context->sp[1] != ']') {
80 if (context->sp[1] < ch) {
81 /* empty range in character class */
82 return NULL;
83 }
84 context->sp += 2;
85 }
86 break;
87 }
88 }
89 }
90
parse_interval(regex_parse_context * context)91 static cregex_node_t* parse_interval(regex_parse_context *context) {
92 const char *from = context->sp;
93 int nmin, nmax;
94
95 for (nmin = 0; *context->sp >= '0' && *context->sp <= '9'; ++context->sp)
96 nmin = (nmin * 10) + (*context->sp - '0');
97
98 if (*context->sp == ',') {
99 ++context->sp;
100 if (*from != ',' && *context->sp == '}') {
101 nmax = -1;
102 } else {
103 for (nmax = 0; *context->sp >= '0' && *context->sp <= '9';
104 ++context->sp)
105 nmax = (nmax * 10) + (*context->sp - '0');
106 if ( *(context->sp - 1) == ',' || *context->sp != '}'
107 || nmax < nmin) {
108 context->sp = from;
109 return NULL;
110 }
111 }
112 } else if (*from != '}' && *context->sp == '}') {
113 nmax = nmin;
114 } else {
115 context->sp = from;
116 return NULL;
117 }
118
119 ++context->sp;
120 return push(context,
121 &(cregex_node_t) {
122 .type = REGEX_NODE_TYPE_QUANTIFIER,
123 .nmin = nmin,
124 .nmax = nmax,
125 .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1,
126 .quantified = consume(context)
127 });
128 }
129
parse_context(regex_parse_context * context,int depth)130 static cregex_node_t* parse_context(regex_parse_context *context, int depth) {
131 cregex_node_t *bottom = context->stack;
132
133 for ( ; ; ) {
134 int ch = *context->sp++;
135 switch (ch) {
136 /* Characters */
137 case '\\':
138 ch = *context->sp++;
139 /* fall-through */
140 default:
141 CHARACTER:
142 push(context,
143 &(cregex_node_t) { .type = REGEX_NODE_TYPE_CHARACTER, .ch = ch });
144 break;
145 case '.':
146 push(context,
147 &(cregex_node_t) { .type = REGEX_NODE_TYPE_ANY_CHARACTER });
148 break;
149 case '[':
150 if (!parse_char_class(context)) {
151 return NULL;
152 }
153 break;
154
155 /* Composites */
156 case '|': {
157 cregex_node_t *left = concatenate(context, bottom), *right;
158 if (!(right = parse_context(context, depth))) {
159 return NULL;
160 }
161 if ( left->type == REGEX_NODE_TYPE_EPSILON
162 && right->type == left->type) {
163 drop(context);
164 } else if (left->type == REGEX_NODE_TYPE_EPSILON) {
165 right = consume(context);
166 drop(context);
167 push(context,
168 &(cregex_node_t) { .type = REGEX_NODE_TYPE_QUANTIFIER,
169 .nmin = 0,
170 .nmax = 1,
171 .greedy = 1,
172 .quantified = right });
173 } else if (right->type == REGEX_NODE_TYPE_EPSILON) {
174 drop(context);
175 left = consume(context);
176 push(context,
177 &(cregex_node_t) { .type = REGEX_NODE_TYPE_QUANTIFIER,
178 .nmin = 0,
179 .nmax = 1,
180 .greedy = 1,
181 .quantified = left });
182 } else {
183 right = consume(context);
184 left = consume(context);
185 push(context,
186 &(cregex_node_t) { .type = REGEX_NODE_TYPE_ALTERNATION,
187 .left = left,
188 .right = right });
189 }
190 return bottom;
191 }
192
193 #define QUANTIFIER(ch, min, max) \
194 case ch: \
195 if (context->stack == bottom) \
196 goto CHARACTER; \
197 push(context, \
198 &(cregex_node_t) { \
199 .type = REGEX_NODE_TYPE_QUANTIFIER, \
200 .nmin = min, \
201 .nmax = max, \
202 .greedy = (*context->sp == '?') ? (++context->sp, 0) : 1, \
203 .quantified = consume(context) \
204 } \
205 ); \
206 break
207
208 /* clang-format off */
209 /* Quantifiers */
210 QUANTIFIER('?', 0, 1);
211 QUANTIFIER('*', 0, -1);
212 QUANTIFIER('+', 1, -1);
213 /* clang-format on */
214 #undef QUANTIFIER
215
216 case '{':
217 if ((context->stack == bottom) || !parse_interval(context)) {
218 goto CHARACTER;
219 }
220 break;
221
222 /* Anchors */
223 case '^':
224 push(context,
225 &(cregex_node_t) { .type = REGEX_NODE_TYPE_ANCHOR_BEGIN });
226 break;
227 case '$':
228 push(context, &(cregex_node_t) { .type = REGEX_NODE_TYPE_ANCHOR_END });
229 break;
230
231 /* Captures */
232 case '(':
233 if (!parse_context(context, depth + 1)) {
234 return NULL;
235 }
236 push(context, &(cregex_node_t) { .type = REGEX_NODE_TYPE_CAPTURE,
237 .captured = consume(context) });
238 break;
239 case ')':
240 if (depth > 0) {
241 return concatenate(context, bottom);
242 }
243 /* unmatched close parenthesis */
244 return NULL;
245
246 /* End of string */
247 case '\0':
248 if (depth == 0) {
249 return concatenate(context, bottom);
250 }
251 /* unmatched open parenthesis */
252 return NULL;
253 }
254 }
255 }
256
estimate_nodes(const char * pattern)257 static inline int estimate_nodes(const char *pattern) {
258 return strlen(pattern) * 2;
259 }
260
261 /* Parse a pattern (using a previously allocated buffer of at least
262 * estimate_nodes(pattern) nodes).
263 */
parse_with_nodes(const char * pattern,cregex_node_t * nodes)264 static cregex_node_t* parse_with_nodes(
265 const char *pattern,
266 cregex_node_t *nodes
267 ) {
268 regex_parse_context *context
269 = &(regex_parse_context) {
270 .sp = pattern,
271 .stack = nodes,
272 .output = nodes + estimate_nodes(pattern)
273 };
274 return parse_context(context, 0);
275 }
276
cregex_parse(const char * pattern)277 cregex_node_t* cregex_parse(const char *pattern) {
278 size_t size = sizeof(cregex_node_t) * estimate_nodes(pattern);
279 cregex_node_t *nodes = malloc(size);
280 if (!nodes) {
281 return NULL;
282 }
283
284 if (!parse_with_nodes(pattern, nodes)) {
285 free(nodes);
286 return NULL;
287 }
288
289 return nodes;
290 }
291
cregex_parse_free(cregex_node_t * root)292 void cregex_parse_free(cregex_node_t *root) {
293 free(root);
294 }
295