1 #ifndef CREGEX_H
2 #define CREGEX_H
3
4 typedef enum {
5 REGEX_NODE_TYPE_EPSILON = 0,
6 /* Characters */
7 REGEX_NODE_TYPE_CHARACTER,
8 REGEX_NODE_TYPE_ANY_CHARACTER,
9 REGEX_NODE_TYPE_CHARACTER_CLASS,
10 REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED,
11 /* Composites */
12 REGEX_NODE_TYPE_CONCATENATION,
13 REGEX_NODE_TYPE_ALTERNATION,
14 /* Quantifiers */
15 REGEX_NODE_TYPE_QUANTIFIER,
16 /* Anchors */
17 REGEX_NODE_TYPE_ANCHOR_BEGIN,
18 REGEX_NODE_TYPE_ANCHOR_END,
19 /* Captures */
20 REGEX_NODE_TYPE_CAPTURE,
21 } cregex_node_type;
22
23 typedef struct cregex_node {
24 cregex_node_type type;
25 union {
26 /* REGEX_NODE_TYPE_CHARACTER */
27 struct {
28 int ch;
29 };
30 /* REGEX_NODE_TYPE_CHARACTER_CLASS,
31 * REGEX_NODE_TYPE_CHARACTER_CLASS_NEGATED
32 */
33 struct {
34 const char *from, *to;
35 };
36 /* REGEX_NODE_TYPE_QUANTIFIER */
37 struct {
38 int nmin, nmax, greedy;
39 struct cregex_node *quantified;
40 };
41 /* REGEX_NODE_TYPE_CONCATENATION,
42 * REGEX_NODE_TYPE_ALTERNATION
43 */
44 struct {
45 struct cregex_node *left, *right;
46 };
47 /* REGEX_NODE_TYPE_CAPTURE */
48 struct {
49 struct cregex_node *captured;
50 };
51 };
52 } cregex_node_t;
53
54 typedef enum {
55 REGEX_PROGRAM_OPCODE_MATCH = 0,
56 /* Characters */
57 REGEX_PROGRAM_OPCODE_CHARACTER,
58 REGEX_PROGRAM_OPCODE_ANY_CHARACTER,
59 REGEX_PROGRAM_OPCODE_CHARACTER_CLASS,
60 REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED,
61 /* Control-flow */
62 REGEX_PROGRAM_OPCODE_SPLIT,
63 REGEX_PROGRAM_OPCODE_JUMP,
64 /* Assertions */
65 REGEX_PROGRAM_OPCODE_ASSERT_BEGIN,
66 REGEX_PROGRAM_OPCODE_ASSERT_END,
67 /* Saving */
68 REGEX_PROGRAM_OPCODE_SAVE,
69 } cregex_program_opcode_t;
70
71 #include <limits.h>
72
73 typedef char cregex_char_class[(UCHAR_MAX + CHAR_BIT - 1) / CHAR_BIT];
74
cregex_char_class_contains(const cregex_char_class klass,int ch)75 static inline int cregex_char_class_contains(
76 const cregex_char_class klass,
77 int ch
78 ) {
79 return klass[ch / CHAR_BIT] & (1 << ch % CHAR_BIT);
80 }
81
cregex_char_class_add(cregex_char_class klass,int ch)82 static inline int cregex_char_class_add(cregex_char_class klass, int ch) {
83 klass[ch / CHAR_BIT] |= 1 << (ch % CHAR_BIT);
84 return ch;
85 }
86
87 typedef struct cregex_program_instr {
88 cregex_program_opcode_t opcode;
89 union {
90 /* REGEX_PROGRAM_OPCODE_CHARACTER */
91 struct {
92 int ch;
93 };
94 /* REGEX_PROGRAM_OPCODE_CHARACTER_CLASS,
95 * REGEX_PROGRAM_OPCODE_CHARACTER_CLASS_NEGATED
96 */
97 struct {
98 cregex_char_class klass;
99 };
100 /* REGEX_PROGRAM_OPCODE_SPLIT */
101 struct {
102 struct cregex_program_instr *first, *second;
103 };
104 /* REGEX_PROGRAM_OPCODE_JUMP */
105 struct {
106 struct cregex_program_instr *target;
107 };
108 /* REGEX_PROGRAM_OPCODE_SAVE */
109 struct {
110 int save;
111 };
112 };
113 } cregex_program_instr_t;
114
115 typedef struct {
116 int ninstructions;
117 cregex_program_instr_t instructions[];
118 } cregex_program_t;
119
120 /* Run program on string */
121 int cregex_program_run(
122 const cregex_program_t *program,
123 const char *string,
124 const char **matches,
125 int nmatches);
126
127 /* Compile a parsed pattern */
128 cregex_program_t* cregex_compile_node(const cregex_node_t *root);
129
130 /* Free a compiled program */
131 void cregex_compile_free(cregex_program_t *program);
132
133 /* Parse a pattern */
134 cregex_node_t* cregex_parse(const char *pattern);
135
136 /* Free a parsed pattern */
137 void cregex_parse_free(cregex_node_t *root);
138
139 #endif
140