• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 #ifndef _RURE_H
2 #define _RURE_H
3 
4 #include <stdbool.h>
5 #include <stdint.h>
6 #include <stdlib.h>
7 
8 #ifdef __cplusplus
9 extern "C" {
10 #endif
11 
12 /*
13  * rure is the type of a compiled regular expression.
14  *
15  * An rure can be safely used from multiple threads simultaneously.
16  */
17 typedef struct rure rure;
18 
19 /*
20  * rure_set is the type of a set of compiled regular expressions.
21  *
22  * A rure can be safely used from multiple threads simultaneously.
23  */
24 typedef struct rure_set rure_set;
25 
26 /*
27  * rure_options is the set of non-flag configuration options for compiling
28  * a regular expression. Currently, only two options are available: setting
29  * the size limit of the compiled program and setting the size limit of the
30  * cache of states that the DFA uses while searching.
31  *
32  * For most uses, the default settings will work fine, and NULL can be passed
33  * wherever a *rure_options is expected.
34 */
35 typedef struct rure_options rure_options;
36 
37 /*
38  * The flags listed below can be used in rure_compile to set the default
39  * flags. All flags can otherwise be toggled in the expression itself using
40  * standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)`
41  * disables it.
42  */
43 /* The case insensitive (i) flag. */
44 #define RURE_FLAG_CASEI (1 << 0)
45 /* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */
46 #define RURE_FLAG_MULTI (1 << 1)
47 /* The any character (s) flag. (. matches new line.) */
48 #define RURE_FLAG_DOTNL (1 << 2)
49 /* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */
50 #define RURE_FLAG_SWAP_GREED (1 << 3)
51 /* The ignore whitespace (x) flag. */
52 #define RURE_FLAG_SPACE (1 << 4)
53 /* The Unicode (u) flag. */
54 #define RURE_FLAG_UNICODE (1 << 5)
55 /* The default set of flags enabled when no flags are set. */
56 #define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE
57 
58 /*
59  * rure_match corresponds to the location of a single match in a haystack.
60  */
61 typedef struct rure_match {
62     /* The start position. */
63     size_t start;
64     /* The end position. */
65     size_t end;
66 } rure_match;
67 
68 /*
69  * rure_captures represents storage for sub-capture locations of a match.
70  *
71  * Computing the capture groups of a match can carry a significant performance
72  * penalty, so their use in the API is optional.
73  *
74  * An rure_captures value can be reused in multiple calls to rure_find_captures,
75  * so long as it is used with the compiled regular expression that created
76  * it.
77  *
78  * An rure_captures value may outlive its corresponding rure and can be freed
79  * independently.
80  *
81  * It is not safe to use from multiple threads simultaneously.
82  */
83 typedef struct rure_captures rure_captures;
84 
85 /*
86  * rure_iter is an iterator over successive non-overlapping matches in a
87  * particular haystack.
88  *
89  * An rure_iter value may not outlive its corresponding rure and should be freed
90  * before its corresponding rure is freed.
91  *
92  * It is not safe to use from multiple threads simultaneously.
93  */
94 typedef struct rure_iter rure_iter;
95 
96 /*
97  * rure_iter_capture_names is an iterator over the list of capture group names
98  * in this particular rure.
99  *
100  * An rure_iter_capture_names value may not outlive its corresponding rure,
101  * and should be freed before its corresponding rure is freed.
102  *
103  * It is not safe to use from multiple threads simultaneously.
104  */
105 typedef struct rure_iter_capture_names rure_iter_capture_names;
106 
107 /*
108  * rure_error is an error that caused compilation to fail.
109  *
110  * Most errors are syntax errors but an error can be returned if the compiled
111  * regular expression would be too big.
112  *
113  * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But
114  * you will not get access to the error if one occurred.)
115  *
116  * It is not safe to use from multiple threads simultaneously.
117  */
118 typedef struct rure_error rure_error;
119 
120 /*
121  * rure_compile_must compiles the given pattern into a regular expression. If
122  * compilation fails for any reason, an error message is printed to stderr and
123  * the process is aborted.
124  *
125  * The pattern given should be in UTF-8. For convenience, this accepts a C
126  * string, which means the pattern cannot usefully contain NUL. If your pattern
127  * may contain NUL, consider using a regular expression escape sequence, or
128  * just use rure_compile.
129  *
130  * This uses RURE_DEFAULT_FLAGS.
131  *
132  * The compiled expression returned may be used from multiple threads
133  * simultaneously.
134  */
135 rure *rure_compile_must(const char *pattern);
136 
137 /*
138  * rure_compile compiles the given pattern into a regular expression. The
139  * pattern must be valid UTF-8 and the length corresponds to the number of
140  * bytes in the pattern.
141  *
142  * flags is a bitfield. Valid values are constants declared with prefix
143  * RURE_FLAG_.
144  *
145  * options contains non-flag configuration settings. If it's NULL, default
146  * settings are used. options may be freed immediately after a call to
147  * rure_compile.
148  *
149  * error is set if there was a problem compiling the pattern (including if the
150  * pattern is not valid UTF-8). If error is NULL, then no error information
151  * is returned. In all cases, if an error occurs, NULL is returned.
152  *
153  * The compiled expression returned may be used from multiple threads
154  * simultaneously.
155  */
156 rure *rure_compile(const uint8_t *pattern, size_t length,
157                    uint32_t flags, rure_options *options,
158                    rure_error *error);
159 
160 /*
161  * rure_free frees the given compiled regular expression.
162  *
163  * This must be called at most once for any rure.
164  */
165 void rure_free(rure *re);
166 
167 /*
168  * rure_is_match returns true if and only if re matches anywhere in haystack.
169  *
170  * haystack may contain arbitrary bytes, but ASCII compatible text is more
171  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
172  * length should be the number of bytes in haystack.
173  *
174  * start is the position at which to start searching. Note that setting the
175  * start position is distinct from incrementing the pointer, since the regex
176  * engine may look at bytes before the start position to determine match
177  * information. For example, if the start position is greater than 0, then the
178  * \A ("begin text") anchor can never match.
179  *
180  * rure_is_match should be preferred to rure_find since it may be faster.
181  *
182  * N.B. The performance of this search is not impacted by the presence of
183  * capturing groups in your regular expression.
184  */
185 bool rure_is_match(rure *re, const uint8_t *haystack, size_t length,
186                    size_t start);
187 
188 /*
189  * rure_find returns true if and only if re matches anywhere in haystack.
190  * If a match is found, then its start and end offsets (in bytes) are set
191  * on the match pointer given.
192  *
193  * haystack may contain arbitrary bytes, but ASCII compatible text is more
194  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
195  * length should be the number of bytes in haystack.
196  *
197  * start is the position at which to start searching. Note that setting the
198  * start position is distinct from incrementing the pointer, since the regex
199  * engine may look at bytes before the start position to determine match
200  * information. For example, if the start position is greater than 0, then the
201  * \A ("begin text") anchor can never match.
202  *
203  * rure_find should be preferred to rure_find_captures since it may be faster.
204  *
205  * N.B. The performance of this search is not impacted by the presence of
206  * capturing groups in your regular expression.
207  */
208 bool rure_find(rure *re, const uint8_t *haystack, size_t length,
209                size_t start, rure_match *match);
210 
211 /*
212  * rure_find_captures returns true if and only if re matches anywhere in
213  * haystack. If a match is found, then all of its capture locations are stored
214  * in the captures pointer given.
215  *
216  * haystack may contain arbitrary bytes, but ASCII compatible text is more
217  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
218  * length should be the number of bytes in haystack.
219  *
220  * start is the position at which to start searching. Note that setting the
221  * start position is distinct from incrementing the pointer, since the regex
222  * engine may look at bytes before the start position to determine match
223  * information. For example, if the start position is greater than 0, then the
224  * \A ("begin text") anchor can never match.
225  *
226  * Only use this function if you specifically need access to capture locations.
227  * It is not necessary to use this function just because your regular
228  * expression contains capturing groups.
229  *
230  * Capture locations can be accessed using the rure_captures_* functions.
231  *
232  * N.B. The performance of this search can be impacted by the number of
233  * capturing groups. If you're using this function, it may be beneficial to
234  * use non-capturing groups (e.g., `(?:re)`) where possible.
235  */
236 bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length,
237                         size_t start, rure_captures *captures);
238 
239 /*
240  * rure_shortest_match returns true if and only if re matches anywhere in
241  * haystack. If a match is found, then its end location is stored in the
242  * pointer given. The end location is the place at which the regex engine
243  * determined that a match exists, but may occur before the end of the proper
244  * leftmost-first match.
245  *
246  * haystack may contain arbitrary bytes, but ASCII compatible text is more
247  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
248  * length should be the number of bytes in haystack.
249  *
250  * start is the position at which to start searching. Note that setting the
251  * start position is distinct from incrementing the pointer, since the regex
252  * engine may look at bytes before the start position to determine match
253  * information. For example, if the start position is greater than 0, then the
254  * \A ("begin text") anchor can never match.
255  *
256  * rure_shortest_match should be preferred to rure_find since it may be faster.
257  *
258  * N.B. The performance of this search is not impacted by the presence of
259  * capturing groups in your regular expression.
260  */
261 bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length,
262                          size_t start, size_t *end);
263 
264 /*
265  * rure_capture_name_index returns the capture index for the name given. If
266  * no such named capturing group exists in re, then -1 is returned.
267  *
268  * The capture index may be used with rure_captures_at.
269  *
270  * This function never returns 0 since the first capture group always
271  * corresponds to the entire match and is always unnamed.
272  */
273 int32_t rure_capture_name_index(rure *re, const char *name);
274 
275 /*
276  * rure_iter_capture_names_new creates a new capture_names iterator.
277  *
278  * An iterator will report all successive capture group names of re.
279  */
280 rure_iter_capture_names *rure_iter_capture_names_new(rure *re);
281 
282 /*
283  * rure_iter_capture_names_free frees the iterator given.
284  *
285  * It must be called at most once.
286  */
287 void rure_iter_capture_names_free(rure_iter_capture_names *it);
288 
289 /*
290  * rure_iter_capture_names_next advances the iterator and returns true
291  * if and only if another capture group name exists.
292  *
293  * The value of the capture group name is written to the provided pointer.
294  */
295 bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name);
296 
297 /*
298  * rure_iter_new creates a new iterator.
299  *
300  * An iterator will report all successive non-overlapping matches of re.
301  * When calling iterator functions, the same haystack and length must be
302  * supplied to all invocations. (Strict pointer equality is, however, not
303  * required.)
304  */
305 rure_iter *rure_iter_new(rure *re);
306 
307 /*
308  * rure_iter_free frees the iterator given.
309  *
310  * It must be called at most once.
311  */
312 void rure_iter_free(rure_iter *it);
313 
314 /*
315  * rure_iter_next advances the iterator and returns true if and only if a
316  * match was found. If a match is found, then the match pointer is set with the
317  * start and end location of the match, in bytes.
318  *
319  * If no match is found, then subsequent calls will return false indefinitely.
320  *
321  * haystack may contain arbitrary bytes, but ASCII compatible text is more
322  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
323  * length should be the number of bytes in haystack. The given haystack must
324  * be logically equivalent to all other haystacks given to this iterator.
325  *
326  * rure_iter_next should be preferred to rure_iter_next_captures since it may
327  * be faster.
328  *
329  * N.B. The performance of this search is not impacted by the presence of
330  * capturing groups in your regular expression.
331  */
332 bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length,
333                     rure_match *match);
334 
335 /*
336  * rure_iter_next_captures advances the iterator and returns true if and only if a
337  * match was found. If a match is found, then all of its capture locations are
338  * stored in the captures pointer given.
339  *
340  * If no match is found, then subsequent calls will return false indefinitely.
341  *
342  * haystack may contain arbitrary bytes, but ASCII compatible text is more
343  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
344  * length should be the number of bytes in haystack. The given haystack must
345  * be logically equivalent to all other haystacks given to this iterator.
346  *
347  * Only use this function if you specifically need access to capture locations.
348  * It is not necessary to use this function just because your regular
349  * expression contains capturing groups.
350  *
351  * Capture locations can be accessed using the rure_captures_* functions.
352  *
353  * N.B. The performance of this search can be impacted by the number of
354  * capturing groups. If you're using this function, it may be beneficial to
355  * use non-capturing groups (e.g., `(?:re)`) where possible.
356  */
357 bool rure_iter_next_captures(rure_iter *it,
358                              const uint8_t *haystack, size_t length,
359                              rure_captures *captures);
360 
361 /*
362  * rure_captures_new allocates storage for all capturing groups in re.
363  *
364  * An rure_captures value may be reused on subsequent calls to
365  * rure_find_captures or rure_iter_next_captures.
366  *
367  * An rure_captures value may be freed independently of re, although any
368  * particular rure_captures should be used only with the re given here.
369  *
370  * It is not safe to use an rure_captures value from multiple threads
371  * simultaneously.
372  */
373 rure_captures *rure_captures_new(rure *re);
374 
375 /*
376  * rure_captures_free frees the given captures.
377  *
378  * This must be called at most once.
379  */
380 void rure_captures_free(rure_captures *captures);
381 
382 /*
383  * rure_captures_at returns true if and only if the capturing group at the
384  * index given was part of a match. If so, the given match pointer is populated
385  * with the start and end location (in bytes) of the capturing group.
386  *
387  * If no capture group with the index i exists, then false is
388  * returned. (A capturing group exists if and only if i is less than
389  * rure_captures_len(captures).)
390  *
391  * Note that index 0 corresponds to the full match.
392  */
393 bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match);
394 
395 /*
396  * rure_captures_len returns the number of capturing groups in the given
397  * captures.
398  */
399 size_t rure_captures_len(rure_captures *captures);
400 
401 /*
402  * rure_options_new allocates space for options.
403  *
404  * Options may be freed immediately after a call to rure_compile, but otherwise
405  * may be freely used in multiple calls to rure_compile.
406  *
407  * It is not safe to set options from multiple threads simultaneously. It is
408  * safe to call rure_compile from multiple threads simultaneously using the
409  * same options pointer.
410  */
411 rure_options *rure_options_new();
412 
413 /*
414  * rure_options_free frees the given options.
415  *
416  * This must be called at most once.
417  */
418 void rure_options_free(rure_options *options);
419 
420 /*
421  * rure_options_size_limit sets the appoximate size limit of the compiled
422  * regular expression.
423  *
424  * This size limit roughly corresponds to the number of bytes occupied by a
425  * single compiled program. If the program would exceed this number, then a
426  * compilation error will be returned from rure_compile.
427  */
428 void rure_options_size_limit(rure_options *options, size_t limit);
429 
430 /*
431  * rure_options_dfa_size_limit sets the approximate size of the cache used by
432  * the DFA during search.
433  *
434  * This roughly corresponds to the number of bytes that the DFA will use while
435  * searching.
436  *
437  * Note that this is a *per thread* limit. There is no way to set a global
438  * limit. In particular, if a regular expression is used from multiple threads
439  * simultaneously, then each thread may use up to the number of bytes
440  * specified here.
441  */
442 void rure_options_dfa_size_limit(rure_options *options, size_t limit);
443 
444 /*
445  * rure_compile_set compiles the given list of patterns into a single regular
446  * expression which can be matched in a linear-scan. Each pattern in patterns
447  * must be valid UTF-8 and the length of each pattern in patterns corresponds
448  * to a byte length in patterns_lengths.
449  *
450  * The number of patterns to compile is specified by patterns_count. patterns
451  * must contain at least this many entries.
452  *
453  * flags is a bitfield. Valid values are constants declared with prefix
454  * RURE_FLAG_.
455  *
456  * options contains non-flag configuration settings. If it's NULL, default
457  * settings are used. options may be freed immediately after a call to
458  * rure_compile.
459  *
460  * error is set if there was a problem compiling the pattern.
461  *
462  * The compiled expression set returned may be used from multiple threads.
463  */
464 rure_set *rure_compile_set(const uint8_t **patterns,
465                            const size_t *patterns_lengths,
466                            size_t patterns_count,
467                            uint32_t flags,
468                            rure_options *options,
469                            rure_error *error);
470 
471 /*
472  * rure_set_free frees the given compiled regular expression set.
473  *
474  * This must be called at most once for any rure_set.
475  */
476 void rure_set_free(rure_set *re);
477 
478 /*
479  * rure_is_match returns true if and only if any regexes within the set
480  * match anywhere in the haystack. Once a match has been located, the
481  * matching engine will quit immediately.
482  *
483  * haystack may contain arbitrary bytes, but ASCII compatible text is more
484  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
485  * length should be the number of bytes in haystack.
486  *
487  * start is the position at which to start searching. Note that setting the
488  * start position is distinct from incrementing the pointer, since the regex
489  * engine may look at bytes before the start position to determine match
490  * information. For example, if the start position is greater than 0, then the
491  * \A ("begin text") anchor can never match.
492  */
493 bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length,
494                        size_t start);
495 
496 /*
497  * rure_set_matches compares each regex in the set against the haystack and
498  * modifies matches with the match result of each pattern. Match results are
499  * ordered in the same way as the rure_set was compiled. For example,
500  * index 0 of matches corresponds to the first pattern passed to
501  * `rure_compile_set`.
502  *
503  * haystack may contain arbitrary bytes, but ASCII compatible text is more
504  * useful. UTF-8 is even more useful. Other text encodings aren't supported.
505  * length should be the number of bytes in haystack.
506  *
507  * start is the position at which to start searching. Note that setting the
508  * start position is distinct from incrementing the pointer, since the regex
509  * engine may look at bytes before the start position to determine match
510  * information. For example, if the start position is greater than 0, then the
511  * \A ("begin text") anchor can never match.
512  *
513  * matches must be greater than or equal to the number of patterns the
514  * rure_set was compiled with.
515  *
516  * Only use this function if you specifically need to know which regexes
517  * matched within the set. To determine if any of the regexes matched without
518  * caring which, use rure_set_is_match.
519  */
520 bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length,
521                       size_t start, bool *matches);
522 
523 /*
524  * rure_set_len returns the number of patterns rure_set was compiled with.
525  */
526 size_t rure_set_len(rure_set *re);
527 
528 /*
529  * rure_error_new allocates space for an error.
530  *
531  * If error information is desired, then rure_error_new should be called
532  * to create an rure_error pointer, and that pointer can be passed to
533  * rure_compile. If an error occurred, then rure_compile will return NULL and
534  * the error pointer will be set. A message can then be extracted.
535  *
536  * It is not safe to use errors from multiple threads simultaneously. An error
537  * value may be reused on subsequent calls to rure_compile.
538  */
539 rure_error *rure_error_new();
540 
541 /*
542  * rure_error_free frees the error given.
543  *
544  * This must be called at most once.
545  */
546 void rure_error_free(rure_error *err);
547 
548 /*
549  * rure_error_message returns a NUL terminated string that describes the error
550  * message.
551  *
552  * The pointer returned must not be freed. Instead, it will be freed when
553  * rure_error_free is called. If err is used in subsequent calls to
554  * rure_compile, then this pointer may change or become invalid.
555  */
556 const char *rure_error_message(rure_error *err);
557 
558 /*
559  * rure_escape_must returns a NUL terminated string where all meta characters
560  * have been escaped. If escaping fails for any reason, an error message is
561  * printed to stderr and the process is aborted.
562  *
563  * The pattern given should be in UTF-8. For convenience, this accepts a C
564  * string, which means the pattern cannot contain a NUL byte. These correspond
565  * to the only two failure conditions of this function. That is, if the caller
566  * guarantees that the given pattern is valid UTF-8 and does not contain a
567  * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors).
568  *
569  * The pointer returned must not be freed directly. Instead, it should be freed
570  * by calling rure_cstring_free.
571  */
572 const char *rure_escape_must(const char *pattern);
573 
574 /*
575  * rure_cstring_free frees the string given.
576  *
577  * This must be called at most once per string.
578  */
579 void rure_cstring_free(char *s);
580 
581 #ifdef __cplusplus
582 }
583 #endif
584 
585 #endif
586