1 #ifndef _RURE_H 2 #define _RURE_H 3 4 #include <stdbool.h> 5 #include <stdint.h> 6 #include <stdlib.h> 7 8 #ifdef __cplusplus 9 extern "C" { 10 #endif 11 12 /* 13 * rure is the type of a compiled regular expression. 14 * 15 * An rure can be safely used from multiple threads simultaneously. 16 */ 17 typedef struct rure rure; 18 19 /* 20 * rure_set is the type of a set of compiled regular expressions. 21 * 22 * A rure can be safely used from multiple threads simultaneously. 23 */ 24 typedef struct rure_set rure_set; 25 26 /* 27 * rure_options is the set of non-flag configuration options for compiling 28 * a regular expression. Currently, only two options are available: setting 29 * the size limit of the compiled program and setting the size limit of the 30 * cache of states that the DFA uses while searching. 31 * 32 * For most uses, the default settings will work fine, and NULL can be passed 33 * wherever a *rure_options is expected. 34 */ 35 typedef struct rure_options rure_options; 36 37 /* 38 * The flags listed below can be used in rure_compile to set the default 39 * flags. All flags can otherwise be toggled in the expression itself using 40 * standard syntax, e.g., `(?i)` turns case insensitive matching on and `(?-i)` 41 * disables it. 42 */ 43 /* The case insensitive (i) flag. */ 44 #define RURE_FLAG_CASEI (1 << 0) 45 /* The multi-line matching (m) flag. (^ and $ match new line boundaries.) */ 46 #define RURE_FLAG_MULTI (1 << 1) 47 /* The any character (s) flag. (. matches new line.) */ 48 #define RURE_FLAG_DOTNL (1 << 2) 49 /* The greedy swap (U) flag. (e.g., + is ungreedy and +? is greedy.) */ 50 #define RURE_FLAG_SWAP_GREED (1 << 3) 51 /* The ignore whitespace (x) flag. */ 52 #define RURE_FLAG_SPACE (1 << 4) 53 /* The Unicode (u) flag. */ 54 #define RURE_FLAG_UNICODE (1 << 5) 55 /* The default set of flags enabled when no flags are set. */ 56 #define RURE_DEFAULT_FLAGS RURE_FLAG_UNICODE 57 58 /* 59 * rure_match corresponds to the location of a single match in a haystack. 60 */ 61 typedef struct rure_match { 62 /* The start position. */ 63 size_t start; 64 /* The end position. */ 65 size_t end; 66 } rure_match; 67 68 /* 69 * rure_captures represents storage for sub-capture locations of a match. 70 * 71 * Computing the capture groups of a match can carry a significant performance 72 * penalty, so their use in the API is optional. 73 * 74 * An rure_captures value can be reused in multiple calls to rure_find_captures, 75 * so long as it is used with the compiled regular expression that created 76 * it. 77 * 78 * An rure_captures value may outlive its corresponding rure and can be freed 79 * independently. 80 * 81 * It is not safe to use from multiple threads simultaneously. 82 */ 83 typedef struct rure_captures rure_captures; 84 85 /* 86 * rure_iter is an iterator over successive non-overlapping matches in a 87 * particular haystack. 88 * 89 * An rure_iter value may not outlive its corresponding rure and should be freed 90 * before its corresponding rure is freed. 91 * 92 * It is not safe to use from multiple threads simultaneously. 93 */ 94 typedef struct rure_iter rure_iter; 95 96 /* 97 * rure_iter_capture_names is an iterator over the list of capture group names 98 * in this particular rure. 99 * 100 * An rure_iter_capture_names value may not outlive its corresponding rure, 101 * and should be freed before its corresponding rure is freed. 102 * 103 * It is not safe to use from multiple threads simultaneously. 104 */ 105 typedef struct rure_iter_capture_names rure_iter_capture_names; 106 107 /* 108 * rure_error is an error that caused compilation to fail. 109 * 110 * Most errors are syntax errors but an error can be returned if the compiled 111 * regular expression would be too big. 112 * 113 * Whenever a function accepts an *rure_error, it is safe to pass NULL. (But 114 * you will not get access to the error if one occurred.) 115 * 116 * It is not safe to use from multiple threads simultaneously. 117 */ 118 typedef struct rure_error rure_error; 119 120 /* 121 * rure_compile_must compiles the given pattern into a regular expression. If 122 * compilation fails for any reason, an error message is printed to stderr and 123 * the process is aborted. 124 * 125 * The pattern given should be in UTF-8. For convenience, this accepts a C 126 * string, which means the pattern cannot usefully contain NUL. If your pattern 127 * may contain NUL, consider using a regular expression escape sequence, or 128 * just use rure_compile. 129 * 130 * This uses RURE_DEFAULT_FLAGS. 131 * 132 * The compiled expression returned may be used from multiple threads 133 * simultaneously. 134 */ 135 rure *rure_compile_must(const char *pattern); 136 137 /* 138 * rure_compile compiles the given pattern into a regular expression. The 139 * pattern must be valid UTF-8 and the length corresponds to the number of 140 * bytes in the pattern. 141 * 142 * flags is a bitfield. Valid values are constants declared with prefix 143 * RURE_FLAG_. 144 * 145 * options contains non-flag configuration settings. If it's NULL, default 146 * settings are used. options may be freed immediately after a call to 147 * rure_compile. 148 * 149 * error is set if there was a problem compiling the pattern (including if the 150 * pattern is not valid UTF-8). If error is NULL, then no error information 151 * is returned. In all cases, if an error occurs, NULL is returned. 152 * 153 * The compiled expression returned may be used from multiple threads 154 * simultaneously. 155 */ 156 rure *rure_compile(const uint8_t *pattern, size_t length, 157 uint32_t flags, rure_options *options, 158 rure_error *error); 159 160 /* 161 * rure_free frees the given compiled regular expression. 162 * 163 * This must be called at most once for any rure. 164 */ 165 void rure_free(rure *re); 166 167 /* 168 * rure_is_match returns true if and only if re matches anywhere in haystack. 169 * 170 * haystack may contain arbitrary bytes, but ASCII compatible text is more 171 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 172 * length should be the number of bytes in haystack. 173 * 174 * start is the position at which to start searching. Note that setting the 175 * start position is distinct from incrementing the pointer, since the regex 176 * engine may look at bytes before the start position to determine match 177 * information. For example, if the start position is greater than 0, then the 178 * \A ("begin text") anchor can never match. 179 * 180 * rure_is_match should be preferred to rure_find since it may be faster. 181 * 182 * N.B. The performance of this search is not impacted by the presence of 183 * capturing groups in your regular expression. 184 */ 185 bool rure_is_match(rure *re, const uint8_t *haystack, size_t length, 186 size_t start); 187 188 /* 189 * rure_find returns true if and only if re matches anywhere in haystack. 190 * If a match is found, then its start and end offsets (in bytes) are set 191 * on the match pointer given. 192 * 193 * haystack may contain arbitrary bytes, but ASCII compatible text is more 194 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 195 * length should be the number of bytes in haystack. 196 * 197 * start is the position at which to start searching. Note that setting the 198 * start position is distinct from incrementing the pointer, since the regex 199 * engine may look at bytes before the start position to determine match 200 * information. For example, if the start position is greater than 0, then the 201 * \A ("begin text") anchor can never match. 202 * 203 * rure_find should be preferred to rure_find_captures since it may be faster. 204 * 205 * N.B. The performance of this search is not impacted by the presence of 206 * capturing groups in your regular expression. 207 */ 208 bool rure_find(rure *re, const uint8_t *haystack, size_t length, 209 size_t start, rure_match *match); 210 211 /* 212 * rure_find_captures returns true if and only if re matches anywhere in 213 * haystack. If a match is found, then all of its capture locations are stored 214 * in the captures pointer given. 215 * 216 * haystack may contain arbitrary bytes, but ASCII compatible text is more 217 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 218 * length should be the number of bytes in haystack. 219 * 220 * start is the position at which to start searching. Note that setting the 221 * start position is distinct from incrementing the pointer, since the regex 222 * engine may look at bytes before the start position to determine match 223 * information. For example, if the start position is greater than 0, then the 224 * \A ("begin text") anchor can never match. 225 * 226 * Only use this function if you specifically need access to capture locations. 227 * It is not necessary to use this function just because your regular 228 * expression contains capturing groups. 229 * 230 * Capture locations can be accessed using the rure_captures_* functions. 231 * 232 * N.B. The performance of this search can be impacted by the number of 233 * capturing groups. If you're using this function, it may be beneficial to 234 * use non-capturing groups (e.g., `(?:re)`) where possible. 235 */ 236 bool rure_find_captures(rure *re, const uint8_t *haystack, size_t length, 237 size_t start, rure_captures *captures); 238 239 /* 240 * rure_shortest_match returns true if and only if re matches anywhere in 241 * haystack. If a match is found, then its end location is stored in the 242 * pointer given. The end location is the place at which the regex engine 243 * determined that a match exists, but may occur before the end of the proper 244 * leftmost-first match. 245 * 246 * haystack may contain arbitrary bytes, but ASCII compatible text is more 247 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 248 * length should be the number of bytes in haystack. 249 * 250 * start is the position at which to start searching. Note that setting the 251 * start position is distinct from incrementing the pointer, since the regex 252 * engine may look at bytes before the start position to determine match 253 * information. For example, if the start position is greater than 0, then the 254 * \A ("begin text") anchor can never match. 255 * 256 * rure_shortest_match should be preferred to rure_find since it may be faster. 257 * 258 * N.B. The performance of this search is not impacted by the presence of 259 * capturing groups in your regular expression. 260 */ 261 bool rure_shortest_match(rure *re, const uint8_t *haystack, size_t length, 262 size_t start, size_t *end); 263 264 /* 265 * rure_capture_name_index returns the capture index for the name given. If 266 * no such named capturing group exists in re, then -1 is returned. 267 * 268 * The capture index may be used with rure_captures_at. 269 * 270 * This function never returns 0 since the first capture group always 271 * corresponds to the entire match and is always unnamed. 272 */ 273 int32_t rure_capture_name_index(rure *re, const char *name); 274 275 /* 276 * rure_iter_capture_names_new creates a new capture_names iterator. 277 * 278 * An iterator will report all successive capture group names of re. 279 */ 280 rure_iter_capture_names *rure_iter_capture_names_new(rure *re); 281 282 /* 283 * rure_iter_capture_names_free frees the iterator given. 284 * 285 * It must be called at most once. 286 */ 287 void rure_iter_capture_names_free(rure_iter_capture_names *it); 288 289 /* 290 * rure_iter_capture_names_next advances the iterator and returns true 291 * if and only if another capture group name exists. 292 * 293 * The value of the capture group name is written to the provided pointer. 294 */ 295 bool rure_iter_capture_names_next(rure_iter_capture_names *it, char **name); 296 297 /* 298 * rure_iter_new creates a new iterator. 299 * 300 * An iterator will report all successive non-overlapping matches of re. 301 * When calling iterator functions, the same haystack and length must be 302 * supplied to all invocations. (Strict pointer equality is, however, not 303 * required.) 304 */ 305 rure_iter *rure_iter_new(rure *re); 306 307 /* 308 * rure_iter_free frees the iterator given. 309 * 310 * It must be called at most once. 311 */ 312 void rure_iter_free(rure_iter *it); 313 314 /* 315 * rure_iter_next advances the iterator and returns true if and only if a 316 * match was found. If a match is found, then the match pointer is set with the 317 * start and end location of the match, in bytes. 318 * 319 * If no match is found, then subsequent calls will return false indefinitely. 320 * 321 * haystack may contain arbitrary bytes, but ASCII compatible text is more 322 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 323 * length should be the number of bytes in haystack. The given haystack must 324 * be logically equivalent to all other haystacks given to this iterator. 325 * 326 * rure_iter_next should be preferred to rure_iter_next_captures since it may 327 * be faster. 328 * 329 * N.B. The performance of this search is not impacted by the presence of 330 * capturing groups in your regular expression. 331 */ 332 bool rure_iter_next(rure_iter *it, const uint8_t *haystack, size_t length, 333 rure_match *match); 334 335 /* 336 * rure_iter_next_captures advances the iterator and returns true if and only if a 337 * match was found. If a match is found, then all of its capture locations are 338 * stored in the captures pointer given. 339 * 340 * If no match is found, then subsequent calls will return false indefinitely. 341 * 342 * haystack may contain arbitrary bytes, but ASCII compatible text is more 343 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 344 * length should be the number of bytes in haystack. The given haystack must 345 * be logically equivalent to all other haystacks given to this iterator. 346 * 347 * Only use this function if you specifically need access to capture locations. 348 * It is not necessary to use this function just because your regular 349 * expression contains capturing groups. 350 * 351 * Capture locations can be accessed using the rure_captures_* functions. 352 * 353 * N.B. The performance of this search can be impacted by the number of 354 * capturing groups. If you're using this function, it may be beneficial to 355 * use non-capturing groups (e.g., `(?:re)`) where possible. 356 */ 357 bool rure_iter_next_captures(rure_iter *it, 358 const uint8_t *haystack, size_t length, 359 rure_captures *captures); 360 361 /* 362 * rure_captures_new allocates storage for all capturing groups in re. 363 * 364 * An rure_captures value may be reused on subsequent calls to 365 * rure_find_captures or rure_iter_next_captures. 366 * 367 * An rure_captures value may be freed independently of re, although any 368 * particular rure_captures should be used only with the re given here. 369 * 370 * It is not safe to use an rure_captures value from multiple threads 371 * simultaneously. 372 */ 373 rure_captures *rure_captures_new(rure *re); 374 375 /* 376 * rure_captures_free frees the given captures. 377 * 378 * This must be called at most once. 379 */ 380 void rure_captures_free(rure_captures *captures); 381 382 /* 383 * rure_captures_at returns true if and only if the capturing group at the 384 * index given was part of a match. If so, the given match pointer is populated 385 * with the start and end location (in bytes) of the capturing group. 386 * 387 * If no capture group with the index i exists, then false is 388 * returned. (A capturing group exists if and only if i is less than 389 * rure_captures_len(captures).) 390 * 391 * Note that index 0 corresponds to the full match. 392 */ 393 bool rure_captures_at(rure_captures *captures, size_t i, rure_match *match); 394 395 /* 396 * rure_captures_len returns the number of capturing groups in the given 397 * captures. 398 */ 399 size_t rure_captures_len(rure_captures *captures); 400 401 /* 402 * rure_options_new allocates space for options. 403 * 404 * Options may be freed immediately after a call to rure_compile, but otherwise 405 * may be freely used in multiple calls to rure_compile. 406 * 407 * It is not safe to set options from multiple threads simultaneously. It is 408 * safe to call rure_compile from multiple threads simultaneously using the 409 * same options pointer. 410 */ 411 rure_options *rure_options_new(); 412 413 /* 414 * rure_options_free frees the given options. 415 * 416 * This must be called at most once. 417 */ 418 void rure_options_free(rure_options *options); 419 420 /* 421 * rure_options_size_limit sets the appoximate size limit of the compiled 422 * regular expression. 423 * 424 * This size limit roughly corresponds to the number of bytes occupied by a 425 * single compiled program. If the program would exceed this number, then a 426 * compilation error will be returned from rure_compile. 427 */ 428 void rure_options_size_limit(rure_options *options, size_t limit); 429 430 /* 431 * rure_options_dfa_size_limit sets the approximate size of the cache used by 432 * the DFA during search. 433 * 434 * This roughly corresponds to the number of bytes that the DFA will use while 435 * searching. 436 * 437 * Note that this is a *per thread* limit. There is no way to set a global 438 * limit. In particular, if a regular expression is used from multiple threads 439 * simultaneously, then each thread may use up to the number of bytes 440 * specified here. 441 */ 442 void rure_options_dfa_size_limit(rure_options *options, size_t limit); 443 444 /* 445 * rure_compile_set compiles the given list of patterns into a single regular 446 * expression which can be matched in a linear-scan. Each pattern in patterns 447 * must be valid UTF-8 and the length of each pattern in patterns corresponds 448 * to a byte length in patterns_lengths. 449 * 450 * The number of patterns to compile is specified by patterns_count. patterns 451 * must contain at least this many entries. 452 * 453 * flags is a bitfield. Valid values are constants declared with prefix 454 * RURE_FLAG_. 455 * 456 * options contains non-flag configuration settings. If it's NULL, default 457 * settings are used. options may be freed immediately after a call to 458 * rure_compile. 459 * 460 * error is set if there was a problem compiling the pattern. 461 * 462 * The compiled expression set returned may be used from multiple threads. 463 */ 464 rure_set *rure_compile_set(const uint8_t **patterns, 465 const size_t *patterns_lengths, 466 size_t patterns_count, 467 uint32_t flags, 468 rure_options *options, 469 rure_error *error); 470 471 /* 472 * rure_set_free frees the given compiled regular expression set. 473 * 474 * This must be called at most once for any rure_set. 475 */ 476 void rure_set_free(rure_set *re); 477 478 /* 479 * rure_is_match returns true if and only if any regexes within the set 480 * match anywhere in the haystack. Once a match has been located, the 481 * matching engine will quit immediately. 482 * 483 * haystack may contain arbitrary bytes, but ASCII compatible text is more 484 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 485 * length should be the number of bytes in haystack. 486 * 487 * start is the position at which to start searching. Note that setting the 488 * start position is distinct from incrementing the pointer, since the regex 489 * engine may look at bytes before the start position to determine match 490 * information. For example, if the start position is greater than 0, then the 491 * \A ("begin text") anchor can never match. 492 */ 493 bool rure_set_is_match(rure_set *re, const uint8_t *haystack, size_t length, 494 size_t start); 495 496 /* 497 * rure_set_matches compares each regex in the set against the haystack and 498 * modifies matches with the match result of each pattern. Match results are 499 * ordered in the same way as the rure_set was compiled. For example, 500 * index 0 of matches corresponds to the first pattern passed to 501 * `rure_compile_set`. 502 * 503 * haystack may contain arbitrary bytes, but ASCII compatible text is more 504 * useful. UTF-8 is even more useful. Other text encodings aren't supported. 505 * length should be the number of bytes in haystack. 506 * 507 * start is the position at which to start searching. Note that setting the 508 * start position is distinct from incrementing the pointer, since the regex 509 * engine may look at bytes before the start position to determine match 510 * information. For example, if the start position is greater than 0, then the 511 * \A ("begin text") anchor can never match. 512 * 513 * matches must be greater than or equal to the number of patterns the 514 * rure_set was compiled with. 515 * 516 * Only use this function if you specifically need to know which regexes 517 * matched within the set. To determine if any of the regexes matched without 518 * caring which, use rure_set_is_match. 519 */ 520 bool rure_set_matches(rure_set *re, const uint8_t *haystack, size_t length, 521 size_t start, bool *matches); 522 523 /* 524 * rure_set_len returns the number of patterns rure_set was compiled with. 525 */ 526 size_t rure_set_len(rure_set *re); 527 528 /* 529 * rure_error_new allocates space for an error. 530 * 531 * If error information is desired, then rure_error_new should be called 532 * to create an rure_error pointer, and that pointer can be passed to 533 * rure_compile. If an error occurred, then rure_compile will return NULL and 534 * the error pointer will be set. A message can then be extracted. 535 * 536 * It is not safe to use errors from multiple threads simultaneously. An error 537 * value may be reused on subsequent calls to rure_compile. 538 */ 539 rure_error *rure_error_new(); 540 541 /* 542 * rure_error_free frees the error given. 543 * 544 * This must be called at most once. 545 */ 546 void rure_error_free(rure_error *err); 547 548 /* 549 * rure_error_message returns a NUL terminated string that describes the error 550 * message. 551 * 552 * The pointer returned must not be freed. Instead, it will be freed when 553 * rure_error_free is called. If err is used in subsequent calls to 554 * rure_compile, then this pointer may change or become invalid. 555 */ 556 const char *rure_error_message(rure_error *err); 557 558 /* 559 * rure_escape_must returns a NUL terminated string where all meta characters 560 * have been escaped. If escaping fails for any reason, an error message is 561 * printed to stderr and the process is aborted. 562 * 563 * The pattern given should be in UTF-8. For convenience, this accepts a C 564 * string, which means the pattern cannot contain a NUL byte. These correspond 565 * to the only two failure conditions of this function. That is, if the caller 566 * guarantees that the given pattern is valid UTF-8 and does not contain a 567 * NUL byte, then this is guaranteed to succeed (modulo out-of-memory errors). 568 * 569 * The pointer returned must not be freed directly. Instead, it should be freed 570 * by calling rure_cstring_free. 571 */ 572 const char *rure_escape_must(const char *pattern); 573 574 /* 575 * rure_cstring_free frees the string given. 576 * 577 * This must be called at most once per string. 578 */ 579 void rure_cstring_free(char *s); 580 581 #ifdef __cplusplus 582 } 583 #endif 584 585 #endif 586