1 /*
2 * This example code shows how to iterate over all regex matches in a file,
3 * emit the match location and print the contents of a capturing group.
4 */
5
6 #include <fcntl.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <sys/mman.h>
11 #include <sys/stat.h>
12 #include <sys/types.h>
13 #include <unistd.h>
14
15 #include "rure.h"
16
main()17 int main() {
18 /* Open a file and mmap it. */
19 int fd = open("sherlock.txt", O_RDONLY);
20 if (fd == -1) {
21 perror("failed to open sherlock.txt");
22 exit(1);
23 }
24 struct stat status;
25 if (fstat(fd, &status) == -1) {
26 perror("failed to stat sherlock.txt");
27 exit(1);
28 }
29 if ((uintmax_t)status.st_size > SIZE_MAX) {
30 perror("file too big");
31 exit(1);
32 }
33 if (status.st_size == 0) {
34 perror("file empty");
35 exit(1);
36 }
37 size_t sherlock_len = (size_t)status.st_size;
38 const uint8_t *sherlock = (const uint8_t *)mmap(
39 NULL, status.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
40 close(fd);
41 if (sherlock == MAP_FAILED) {
42 perror("could not mmap file");
43 exit(1);
44 }
45
46 /*
47 * Compile the regular expression. A more convenient routine,
48 * rure_compile_must, is also available, which will abort the process if
49 * and print an error message to stderr if the regex compilation fails.
50 * We show the full gory details here as an example.
51 */
52 const char *pattern = "(\\w+)\\s+Holmes";
53 size_t pattern_len = strlen(pattern);
54 rure_error *err = rure_error_new();
55 rure *re = rure_compile((const uint8_t *)pattern, pattern_len,
56 RURE_FLAG_UNICODE | RURE_FLAG_CASEI, NULL, err);
57 if (NULL == re) {
58 /* A null regex means compilation failed and an error exists. */
59 printf("compilation of %s failed: %s\n",
60 pattern, rure_error_message(err));
61 rure_error_free(err);
62 munmap((char*)sherlock, sherlock_len);
63 exit(1);
64 }
65 rure_error_free(err);
66
67 /*
68 * Create an iterator to find all successive non-overlapping matches.
69 * For each match, we extract the location of the capturing group.
70 */
71 rure_match group0 = {0};
72 rure_match group1 = {0};
73 rure_captures *caps = rure_captures_new(re);
74 rure_iter *it = rure_iter_new(re);
75
76 while (rure_iter_next_captures(it, sherlock, sherlock_len, caps)) {
77 /*
78 * Get the location of the full match and the capturing group.
79 * We know that both accesses are successful since the body of the
80 * loop only executes if there is a match and both capture groups
81 * must match in order for the entire regex to match.
82 *
83 * N.B. The zeroth group corresponds to the full match of the regex.
84 */
85 rure_captures_at(caps, 0, &group0);
86 rure_captures_at(caps, 1, &group1);
87 printf("%.*s (match at: %zu, %zu)\n",
88 (int)(group1.end - group1.start),
89 sherlock + group1.start,
90 group0.start, group0.end);
91 }
92
93 /* Free all our resources. */
94 munmap((char*)sherlock, sherlock_len);
95 rure_captures_free(caps);
96 rure_iter_free(it);
97 rure_free(re);
98 return 0;
99 }
100