1 /* Pattern Matchers for Regular Expressions.
2 Copyright (C) 1992, 1998, 2000, 2005-2006, 2010, 2013 Free Software
3 Foundation, Inc.
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation; either version 3 of the License, or
8 (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program. If not, see <https://www.gnu.org/licenses/>. */
17
18 #ifdef HAVE_CONFIG_H
19 # include <config.h>
20 #endif
21
22 /* Specification. */
23 #include "libgrep.h"
24
25 #include <ctype.h>
26 #include <stdbool.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <regex.h>
30
31 #include "error.h"
32 #include "exitfail.h"
33 #include "xalloc.h"
34
35 #if defined (STDC_HEADERS) || (!defined (isascii) && !defined (HAVE_ISASCII))
36 # define IN_CTYPE_DOMAIN(c) 1
37 #else
38 # define IN_CTYPE_DOMAIN(c) isascii(c)
39 #endif
40 #define ISALNUM(C) (IN_CTYPE_DOMAIN (C) && isalnum (C))
41 #define IS_WORD_CONSTITUENT(C) (ISALNUM(C) || (C) == '_')
42
43 struct patterns
44 {
45 /* Regex compiled regexp. */
46 struct re_pattern_buffer regexbuf;
47 struct re_registers regs; /* This is here on account of a BRAIN-DEAD
48 Q@#%!# library interface in regex.c. */
49 };
50
51 struct compiled_regex {
52 bool match_words;
53 bool match_lines;
54 char eolbyte;
55
56 /* The Regex compiled patterns. */
57 struct patterns *patterns;
58 size_t pcount;
59 };
60
61 static void *
compile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte,reg_syntax_t syntax)62 compile (const char *pattern, size_t pattern_size,
63 bool match_icase, bool match_words, bool match_lines, char eolbyte,
64 reg_syntax_t syntax)
65 {
66 struct compiled_regex *cregex;
67
68 cregex = XMALLOC (struct compiled_regex);
69 memset (cregex, '\0', sizeof (struct compiled_regex));
70 cregex->match_words = match_words;
71 cregex->match_lines = match_lines;
72 cregex->eolbyte = eolbyte;
73 cregex->patterns = NULL;
74 cregex->pcount = 0;
75
76 re_set_syntax (syntax);
77
78 /* For GNU regex compiler we have to pass the patterns separately to detect
79 errors like "[\nallo\n]\n". The patterns here are "[", "allo" and "]"
80 GNU regex should have raised a syntax error. The same for backref, where
81 the backref should have been local to each pattern. */
82 {
83 const char *sep;
84 size_t total = pattern_size;
85 const char *motif = pattern;
86
87 do
88 {
89 size_t len;
90 const char *err;
91
92 sep = (const char *) memchr (motif, '\n', total);
93 if (sep)
94 {
95 len = sep - motif;
96 sep++;
97 total -= (len + 1);
98 }
99 else
100 {
101 len = total;
102 total = 0;
103 }
104
105 cregex->patterns = xrealloc (cregex->patterns, (cregex->pcount + 1) * sizeof (struct patterns));
106 memset (&cregex->patterns[cregex->pcount], '\0', sizeof (struct patterns));
107
108 if ((err = re_compile_pattern (motif, len,
109 &cregex->patterns[cregex->pcount].regexbuf)) != NULL)
110 error (exit_failure, 0, "%s", err);
111 cregex->pcount++;
112
113 motif = sep;
114 }
115 while (sep && total != 0);
116 }
117
118 return cregex;
119 }
120
121 static void *
Gcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)122 Gcompile (const char *pattern, size_t pattern_size,
123 bool match_icase, bool match_words, bool match_lines, char eolbyte)
124 {
125 return compile (pattern, pattern_size,
126 match_icase, match_words, match_lines, eolbyte,
127 RE_SYNTAX_GREP | RE_HAT_LISTS_NOT_NEWLINE);
128 }
129
130 static void *
Ecompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)131 Ecompile (const char *pattern, size_t pattern_size,
132 bool match_icase, bool match_words, bool match_lines, char eolbyte)
133 {
134 return compile (pattern, pattern_size,
135 match_icase, match_words, match_lines, eolbyte,
136 RE_SYNTAX_POSIX_EGREP);
137 }
138
139 static void *
AWKcompile(const char * pattern,size_t pattern_size,bool match_icase,bool match_words,bool match_lines,char eolbyte)140 AWKcompile (const char *pattern, size_t pattern_size,
141 bool match_icase, bool match_words, bool match_lines, char eolbyte)
142 {
143 return compile (pattern, pattern_size,
144 match_icase, match_words, match_lines, eolbyte,
145 RE_SYNTAX_AWK);
146 }
147
148 static size_t
EGexecute(const void * compiled_pattern,const char * buf,size_t buf_size,size_t * match_size,bool exact)149 EGexecute (const void *compiled_pattern,
150 const char *buf, size_t buf_size,
151 size_t *match_size, bool exact)
152 {
153 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
154 char eol = cregex->eolbyte;
155 register const char *buflim = buf + buf_size;
156 register const char *beg;
157 register const char *end;
158
159 for (beg = buf; beg < buflim; beg = end)
160 {
161 size_t i;
162
163 end = (const char *) memchr (beg, eol, buflim - beg);
164 if (end == NULL)
165 end = buflim;
166 /* Here, either end < buflim && *end == eol, or end == buflim. */
167
168 for (i = 0; i < cregex->pcount; i++)
169 {
170 int start, len;
171
172 cregex->patterns[i].regexbuf.not_eol = 0;
173 if (0 <= (start = re_search (&cregex->patterns[i].regexbuf, beg,
174 end - beg, 0,
175 end - beg, &cregex->patterns[i].regs)))
176 {
177 len = cregex->patterns[i].regs.end[0] - start;
178 if (exact)
179 {
180 *match_size = len;
181 return start;
182 }
183 if (cregex->match_lines)
184 {
185 if (len == end - beg) /* implies start == 0 */
186 goto success;
187 }
188 else if (cregex->match_words)
189 {
190 /* If -w, check if the match aligns with word boundaries.
191 We do this iteratively because:
192 (a) the line may contain more than one occurence of the
193 pattern, and
194 (b) Several alternatives in the pattern might be valid at
195 a given point, and we may need to consider a shorter
196 one to find a word boundary. */
197 while (start >= 0)
198 {
199 if ((start == 0 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start - 1]))
200 && (start + len == end - beg
201 || !IS_WORD_CONSTITUENT ((unsigned char) beg[start + len])))
202 goto success;
203 if (len > 0)
204 {
205 /* Try a shorter length anchored at the same place. */
206 --len;
207 cregex->patterns[i].regexbuf.not_eol = 1;
208 len = re_match (&cregex->patterns[i].regexbuf, beg,
209 start + len, start,
210 &cregex->patterns[i].regs);
211 }
212 if (len <= 0)
213 {
214 /* Try looking further on. */
215 if (start == end - beg)
216 break;
217 ++start;
218 cregex->patterns[i].regexbuf.not_eol = 0;
219 start = re_search (&cregex->patterns[i].regexbuf, beg,
220 end - beg,
221 start, end - beg - start,
222 &cregex->patterns[i].regs);
223 len = cregex->patterns[i].regs.end[0] - start;
224 }
225 }
226 }
227 else
228 goto success;
229 }
230 }
231
232 if (end < buflim)
233 end++;
234 }
235 return (size_t) -1;
236
237 success:
238 *match_size = end - beg;
239 return beg - buf;
240 }
241
242 static void
EGfree(void * compiled_pattern)243 EGfree (void *compiled_pattern)
244 {
245 struct compiled_regex *cregex = (struct compiled_regex *) compiled_pattern;
246
247 free (cregex->patterns);
248 free (cregex);
249 }
250
251 /* POSIX Basic Regular Expressions */
252 matcher_t matcher_grep =
253 {
254 Gcompile,
255 EGexecute,
256 EGfree
257 };
258
259 /* POSIX Extended Regular Expressions */
260 matcher_t matcher_egrep =
261 {
262 Ecompile,
263 EGexecute,
264 EGfree
265 };
266
267 /* AWK Regular Expressions */
268 matcher_t matcher_awk =
269 {
270 AWKcompile,
271 EGexecute,
272 EGfree
273 };
274