1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Rewrite POSIX and other features in re
6 // to use simple extended regular expression features.
7 // Also sort and simplify character classes.
8
9 #include "util/util.h"
10 #include "re2/regexp.h"
11 #include "re2/walker-inl.h"
12
13 namespace re2 {
14
15 // Parses the regexp src and then simplifies it and sets *dst to the
16 // string representation of the simplified form. Returns true on success.
17 // Returns false and sets *error (if error != NULL) on error.
SimplifyRegexp(const StringPiece & src,ParseFlags flags,string * dst,RegexpStatus * status)18 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
19 string* dst,
20 RegexpStatus* status) {
21 Regexp* re = Parse(src, flags, status);
22 if (re == NULL)
23 return false;
24 Regexp* sre = re->Simplify();
25 re->Decref();
26 if (sre == NULL) {
27 // Should not happen, since Simplify never fails.
28 LOG(ERROR) << "Simplify failed on " << src;
29 if (status) {
30 status->set_code(kRegexpInternalError);
31 status->set_error_arg(src);
32 }
33 return false;
34 }
35 *dst = sre->ToString();
36 sre->Decref();
37 return true;
38 }
39
40 // Assuming the simple_ flags on the children are accurate,
41 // is this Regexp* simple?
ComputeSimple()42 bool Regexp::ComputeSimple() {
43 Regexp** subs;
44 switch (op_) {
45 case kRegexpNoMatch:
46 case kRegexpEmptyMatch:
47 case kRegexpLiteral:
48 case kRegexpLiteralString:
49 case kRegexpBeginLine:
50 case kRegexpEndLine:
51 case kRegexpBeginText:
52 case kRegexpWordBoundary:
53 case kRegexpNoWordBoundary:
54 case kRegexpEndText:
55 case kRegexpAnyChar:
56 case kRegexpAnyByte:
57 case kRegexpHaveMatch:
58 return true;
59 case kRegexpConcat:
60 case kRegexpAlternate:
61 // These are simple as long as the subpieces are simple.
62 subs = sub();
63 for (int i = 0; i < nsub_; i++)
64 if (!subs[i]->simple_)
65 return false;
66 return true;
67 case kRegexpCharClass:
68 // Simple as long as the char class is not empty, not full.
69 if (ccb_ != NULL)
70 return !ccb_->empty() && !ccb_->full();
71 return !cc_->empty() && !cc_->full();
72 case kRegexpCapture:
73 subs = sub();
74 return subs[0]->simple_;
75 case kRegexpStar:
76 case kRegexpPlus:
77 case kRegexpQuest:
78 subs = sub();
79 if (!subs[0]->simple_)
80 return false;
81 switch (subs[0]->op_) {
82 case kRegexpStar:
83 case kRegexpPlus:
84 case kRegexpQuest:
85 case kRegexpEmptyMatch:
86 case kRegexpNoMatch:
87 return false;
88 default:
89 break;
90 }
91 return true;
92 case kRegexpRepeat:
93 return false;
94 }
95 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
96 return false;
97 }
98
99 // Walker subclass used by Simplify.
100 // The simplify walk is purely post-recursive: given the simplified children,
101 // PostVisit creates the simplified result.
102 // The child_args are simplified Regexp*s.
103 class SimplifyWalker : public Regexp::Walker<Regexp*> {
104 public:
SimplifyWalker()105 SimplifyWalker() {}
106 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
107 virtual Regexp* PostVisit(Regexp* re,
108 Regexp* parent_arg,
109 Regexp* pre_arg,
110 Regexp** child_args, int nchild_args);
111 virtual Regexp* Copy(Regexp* re);
112 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
113
114 private:
115 // These functions are declared inside SimplifyWalker so that
116 // they can edit the private fields of the Regexps they construct.
117
118 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
119 // Caller must Decref return value when done with it.
120 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
121
122 // Simplifies the expression re{min,max} in terms of *, +, and ?.
123 // Returns a new regexp. Does not edit re. Does not consume reference to re.
124 // Caller must Decref return value when done with it.
125 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
126 Regexp::ParseFlags parse_flags);
127
128 // Simplifies a character class by expanding any named classes
129 // into rune ranges. Does not edit re. Does not consume ref to re.
130 // Caller must Decref return value when done with it.
131 static Regexp* SimplifyCharClass(Regexp* re);
132
133 DISALLOW_EVIL_CONSTRUCTORS(SimplifyWalker);
134 };
135
136 // Simplifies a regular expression, returning a new regexp.
137 // The new regexp uses traditional Unix egrep features only,
138 // plus the Perl (?:) non-capturing parentheses.
139 // Otherwise, no POSIX or Perl additions. The new regexp
140 // captures exactly the same subexpressions (with the same indices)
141 // as the original.
142 // Does not edit current object.
143 // Caller must Decref() return value when done with it.
144
Simplify()145 Regexp* Regexp::Simplify() {
146 if (simple_)
147 return Incref();
148 SimplifyWalker w;
149 return w.Walk(this, NULL);
150 }
151
152 #define Simplify DontCallSimplify // Avoid accidental recursion
153
Copy(Regexp * re)154 Regexp* SimplifyWalker::Copy(Regexp* re) {
155 return re->Incref();
156 }
157
ShortVisit(Regexp * re,Regexp * parent_arg)158 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
159 // This should never be called, since we use Walk and not
160 // WalkExponential.
161 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
162 return re->Incref();
163 }
164
PreVisit(Regexp * re,Regexp * parent_arg,bool * stop)165 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
166 if (re->simple_) {
167 *stop = true;
168 return re->Incref();
169 }
170 return NULL;
171 }
172
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)173 Regexp* SimplifyWalker::PostVisit(Regexp* re,
174 Regexp* parent_arg,
175 Regexp* pre_arg,
176 Regexp** child_args,
177 int nchild_args) {
178 switch (re->op()) {
179 case kRegexpNoMatch:
180 case kRegexpEmptyMatch:
181 case kRegexpLiteral:
182 case kRegexpLiteralString:
183 case kRegexpBeginLine:
184 case kRegexpEndLine:
185 case kRegexpBeginText:
186 case kRegexpWordBoundary:
187 case kRegexpNoWordBoundary:
188 case kRegexpEndText:
189 case kRegexpAnyChar:
190 case kRegexpAnyByte:
191 case kRegexpHaveMatch:
192 // All these are always simple.
193 re->simple_ = true;
194 return re->Incref();
195
196 case kRegexpConcat:
197 case kRegexpAlternate: {
198 // These are simple as long as the subpieces are simple.
199 // Two passes to avoid allocation in the common case.
200 bool changed = false;
201 Regexp** subs = re->sub();
202 for (int i = 0; i < re->nsub_; i++) {
203 Regexp* sub = subs[i];
204 Regexp* newsub = child_args[i];
205 if (newsub != sub) {
206 changed = true;
207 break;
208 }
209 }
210 if (!changed) {
211 for (int i = 0; i < re->nsub_; i++) {
212 Regexp* newsub = child_args[i];
213 newsub->Decref();
214 }
215 re->simple_ = true;
216 return re->Incref();
217 }
218 Regexp* nre = new Regexp(re->op(), re->parse_flags());
219 nre->AllocSub(re->nsub_);
220 Regexp** nre_subs = nre->sub();
221 for (int i = 0; i <re->nsub_; i++)
222 nre_subs[i] = child_args[i];
223 nre->simple_ = true;
224 return nre;
225 }
226
227 case kRegexpCapture: {
228 Regexp* newsub = child_args[0];
229 if (newsub == re->sub()[0]) {
230 newsub->Decref();
231 re->simple_ = true;
232 return re->Incref();
233 }
234 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
235 nre->AllocSub(1);
236 nre->sub()[0] = newsub;
237 nre->cap_ = re->cap_;
238 nre->simple_ = true;
239 return nre;
240 }
241
242 case kRegexpStar:
243 case kRegexpPlus:
244 case kRegexpQuest: {
245 Regexp* newsub = child_args[0];
246 // Special case: repeat the empty string as much as
247 // you want, but it's still the empty string.
248 if (newsub->op() == kRegexpEmptyMatch)
249 return newsub;
250
251 // These are simple as long as the subpiece is simple.
252 if (newsub == re->sub()[0]) {
253 newsub->Decref();
254 re->simple_ = true;
255 return re->Incref();
256 }
257
258 // These are also idempotent if flags are constant.
259 if (re->op() == newsub->op() &&
260 re->parse_flags() == newsub->parse_flags())
261 return newsub;
262
263 Regexp* nre = new Regexp(re->op(), re->parse_flags());
264 nre->AllocSub(1);
265 nre->sub()[0] = newsub;
266 nre->simple_ = true;
267 return nre;
268 }
269
270 case kRegexpRepeat: {
271 Regexp* newsub = child_args[0];
272 // Special case: repeat the empty string as much as
273 // you want, but it's still the empty string.
274 if (newsub->op() == kRegexpEmptyMatch)
275 return newsub;
276
277 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
278 re->parse_flags());
279 newsub->Decref();
280 nre->simple_ = true;
281 return nre;
282 }
283
284 case kRegexpCharClass: {
285 Regexp* nre = SimplifyCharClass(re);
286 nre->simple_ = true;
287 return nre;
288 }
289 }
290
291 LOG(ERROR) << "Simplify case not handled: " << re->op();
292 return re->Incref();
293 }
294
295 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
296 // Returns a new Regexp, handing the ref to the caller.
Concat2(Regexp * re1,Regexp * re2,Regexp::ParseFlags parse_flags)297 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
298 Regexp::ParseFlags parse_flags) {
299 Regexp* re = new Regexp(kRegexpConcat, parse_flags);
300 re->AllocSub(2);
301 Regexp** subs = re->sub();
302 subs[0] = re1;
303 subs[1] = re2;
304 return re;
305 }
306
307 // Simplifies the expression re{min,max} in terms of *, +, and ?.
308 // Returns a new regexp. Does not edit re. Does not consume reference to re.
309 // Caller must Decref return value when done with it.
310 // The result will *not* necessarily have the right capturing parens
311 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
312 // but in the Regexp* representation, both (x) are marked as $1.
SimplifyRepeat(Regexp * re,int min,int max,Regexp::ParseFlags f)313 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
314 Regexp::ParseFlags f) {
315 // x{n,} means at least n matches of x.
316 if (max == -1) {
317 // Special case: x{0,} is x*
318 if (min == 0)
319 return Regexp::Star(re->Incref(), f);
320
321 // Special case: x{1,} is x+
322 if (min == 1)
323 return Regexp::Plus(re->Incref(), f);
324
325 // General case: x{4,} is xxxx+
326 Regexp* nre = new Regexp(kRegexpConcat, f);
327 nre->AllocSub(min);
328 VLOG(1) << "Simplify " << min;
329 Regexp** nre_subs = nre->sub();
330 for (int i = 0; i < min-1; i++)
331 nre_subs[i] = re->Incref();
332 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
333 return nre;
334 }
335
336 // Special case: (x){0} matches only empty string.
337 if (min == 0 && max == 0)
338 return new Regexp(kRegexpEmptyMatch, f);
339
340 // Special case: x{1} is just x.
341 if (min == 1 && max == 1)
342 return re->Incref();
343
344 // General case: x{n,m} means n copies of x and m copies of x?.
345 // The machine will do less work if we nest the final m copies,
346 // so that x{2,5} = xx(x(x(x)?)?)?
347
348 // Build leading prefix: xx. Capturing only on the last one.
349 Regexp* nre = NULL;
350 if (min > 0) {
351 nre = new Regexp(kRegexpConcat, f);
352 nre->AllocSub(min);
353 Regexp** nre_subs = nre->sub();
354 for (int i = 0; i < min; i++)
355 nre_subs[i] = re->Incref();
356 }
357
358 // Build and attach suffix: (x(x(x)?)?)?
359 if (max > min) {
360 Regexp* suf = Regexp::Quest(re->Incref(), f);
361 for (int i = min+1; i < max; i++)
362 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
363 if (nre == NULL)
364 nre = suf;
365 else
366 nre = Concat2(nre, suf, f);
367 }
368
369 if (nre == NULL) {
370 // Some degenerate case, like min > max, or min < max < 0.
371 // This shouldn't happen, because the parser rejects such regexps.
372 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
373 return new Regexp(kRegexpNoMatch, f);
374 }
375
376 return nre;
377 }
378
379 // Simplifies a character class.
380 // Caller must Decref return value when done with it.
SimplifyCharClass(Regexp * re)381 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
382 CharClass* cc = re->cc();
383
384 // Special cases
385 if (cc->empty())
386 return new Regexp(kRegexpNoMatch, re->parse_flags());
387 if (cc->full())
388 return new Regexp(kRegexpAnyChar, re->parse_flags());
389
390 return re->Incref();
391 }
392
393 } // namespace re2
394