1 // Copyright 2006 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Rewrite POSIX and other features in re
6 // to use simple extended regular expression features.
7 // Also sort and simplify character classes.
8
9 #include <string>
10
11 #include "util/util.h"
12 #include "util/logging.h"
13 #include "util/utf.h"
14 #include "re2/pod_array.h"
15 #include "re2/regexp.h"
16 #include "re2/walker-inl.h"
17
18 namespace re2 {
19
20 // Parses the regexp src and then simplifies it and sets *dst to the
21 // string representation of the simplified form. Returns true on success.
22 // Returns false and sets *error (if error != NULL) on error.
SimplifyRegexp(const StringPiece & src,ParseFlags flags,std::string * dst,RegexpStatus * status)23 bool Regexp::SimplifyRegexp(const StringPiece& src, ParseFlags flags,
24 std::string* dst, RegexpStatus* status) {
25 Regexp* re = Parse(src, flags, status);
26 if (re == NULL)
27 return false;
28 Regexp* sre = re->Simplify();
29 re->Decref();
30 if (sre == NULL) {
31 // Should not happen, since Simplify never fails.
32 LOG(ERROR) << "Simplify failed on " << src;
33 if (status) {
34 status->set_code(kRegexpInternalError);
35 status->set_error_arg(src);
36 }
37 return false;
38 }
39 *dst = sre->ToString();
40 sre->Decref();
41 return true;
42 }
43
44 // Assuming the simple_ flags on the children are accurate,
45 // is this Regexp* simple?
ComputeSimple()46 bool Regexp::ComputeSimple() {
47 Regexp** subs;
48 switch (op_) {
49 case kRegexpNoMatch:
50 case kRegexpEmptyMatch:
51 case kRegexpLiteral:
52 case kRegexpLiteralString:
53 case kRegexpBeginLine:
54 case kRegexpEndLine:
55 case kRegexpBeginText:
56 case kRegexpWordBoundary:
57 case kRegexpNoWordBoundary:
58 case kRegexpEndText:
59 case kRegexpAnyChar:
60 case kRegexpAnyByte:
61 case kRegexpHaveMatch:
62 return true;
63 case kRegexpConcat:
64 case kRegexpAlternate:
65 // These are simple as long as the subpieces are simple.
66 subs = sub();
67 for (int i = 0; i < nsub_; i++)
68 if (!subs[i]->simple())
69 return false;
70 return true;
71 case kRegexpCharClass:
72 // Simple as long as the char class is not empty, not full.
73 if (ccb_ != NULL)
74 return !ccb_->empty() && !ccb_->full();
75 return !cc_->empty() && !cc_->full();
76 case kRegexpCapture:
77 subs = sub();
78 return subs[0]->simple();
79 case kRegexpStar:
80 case kRegexpPlus:
81 case kRegexpQuest:
82 subs = sub();
83 if (!subs[0]->simple())
84 return false;
85 switch (subs[0]->op_) {
86 case kRegexpStar:
87 case kRegexpPlus:
88 case kRegexpQuest:
89 case kRegexpEmptyMatch:
90 case kRegexpNoMatch:
91 return false;
92 default:
93 break;
94 }
95 return true;
96 case kRegexpRepeat:
97 return false;
98 }
99 LOG(DFATAL) << "Case not handled in ComputeSimple: " << op_;
100 return false;
101 }
102
103 // Walker subclass used by Simplify.
104 // Coalesces runs of star/plus/quest/repeat of the same literal along with any
105 // occurrences of that literal into repeats of that literal. It also works for
106 // char classes, any char and any byte.
107 // PostVisit creates the coalesced result, which should then be simplified.
108 class CoalesceWalker : public Regexp::Walker<Regexp*> {
109 public:
CoalesceWalker()110 CoalesceWalker() {}
111 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
112 Regexp** child_args, int nchild_args);
113 virtual Regexp* Copy(Regexp* re);
114 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
115
116 private:
117 // These functions are declared inside CoalesceWalker so that
118 // they can edit the private fields of the Regexps they construct.
119
120 // Returns true if r1 and r2 can be coalesced. In particular, ensures that
121 // the parse flags are consistent. (They will not be checked again later.)
122 static bool CanCoalesce(Regexp* r1, Regexp* r2);
123
124 // Coalesces *r1ptr and *r2ptr. In most cases, the array elements afterwards
125 // will be empty match and the coalesced op. In other cases, where part of a
126 // literal string was removed to be coalesced, the array elements afterwards
127 // will be the coalesced op and the remainder of the literal string.
128 static void DoCoalesce(Regexp** r1ptr, Regexp** r2ptr);
129
130 CoalesceWalker(const CoalesceWalker&) = delete;
131 CoalesceWalker& operator=(const CoalesceWalker&) = delete;
132 };
133
134 // Walker subclass used by Simplify.
135 // The simplify walk is purely post-recursive: given the simplified children,
136 // PostVisit creates the simplified result.
137 // The child_args are simplified Regexp*s.
138 class SimplifyWalker : public Regexp::Walker<Regexp*> {
139 public:
SimplifyWalker()140 SimplifyWalker() {}
141 virtual Regexp* PreVisit(Regexp* re, Regexp* parent_arg, bool* stop);
142 virtual Regexp* PostVisit(Regexp* re, Regexp* parent_arg, Regexp* pre_arg,
143 Regexp** child_args, int nchild_args);
144 virtual Regexp* Copy(Regexp* re);
145 virtual Regexp* ShortVisit(Regexp* re, Regexp* parent_arg);
146
147 private:
148 // These functions are declared inside SimplifyWalker so that
149 // they can edit the private fields of the Regexps they construct.
150
151 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
152 // Caller must Decref return value when done with it.
153 static Regexp* Concat2(Regexp* re1, Regexp* re2, Regexp::ParseFlags flags);
154
155 // Simplifies the expression re{min,max} in terms of *, +, and ?.
156 // Returns a new regexp. Does not edit re. Does not consume reference to re.
157 // Caller must Decref return value when done with it.
158 static Regexp* SimplifyRepeat(Regexp* re, int min, int max,
159 Regexp::ParseFlags parse_flags);
160
161 // Simplifies a character class by expanding any named classes
162 // into rune ranges. Does not edit re. Does not consume ref to re.
163 // Caller must Decref return value when done with it.
164 static Regexp* SimplifyCharClass(Regexp* re);
165
166 SimplifyWalker(const SimplifyWalker&) = delete;
167 SimplifyWalker& operator=(const SimplifyWalker&) = delete;
168 };
169
170 // Simplifies a regular expression, returning a new regexp.
171 // The new regexp uses traditional Unix egrep features only,
172 // plus the Perl (?:) non-capturing parentheses.
173 // Otherwise, no POSIX or Perl additions. The new regexp
174 // captures exactly the same subexpressions (with the same indices)
175 // as the original.
176 // Does not edit current object.
177 // Caller must Decref() return value when done with it.
178
Simplify()179 Regexp* Regexp::Simplify() {
180 CoalesceWalker cw;
181 Regexp* cre = cw.Walk(this, NULL);
182 if (cre == NULL)
183 return cre;
184 SimplifyWalker sw;
185 Regexp* sre = sw.Walk(cre, NULL);
186 cre->Decref();
187 return sre;
188 }
189
190 #define Simplify DontCallSimplify // Avoid accidental recursion
191
192 // Utility function for PostVisit implementations that compares re->sub() with
193 // child_args to determine whether any child_args changed. In the common case,
194 // where nothing changed, calls Decref() for all child_args and returns false,
195 // so PostVisit must return re->Incref(). Otherwise, returns true.
ChildArgsChanged(Regexp * re,Regexp ** child_args)196 static bool ChildArgsChanged(Regexp* re, Regexp** child_args) {
197 for (int i = 0; i < re->nsub(); i++) {
198 Regexp* sub = re->sub()[i];
199 Regexp* newsub = child_args[i];
200 if (newsub != sub)
201 return true;
202 }
203 for (int i = 0; i < re->nsub(); i++) {
204 Regexp* newsub = child_args[i];
205 newsub->Decref();
206 }
207 return false;
208 }
209
Copy(Regexp * re)210 Regexp* CoalesceWalker::Copy(Regexp* re) {
211 return re->Incref();
212 }
213
ShortVisit(Regexp * re,Regexp * parent_arg)214 Regexp* CoalesceWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
215 // This should never be called, since we use Walk and not
216 // WalkExponential.
217 LOG(DFATAL) << "CoalesceWalker::ShortVisit called";
218 return re->Incref();
219 }
220
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)221 Regexp* CoalesceWalker::PostVisit(Regexp* re,
222 Regexp* parent_arg,
223 Regexp* pre_arg,
224 Regexp** child_args,
225 int nchild_args) {
226 if (re->nsub() == 0)
227 return re->Incref();
228
229 if (re->op() != kRegexpConcat) {
230 if (!ChildArgsChanged(re, child_args))
231 return re->Incref();
232
233 // Something changed. Build a new op.
234 Regexp* nre = new Regexp(re->op(), re->parse_flags());
235 nre->AllocSub(re->nsub());
236 Regexp** nre_subs = nre->sub();
237 for (int i = 0; i < re->nsub(); i++)
238 nre_subs[i] = child_args[i];
239 // Repeats and Captures have additional data that must be copied.
240 if (re->op() == kRegexpRepeat) {
241 nre->min_ = re->min();
242 nre->max_ = re->max();
243 } else if (re->op() == kRegexpCapture) {
244 nre->cap_ = re->cap();
245 }
246 return nre;
247 }
248
249 bool can_coalesce = false;
250 for (int i = 0; i < re->nsub(); i++) {
251 if (i+1 < re->nsub() &&
252 CanCoalesce(child_args[i], child_args[i+1])) {
253 can_coalesce = true;
254 break;
255 }
256 }
257 if (!can_coalesce) {
258 if (!ChildArgsChanged(re, child_args))
259 return re->Incref();
260
261 // Something changed. Build a new op.
262 Regexp* nre = new Regexp(re->op(), re->parse_flags());
263 nre->AllocSub(re->nsub());
264 Regexp** nre_subs = nre->sub();
265 for (int i = 0; i < re->nsub(); i++)
266 nre_subs[i] = child_args[i];
267 return nre;
268 }
269
270 for (int i = 0; i < re->nsub(); i++) {
271 if (i+1 < re->nsub() &&
272 CanCoalesce(child_args[i], child_args[i+1]))
273 DoCoalesce(&child_args[i], &child_args[i+1]);
274 }
275 // Determine how many empty matches were left by DoCoalesce.
276 int n = 0;
277 for (int i = n; i < re->nsub(); i++) {
278 if (child_args[i]->op() == kRegexpEmptyMatch)
279 n++;
280 }
281 // Build a new op.
282 Regexp* nre = new Regexp(re->op(), re->parse_flags());
283 nre->AllocSub(re->nsub() - n);
284 Regexp** nre_subs = nre->sub();
285 for (int i = 0, j = 0; i < re->nsub(); i++) {
286 if (child_args[i]->op() == kRegexpEmptyMatch) {
287 child_args[i]->Decref();
288 continue;
289 }
290 nre_subs[j] = child_args[i];
291 j++;
292 }
293 return nre;
294 }
295
CanCoalesce(Regexp * r1,Regexp * r2)296 bool CoalesceWalker::CanCoalesce(Regexp* r1, Regexp* r2) {
297 // r1 must be a star/plus/quest/repeat of a literal, char class, any char or
298 // any byte.
299 if ((r1->op() == kRegexpStar ||
300 r1->op() == kRegexpPlus ||
301 r1->op() == kRegexpQuest ||
302 r1->op() == kRegexpRepeat) &&
303 (r1->sub()[0]->op() == kRegexpLiteral ||
304 r1->sub()[0]->op() == kRegexpCharClass ||
305 r1->sub()[0]->op() == kRegexpAnyChar ||
306 r1->sub()[0]->op() == kRegexpAnyByte)) {
307 // r2 must be a star/plus/quest/repeat of the same literal, char class,
308 // any char or any byte.
309 if ((r2->op() == kRegexpStar ||
310 r2->op() == kRegexpPlus ||
311 r2->op() == kRegexpQuest ||
312 r2->op() == kRegexpRepeat) &&
313 Regexp::Equal(r1->sub()[0], r2->sub()[0]) &&
314 // The parse flags must be consistent.
315 ((r1->parse_flags() & Regexp::NonGreedy) ==
316 (r2->parse_flags() & Regexp::NonGreedy))) {
317 return true;
318 }
319 // ... OR an occurrence of that literal, char class, any char or any byte
320 if (Regexp::Equal(r1->sub()[0], r2)) {
321 return true;
322 }
323 // ... OR a literal string that begins with that literal.
324 if (r1->sub()[0]->op() == kRegexpLiteral &&
325 r2->op() == kRegexpLiteralString &&
326 r2->runes()[0] == r1->sub()[0]->rune() &&
327 // The parse flags must be consistent.
328 ((r1->sub()[0]->parse_flags() & Regexp::FoldCase) ==
329 (r2->parse_flags() & Regexp::FoldCase))) {
330 return true;
331 }
332 }
333 return false;
334 }
335
DoCoalesce(Regexp ** r1ptr,Regexp ** r2ptr)336 void CoalesceWalker::DoCoalesce(Regexp** r1ptr, Regexp** r2ptr) {
337 Regexp* r1 = *r1ptr;
338 Regexp* r2 = *r2ptr;
339
340 Regexp* nre = Regexp::Repeat(
341 r1->sub()[0]->Incref(), r1->parse_flags(), 0, 0);
342
343 switch (r1->op()) {
344 case kRegexpStar:
345 nre->min_ = 0;
346 nre->max_ = -1;
347 break;
348
349 case kRegexpPlus:
350 nre->min_ = 1;
351 nre->max_ = -1;
352 break;
353
354 case kRegexpQuest:
355 nre->min_ = 0;
356 nre->max_ = 1;
357 break;
358
359 case kRegexpRepeat:
360 nre->min_ = r1->min();
361 nre->max_ = r1->max();
362 break;
363
364 default:
365 LOG(DFATAL) << "DoCoalesce failed: r1->op() is " << r1->op();
366 nre->Decref();
367 return;
368 }
369
370 switch (r2->op()) {
371 case kRegexpStar:
372 nre->max_ = -1;
373 goto LeaveEmpty;
374
375 case kRegexpPlus:
376 nre->min_++;
377 nre->max_ = -1;
378 goto LeaveEmpty;
379
380 case kRegexpQuest:
381 if (nre->max() != -1)
382 nre->max_++;
383 goto LeaveEmpty;
384
385 case kRegexpRepeat:
386 nre->min_ += r2->min();
387 if (r2->max() == -1)
388 nre->max_ = -1;
389 else if (nre->max() != -1)
390 nre->max_ += r2->max();
391 goto LeaveEmpty;
392
393 case kRegexpLiteral:
394 case kRegexpCharClass:
395 case kRegexpAnyChar:
396 case kRegexpAnyByte:
397 nre->min_++;
398 if (nre->max() != -1)
399 nre->max_++;
400 goto LeaveEmpty;
401
402 LeaveEmpty:
403 *r1ptr = new Regexp(kRegexpEmptyMatch, Regexp::NoParseFlags);
404 *r2ptr = nre;
405 break;
406
407 case kRegexpLiteralString: {
408 Rune r = r1->sub()[0]->rune();
409 // Determine how much of the literal string is removed.
410 // We know that we have at least one rune. :)
411 int n = 1;
412 while (n < r2->nrunes() && r2->runes()[n] == r)
413 n++;
414 nre->min_ += n;
415 if (nre->max() != -1)
416 nre->max_ += n;
417 if (n == r2->nrunes())
418 goto LeaveEmpty;
419 *r1ptr = nre;
420 *r2ptr = Regexp::LiteralString(
421 &r2->runes()[n], r2->nrunes() - n, r2->parse_flags());
422 break;
423 }
424
425 default:
426 LOG(DFATAL) << "DoCoalesce failed: r2->op() is " << r2->op();
427 nre->Decref();
428 return;
429 }
430
431 r1->Decref();
432 r2->Decref();
433 }
434
Copy(Regexp * re)435 Regexp* SimplifyWalker::Copy(Regexp* re) {
436 return re->Incref();
437 }
438
ShortVisit(Regexp * re,Regexp * parent_arg)439 Regexp* SimplifyWalker::ShortVisit(Regexp* re, Regexp* parent_arg) {
440 // This should never be called, since we use Walk and not
441 // WalkExponential.
442 LOG(DFATAL) << "SimplifyWalker::ShortVisit called";
443 return re->Incref();
444 }
445
PreVisit(Regexp * re,Regexp * parent_arg,bool * stop)446 Regexp* SimplifyWalker::PreVisit(Regexp* re, Regexp* parent_arg, bool* stop) {
447 if (re->simple()) {
448 *stop = true;
449 return re->Incref();
450 }
451 return NULL;
452 }
453
PostVisit(Regexp * re,Regexp * parent_arg,Regexp * pre_arg,Regexp ** child_args,int nchild_args)454 Regexp* SimplifyWalker::PostVisit(Regexp* re,
455 Regexp* parent_arg,
456 Regexp* pre_arg,
457 Regexp** child_args,
458 int nchild_args) {
459 switch (re->op()) {
460 case kRegexpNoMatch:
461 case kRegexpEmptyMatch:
462 case kRegexpLiteral:
463 case kRegexpLiteralString:
464 case kRegexpBeginLine:
465 case kRegexpEndLine:
466 case kRegexpBeginText:
467 case kRegexpWordBoundary:
468 case kRegexpNoWordBoundary:
469 case kRegexpEndText:
470 case kRegexpAnyChar:
471 case kRegexpAnyByte:
472 case kRegexpHaveMatch:
473 // All these are always simple.
474 re->simple_ = true;
475 return re->Incref();
476
477 case kRegexpConcat:
478 case kRegexpAlternate: {
479 // These are simple as long as the subpieces are simple.
480 if (!ChildArgsChanged(re, child_args)) {
481 re->simple_ = true;
482 return re->Incref();
483 }
484 Regexp* nre = new Regexp(re->op(), re->parse_flags());
485 nre->AllocSub(re->nsub());
486 Regexp** nre_subs = nre->sub();
487 for (int i = 0; i < re->nsub(); i++)
488 nre_subs[i] = child_args[i];
489 nre->simple_ = true;
490 return nre;
491 }
492
493 case kRegexpCapture: {
494 Regexp* newsub = child_args[0];
495 if (newsub == re->sub()[0]) {
496 newsub->Decref();
497 re->simple_ = true;
498 return re->Incref();
499 }
500 Regexp* nre = new Regexp(kRegexpCapture, re->parse_flags());
501 nre->AllocSub(1);
502 nre->sub()[0] = newsub;
503 nre->cap_ = re->cap();
504 nre->simple_ = true;
505 return nre;
506 }
507
508 case kRegexpStar:
509 case kRegexpPlus:
510 case kRegexpQuest: {
511 Regexp* newsub = child_args[0];
512 // Special case: repeat the empty string as much as
513 // you want, but it's still the empty string.
514 if (newsub->op() == kRegexpEmptyMatch)
515 return newsub;
516
517 // These are simple as long as the subpiece is simple.
518 if (newsub == re->sub()[0]) {
519 newsub->Decref();
520 re->simple_ = true;
521 return re->Incref();
522 }
523
524 // These are also idempotent if flags are constant.
525 if (re->op() == newsub->op() &&
526 re->parse_flags() == newsub->parse_flags())
527 return newsub;
528
529 Regexp* nre = new Regexp(re->op(), re->parse_flags());
530 nre->AllocSub(1);
531 nre->sub()[0] = newsub;
532 nre->simple_ = true;
533 return nre;
534 }
535
536 case kRegexpRepeat: {
537 Regexp* newsub = child_args[0];
538 // Special case: repeat the empty string as much as
539 // you want, but it's still the empty string.
540 if (newsub->op() == kRegexpEmptyMatch)
541 return newsub;
542
543 Regexp* nre = SimplifyRepeat(newsub, re->min_, re->max_,
544 re->parse_flags());
545 newsub->Decref();
546 nre->simple_ = true;
547 return nre;
548 }
549
550 case kRegexpCharClass: {
551 Regexp* nre = SimplifyCharClass(re);
552 nre->simple_ = true;
553 return nre;
554 }
555 }
556
557 LOG(ERROR) << "Simplify case not handled: " << re->op();
558 return re->Incref();
559 }
560
561 // Creates a concatenation of two Regexp, consuming refs to re1 and re2.
562 // Returns a new Regexp, handing the ref to the caller.
Concat2(Regexp * re1,Regexp * re2,Regexp::ParseFlags parse_flags)563 Regexp* SimplifyWalker::Concat2(Regexp* re1, Regexp* re2,
564 Regexp::ParseFlags parse_flags) {
565 Regexp* re = new Regexp(kRegexpConcat, parse_flags);
566 re->AllocSub(2);
567 Regexp** subs = re->sub();
568 subs[0] = re1;
569 subs[1] = re2;
570 return re;
571 }
572
573 // Simplifies the expression re{min,max} in terms of *, +, and ?.
574 // Returns a new regexp. Does not edit re. Does not consume reference to re.
575 // Caller must Decref return value when done with it.
576 // The result will *not* necessarily have the right capturing parens
577 // if you call ToString() and re-parse it: (x){2} becomes (x)(x),
578 // but in the Regexp* representation, both (x) are marked as $1.
SimplifyRepeat(Regexp * re,int min,int max,Regexp::ParseFlags f)579 Regexp* SimplifyWalker::SimplifyRepeat(Regexp* re, int min, int max,
580 Regexp::ParseFlags f) {
581 // x{n,} means at least n matches of x.
582 if (max == -1) {
583 // Special case: x{0,} is x*
584 if (min == 0)
585 return Regexp::Star(re->Incref(), f);
586
587 // Special case: x{1,} is x+
588 if (min == 1)
589 return Regexp::Plus(re->Incref(), f);
590
591 // General case: x{4,} is xxxx+
592 PODArray<Regexp*> nre_subs(min);
593 for (int i = 0; i < min-1; i++)
594 nre_subs[i] = re->Incref();
595 nre_subs[min-1] = Regexp::Plus(re->Incref(), f);
596 return Regexp::Concat(nre_subs.data(), min, f);
597 }
598
599 // Special case: (x){0} matches only empty string.
600 if (min == 0 && max == 0)
601 return new Regexp(kRegexpEmptyMatch, f);
602
603 // Special case: x{1} is just x.
604 if (min == 1 && max == 1)
605 return re->Incref();
606
607 // General case: x{n,m} means n copies of x and m copies of x?.
608 // The machine will do less work if we nest the final m copies,
609 // so that x{2,5} = xx(x(x(x)?)?)?
610
611 // Build leading prefix: xx. Capturing only on the last one.
612 Regexp* nre = NULL;
613 if (min > 0) {
614 PODArray<Regexp*> nre_subs(min);
615 for (int i = 0; i < min; i++)
616 nre_subs[i] = re->Incref();
617 nre = Regexp::Concat(nre_subs.data(), min, f);
618 }
619
620 // Build and attach suffix: (x(x(x)?)?)?
621 if (max > min) {
622 Regexp* suf = Regexp::Quest(re->Incref(), f);
623 for (int i = min+1; i < max; i++)
624 suf = Regexp::Quest(Concat2(re->Incref(), suf, f), f);
625 if (nre == NULL)
626 nre = suf;
627 else
628 nre = Concat2(nre, suf, f);
629 }
630
631 if (nre == NULL) {
632 // Some degenerate case, like min > max, or min < max < 0.
633 // This shouldn't happen, because the parser rejects such regexps.
634 LOG(DFATAL) << "Malformed repeat " << re->ToString() << " " << min << " " << max;
635 return new Regexp(kRegexpNoMatch, f);
636 }
637
638 return nre;
639 }
640
641 // Simplifies a character class.
642 // Caller must Decref return value when done with it.
SimplifyCharClass(Regexp * re)643 Regexp* SimplifyWalker::SimplifyCharClass(Regexp* re) {
644 CharClass* cc = re->cc();
645
646 // Special cases
647 if (cc->empty())
648 return new Regexp(kRegexpNoMatch, re->parse_flags());
649 if (cc->full())
650 return new Regexp(kRegexpAnyChar, re->parse_flags());
651
652 return re->Incref();
653 }
654
655 } // namespace re2
656