1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24
25 #include "util/util.h"
26 #include "re2/regexp.h"
27 #include "re2/walker-inl.h"
28
29 namespace re2 {
30
31 // Returns whether re might match an empty string.
32 static bool CanBeEmptyString(Regexp *re);
33
34 // Walker class to compute whether library handles a regexp
35 // exactly as PCRE would. See comment at top for conditions.
36
37 class PCREWalker : public Regexp::Walker<bool> {
38 public:
PCREWalker()39 PCREWalker() {}
40 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg, bool* child_args,
41 int nchild_args);
42
ShortVisit(Regexp * re,bool a)43 bool ShortVisit(Regexp* re, bool a) {
44 // Should never be called: we use Walk not WalkExponential.
45 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
46 return a;
47 }
48 };
49
50 // Called after visiting each of re's children and accumulating
51 // the return values in child_args. So child_args contains whether
52 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)53 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
54 bool* child_args, int nchild_args) {
55 // If children failed, so do we.
56 for (int i = 0; i < nchild_args; i++)
57 if (!child_args[i])
58 return false;
59
60 // Otherwise look for other reasons to fail.
61 switch (re->op()) {
62 // Look for repeated empty string.
63 case kRegexpStar:
64 case kRegexpPlus:
65 case kRegexpQuest:
66 if (CanBeEmptyString(re->sub()[0]))
67 return false;
68 break;
69 case kRegexpRepeat:
70 if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
71 return false;
72 break;
73
74 // Look for \v
75 case kRegexpLiteral:
76 if (re->rune() == '\v')
77 return false;
78 break;
79
80 // Look for $ in single-line mode.
81 case kRegexpEndText:
82 case kRegexpEmptyMatch:
83 if (re->parse_flags() & Regexp::WasDollar)
84 return false;
85 break;
86
87 // Look for ^ in multi-line mode.
88 case kRegexpBeginLine:
89 // No condition: in single-line mode ^ becomes kRegexpBeginText.
90 return false;
91
92 default:
93 break;
94 }
95
96 // Not proven guilty.
97 return true;
98 }
99
100 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()101 bool Regexp::MimicsPCRE() {
102 PCREWalker w;
103 return w.Walk(this, true);
104 }
105
106
107 // Walker class to compute whether a Regexp can match an empty string.
108 // It is okay to overestimate. For example, \b\B cannot match an empty
109 // string, because \b and \B are mutually exclusive, but this isn't
110 // that smart and will say it can. Spurious empty strings
111 // will reduce the number of regexps we sanity check against PCRE,
112 // but they won't break anything.
113
114 class EmptyStringWalker : public Regexp::Walker<bool> {
115 public:
EmptyStringWalker()116 EmptyStringWalker() { }
117 bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
118 bool* child_args, int nchild_args);
119
ShortVisit(Regexp * re,bool a)120 bool ShortVisit(Regexp* re, bool a) {
121 // Should never be called: we use Walk not WalkExponential.
122 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
123 return a;
124 }
125
126 private:
127 DISALLOW_EVIL_CONSTRUCTORS(EmptyStringWalker);
128 };
129
130 // Called after visiting re's children. child_args contains the return
131 // value from each of the children's PostVisits (i.e., whether each child
132 // can match an empty string). Returns whether this clause can match an
133 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)134 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
135 bool* child_args, int nchild_args) {
136 switch (re->op()) {
137 case kRegexpNoMatch: // never empty
138 case kRegexpLiteral:
139 case kRegexpAnyChar:
140 case kRegexpAnyByte:
141 case kRegexpCharClass:
142 case kRegexpLiteralString:
143 return false;
144
145 case kRegexpEmptyMatch: // always empty
146 case kRegexpBeginLine: // always empty, when they match
147 case kRegexpEndLine:
148 case kRegexpNoWordBoundary:
149 case kRegexpWordBoundary:
150 case kRegexpBeginText:
151 case kRegexpEndText:
152 case kRegexpStar: // can always be empty
153 case kRegexpQuest:
154 case kRegexpHaveMatch:
155 return true;
156
157 case kRegexpConcat: // can be empty if all children can
158 for (int i = 0; i < nchild_args; i++)
159 if (!child_args[i])
160 return false;
161 return true;
162
163 case kRegexpAlternate: // can be empty if any child can
164 for (int i = 0; i < nchild_args; i++)
165 if (child_args[i])
166 return true;
167 return false;
168
169 case kRegexpPlus: // can be empty if the child can
170 case kRegexpCapture:
171 return child_args[0];
172
173 case kRegexpRepeat: // can be empty if child can or is x{0}
174 return child_args[0] || re->min() == 0;
175 }
176 return false;
177 }
178
179 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)180 static bool CanBeEmptyString(Regexp* re) {
181 EmptyStringWalker w;
182 return w.Walk(re, true);
183 }
184
185 } // namespace re2
186