1 // Copyright 2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Determine whether this library should match PCRE exactly
6 // for a particular Regexp. (If so, the testing framework can
7 // check that it does.)
8 //
9 // This library matches PCRE except in these cases:
10 // * the regexp contains a repetition of an empty string,
11 // like (a*)* or (a*)+. In this case, PCRE will treat
12 // the repetition sequence as ending with an empty string,
13 // while this library does not.
14 // * Perl and PCRE differ on whether \v matches \n.
15 // For historical reasons, this library implements the Perl behavior.
16 // * Perl and PCRE allow $ in one-line mode to match either the very
17 // end of the text or just before a \n at the end of the text.
18 // This library requires it to match only the end of the text.
19 // * Similarly, Perl and PCRE do not allow ^ in multi-line mode to
20 // match the end of the text if the last character is a \n.
21 // This library does allow it.
22 //
23 // Regexp::MimicsPCRE checks for any of these conditions.
24
25 #include "util/util.h"
26 #include "util/logging.h"
27 #include "re2/regexp.h"
28 #include "re2/walker-inl.h"
29
30 namespace re2 {
31
32 // Returns whether re might match an empty string.
33 static bool CanBeEmptyString(Regexp *re);
34
35 // Walker class to compute whether library handles a regexp
36 // exactly as PCRE would. See comment at top for conditions.
37
38 class PCREWalker : public Regexp::Walker<bool> {
39 public:
PCREWalker()40 PCREWalker() {}
41
42 virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
43 bool* child_args, int nchild_args);
44
ShortVisit(Regexp * re,bool a)45 virtual bool ShortVisit(Regexp* re, bool a) {
46 // Should never be called: we use Walk(), not WalkExponential().
47 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
48 LOG(DFATAL) << "PCREWalker::ShortVisit called";
49 #endif
50 return a;
51 }
52
53 private:
54 PCREWalker(const PCREWalker&) = delete;
55 PCREWalker& operator=(const PCREWalker&) = delete;
56 };
57
58 // Called after visiting each of re's children and accumulating
59 // the return values in child_args. So child_args contains whether
60 // this library mimics PCRE for those subexpressions.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)61 bool PCREWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
62 bool* child_args, int nchild_args) {
63 // If children failed, so do we.
64 for (int i = 0; i < nchild_args; i++)
65 if (!child_args[i])
66 return false;
67
68 // Otherwise look for other reasons to fail.
69 switch (re->op()) {
70 // Look for repeated empty string.
71 case kRegexpStar:
72 case kRegexpPlus:
73 case kRegexpQuest:
74 if (CanBeEmptyString(re->sub()[0]))
75 return false;
76 break;
77 case kRegexpRepeat:
78 if (re->max() == -1 && CanBeEmptyString(re->sub()[0]))
79 return false;
80 break;
81
82 // Look for \v
83 case kRegexpLiteral:
84 if (re->rune() == '\v')
85 return false;
86 break;
87
88 // Look for $ in single-line mode.
89 case kRegexpEndText:
90 case kRegexpEmptyMatch:
91 if (re->parse_flags() & Regexp::WasDollar)
92 return false;
93 break;
94
95 // Look for ^ in multi-line mode.
96 case kRegexpBeginLine:
97 // No condition: in single-line mode ^ becomes kRegexpBeginText.
98 return false;
99
100 default:
101 break;
102 }
103
104 // Not proven guilty.
105 return true;
106 }
107
108 // Returns whether this regexp's behavior will mimic PCRE's exactly.
MimicsPCRE()109 bool Regexp::MimicsPCRE() {
110 PCREWalker w;
111 return w.Walk(this, true);
112 }
113
114
115 // Walker class to compute whether a Regexp can match an empty string.
116 // It is okay to overestimate. For example, \b\B cannot match an empty
117 // string, because \b and \B are mutually exclusive, but this isn't
118 // that smart and will say it can. Spurious empty strings
119 // will reduce the number of regexps we sanity check against PCRE,
120 // but they won't break anything.
121
122 class EmptyStringWalker : public Regexp::Walker<bool> {
123 public:
EmptyStringWalker()124 EmptyStringWalker() {}
125
126 virtual bool PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
127 bool* child_args, int nchild_args);
128
ShortVisit(Regexp * re,bool a)129 virtual bool ShortVisit(Regexp* re, bool a) {
130 // Should never be called: we use Walk(), not WalkExponential().
131 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
132 LOG(DFATAL) << "EmptyStringWalker::ShortVisit called";
133 #endif
134 return a;
135 }
136
137 private:
138 EmptyStringWalker(const EmptyStringWalker&) = delete;
139 EmptyStringWalker& operator=(const EmptyStringWalker&) = delete;
140 };
141
142 // Called after visiting re's children. child_args contains the return
143 // value from each of the children's PostVisits (i.e., whether each child
144 // can match an empty string). Returns whether this clause can match an
145 // empty string.
PostVisit(Regexp * re,bool parent_arg,bool pre_arg,bool * child_args,int nchild_args)146 bool EmptyStringWalker::PostVisit(Regexp* re, bool parent_arg, bool pre_arg,
147 bool* child_args, int nchild_args) {
148 switch (re->op()) {
149 case kRegexpNoMatch: // never empty
150 case kRegexpLiteral:
151 case kRegexpAnyChar:
152 case kRegexpAnyByte:
153 case kRegexpCharClass:
154 case kRegexpLiteralString:
155 return false;
156
157 case kRegexpEmptyMatch: // always empty
158 case kRegexpBeginLine: // always empty, when they match
159 case kRegexpEndLine:
160 case kRegexpNoWordBoundary:
161 case kRegexpWordBoundary:
162 case kRegexpBeginText:
163 case kRegexpEndText:
164 case kRegexpStar: // can always be empty
165 case kRegexpQuest:
166 case kRegexpHaveMatch:
167 return true;
168
169 case kRegexpConcat: // can be empty if all children can
170 for (int i = 0; i < nchild_args; i++)
171 if (!child_args[i])
172 return false;
173 return true;
174
175 case kRegexpAlternate: // can be empty if any child can
176 for (int i = 0; i < nchild_args; i++)
177 if (child_args[i])
178 return true;
179 return false;
180
181 case kRegexpPlus: // can be empty if the child can
182 case kRegexpCapture:
183 return child_args[0];
184
185 case kRegexpRepeat: // can be empty if child can or is x{0}
186 return child_args[0] || re->min() == 0;
187 }
188 return false;
189 }
190
191 // Returns whether re can match an empty string.
CanBeEmptyString(Regexp * re)192 static bool CanBeEmptyString(Regexp* re) {
193 EmptyStringWalker w;
194 return w.Walk(re, true);
195 }
196
197 } // namespace re2
198