• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Regular expression generator: generates all possible
6 // regular expressions within parameters (see regexp_generator.h for details).
7 
8 // The regexp generator first generates a sequence of commands in a simple
9 // postfix language.  Each command in the language is a string,
10 // like "a" or "%s*" or "%s|%s".
11 //
12 // To evaluate a command, enough arguments are popped from the value stack to
13 // plug into the %s slots.  Then the result is pushed onto the stack.
14 // For example, the command sequence
15 //      a b %s%s c
16 // results in the stack
17 //      ab c
18 //
19 // GeneratePostfix generates all possible command sequences.
20 // Then RunPostfix turns each sequence into a regular expression
21 // and passes the regexp to HandleRegexp.
22 
23 #include "re2/testing/regexp_generator.h"
24 
25 #include <stddef.h>
26 #include <stdint.h>
27 #include <stdio.h>
28 #include <string.h>
29 
30 #include <memory>
31 #include <random>
32 #include <stack>
33 #include <string>
34 #include <vector>
35 
36 #include "absl/base/macros.h"
37 #include "absl/log/absl_check.h"
38 #include "absl/log/absl_log.h"
39 #include "absl/strings/escaping.h"
40 #include "absl/strings/str_format.h"
41 #include "absl/strings/string_view.h"
42 #include "util/utf.h"
43 
44 namespace re2 {
45 
46 // Returns a vector of the egrep regexp operators.
EgrepOps()47 const std::vector<std::string>& RegexpGenerator::EgrepOps() {
48   static const char *ops[] = {
49     "%s%s",
50     "%s|%s",
51     "%s*",
52     "%s+",
53     "%s?",
54     "%s\\C*",
55   };
56   static std::vector<std::string> v(ops, ops + ABSL_ARRAYSIZE(ops));
57   return v;
58 }
59 
RegexpGenerator(int maxatoms,int maxops,const std::vector<std::string> & atoms,const std::vector<std::string> & ops)60 RegexpGenerator::RegexpGenerator(int maxatoms, int maxops,
61                                  const std::vector<std::string>& atoms,
62                                  const std::vector<std::string>& ops)
63     : maxatoms_(maxatoms), maxops_(maxops), atoms_(atoms), ops_(ops) {
64   // Degenerate case.
65   if (atoms_.empty())
66     maxatoms_ = 0;
67   if (ops_.empty())
68     maxops_ = 0;
69 }
70 
71 // Generates all possible regular expressions (within the parameters),
72 // calling HandleRegexp for each one.
Generate()73 void RegexpGenerator::Generate() {
74   std::vector<std::string> postfix;
75   GeneratePostfix(&postfix, 0, 0, 0);
76 }
77 
78 // Generates random regular expressions, calling HandleRegexp for each one.
GenerateRandom(int32_t seed,int n)79 void RegexpGenerator::GenerateRandom(int32_t seed, int n) {
80   rng_.seed(seed);
81 
82   for (int i = 0; i < n; i++) {
83     std::vector<std::string> postfix;
84     GenerateRandomPostfix(&postfix, 0, 0, 0);
85   }
86 }
87 
88 // Counts and returns the number of occurrences of "%s" in s.
CountArgs(const std::string & s)89 static int CountArgs(const std::string& s) {
90   const char *p = s.c_str();
91   int n = 0;
92   while ((p = strstr(p, "%s")) != NULL) {
93     p += 2;
94     n++;
95   }
96   return n;
97 }
98 
99 // Generates all possible postfix command sequences.
100 // Each sequence is handed off to RunPostfix to generate a regular expression.
101 // The arguments are:
102 //   post:  the current postfix sequence
103 //   nstk:  the number of elements that would be on the stack after executing
104 //          the sequence
105 //   ops:   the number of operators used in the sequence
106 //   atoms: the number of atoms used in the sequence
107 // For example, if post were ["a", "b", "%s%s", "c"],
108 // then nstk = 2, ops = 1, atoms = 3.
109 //
110 // The initial call should be GeneratePostfix([empty vector], 0, 0, 0).
111 //
GeneratePostfix(std::vector<std::string> * post,int nstk,int ops,int atoms)112 void RegexpGenerator::GeneratePostfix(std::vector<std::string>* post,
113                                       int nstk, int ops, int atoms) {
114   if (nstk == 1)
115     RunPostfix(*post);
116 
117   // Early out: if used too many operators or can't
118   // get back down to a single expression on the stack
119   // using binary operators, give up.
120   if (ops + nstk - 1 > maxops_)
121     return;
122 
123   // Add atoms if there is room.
124   if (atoms < maxatoms_) {
125     for (size_t i = 0; i < atoms_.size(); i++) {
126       post->push_back(atoms_[i]);
127       GeneratePostfix(post, nstk + 1, ops, atoms + 1);
128       post->pop_back();
129     }
130   }
131 
132   // Add operators if there are enough arguments.
133   if (ops < maxops_) {
134     for (size_t i = 0; i < ops_.size(); i++) {
135       const std::string& fmt = ops_[i];
136       int nargs = CountArgs(fmt);
137       if (nargs <= nstk) {
138         post->push_back(fmt);
139         GeneratePostfix(post, nstk - nargs + 1, ops + 1, atoms);
140         post->pop_back();
141       }
142     }
143   }
144 }
145 
146 // Generates a random postfix command sequence.
147 // Stops and returns true once a single sequence has been generated.
GenerateRandomPostfix(std::vector<std::string> * post,int nstk,int ops,int atoms)148 bool RegexpGenerator::GenerateRandomPostfix(std::vector<std::string>* post,
149                                             int nstk, int ops, int atoms) {
150   std::uniform_int_distribution<int> random_stop(0, maxatoms_ - atoms);
151   std::uniform_int_distribution<int> random_bit(0, 1);
152   std::uniform_int_distribution<int> random_ops_index(
153       0, static_cast<int>(ops_.size()) - 1);
154   std::uniform_int_distribution<int> random_atoms_index(
155       0, static_cast<int>(atoms_.size()) - 1);
156 
157   for (;;) {
158     // Stop if we get to a single element, but only sometimes.
159     if (nstk == 1 && random_stop(rng_) == 0) {
160       RunPostfix(*post);
161       return true;
162     }
163 
164     // Early out: if used too many operators or can't
165     // get back down to a single expression on the stack
166     // using binary operators, give up.
167     if (ops + nstk - 1 > maxops_)
168       return false;
169 
170     // Add operators if there are enough arguments.
171     if (ops < maxops_ && random_bit(rng_) == 0) {
172       const std::string& fmt = ops_[random_ops_index(rng_)];
173       int nargs = CountArgs(fmt);
174       if (nargs <= nstk) {
175         post->push_back(fmt);
176         bool ret = GenerateRandomPostfix(post, nstk - nargs + 1,
177                                          ops + 1, atoms);
178         post->pop_back();
179         if (ret)
180           return true;
181       }
182     }
183 
184     // Add atoms if there is room.
185     if (atoms < maxatoms_ && random_bit(rng_) == 0) {
186       post->push_back(atoms_[random_atoms_index(rng_)]);
187       bool ret = GenerateRandomPostfix(post, nstk + 1, ops, atoms + 1);
188       post->pop_back();
189       if (ret)
190         return true;
191     }
192   }
193 }
194 
195 // Interprets the postfix command sequence to create a regular expression
196 // passed to HandleRegexp.  The results of operators like %s|%s are wrapped
197 // in (?: ) to avoid needing to maintain a precedence table.
RunPostfix(const std::vector<std::string> & post)198 void RegexpGenerator::RunPostfix(const std::vector<std::string>& post) {
199   std::stack<std::string> regexps;
200   for (size_t i = 0; i < post.size(); i++) {
201     switch (CountArgs(post[i])) {
202       default:
203         ABSL_LOG(FATAL) << "Bad operator: " << post[i];
204       case 0:
205         regexps.push(post[i]);
206         break;
207       case 1: {
208         auto fmt = absl::ParsedFormat<'s'>::New(post[i]);
209         ABSL_CHECK(fmt != nullptr);
210         std::string a = regexps.top();
211         regexps.pop();
212         regexps.push("(?:" + absl::StrFormat(*fmt, a) + ")");
213         break;
214       }
215       case 2: {
216         auto fmt = absl::ParsedFormat<'s', 's'>::New(post[i]);
217         ABSL_CHECK(fmt != nullptr);
218         std::string b = regexps.top();
219         regexps.pop();
220         std::string a = regexps.top();
221         regexps.pop();
222         regexps.push("(?:" + absl::StrFormat(*fmt, a, b) + ")");
223         break;
224       }
225     }
226   }
227 
228   if (regexps.size() != 1) {
229     // Internal error - should never happen.
230     absl::PrintF("Bad regexp program:\n");
231     for (size_t i = 0; i < post.size(); i++) {
232       absl::PrintF("  %s\n", absl::CEscape(post[i]));
233     }
234     absl::PrintF("Stack after running program:\n");
235     while (!regexps.empty()) {
236       absl::PrintF("  %s\n", absl::CEscape(regexps.top()));
237       regexps.pop();
238     }
239     ABSL_LOG(FATAL) << "Bad regexp program.";
240   }
241 
242   HandleRegexp(regexps.top());
243   HandleRegexp("^(?:" + regexps.top() + ")$");
244   HandleRegexp("^(?:" + regexps.top() + ")");
245   HandleRegexp("(?:" + regexps.top() + ")$");
246 }
247 
248 // Split s into an vector of strings, one for each UTF-8 character.
Explode(absl::string_view s)249 std::vector<std::string> Explode(absl::string_view s) {
250   std::vector<std::string> v;
251 
252   for (const char *q = s.data(); q < s.data() + s.size(); ) {
253     const char* p = q;
254     Rune r;
255     q += chartorune(&r, q);
256     v.push_back(std::string(p, q - p));
257   }
258 
259   return v;
260 }
261 
262 // Split string everywhere a substring is found, returning
263 // vector of pieces.
Split(absl::string_view sep,absl::string_view s)264 std::vector<std::string> Split(absl::string_view sep, absl::string_view s) {
265   std::vector<std::string> v;
266 
267   if (sep.empty())
268     return Explode(s);
269 
270   const char *p = s.data();
271   for (const char *q = s.data(); q + sep.size() <= s.data() + s.size(); q++) {
272     if (absl::string_view(q, sep.size()) == sep) {
273       v.push_back(std::string(p, q - p));
274       p = q + sep.size();
275       q = p - 1;  // -1 for ++ in loop
276       continue;
277     }
278   }
279   if (p < s.data() + s.size())
280     v.push_back(std::string(p, s.data() + s.size() - p));
281   return v;
282 }
283 
284 }  // namespace re2
285