1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include "util/test.h"
6 #include "util/thread.h"
7 #include "re2/prog.h"
8 #include "re2/re2.h"
9 #include "re2/regexp.h"
10 #include "re2/testing/regexp_generator.h"
11 #include "re2/testing/string_generator.h"
12
13 DECLARE_bool(re2_dfa_bail_when_slow);
14
15 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
16 DEFINE_int32(repeat, 2, "Repetition count.");
17 DEFINE_int32(threads, 4, "number of threads");
18
19 namespace re2 {
20
21 // Check that multithreaded access to DFA class works.
22
23 // Helper thread: builds entire DFA for prog.
24 class BuildThread : public Thread {
25 public:
BuildThread(Prog * prog)26 BuildThread(Prog* prog) : prog_(prog) {}
Run()27 virtual void Run() {
28 CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
29 }
30
31 private:
32 Prog* prog_;
33 };
34
TEST(Multithreaded,BuildEntireDFA)35 TEST(Multithreaded, BuildEntireDFA) {
36 // Create regexp with 2^FLAGS_size states in DFA.
37 string s = "a";
38 for (int i = 0; i < FLAGS_size; i++)
39 s += "[ab]";
40 s += "b";
41
42 // Check that single-threaded code works.
43 {
44 //LOG(INFO) << s;
45 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
46 CHECK(re);
47 Prog* prog = re->CompileToProg(0);
48 CHECK(prog);
49 BuildThread* t = new BuildThread(prog);
50 t->SetJoinable(true);
51 t->Start();
52 t->Join();
53 delete t;
54 delete prog;
55 re->Decref();
56 }
57
58 // Build the DFA simultaneously in a bunch of threads.
59 for (int i = 0; i < FLAGS_repeat; i++) {
60 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
61 CHECK(re);
62 Prog* prog = re->CompileToProg(0);
63 CHECK(prog);
64
65 vector<BuildThread*> threads;
66 for (int j = 0; j < FLAGS_threads; j++) {
67 BuildThread *t = new BuildThread(prog);
68 t->SetJoinable(true);
69 threads.push_back(t);
70 }
71 for (int j = 0; j < FLAGS_threads; j++)
72 threads[j]->Start();
73 for (int j = 0; j < FLAGS_threads; j++) {
74 threads[j]->Join();
75 delete threads[j];
76 }
77
78 // One more compile, to make sure everything is okay.
79 prog->BuildEntireDFA(Prog::kFirstMatch);
80 delete prog;
81 re->Decref();
82 }
83 }
84
85 // Check that DFA size requirements are followed.
86 // BuildEntireDFA will, like SearchDFA, stop building out
87 // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)88 TEST(SingleThreaded, BuildEntireDFA) {
89 // Create regexp with 2^30 states in DFA.
90 string s = "a";
91 for (int i = 0; i < 30; i++)
92 s += "[ab]";
93 s += "b";
94
95 //LOG(INFO) << s;
96 Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
97 CHECK(re);
98 int max = 24;
99 for (int i = 17; i < max; i++) {
100 int limit = 1<<i;
101 int usage;
102 //int progusage, dfamem;
103 {
104 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
105 Prog* prog = re->CompileToProg(limit);
106 CHECK(prog);
107 //progusage = m.HeapGrowth();
108 //dfamem = prog->dfa_mem();
109 prog->BuildEntireDFA(Prog::kFirstMatch);
110 prog->BuildEntireDFA(Prog::kLongestMatch);
111 usage = m.HeapGrowth();
112 delete prog;
113 }
114 if (!UsingMallocCounter)
115 continue;
116 //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
117 // limit, progusage, dfamem, usage);
118 CHECK_GT(usage, limit*9/10);
119 CHECK_LT(usage, limit + (16<<10)); // 16kB of slop okay
120 }
121 re->Decref();
122 }
123
124 // Generates and returns a string over binary alphabet {0,1} that contains
125 // all possible binary sequences of length n as subsequences. The obvious
126 // brute force method would generate a string of length n * 2^n, but this
127 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
128 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
129 // Such a string is useful for testing a DFA. If you have a DFA
130 // where distinct last n bytes implies distinct states, then running on a
131 // DeBruijn string causes the DFA to need to create a new state at every
132 // position in the input, never reusing any states until it gets to the
133 // end of the string. This is the worst possible case for DFA execution.
DeBruijnString(int n)134 static string DeBruijnString(int n) {
135 CHECK_LT(n, 8*sizeof(int));
136 CHECK_GT(n, 0);
137
138 vector<bool> did(1<<n);
139 for (int i = 0; i < 1<<n; i++)
140 did[i] = false;
141
142 string s;
143 for (int i = 0; i < n-1; i++)
144 s.append("0");
145 int bits = 0;
146 int mask = (1<<n) - 1;
147 for (int i = 0; i < (1<<n); i++) {
148 bits <<= 1;
149 bits &= mask;
150 if (!did[bits|1]) {
151 bits |= 1;
152 s.append("1");
153 } else {
154 s.append("0");
155 }
156 CHECK(!did[bits]);
157 did[bits] = true;
158 }
159 return s;
160 }
161
162 // Test that the DFA gets the right result even if it runs
163 // out of memory during a search. The regular expression
164 // 0[01]{n}$ matches a binary string of 0s and 1s only if
165 // the (n+1)th-to-last character is a 0. Matching this in
166 // a single forward pass (as done by the DFA) requires
167 // keeping one bit for each of the last n+1 characters
168 // (whether each was a 0), or 2^(n+1) possible states.
169 // If we run this regexp to search in a string that contains
170 // every possible n-character binary string as a substring,
171 // then it will have to run through at least 2^n states.
172 // States are big data structures -- certainly more than 1 byte --
173 // so if the DFA can search correctly while staying within a
174 // 2^n byte limit, it must be handling out-of-memory conditions
175 // gracefully.
TEST(SingleThreaded,SearchDFA)176 TEST(SingleThreaded, SearchDFA) {
177 // Choice of n is mostly arbitrary, except that:
178 // * making n too big makes the test run for too long.
179 // * making n too small makes the DFA refuse to run,
180 // because it has so little memory compared to the program size.
181 // Empirically, n = 18 is a good compromise between the two.
182 const int n = 18;
183
184 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
185 Regexp::LikePerl, NULL);
186 CHECK(re);
187
188 // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
189 // which is not a match for 0[01]{n}$. Adding one more 0 is a match.
190 string no_match = DeBruijnString(n);
191 string match = no_match + "0";
192
193 // The De Bruijn string is the worst case input for this regexp.
194 // By default, the DFA will notice that it is flushing its cache
195 // too frequently and will bail out early, so that RE2 can use the
196 // NFA implementation instead. (The DFA loses its speed advantage
197 // if it can't get a good cache hit rate.)
198 // Tell the DFA to trudge along instead.
199 FLAGS_re2_dfa_bail_when_slow = false;
200
201 int64 usage;
202 int64 peak_usage;
203 {
204 testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
205 Prog* prog = re->CompileToProg(1<<n);
206 CHECK(prog);
207 for (int i = 0; i < 10; i++) {
208 bool matched, failed = false;
209 matched = prog->SearchDFA(match, NULL,
210 Prog::kUnanchored, Prog::kFirstMatch,
211 NULL, &failed, NULL);
212 CHECK(!failed);
213 CHECK(matched);
214 matched = prog->SearchDFA(no_match, NULL,
215 Prog::kUnanchored, Prog::kFirstMatch,
216 NULL, &failed, NULL);
217 CHECK(!failed);
218 CHECK(!matched);
219 }
220 usage = m.HeapGrowth();
221 peak_usage = m.PeakHeapGrowth();
222 delete prog;
223 }
224 re->Decref();
225
226 if (!UsingMallocCounter)
227 return;
228 //LOG(INFO) << "usage " << usage << " " << peak_usage;
229 CHECK_LT(usage, 1<<n);
230 CHECK_LT(peak_usage, 1<<n);
231 }
232
233 // Helper thread: searches for match, which should match,
234 // and no_match, which should not.
235 class SearchThread : public Thread {
236 public:
SearchThread(Prog * prog,const StringPiece & match,const StringPiece & no_match)237 SearchThread(Prog* prog, const StringPiece& match,
238 const StringPiece& no_match)
239 : prog_(prog), match_(match), no_match_(no_match) {}
240
Run()241 virtual void Run() {
242 for (int i = 0; i < 2; i++) {
243 bool matched, failed = false;
244 matched = prog_->SearchDFA(match_, NULL,
245 Prog::kUnanchored, Prog::kFirstMatch,
246 NULL, &failed, NULL);
247 CHECK(!failed);
248 CHECK(matched);
249 matched = prog_->SearchDFA(no_match_, NULL,
250 Prog::kUnanchored, Prog::kFirstMatch,
251 NULL, &failed, NULL);
252 CHECK(!failed);
253 CHECK(!matched);
254 }
255 }
256
257 private:
258 Prog* prog_;
259 StringPiece match_;
260 StringPiece no_match_;
261 };
262
TEST(Multithreaded,SearchDFA)263 TEST(Multithreaded, SearchDFA) {
264 // Same as single-threaded test above.
265 const int n = 18;
266 Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
267 Regexp::LikePerl, NULL);
268 CHECK(re);
269 string no_match = DeBruijnString(n);
270 string match = no_match + "0";
271 FLAGS_re2_dfa_bail_when_slow = false;
272
273 // Check that single-threaded code works.
274 {
275 Prog* prog = re->CompileToProg(1<<n);
276 CHECK(prog);
277 SearchThread* t = new SearchThread(prog, match, no_match);
278 t->SetJoinable(true);
279 t->Start();
280 t->Join();
281 delete t;
282 delete prog;
283 }
284
285 // Run the search simultaneously in a bunch of threads.
286 // Reuse same flags for Multithreaded.BuildDFA above.
287 for (int i = 0; i < FLAGS_repeat; i++) {
288 //LOG(INFO) << "Search " << i;
289 Prog* prog = re->CompileToProg(1<<n);
290 CHECK(prog);
291
292 vector<SearchThread*> threads;
293 for (int j = 0; j < FLAGS_threads; j++) {
294 SearchThread *t = new SearchThread(prog, match, no_match);
295 t->SetJoinable(true);
296 threads.push_back(t);
297 }
298 for (int j = 0; j < FLAGS_threads; j++)
299 threads[j]->Start();
300 for (int j = 0; j < FLAGS_threads; j++) {
301 threads[j]->Join();
302 delete threads[j];
303 }
304 delete prog;
305 }
306 re->Decref();
307 }
308
309 struct ReverseTest {
310 const char *regexp;
311 const char *text;
312 bool match;
313 };
314
315 // Test that reverse DFA handles anchored/unanchored correctly.
316 // It's in the DFA interface but not used by RE2.
317 ReverseTest reverse_tests[] = {
318 { "\\A(a|b)", "abc", true },
319 { "(a|b)\\z", "cba", true },
320 { "\\A(a|b)", "cba", false },
321 { "(a|b)\\z", "abc", false },
322 };
323
TEST(DFA,ReverseMatch)324 TEST(DFA, ReverseMatch) {
325 int nfail = 0;
326 for (int i = 0; i < arraysize(reverse_tests); i++) {
327 const ReverseTest& t = reverse_tests[i];
328 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
329 CHECK(re);
330 Prog *prog = re->CompileToReverseProg(0);
331 CHECK(prog);
332 bool failed = false;
333 bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
334 if (matched != t.match) {
335 LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
336 nfail++;
337 }
338 delete prog;
339 re->Decref();
340 }
341 EXPECT_EQ(nfail, 0);
342 }
343
344 } // namespace re2
345