• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include "util/test.h"
6 #include "util/thread.h"
7 #include "re2/prog.h"
8 #include "re2/re2.h"
9 #include "re2/regexp.h"
10 #include "re2/testing/regexp_generator.h"
11 #include "re2/testing/string_generator.h"
12 
13 DECLARE_bool(re2_dfa_bail_when_slow);
14 
15 DEFINE_int32(size, 8, "log2(number of DFA nodes)");
16 DEFINE_int32(repeat, 2, "Repetition count.");
17 DEFINE_int32(threads, 4, "number of threads");
18 
19 namespace re2 {
20 
21 // Check that multithreaded access to DFA class works.
22 
23 // Helper thread: builds entire DFA for prog.
24 class BuildThread : public Thread {
25  public:
BuildThread(Prog * prog)26   BuildThread(Prog* prog) : prog_(prog) {}
Run()27   virtual void Run() {
28     CHECK(prog_->BuildEntireDFA(Prog::kFirstMatch));
29   }
30 
31  private:
32   Prog* prog_;
33 };
34 
TEST(Multithreaded,BuildEntireDFA)35 TEST(Multithreaded, BuildEntireDFA) {
36   // Create regexp with 2^FLAGS_size states in DFA.
37   string s = "a";
38   for (int i = 0; i < FLAGS_size; i++)
39     s += "[ab]";
40   s += "b";
41 
42   // Check that single-threaded code works.
43   {
44     //LOG(INFO) << s;
45     Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
46     CHECK(re);
47     Prog* prog = re->CompileToProg(0);
48     CHECK(prog);
49     BuildThread* t = new BuildThread(prog);
50     t->SetJoinable(true);
51     t->Start();
52     t->Join();
53     delete t;
54     delete prog;
55     re->Decref();
56   }
57 
58   // Build the DFA simultaneously in a bunch of threads.
59   for (int i = 0; i < FLAGS_repeat; i++) {
60     Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
61     CHECK(re);
62     Prog* prog = re->CompileToProg(0);
63     CHECK(prog);
64 
65     vector<BuildThread*> threads;
66     for (int j = 0; j < FLAGS_threads; j++) {
67       BuildThread *t = new BuildThread(prog);
68       t->SetJoinable(true);
69       threads.push_back(t);
70     }
71     for (int j = 0; j < FLAGS_threads; j++)
72       threads[j]->Start();
73     for (int j = 0; j < FLAGS_threads; j++) {
74       threads[j]->Join();
75       delete threads[j];
76     }
77 
78     // One more compile, to make sure everything is okay.
79     prog->BuildEntireDFA(Prog::kFirstMatch);
80     delete prog;
81     re->Decref();
82   }
83 }
84 
85 // Check that DFA size requirements are followed.
86 // BuildEntireDFA will, like SearchDFA, stop building out
87 // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)88 TEST(SingleThreaded, BuildEntireDFA) {
89   // Create regexp with 2^30 states in DFA.
90   string s = "a";
91   for (int i = 0; i < 30; i++)
92     s += "[ab]";
93   s += "b";
94 
95   //LOG(INFO) << s;
96   Regexp* re = Regexp::Parse(s.c_str(), Regexp::LikePerl, NULL);
97   CHECK(re);
98   int max = 24;
99   for (int i = 17; i < max; i++) {
100     int limit = 1<<i;
101     int usage;
102     //int progusage, dfamem;
103     {
104       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
105       Prog* prog = re->CompileToProg(limit);
106       CHECK(prog);
107       //progusage = m.HeapGrowth();
108       //dfamem = prog->dfa_mem();
109       prog->BuildEntireDFA(Prog::kFirstMatch);
110       prog->BuildEntireDFA(Prog::kLongestMatch);
111       usage = m.HeapGrowth();
112       delete prog;
113     }
114     if (!UsingMallocCounter)
115       continue;
116     //LOG(INFO) << StringPrintf("Limit %d: prog used %d, DFA budget %d, total %d\n",
117     //                          limit, progusage, dfamem, usage);
118     CHECK_GT(usage, limit*9/10);
119     CHECK_LT(usage, limit + (16<<10));  // 16kB of slop okay
120   }
121   re->Decref();
122 }
123 
124 // Generates and returns a string over binary alphabet {0,1} that contains
125 // all possible binary sequences of length n as subsequences.  The obvious
126 // brute force method would generate a string of length n * 2^n, but this
127 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
128 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
129 // Such a string is useful for testing a DFA.  If you have a DFA
130 // where distinct last n bytes implies distinct states, then running on a
131 // DeBruijn string causes the DFA to need to create a new state at every
132 // position in the input, never reusing any states until it gets to the
133 // end of the string.  This is the worst possible case for DFA execution.
DeBruijnString(int n)134 static string DeBruijnString(int n) {
135   CHECK_LT(n, 8*sizeof(int));
136   CHECK_GT(n, 0);
137 
138   vector<bool> did(1<<n);
139   for (int i = 0; i < 1<<n; i++)
140     did[i] = false;
141 
142   string s;
143   for (int i = 0; i < n-1; i++)
144     s.append("0");
145   int bits = 0;
146   int mask = (1<<n) - 1;
147   for (int i = 0; i < (1<<n); i++) {
148     bits <<= 1;
149     bits &= mask;
150     if (!did[bits|1]) {
151       bits |= 1;
152       s.append("1");
153     } else {
154       s.append("0");
155     }
156     CHECK(!did[bits]);
157     did[bits] = true;
158   }
159   return s;
160 }
161 
162 // Test that the DFA gets the right result even if it runs
163 // out of memory during a search.  The regular expression
164 // 0[01]{n}$ matches a binary string of 0s and 1s only if
165 // the (n+1)th-to-last character is a 0.  Matching this in
166 // a single forward pass (as done by the DFA) requires
167 // keeping one bit for each of the last n+1 characters
168 // (whether each was a 0), or 2^(n+1) possible states.
169 // If we run this regexp to search in a string that contains
170 // every possible n-character binary string as a substring,
171 // then it will have to run through at least 2^n states.
172 // States are big data structures -- certainly more than 1 byte --
173 // so if the DFA can search correctly while staying within a
174 // 2^n byte limit, it must be handling out-of-memory conditions
175 // gracefully.
TEST(SingleThreaded,SearchDFA)176 TEST(SingleThreaded, SearchDFA) {
177   // Choice of n is mostly arbitrary, except that:
178   //   * making n too big makes the test run for too long.
179   //   * making n too small makes the DFA refuse to run,
180   //     because it has so little memory compared to the program size.
181   // Empirically, n = 18 is a good compromise between the two.
182   const int n = 18;
183 
184   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
185                              Regexp::LikePerl, NULL);
186   CHECK(re);
187 
188   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
189   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
190   string no_match = DeBruijnString(n);
191   string match = no_match + "0";
192 
193   // The De Bruijn string is the worst case input for this regexp.
194   // By default, the DFA will notice that it is flushing its cache
195   // too frequently and will bail out early, so that RE2 can use the
196   // NFA implementation instead.  (The DFA loses its speed advantage
197   // if it can't get a good cache hit rate.)
198   // Tell the DFA to trudge along instead.
199   FLAGS_re2_dfa_bail_when_slow = false;
200 
201   int64 usage;
202   int64 peak_usage;
203   {
204     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
205     Prog* prog = re->CompileToProg(1<<n);
206     CHECK(prog);
207     for (int i = 0; i < 10; i++) {
208       bool matched, failed = false;
209       matched = prog->SearchDFA(match, NULL,
210                                 Prog::kUnanchored, Prog::kFirstMatch,
211                                 NULL, &failed, NULL);
212       CHECK(!failed);
213       CHECK(matched);
214       matched = prog->SearchDFA(no_match, NULL,
215                                 Prog::kUnanchored, Prog::kFirstMatch,
216                                 NULL, &failed, NULL);
217       CHECK(!failed);
218       CHECK(!matched);
219     }
220     usage = m.HeapGrowth();
221     peak_usage = m.PeakHeapGrowth();
222     delete prog;
223   }
224   re->Decref();
225 
226   if (!UsingMallocCounter)
227     return;
228   //LOG(INFO) << "usage " << usage << " " << peak_usage;
229   CHECK_LT(usage, 1<<n);
230   CHECK_LT(peak_usage, 1<<n);
231 }
232 
233 // Helper thread: searches for match, which should match,
234 // and no_match, which should not.
235 class SearchThread : public Thread {
236  public:
SearchThread(Prog * prog,const StringPiece & match,const StringPiece & no_match)237   SearchThread(Prog* prog, const StringPiece& match,
238                const StringPiece& no_match)
239     : prog_(prog), match_(match), no_match_(no_match) {}
240 
Run()241   virtual void Run() {
242     for (int i = 0; i < 2; i++) {
243       bool matched, failed = false;
244       matched = prog_->SearchDFA(match_, NULL,
245                                  Prog::kUnanchored, Prog::kFirstMatch,
246                                  NULL, &failed, NULL);
247       CHECK(!failed);
248       CHECK(matched);
249       matched = prog_->SearchDFA(no_match_, NULL,
250                                  Prog::kUnanchored, Prog::kFirstMatch,
251                                  NULL, &failed, NULL);
252       CHECK(!failed);
253       CHECK(!matched);
254     }
255   }
256 
257  private:
258   Prog* prog_;
259   StringPiece match_;
260   StringPiece no_match_;
261 };
262 
TEST(Multithreaded,SearchDFA)263 TEST(Multithreaded, SearchDFA) {
264   // Same as single-threaded test above.
265   const int n = 18;
266   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
267                              Regexp::LikePerl, NULL);
268   CHECK(re);
269   string no_match = DeBruijnString(n);
270   string match = no_match + "0";
271   FLAGS_re2_dfa_bail_when_slow = false;
272 
273   // Check that single-threaded code works.
274   {
275     Prog* prog = re->CompileToProg(1<<n);
276     CHECK(prog);
277     SearchThread* t = new SearchThread(prog, match, no_match);
278     t->SetJoinable(true);
279     t->Start();
280     t->Join();
281     delete t;
282     delete prog;
283   }
284 
285   // Run the search simultaneously in a bunch of threads.
286   // Reuse same flags for Multithreaded.BuildDFA above.
287   for (int i = 0; i < FLAGS_repeat; i++) {
288     //LOG(INFO) << "Search " << i;
289     Prog* prog = re->CompileToProg(1<<n);
290     CHECK(prog);
291 
292     vector<SearchThread*> threads;
293     for (int j = 0; j < FLAGS_threads; j++) {
294       SearchThread *t = new SearchThread(prog, match, no_match);
295       t->SetJoinable(true);
296       threads.push_back(t);
297     }
298     for (int j = 0; j < FLAGS_threads; j++)
299       threads[j]->Start();
300     for (int j = 0; j < FLAGS_threads; j++) {
301       threads[j]->Join();
302       delete threads[j];
303     }
304     delete prog;
305   }
306   re->Decref();
307 }
308 
309 struct ReverseTest {
310   const char *regexp;
311   const char *text;
312   bool match;
313 };
314 
315 // Test that reverse DFA handles anchored/unanchored correctly.
316 // It's in the DFA interface but not used by RE2.
317 ReverseTest reverse_tests[] = {
318   { "\\A(a|b)", "abc", true },
319   { "(a|b)\\z", "cba", true },
320   { "\\A(a|b)", "cba", false },
321   { "(a|b)\\z", "abc", false },
322 };
323 
TEST(DFA,ReverseMatch)324 TEST(DFA, ReverseMatch) {
325   int nfail = 0;
326   for (int i = 0; i < arraysize(reverse_tests); i++) {
327     const ReverseTest& t = reverse_tests[i];
328     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
329     CHECK(re);
330     Prog *prog = re->CompileToReverseProg(0);
331     CHECK(prog);
332     bool failed = false;
333     bool matched = prog->SearchDFA(t.text, NULL, Prog::kUnanchored, Prog::kFirstMatch, NULL, &failed, NULL);
334     if (matched != t.match) {
335       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
336       nfail++;
337     }
338     delete prog;
339     re->Decref();
340   }
341   EXPECT_EQ(nfail, 0);
342 }
343 
344 }  // namespace re2
345