• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006-2008 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stdint.h>
6 #include <string>
7 #include <thread>
8 #include <vector>
9 
10 #include "util/test.h"
11 #include "util/flags.h"
12 #include "util/logging.h"
13 #include "util/malloc_counter.h"
14 #include "util/strutil.h"
15 #include "re2/prog.h"
16 #include "re2/re2.h"
17 #include "re2/regexp.h"
18 #include "re2/testing/regexp_generator.h"
19 #include "re2/testing/string_generator.h"
20 
21 static const bool UsingMallocCounter = false;
22 
23 DEFINE_FLAG(int, size, 8, "log2(number of DFA nodes)");
24 DEFINE_FLAG(int, repeat, 2, "Repetition count.");
25 DEFINE_FLAG(int, threads, 4, "number of threads");
26 
27 namespace re2 {
28 
29 // Check that multithreaded access to DFA class works.
30 
31 // Helper function: builds entire DFA for prog.
DoBuild(Prog * prog)32 static void DoBuild(Prog* prog) {
33   ASSERT_TRUE(prog->BuildEntireDFA(Prog::kFirstMatch, nullptr));
34 }
35 
TEST(Multithreaded,BuildEntireDFA)36 TEST(Multithreaded, BuildEntireDFA) {
37   // Create regexp with 2^FLAGS_size states in DFA.
38   std::string s = "a";
39   for (int i = 0; i < GetFlag(FLAGS_size); i++)
40     s += "[ab]";
41   s += "b";
42   Regexp* re = Regexp::Parse(s, Regexp::LikePerl, NULL);
43   ASSERT_TRUE(re != NULL);
44 
45   // Check that single-threaded code works.
46   {
47     Prog* prog = re->CompileToProg(0);
48     ASSERT_TRUE(prog != NULL);
49 
50     std::thread t(DoBuild, prog);
51     t.join();
52 
53     delete prog;
54   }
55 
56   // Build the DFA simultaneously in a bunch of threads.
57   for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
58     Prog* prog = re->CompileToProg(0);
59     ASSERT_TRUE(prog != NULL);
60 
61     std::vector<std::thread> threads;
62     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
63       threads.emplace_back(DoBuild, prog);
64     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
65       threads[j].join();
66 
67     // One more compile, to make sure everything is okay.
68     prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
69     delete prog;
70   }
71 
72   re->Decref();
73 }
74 
75 // Check that DFA size requirements are followed.
76 // BuildEntireDFA will, like SearchDFA, stop building out
77 // the DFA once the memory limits are reached.
TEST(SingleThreaded,BuildEntireDFA)78 TEST(SingleThreaded, BuildEntireDFA) {
79   // Create regexp with 2^30 states in DFA.
80   Regexp* re = Regexp::Parse("a[ab]{30}b", Regexp::LikePerl, NULL);
81   ASSERT_TRUE(re != NULL);
82 
83   for (int i = 17; i < 24; i++) {
84     int64_t limit = int64_t{1}<<i;
85     int64_t usage;
86     //int64_t progusage, dfamem;
87     {
88       testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
89       Prog* prog = re->CompileToProg(limit);
90       ASSERT_TRUE(prog != NULL);
91       //progusage = m.HeapGrowth();
92       //dfamem = prog->dfa_mem();
93       prog->BuildEntireDFA(Prog::kFirstMatch, nullptr);
94       prog->BuildEntireDFA(Prog::kLongestMatch, nullptr);
95       usage = m.HeapGrowth();
96       delete prog;
97     }
98     if (UsingMallocCounter) {
99       //LOG(INFO) << "limit " << limit << ", "
100       //          << "prog usage " << progusage << ", "
101       //          << "DFA budget " << dfamem << ", "
102       //          << "total " << usage;
103       // Tolerate +/- 10%.
104       ASSERT_GT(usage, limit*9/10);
105       ASSERT_LT(usage, limit*11/10);
106     }
107   }
108   re->Decref();
109 }
110 
111 // Generates and returns a string over binary alphabet {0,1} that contains
112 // all possible binary sequences of length n as subsequences.  The obvious
113 // brute force method would generate a string of length n * 2^n, but this
114 // generates a string of length n + 2^n - 1 called a De Bruijn cycle.
115 // See Knuth, The Art of Computer Programming, Vol 2, Exercise 3.2.2 #17.
116 // Such a string is useful for testing a DFA.  If you have a DFA
117 // where distinct last n bytes implies distinct states, then running on a
118 // DeBruijn string causes the DFA to need to create a new state at every
119 // position in the input, never reusing any states until it gets to the
120 // end of the string.  This is the worst possible case for DFA execution.
DeBruijnString(int n)121 static std::string DeBruijnString(int n) {
122   CHECK_LT(n, static_cast<int>(8*sizeof(int)));
123   CHECK_GT(n, 0);
124 
125   std::vector<bool> did(size_t{1}<<n);
126   for (int i = 0; i < 1<<n; i++)
127     did[i] = false;
128 
129   std::string s;
130   for (int i = 0; i < n-1; i++)
131     s.append("0");
132   int bits = 0;
133   int mask = (1<<n) - 1;
134   for (int i = 0; i < (1<<n); i++) {
135     bits <<= 1;
136     bits &= mask;
137     if (!did[bits|1]) {
138       bits |= 1;
139       s.append("1");
140     } else {
141       s.append("0");
142     }
143     CHECK(!did[bits]);
144     did[bits] = true;
145   }
146   return s;
147 }
148 
149 // Test that the DFA gets the right result even if it runs
150 // out of memory during a search.  The regular expression
151 // 0[01]{n}$ matches a binary string of 0s and 1s only if
152 // the (n+1)th-to-last character is a 0.  Matching this in
153 // a single forward pass (as done by the DFA) requires
154 // keeping one bit for each of the last n+1 characters
155 // (whether each was a 0), or 2^(n+1) possible states.
156 // If we run this regexp to search in a string that contains
157 // every possible n-character binary string as a substring,
158 // then it will have to run through at least 2^n states.
159 // States are big data structures -- certainly more than 1 byte --
160 // so if the DFA can search correctly while staying within a
161 // 2^n byte limit, it must be handling out-of-memory conditions
162 // gracefully.
TEST(SingleThreaded,SearchDFA)163 TEST(SingleThreaded, SearchDFA) {
164   // The De Bruijn string is the worst case input for this regexp.
165   // By default, the DFA will notice that it is flushing its cache
166   // too frequently and will bail out early, so that RE2 can use the
167   // NFA implementation instead.  (The DFA loses its speed advantage
168   // if it can't get a good cache hit rate.)
169   // Tell the DFA to trudge along instead.
170   Prog::TEST_dfa_should_bail_when_slow(false);
171 
172   // Choice of n is mostly arbitrary, except that:
173   //   * making n too big makes the test run for too long.
174   //   * making n too small makes the DFA refuse to run,
175   //     because it has so little memory compared to the program size.
176   // Empirically, n = 18 is a good compromise between the two.
177   const int n = 18;
178 
179   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
180                              Regexp::LikePerl, NULL);
181   ASSERT_TRUE(re != NULL);
182 
183   // The De Bruijn string for n ends with a 1 followed by n 0s in a row,
184   // which is not a match for 0[01]{n}$.  Adding one more 0 is a match.
185   std::string no_match = DeBruijnString(n);
186   std::string match = no_match + "0";
187 
188   int64_t usage;
189   int64_t peak_usage;
190   {
191     testing::MallocCounter m(testing::MallocCounter::THIS_THREAD_ONLY);
192     Prog* prog = re->CompileToProg(1<<n);
193     ASSERT_TRUE(prog != NULL);
194     for (int i = 0; i < 10; i++) {
195       bool matched = false;
196       bool failed = false;
197       matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
198                                 Prog::kFirstMatch, NULL, &failed, NULL);
199       ASSERT_FALSE(failed);
200       ASSERT_TRUE(matched);
201       matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
202                                 Prog::kFirstMatch, NULL, &failed, NULL);
203       ASSERT_FALSE(failed);
204       ASSERT_FALSE(matched);
205     }
206     usage = m.HeapGrowth();
207     peak_usage = m.PeakHeapGrowth();
208     delete prog;
209   }
210   if (UsingMallocCounter) {
211     //LOG(INFO) << "usage " << usage << ", "
212     //          << "peak usage " << peak_usage;
213     ASSERT_LT(usage, 1<<n);
214     ASSERT_LT(peak_usage, 1<<n);
215   }
216   re->Decref();
217 
218   // Reset to original behaviour.
219   Prog::TEST_dfa_should_bail_when_slow(true);
220 }
221 
222 // Helper function: searches for match, which should match,
223 // and no_match, which should not.
DoSearch(Prog * prog,const StringPiece & match,const StringPiece & no_match)224 static void DoSearch(Prog* prog, const StringPiece& match,
225                      const StringPiece& no_match) {
226   for (int i = 0; i < 2; i++) {
227     bool matched = false;
228     bool failed = false;
229     matched = prog->SearchDFA(match, StringPiece(), Prog::kUnanchored,
230                               Prog::kFirstMatch, NULL, &failed, NULL);
231     ASSERT_FALSE(failed);
232     ASSERT_TRUE(matched);
233     matched = prog->SearchDFA(no_match, StringPiece(), Prog::kUnanchored,
234                               Prog::kFirstMatch, NULL, &failed, NULL);
235     ASSERT_FALSE(failed);
236     ASSERT_FALSE(matched);
237   }
238 }
239 
TEST(Multithreaded,SearchDFA)240 TEST(Multithreaded, SearchDFA) {
241   Prog::TEST_dfa_should_bail_when_slow(false);
242 
243   // Same as single-threaded test above.
244   const int n = 18;
245   Regexp* re = Regexp::Parse(StringPrintf("0[01]{%d}$", n),
246                              Regexp::LikePerl, NULL);
247   ASSERT_TRUE(re != NULL);
248   std::string no_match = DeBruijnString(n);
249   std::string match = no_match + "0";
250 
251   // Check that single-threaded code works.
252   {
253     Prog* prog = re->CompileToProg(1<<n);
254     ASSERT_TRUE(prog != NULL);
255 
256     std::thread t(DoSearch, prog, match, no_match);
257     t.join();
258 
259     delete prog;
260   }
261 
262   // Run the search simultaneously in a bunch of threads.
263   // Reuse same flags for Multithreaded.BuildDFA above.
264   for (int i = 0; i < GetFlag(FLAGS_repeat); i++) {
265     Prog* prog = re->CompileToProg(1<<n);
266     ASSERT_TRUE(prog != NULL);
267 
268     std::vector<std::thread> threads;
269     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
270       threads.emplace_back(DoSearch, prog, match, no_match);
271     for (int j = 0; j < GetFlag(FLAGS_threads); j++)
272       threads[j].join();
273 
274     delete prog;
275   }
276 
277   re->Decref();
278 
279   // Reset to original behaviour.
280   Prog::TEST_dfa_should_bail_when_slow(true);
281 }
282 
283 struct ReverseTest {
284   const char* regexp;
285   const char* text;
286   bool match;
287 };
288 
289 // Test that reverse DFA handles anchored/unanchored correctly.
290 // It's in the DFA interface but not used by RE2.
291 ReverseTest reverse_tests[] = {
292   { "\\A(a|b)", "abc", true },
293   { "(a|b)\\z", "cba", true },
294   { "\\A(a|b)", "cba", false },
295   { "(a|b)\\z", "abc", false },
296 };
297 
TEST(DFA,ReverseMatch)298 TEST(DFA, ReverseMatch) {
299   int nfail = 0;
300   for (size_t i = 0; i < arraysize(reverse_tests); i++) {
301     const ReverseTest& t = reverse_tests[i];
302     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
303     ASSERT_TRUE(re != NULL);
304     Prog* prog = re->CompileToReverseProg(0);
305     ASSERT_TRUE(prog != NULL);
306     bool failed = false;
307     bool matched = prog->SearchDFA(t.text, StringPiece(), Prog::kUnanchored,
308                                    Prog::kFirstMatch, NULL, &failed, NULL);
309     if (matched != t.match) {
310       LOG(ERROR) << t.regexp << " on " << t.text << ": want " << t.match;
311       nfail++;
312     }
313     delete prog;
314     re->Decref();
315   }
316   EXPECT_EQ(nfail, 0);
317 }
318 
319 struct CallbackTest {
320   const char* regexp;
321   const char* dump;
322 };
323 
324 // Test that DFA::BuildAllStates() builds the expected DFA states
325 // and issues the expected callbacks. These test cases reflect the
326 // very compact encoding of the callbacks, but that also makes them
327 // very difficult to understand, so let's work through "\\Aa\\z".
328 // There are three slots per DFA state because the bytemap has two
329 // equivalence classes and there is a third slot for kByteEndText:
330 //   0: all bytes that are not 'a'
331 //   1: the byte 'a'
332 //   2: kByteEndText
333 // -1 means that there is no transition from that DFA state to any
334 // other DFA state for that slot. The valid transitions are thus:
335 //   state 0 --slot 1--> state 1
336 //   state 1 --slot 2--> state 2
337 // The double brackets indicate that state 2 is a matching state.
338 // Putting it together, this means that the DFA must consume the
339 // byte 'a' and then hit end of text. Q.E.D.
340 CallbackTest callback_tests[] = {
341   { "\\Aa\\z", "[-1,1,-1] [-1,-1,2] [[-1,-1,-1]]" },
342   { "\\Aab\\z", "[-1,1,-1,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
343   { "\\Aa*b\\z", "[-1,0,1,-1] [-1,-1,-1,2] [[-1,-1,-1,-1]]" },
344   { "\\Aa+b\\z", "[-1,1,-1,-1] [-1,1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
345   { "\\Aa?b\\z", "[-1,1,2,-1] [-1,-1,2,-1] [-1,-1,-1,3] [[-1,-1,-1,-1]]" },
346   { "\\Aa\\C*\\z", "[-1,1,-1] [1,1,2] [[-1,-1,-1]]" },
347   { "\\Aa\\C*", "[-1,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
348   { "a\\C*", "[0,1,-1] [2,2,3] [[2,2,2]] [[-1,-1,-1]]" },
349   { "\\C*", "[1,2] [[1,1]] [[-1,-1]]" },
350   { "a", "[0,1,-1] [2,2,2] [[-1,-1,-1]]"} ,
351 };
352 
TEST(DFA,Callback)353 TEST(DFA, Callback) {
354   int nfail = 0;
355   for (size_t i = 0; i < arraysize(callback_tests); i++) {
356     const CallbackTest& t = callback_tests[i];
357     Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
358     ASSERT_TRUE(re != NULL);
359     Prog* prog = re->CompileToProg(0);
360     ASSERT_TRUE(prog != NULL);
361     std::string dump;
362     prog->BuildEntireDFA(Prog::kLongestMatch, [&](const int* next, bool match) {
363       ASSERT_TRUE(next != NULL);
364       if (!dump.empty())
365         dump += " ";
366       dump += match ? "[[" : "[";
367       for (int b = 0; b < prog->bytemap_range() + 1; b++)
368         dump += StringPrintf("%d,", next[b]);
369       dump.pop_back();
370       dump += match ? "]]" : "]";
371     });
372     if (dump != t.dump) {
373       LOG(ERROR) << t.regexp << " bytemap:\n" << prog->DumpByteMap();
374       LOG(ERROR) << t.regexp << " dump:\ngot " << dump << "\nwant " << t.dump;
375       nfail++;
376     }
377     delete prog;
378     re->Decref();
379   }
380   EXPECT_EQ(nfail, 0);
381 }
382 
383 }  // namespace re2
384