• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <string>
8 
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "re2/regexp.h"
12 #include "re2/prog.h"
13 
14 namespace re2 {
15 
16 // Simple input/output tests checking that
17 // the regexp compiles to the expected code.
18 // These are just to sanity check the basic implementation.
19 // The real confidence tests happen by testing the NFA/DFA
20 // that run the compiled code.
21 
22 struct Test {
23   const char* regexp;
24   const char* code;
25 };
26 
27 static Test tests[] = {
28   { "a",
29     "3. byte [61-61] 0 -> 4\n"
30     "4. match! 0\n" },
31   { "ab",
32     "3. byte [61-61] 0 -> 4\n"
33     "4. byte [62-62] 0 -> 5\n"
34     "5. match! 0\n" },
35   { "a|c",
36     "3+ byte [61-61] 0 -> 5\n"
37     "4. byte [63-63] 0 -> 5\n"
38     "5. match! 0\n" },
39   { "a|b",
40     "3. byte [61-62] 0 -> 4\n"
41     "4. match! 0\n" },
42   { "[ab]",
43     "3. byte [61-62] 0 -> 4\n"
44     "4. match! 0\n" },
45   { "a+",
46     "3. byte [61-61] 0 -> 4\n"
47     "4+ nop -> 3\n"
48     "5. match! 0\n" },
49   { "a+?",
50     "3. byte [61-61] 0 -> 4\n"
51     "4+ match! 0\n"
52     "5. nop -> 3\n" },
53   { "a*",
54     "3+ byte [61-61] 1 -> 3\n"
55     "4. match! 0\n" },
56   { "a*?",
57     "3+ match! 0\n"
58     "4. byte [61-61] 0 -> 3\n" },
59   { "a?",
60     "3+ byte [61-61] 1 -> 5\n"
61     "4. nop -> 5\n"
62     "5. match! 0\n" },
63   { "a??",
64     "3+ nop -> 5\n"
65     "4. byte [61-61] 0 -> 5\n"
66     "5. match! 0\n" },
67   { "a{4}",
68     "3. byte [61-61] 0 -> 4\n"
69     "4. byte [61-61] 0 -> 5\n"
70     "5. byte [61-61] 0 -> 6\n"
71     "6. byte [61-61] 0 -> 7\n"
72     "7. match! 0\n" },
73   { "(a)",
74     "3. capture 2 -> 4\n"
75     "4. byte [61-61] 0 -> 5\n"
76     "5. capture 3 -> 6\n"
77     "6. match! 0\n" },
78   { "(?:a)",
79     "3. byte [61-61] 0 -> 4\n"
80     "4. match! 0\n" },
81   { "",
82     "3. match! 0\n" },
83   { ".",
84     "3+ byte [00-09] 0 -> 5\n"
85     "4. byte [0b-ff] 0 -> 5\n"
86     "5. match! 0\n" },
87   { "[^ab]",
88     "3+ byte [00-09] 0 -> 6\n"
89     "4+ byte [0b-60] 0 -> 6\n"
90     "5. byte [63-ff] 0 -> 6\n"
91     "6. match! 0\n" },
92   { "[Aa]",
93     "3. byte/i [61-61] 0 -> 4\n"
94     "4. match! 0\n" },
95   { "\\C+",
96     "3. byte [00-ff] 0 -> 4\n"
97     "4+ altmatch -> 5 | 6\n"
98     "5+ nop -> 3\n"
99     "6. match! 0\n" },
100   { "\\C*",
101     "3+ altmatch -> 4 | 5\n"
102     "4+ byte [00-ff] 1 -> 3\n"
103     "5. match! 0\n" },
104   { "\\C?",
105     "3+ byte [00-ff] 1 -> 5\n"
106     "4. nop -> 5\n"
107     "5. match! 0\n" },
108   // Issue 20992936
109   { "[[-`]",
110     "3. byte [5b-60] 0 -> 4\n"
111     "4. match! 0\n" },
112 };
113 
TEST(TestRegexpCompileToProg,Simple)114 TEST(TestRegexpCompileToProg, Simple) {
115   int failed = 0;
116   for (size_t i = 0; i < arraysize(tests); i++) {
117     const re2::Test& t = tests[i];
118     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
119     if (re == NULL) {
120       LOG(ERROR) << "Cannot parse: " << t.regexp;
121       failed++;
122       continue;
123     }
124     Prog* prog = re->CompileToProg(0);
125     if (prog == NULL) {
126       LOG(ERROR) << "Cannot compile: " << t.regexp;
127       re->Decref();
128       failed++;
129       continue;
130     }
131     ASSERT_TRUE(re->CompileToProg(1) == NULL);
132     std::string s = prog->Dump();
133     if (s != t.code) {
134       LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
135       LOG(ERROR) << "Want:\n" << t.code;
136       LOG(ERROR) << "Got:\n" << s;
137       failed++;
138     }
139     delete prog;
140     re->Decref();
141   }
142   EXPECT_EQ(failed, 0);
143 }
144 
DumpByteMap(StringPiece pattern,Regexp::ParseFlags flags,std::string * bytemap)145 static void DumpByteMap(StringPiece pattern, Regexp::ParseFlags flags,
146                         std::string* bytemap) {
147   Regexp* re = Regexp::Parse(pattern, flags, NULL);
148   EXPECT_TRUE(re != NULL);
149 
150   Prog* prog = re->CompileToProg(0);
151   EXPECT_TRUE(prog != NULL);
152   *bytemap = prog->DumpByteMap();
153   delete prog;
154 
155   re->Decref();
156 }
157 
TEST(TestCompile,Latin1Ranges)158 TEST(TestCompile, Latin1Ranges) {
159   // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
160 
161   std::string bytemap;
162 
163   DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
164   EXPECT_EQ("[00-09] -> 0\n"
165             "[0a-0a] -> 1\n"
166             "[0b-ff] -> 0\n",
167             bytemap);
168 }
169 
TEST(TestCompile,OtherByteMapTests)170 TEST(TestCompile, OtherByteMapTests) {
171   std::string bytemap;
172 
173   // Test that "absent" ranges are mapped to the same byte class.
174   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
175   EXPECT_EQ("[00-2f] -> 0\n"
176             "[30-39] -> 1\n"
177             "[3a-40] -> 0\n"
178             "[41-46] -> 1\n"
179             "[47-60] -> 0\n"
180             "[61-66] -> 1\n"
181             "[67-ff] -> 0\n",
182             bytemap);
183 
184   // Test the byte classes for \b.
185   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
186   EXPECT_EQ("[00-2f] -> 0\n"
187             "[30-39] -> 1\n"
188             "[3a-40] -> 0\n"
189             "[41-5a] -> 1\n"
190             "[5b-5e] -> 0\n"
191             "[5f-5f] -> 1\n"
192             "[60-60] -> 0\n"
193             "[61-7a] -> 1\n"
194             "[7b-ff] -> 0\n",
195             bytemap);
196 
197   // Bug in the ASCII case-folding optimization created too many byte classes.
198   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
199   EXPECT_EQ("[00-5e] -> 0\n"
200             "[5f-5f] -> 1\n"
201             "[60-ff] -> 0\n",
202             bytemap);
203 }
204 
TEST(TestCompile,UTF8Ranges)205 TEST(TestCompile, UTF8Ranges) {
206   // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
207   // Once, erroneously split between 0x3f and 0x40 because it is
208   // a 6-bit boundary.
209 
210   std::string bytemap;
211 
212   DumpByteMap(".", Regexp::PerlX, &bytemap);
213   EXPECT_EQ("[00-09] -> 0\n"
214             "[0a-0a] -> 1\n"
215             "[0b-7f] -> 0\n"
216             "[80-8f] -> 2\n"
217             "[90-9f] -> 3\n"
218             "[a0-bf] -> 4\n"
219             "[c0-c1] -> 1\n"
220             "[c2-df] -> 5\n"
221             "[e0-e0] -> 6\n"
222             "[e1-ef] -> 7\n"
223             "[f0-f0] -> 8\n"
224             "[f1-f3] -> 9\n"
225             "[f4-f4] -> 10\n"
226             "[f5-ff] -> 1\n",
227             bytemap);
228 }
229 
TEST(TestCompile,InsufficientMemory)230 TEST(TestCompile, InsufficientMemory) {
231   Regexp* re = Regexp::Parse(
232       "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
233       Regexp::LikePerl, NULL);
234   EXPECT_TRUE(re != NULL);
235   Prog* prog = re->CompileToProg(920);
236   // If the memory budget has been exhausted, compilation should fail
237   // and return NULL instead of trying to do anything with NoMatch().
238   EXPECT_TRUE(prog == NULL);
239   re->Decref();
240 }
241 
Dump(StringPiece pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)242 static void Dump(StringPiece pattern, Regexp::ParseFlags flags,
243                  std::string* forward, std::string* reverse) {
244   Regexp* re = Regexp::Parse(pattern, flags, NULL);
245   EXPECT_TRUE(re != NULL);
246 
247   if (forward != NULL) {
248     Prog* prog = re->CompileToProg(0);
249     EXPECT_TRUE(prog != NULL);
250     *forward = prog->Dump();
251     delete prog;
252   }
253 
254   if (reverse != NULL) {
255     Prog* prog = re->CompileToReverseProg(0);
256     EXPECT_TRUE(prog != NULL);
257     *reverse = prog->Dump();
258     delete prog;
259   }
260 
261   re->Decref();
262 }
263 
TEST(TestCompile,Bug26705922)264 TEST(TestCompile, Bug26705922) {
265   // Bug in the compiler caused inefficient bytecode to be generated for Unicode
266   // groups: common suffixes were cached, but common prefixes were not factored.
267 
268   std::string forward, reverse;
269 
270   Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
271   EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
272             "4. byte [90-90] 0 -> 5\n"
273             "5. byte [80-80] 0 -> 6\n"
274             "6+ byte [80-80] 0 -> 8\n"
275             "7. byte [90-90] 0 -> 8\n"
276             "8. match! 0\n",
277             forward);
278   EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
279             "4. byte [90-90] 0 -> 5\n"
280             "5. byte [80-80] 0 -> 6\n"
281             "6. byte [90-90] 0 -> 7\n"
282             "7. byte [f0-f0] 0 -> 8\n"
283             "8. match! 0\n",
284             reverse);
285 
286   Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
287   EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
288             "4. byte [f0-f0] 0 -> 8\n"
289             "5. byte [80-bf] 0 -> 6\n"
290             "6. byte [80-bf] 0 -> 7\n"
291             "7. match! 0\n"
292             "8. byte [90-90] 0 -> 5\n",
293             forward);
294   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
295             "4. byte [80-bf] 0 -> 5\n"
296             "5+ byte [e8-ef] 0 -> 7\n"
297             "6. byte [90-90] 0 -> 8\n"
298             "7. match! 0\n"
299             "8. byte [f0-f0] 0 -> 7\n",
300             reverse);
301 
302   Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, NULL, &reverse);
303   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
304             "4+ byte [c2-df] 0 -> 7\n"
305             "5+ byte [a0-bf] 1 -> 8\n"
306             "6. byte [80-bf] 0 -> 9\n"
307             "7. match! 0\n"
308             "8. byte [e0-e0] 0 -> 7\n"
309             "9+ byte [e1-ef] 0 -> 7\n"
310             "10+ byte [90-bf] 1 -> 13\n"
311             "11+ byte [80-bf] 1 -> 14\n"
312             "12. byte [80-8f] 0 -> 15\n"
313             "13. byte [f0-f0] 0 -> 7\n"
314             "14. byte [f1-f3] 0 -> 7\n"
315             "15. byte [f4-f4] 0 -> 7\n",
316             reverse);
317 }
318 
TEST(TestCompile,Bug35237384)319 TEST(TestCompile, Bug35237384) {
320   // Bug in the compiler caused inefficient bytecode to be generated for
321   // nested nullable subexpressions.
322 
323   std::string forward;
324 
325   Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
326   EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
327             "4. nop -> 5\n"
328             "5+ byte [61-61] 1 -> 5\n"
329             "6. nop -> 7\n"
330             "7+ byte [61-61] 1 -> 7\n"
331             "8. match! 0\n",
332             forward);
333 
334   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
335   EXPECT_EQ("3+ nop -> 6\n"
336             "4+ nop -> 8\n"
337             "5. nop -> 21\n"
338             "6+ byte [61-61] 1 -> 6\n"
339             "7. nop -> 3\n"
340             "8+ byte [62-62] 1 -> 8\n"
341             "9. nop -> 3\n"
342             "10+ byte [61-61] 1 -> 10\n"
343             "11. nop -> 21\n"
344             "12+ byte [62-62] 1 -> 12\n"
345             "13. nop -> 21\n"
346             "14+ byte [61-61] 1 -> 14\n"
347             "15. nop -> 18\n"
348             "16+ byte [62-62] 1 -> 16\n"
349             "17. nop -> 18\n"
350             "18+ nop -> 14\n"
351             "19+ nop -> 16\n"
352             "20. match! 0\n"
353             "21+ nop -> 10\n"
354             "22+ nop -> 12\n"
355             "23. nop -> 18\n",
356       forward);
357 
358   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359   EXPECT_EQ("3+ nop -> 36\n"
360             "4+ nop -> 31\n"
361             "5. nop -> 33\n"
362             "6+ byte [00-09] 0 -> 8\n"
363             "7. byte [0b-ff] 0 -> 8\n"
364             "8+ nop -> 6\n"
365             "9+ nop -> 29\n"
366             "10. nop -> 28\n"
367             "11+ byte [00-09] 0 -> 13\n"
368             "12. byte [0b-ff] 0 -> 13\n"
369             "13+ nop -> 11\n"
370             "14+ nop -> 26\n"
371             "15. nop -> 28\n"
372             "16+ byte [00-09] 0 -> 18\n"
373             "17. byte [0b-ff] 0 -> 18\n"
374             "18+ nop -> 16\n"
375             "19+ nop -> 36\n"
376             "20. nop -> 33\n"
377             "21+ byte [00-09] 0 -> 23\n"
378             "22. byte [0b-ff] 0 -> 23\n"
379             "23+ nop -> 21\n"
380             "24+ nop -> 31\n"
381             "25. nop -> 33\n"
382             "26+ nop -> 28\n"
383             "27. byte [53-53] 0 -> 11\n"
384             "28. match! 0\n"
385             "29+ nop -> 28\n"
386             "30. byte [53-53] 0 -> 6\n"
387             "31+ nop -> 33\n"
388             "32. byte [53-53] 0 -> 21\n"
389             "33+ nop -> 29\n"
390             "34+ nop -> 26\n"
391             "35. nop -> 28\n"
392             "36+ nop -> 33\n"
393             "37. byte [53-53] 0 -> 16\n",
394       forward);
395 }
396 
397 }  // namespace re2
398