• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2007 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test prog.cc, compile.cc
6 
7 #include <stddef.h>
8 
9 #include <string>
10 
11 #include "absl/base/macros.h"
12 #include "absl/log/absl_log.h"
13 #include "absl/strings/string_view.h"
14 #include "gtest/gtest.h"
15 #include "re2/prog.h"
16 #include "re2/regexp.h"
17 
18 namespace re2 {
19 
20 // Simple input/output tests checking that
21 // the regexp compiles to the expected code.
22 // These are just to sanity check the basic implementation.
23 // The real confidence tests happen by testing the NFA/DFA
24 // that run the compiled code.
25 
26 struct Test {
27   const char* regexp;
28   const char* code;
29 };
30 
31 static Test tests[] = {
32   { "a",
33     "3. byte [61-61] 0 -> 4\n"
34     "4. match! 0\n" },
35   { "ab",
36     "3. byte [61-61] 0 -> 4\n"
37     "4. byte [62-62] 0 -> 5\n"
38     "5. match! 0\n" },
39   { "a|c",
40     "3+ byte [61-61] 0 -> 5\n"
41     "4. byte [63-63] 0 -> 5\n"
42     "5. match! 0\n" },
43   { "a|b",
44     "3. byte [61-62] 0 -> 4\n"
45     "4. match! 0\n" },
46   { "[ab]",
47     "3. byte [61-62] 0 -> 4\n"
48     "4. match! 0\n" },
49   { "a+",
50     "3. byte [61-61] 0 -> 4\n"
51     "4+ nop -> 3\n"
52     "5. match! 0\n" },
53   { "a+?",
54     "3. byte [61-61] 0 -> 4\n"
55     "4+ match! 0\n"
56     "5. nop -> 3\n" },
57   { "a*",
58     "3+ byte [61-61] 1 -> 3\n"
59     "4. match! 0\n" },
60   { "a*?",
61     "3+ match! 0\n"
62     "4. byte [61-61] 0 -> 3\n" },
63   { "a?",
64     "3+ byte [61-61] 1 -> 5\n"
65     "4. nop -> 5\n"
66     "5. match! 0\n" },
67   { "a??",
68     "3+ nop -> 5\n"
69     "4. byte [61-61] 0 -> 5\n"
70     "5. match! 0\n" },
71   { "a{4}",
72     "3. byte [61-61] 0 -> 4\n"
73     "4. byte [61-61] 0 -> 5\n"
74     "5. byte [61-61] 0 -> 6\n"
75     "6. byte [61-61] 0 -> 7\n"
76     "7. match! 0\n" },
77   { "(a)",
78     "3. capture 2 -> 4\n"
79     "4. byte [61-61] 0 -> 5\n"
80     "5. capture 3 -> 6\n"
81     "6. match! 0\n" },
82   { "(?:a)",
83     "3. byte [61-61] 0 -> 4\n"
84     "4. match! 0\n" },
85   { "",
86     "3. match! 0\n" },
87   { ".",
88     "3+ byte [00-09] 0 -> 5\n"
89     "4. byte [0b-ff] 0 -> 5\n"
90     "5. match! 0\n" },
91   { "[^ab]",
92     "3+ byte [00-09] 0 -> 6\n"
93     "4+ byte [0b-60] 0 -> 6\n"
94     "5. byte [63-ff] 0 -> 6\n"
95     "6. match! 0\n" },
96   { "[Aa]",
97     "3. byte/i [61-61] 0 -> 4\n"
98     "4. match! 0\n" },
99   { "\\C+",
100     "3. byte [00-ff] 0 -> 4\n"
101     "4+ altmatch -> 5 | 6\n"
102     "5+ nop -> 3\n"
103     "6. match! 0\n" },
104   { "\\C*",
105     "3+ altmatch -> 4 | 5\n"
106     "4+ byte [00-ff] 1 -> 3\n"
107     "5. match! 0\n" },
108   { "\\C?",
109     "3+ byte [00-ff] 1 -> 5\n"
110     "4. nop -> 5\n"
111     "5. match! 0\n" },
112   // Issue 20992936
113   { "[[-`]",
114     "3. byte [5b-60] 0 -> 4\n"
115     "4. match! 0\n" },
116   // Issue 310
117   { "(?:|a)*",
118     "3+ nop -> 7\n"
119     "4. nop -> 9\n"
120     "5+ nop -> 7\n"
121     "6. nop -> 9\n"
122     "7+ nop -> 5\n"
123     "8. byte [61-61] 0 -> 5\n"
124     "9. match! 0\n" },
125   { "(?:|a)+",
126     "3+ nop -> 5\n"
127     "4. byte [61-61] 0 -> 5\n"
128     "5+ nop -> 3\n"
129     "6. match! 0\n" },
130 };
131 
TEST(TestRegexpCompileToProg,Simple)132 TEST(TestRegexpCompileToProg, Simple) {
133   int failed = 0;
134   for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
135     const re2::Test& t = tests[i];
136     Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
137     if (re == NULL) {
138       ABSL_LOG(ERROR) << "Cannot parse: " << t.regexp;
139       failed++;
140       continue;
141     }
142     Prog* prog = re->CompileToProg(0);
143     if (prog == NULL) {
144       ABSL_LOG(ERROR) << "Cannot compile: " << t.regexp;
145       re->Decref();
146       failed++;
147       continue;
148     }
149     ASSERT_TRUE(re->CompileToProg(1) == NULL);
150     std::string s = prog->Dump();
151     if (s != t.code) {
152       ABSL_LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
153       ABSL_LOG(ERROR) << "Want:\n" << t.code;
154       ABSL_LOG(ERROR) << "Got:\n" << s;
155       failed++;
156     }
157     delete prog;
158     re->Decref();
159   }
160   EXPECT_EQ(failed, 0);
161 }
162 
DumpByteMap(absl::string_view pattern,Regexp::ParseFlags flags,std::string * bytemap)163 static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags,
164                         std::string* bytemap) {
165   Regexp* re = Regexp::Parse(pattern, flags, NULL);
166   EXPECT_TRUE(re != NULL);
167 
168   {
169     Prog* prog = re->CompileToProg(0);
170     EXPECT_TRUE(prog != NULL);
171     *bytemap = prog->DumpByteMap();
172     delete prog;
173   }
174 
175   {
176     Prog* prog = re->CompileToReverseProg(0);
177     EXPECT_TRUE(prog != NULL);
178     EXPECT_EQ(*bytemap, prog->DumpByteMap());
179     delete prog;
180   }
181 
182   re->Decref();
183 }
184 
TEST(TestCompile,Latin1Ranges)185 TEST(TestCompile, Latin1Ranges) {
186   // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
187 
188   std::string bytemap;
189 
190   DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
191   EXPECT_EQ("[00-09] -> 0\n"
192             "[0a-0a] -> 1\n"
193             "[0b-ff] -> 0\n",
194             bytemap);
195 }
196 
TEST(TestCompile,OtherByteMapTests)197 TEST(TestCompile, OtherByteMapTests) {
198   std::string bytemap;
199 
200   // Test that "absent" ranges are mapped to the same byte class.
201   DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
202   EXPECT_EQ("[00-2f] -> 0\n"
203             "[30-39] -> 1\n"
204             "[3a-40] -> 0\n"
205             "[41-46] -> 1\n"
206             "[47-60] -> 0\n"
207             "[61-66] -> 1\n"
208             "[67-ff] -> 0\n",
209             bytemap);
210 
211   // Test the byte classes for \b.
212   DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
213   EXPECT_EQ("[00-2f] -> 0\n"
214             "[30-39] -> 1\n"
215             "[3a-40] -> 0\n"
216             "[41-5a] -> 1\n"
217             "[5b-5e] -> 0\n"
218             "[5f-5f] -> 1\n"
219             "[60-60] -> 0\n"
220             "[61-7a] -> 1\n"
221             "[7b-ff] -> 0\n",
222             bytemap);
223 
224   // Bug in the ASCII case-folding optimization created too many byte classes.
225   DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
226   EXPECT_EQ("[00-5e] -> 0\n"
227             "[5f-5f] -> 1\n"
228             "[60-ff] -> 0\n",
229             bytemap);
230 }
231 
TEST(TestCompile,UTF8Ranges)232 TEST(TestCompile, UTF8Ranges) {
233   // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
234   // Once, erroneously split between 0x3f and 0x40 because it is
235   // a 6-bit boundary.
236 
237   std::string bytemap;
238 
239   DumpByteMap(".", Regexp::PerlX, &bytemap);
240   EXPECT_EQ("[00-09] -> 0\n"
241             "[0a-0a] -> 1\n"
242             "[0b-7f] -> 0\n"
243             "[80-bf] -> 2\n"
244             "[c0-c1] -> 1\n"
245             "[c2-df] -> 3\n"
246             "[e0-ef] -> 4\n"
247             "[f0-f4] -> 5\n"
248             "[f5-ff] -> 1\n",
249             bytemap);
250 }
251 
TEST(TestCompile,InsufficientMemory)252 TEST(TestCompile, InsufficientMemory) {
253   Regexp* re = Regexp::Parse(
254       "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
255       Regexp::LikePerl, NULL);
256   EXPECT_TRUE(re != NULL);
257   Prog* prog = re->CompileToProg(850);
258   // If the memory budget has been exhausted, compilation should fail
259   // and return NULL instead of trying to do anything with NoMatch().
260   EXPECT_TRUE(prog == NULL);
261   re->Decref();
262 }
263 
Dump(absl::string_view pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)264 static void Dump(absl::string_view pattern, Regexp::ParseFlags flags,
265                  std::string* forward, std::string* reverse) {
266   Regexp* re = Regexp::Parse(pattern, flags, NULL);
267   EXPECT_TRUE(re != NULL);
268 
269   if (forward != NULL) {
270     Prog* prog = re->CompileToProg(0);
271     EXPECT_TRUE(prog != NULL);
272     *forward = prog->Dump();
273     delete prog;
274   }
275 
276   if (reverse != NULL) {
277     Prog* prog = re->CompileToReverseProg(0);
278     EXPECT_TRUE(prog != NULL);
279     *reverse = prog->Dump();
280     delete prog;
281   }
282 
283   re->Decref();
284 }
285 
TEST(TestCompile,Bug26705922)286 TEST(TestCompile, Bug26705922) {
287   // Bug in the compiler caused inefficient bytecode to be generated for Unicode
288   // groups: common suffixes were cached, but common prefixes were not factored.
289 
290   std::string forward, reverse;
291 
292   Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
293   EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
294             "4. byte [90-90] 0 -> 5\n"
295             "5. byte [80-80] 0 -> 6\n"
296             "6+ byte [80-80] 0 -> 8\n"
297             "7. byte [90-90] 0 -> 8\n"
298             "8. match! 0\n",
299             forward);
300   EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
301             "4. byte [90-90] 0 -> 5\n"
302             "5. byte [80-80] 0 -> 6\n"
303             "6. byte [90-90] 0 -> 7\n"
304             "7. byte [f0-f0] 0 -> 8\n"
305             "8. match! 0\n",
306             reverse);
307 
308   Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
309   EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
310             "4. byte [f0-f0] 0 -> 8\n"
311             "5. byte [80-bf] 0 -> 6\n"
312             "6. byte [80-bf] 0 -> 7\n"
313             "7. match! 0\n"
314             "8. byte [90-90] 0 -> 5\n",
315             forward);
316   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
317             "4. byte [80-bf] 0 -> 5\n"
318             "5+ byte [e8-ef] 0 -> 7\n"
319             "6. byte [90-90] 0 -> 8\n"
320             "7. match! 0\n"
321             "8. byte [f0-f0] 0 -> 7\n",
322             reverse);
323 
324   Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
325   EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
326             "4+ byte [e0-ef] 0 -> 8\n"
327             "5. byte [f0-f4] 0 -> 9\n"
328             "6. byte [80-bf] 0 -> 7\n"
329             "7. match! 0\n"
330             "8. byte [80-bf] 0 -> 6\n"
331             "9. byte [80-bf] 0 -> 8\n",
332             forward);
333   EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
334             "4+ byte [c2-df] 0 -> 6\n"
335             "5. byte [80-bf] 0 -> 7\n"
336             "6. match! 0\n"
337             "7+ byte [e0-ef] 0 -> 6\n"
338             "8. byte [80-bf] 0 -> 9\n"
339             "9. byte [f0-f4] 0 -> 6\n",
340             reverse);
341 }
342 
TEST(TestCompile,Bug35237384)343 TEST(TestCompile, Bug35237384) {
344   // Bug in the compiler caused inefficient bytecode to be generated for
345   // nested nullable subexpressions.
346 
347   std::string forward;
348 
349   Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
350   EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
351             "4. nop -> 5\n"
352             "5+ byte [61-61] 1 -> 5\n"
353             "6. nop -> 7\n"
354             "7+ byte [61-61] 1 -> 7\n"
355             "8. match! 0\n",
356             forward);
357 
358   Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359   EXPECT_EQ("3+ nop -> 28\n"
360             "4. nop -> 30\n"
361             "5+ byte [61-61] 1 -> 5\n"
362             "6. nop -> 32\n"
363             "7+ byte [61-61] 1 -> 7\n"
364             "8. nop -> 26\n"
365             "9+ byte [61-61] 1 -> 9\n"
366             "10. nop -> 20\n"
367             "11+ byte [62-62] 1 -> 11\n"
368             "12. nop -> 20\n"
369             "13+ byte [62-62] 1 -> 13\n"
370             "14. nop -> 26\n"
371             "15+ byte [62-62] 1 -> 15\n"
372             "16. nop -> 32\n"
373             "17+ nop -> 9\n"
374             "18. nop -> 11\n"
375             "19. match! 0\n"
376             "20+ nop -> 17\n"
377             "21. nop -> 19\n"
378             "22+ nop -> 7\n"
379             "23. nop -> 13\n"
380             "24+ nop -> 17\n"
381             "25. nop -> 19\n"
382             "26+ nop -> 22\n"
383             "27. nop -> 24\n"
384             "28+ nop -> 5\n"
385             "29. nop -> 15\n"
386             "30+ nop -> 22\n"
387             "31. nop -> 24\n"
388             "32+ nop -> 28\n"
389             "33. nop -> 30\n",
390       forward);
391 
392   Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
393   EXPECT_EQ("3+ nop -> 36\n"
394             "4+ nop -> 31\n"
395             "5. nop -> 33\n"
396             "6+ byte [00-09] 0 -> 8\n"
397             "7. byte [0b-ff] 0 -> 8\n"
398             "8+ nop -> 6\n"
399             "9+ nop -> 29\n"
400             "10. nop -> 28\n"
401             "11+ byte [00-09] 0 -> 13\n"
402             "12. byte [0b-ff] 0 -> 13\n"
403             "13+ nop -> 11\n"
404             "14+ nop -> 26\n"
405             "15. nop -> 28\n"
406             "16+ byte [00-09] 0 -> 18\n"
407             "17. byte [0b-ff] 0 -> 18\n"
408             "18+ nop -> 16\n"
409             "19+ nop -> 36\n"
410             "20. nop -> 33\n"
411             "21+ byte [00-09] 0 -> 23\n"
412             "22. byte [0b-ff] 0 -> 23\n"
413             "23+ nop -> 21\n"
414             "24+ nop -> 31\n"
415             "25. nop -> 33\n"
416             "26+ nop -> 28\n"
417             "27. byte [53-53] 0 -> 11\n"
418             "28. match! 0\n"
419             "29+ nop -> 28\n"
420             "30. byte [53-53] 0 -> 6\n"
421             "31+ nop -> 33\n"
422             "32. byte [53-53] 0 -> 21\n"
423             "33+ nop -> 29\n"
424             "34+ nop -> 26\n"
425             "35. nop -> 28\n"
426             "36+ nop -> 33\n"
427             "37. byte [53-53] 0 -> 16\n",
428       forward);
429 }
430 
431 }  // namespace re2
432