1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Test prog.cc, compile.cc
6
7 #include <stddef.h>
8
9 #include <string>
10
11 #include "absl/base/macros.h"
12 #include "absl/log/absl_log.h"
13 #include "absl/strings/string_view.h"
14 #include "gtest/gtest.h"
15 #include "re2/prog.h"
16 #include "re2/regexp.h"
17
18 namespace re2 {
19
20 // Simple input/output tests checking that
21 // the regexp compiles to the expected code.
22 // These are just to sanity check the basic implementation.
23 // The real confidence tests happen by testing the NFA/DFA
24 // that run the compiled code.
25
26 struct Test {
27 const char* regexp;
28 const char* code;
29 };
30
31 static Test tests[] = {
32 { "a",
33 "3. byte [61-61] 0 -> 4\n"
34 "4. match! 0\n" },
35 { "ab",
36 "3. byte [61-61] 0 -> 4\n"
37 "4. byte [62-62] 0 -> 5\n"
38 "5. match! 0\n" },
39 { "a|c",
40 "3+ byte [61-61] 0 -> 5\n"
41 "4. byte [63-63] 0 -> 5\n"
42 "5. match! 0\n" },
43 { "a|b",
44 "3. byte [61-62] 0 -> 4\n"
45 "4. match! 0\n" },
46 { "[ab]",
47 "3. byte [61-62] 0 -> 4\n"
48 "4. match! 0\n" },
49 { "a+",
50 "3. byte [61-61] 0 -> 4\n"
51 "4+ nop -> 3\n"
52 "5. match! 0\n" },
53 { "a+?",
54 "3. byte [61-61] 0 -> 4\n"
55 "4+ match! 0\n"
56 "5. nop -> 3\n" },
57 { "a*",
58 "3+ byte [61-61] 1 -> 3\n"
59 "4. match! 0\n" },
60 { "a*?",
61 "3+ match! 0\n"
62 "4. byte [61-61] 0 -> 3\n" },
63 { "a?",
64 "3+ byte [61-61] 1 -> 5\n"
65 "4. nop -> 5\n"
66 "5. match! 0\n" },
67 { "a??",
68 "3+ nop -> 5\n"
69 "4. byte [61-61] 0 -> 5\n"
70 "5. match! 0\n" },
71 { "a{4}",
72 "3. byte [61-61] 0 -> 4\n"
73 "4. byte [61-61] 0 -> 5\n"
74 "5. byte [61-61] 0 -> 6\n"
75 "6. byte [61-61] 0 -> 7\n"
76 "7. match! 0\n" },
77 { "(a)",
78 "3. capture 2 -> 4\n"
79 "4. byte [61-61] 0 -> 5\n"
80 "5. capture 3 -> 6\n"
81 "6. match! 0\n" },
82 { "(?:a)",
83 "3. byte [61-61] 0 -> 4\n"
84 "4. match! 0\n" },
85 { "",
86 "3. match! 0\n" },
87 { ".",
88 "3+ byte [00-09] 0 -> 5\n"
89 "4. byte [0b-ff] 0 -> 5\n"
90 "5. match! 0\n" },
91 { "[^ab]",
92 "3+ byte [00-09] 0 -> 6\n"
93 "4+ byte [0b-60] 0 -> 6\n"
94 "5. byte [63-ff] 0 -> 6\n"
95 "6. match! 0\n" },
96 { "[Aa]",
97 "3. byte/i [61-61] 0 -> 4\n"
98 "4. match! 0\n" },
99 { "\\C+",
100 "3. byte [00-ff] 0 -> 4\n"
101 "4+ altmatch -> 5 | 6\n"
102 "5+ nop -> 3\n"
103 "6. match! 0\n" },
104 { "\\C*",
105 "3+ altmatch -> 4 | 5\n"
106 "4+ byte [00-ff] 1 -> 3\n"
107 "5. match! 0\n" },
108 { "\\C?",
109 "3+ byte [00-ff] 1 -> 5\n"
110 "4. nop -> 5\n"
111 "5. match! 0\n" },
112 // Issue 20992936
113 { "[[-`]",
114 "3. byte [5b-60] 0 -> 4\n"
115 "4. match! 0\n" },
116 // Issue 310
117 { "(?:|a)*",
118 "3+ nop -> 7\n"
119 "4. nop -> 9\n"
120 "5+ nop -> 7\n"
121 "6. nop -> 9\n"
122 "7+ nop -> 5\n"
123 "8. byte [61-61] 0 -> 5\n"
124 "9. match! 0\n" },
125 { "(?:|a)+",
126 "3+ nop -> 5\n"
127 "4. byte [61-61] 0 -> 5\n"
128 "5+ nop -> 3\n"
129 "6. match! 0\n" },
130 };
131
TEST(TestRegexpCompileToProg,Simple)132 TEST(TestRegexpCompileToProg, Simple) {
133 int failed = 0;
134 for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
135 const re2::Test& t = tests[i];
136 Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
137 if (re == NULL) {
138 ABSL_LOG(ERROR) << "Cannot parse: " << t.regexp;
139 failed++;
140 continue;
141 }
142 Prog* prog = re->CompileToProg(0);
143 if (prog == NULL) {
144 ABSL_LOG(ERROR) << "Cannot compile: " << t.regexp;
145 re->Decref();
146 failed++;
147 continue;
148 }
149 ASSERT_TRUE(re->CompileToProg(1) == NULL);
150 std::string s = prog->Dump();
151 if (s != t.code) {
152 ABSL_LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
153 ABSL_LOG(ERROR) << "Want:\n" << t.code;
154 ABSL_LOG(ERROR) << "Got:\n" << s;
155 failed++;
156 }
157 delete prog;
158 re->Decref();
159 }
160 EXPECT_EQ(failed, 0);
161 }
162
DumpByteMap(absl::string_view pattern,Regexp::ParseFlags flags,std::string * bytemap)163 static void DumpByteMap(absl::string_view pattern, Regexp::ParseFlags flags,
164 std::string* bytemap) {
165 Regexp* re = Regexp::Parse(pattern, flags, NULL);
166 EXPECT_TRUE(re != NULL);
167
168 {
169 Prog* prog = re->CompileToProg(0);
170 EXPECT_TRUE(prog != NULL);
171 *bytemap = prog->DumpByteMap();
172 delete prog;
173 }
174
175 {
176 Prog* prog = re->CompileToReverseProg(0);
177 EXPECT_TRUE(prog != NULL);
178 EXPECT_EQ(*bytemap, prog->DumpByteMap());
179 delete prog;
180 }
181
182 re->Decref();
183 }
184
TEST(TestCompile,Latin1Ranges)185 TEST(TestCompile, Latin1Ranges) {
186 // The distinct byte ranges involved in the Latin-1 dot ([^\n]).
187
188 std::string bytemap;
189
190 DumpByteMap(".", Regexp::PerlX|Regexp::Latin1, &bytemap);
191 EXPECT_EQ("[00-09] -> 0\n"
192 "[0a-0a] -> 1\n"
193 "[0b-ff] -> 0\n",
194 bytemap);
195 }
196
TEST(TestCompile,OtherByteMapTests)197 TEST(TestCompile, OtherByteMapTests) {
198 std::string bytemap;
199
200 // Test that "absent" ranges are mapped to the same byte class.
201 DumpByteMap("[0-9A-Fa-f]+", Regexp::PerlX|Regexp::Latin1, &bytemap);
202 EXPECT_EQ("[00-2f] -> 0\n"
203 "[30-39] -> 1\n"
204 "[3a-40] -> 0\n"
205 "[41-46] -> 1\n"
206 "[47-60] -> 0\n"
207 "[61-66] -> 1\n"
208 "[67-ff] -> 0\n",
209 bytemap);
210
211 // Test the byte classes for \b.
212 DumpByteMap("\\b", Regexp::LikePerl|Regexp::Latin1, &bytemap);
213 EXPECT_EQ("[00-2f] -> 0\n"
214 "[30-39] -> 1\n"
215 "[3a-40] -> 0\n"
216 "[41-5a] -> 1\n"
217 "[5b-5e] -> 0\n"
218 "[5f-5f] -> 1\n"
219 "[60-60] -> 0\n"
220 "[61-7a] -> 1\n"
221 "[7b-ff] -> 0\n",
222 bytemap);
223
224 // Bug in the ASCII case-folding optimization created too many byte classes.
225 DumpByteMap("[^_]", Regexp::LikePerl|Regexp::Latin1, &bytemap);
226 EXPECT_EQ("[00-5e] -> 0\n"
227 "[5f-5f] -> 1\n"
228 "[60-ff] -> 0\n",
229 bytemap);
230 }
231
TEST(TestCompile,UTF8Ranges)232 TEST(TestCompile, UTF8Ranges) {
233 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
234 // Once, erroneously split between 0x3f and 0x40 because it is
235 // a 6-bit boundary.
236
237 std::string bytemap;
238
239 DumpByteMap(".", Regexp::PerlX, &bytemap);
240 EXPECT_EQ("[00-09] -> 0\n"
241 "[0a-0a] -> 1\n"
242 "[0b-7f] -> 0\n"
243 "[80-bf] -> 2\n"
244 "[c0-c1] -> 1\n"
245 "[c2-df] -> 3\n"
246 "[e0-ef] -> 4\n"
247 "[f0-f4] -> 5\n"
248 "[f5-ff] -> 1\n",
249 bytemap);
250 }
251
TEST(TestCompile,InsufficientMemory)252 TEST(TestCompile, InsufficientMemory) {
253 Regexp* re = Regexp::Parse(
254 "^(?P<name1>[^\\s]+)\\s+(?P<name2>[^\\s]+)\\s+(?P<name3>.+)$",
255 Regexp::LikePerl, NULL);
256 EXPECT_TRUE(re != NULL);
257 Prog* prog = re->CompileToProg(850);
258 // If the memory budget has been exhausted, compilation should fail
259 // and return NULL instead of trying to do anything with NoMatch().
260 EXPECT_TRUE(prog == NULL);
261 re->Decref();
262 }
263
Dump(absl::string_view pattern,Regexp::ParseFlags flags,std::string * forward,std::string * reverse)264 static void Dump(absl::string_view pattern, Regexp::ParseFlags flags,
265 std::string* forward, std::string* reverse) {
266 Regexp* re = Regexp::Parse(pattern, flags, NULL);
267 EXPECT_TRUE(re != NULL);
268
269 if (forward != NULL) {
270 Prog* prog = re->CompileToProg(0);
271 EXPECT_TRUE(prog != NULL);
272 *forward = prog->Dump();
273 delete prog;
274 }
275
276 if (reverse != NULL) {
277 Prog* prog = re->CompileToReverseProg(0);
278 EXPECT_TRUE(prog != NULL);
279 *reverse = prog->Dump();
280 delete prog;
281 }
282
283 re->Decref();
284 }
285
TEST(TestCompile,Bug26705922)286 TEST(TestCompile, Bug26705922) {
287 // Bug in the compiler caused inefficient bytecode to be generated for Unicode
288 // groups: common suffixes were cached, but common prefixes were not factored.
289
290 std::string forward, reverse;
291
292 Dump("[\\x{10000}\\x{10010}]", Regexp::LikePerl, &forward, &reverse);
293 EXPECT_EQ("3. byte [f0-f0] 0 -> 4\n"
294 "4. byte [90-90] 0 -> 5\n"
295 "5. byte [80-80] 0 -> 6\n"
296 "6+ byte [80-80] 0 -> 8\n"
297 "7. byte [90-90] 0 -> 8\n"
298 "8. match! 0\n",
299 forward);
300 EXPECT_EQ("3+ byte [80-80] 0 -> 5\n"
301 "4. byte [90-90] 0 -> 5\n"
302 "5. byte [80-80] 0 -> 6\n"
303 "6. byte [90-90] 0 -> 7\n"
304 "7. byte [f0-f0] 0 -> 8\n"
305 "8. match! 0\n",
306 reverse);
307
308 Dump("[\\x{8000}-\\x{10FFF}]", Regexp::LikePerl, &forward, &reverse);
309 EXPECT_EQ("3+ byte [e8-ef] 0 -> 5\n"
310 "4. byte [f0-f0] 0 -> 8\n"
311 "5. byte [80-bf] 0 -> 6\n"
312 "6. byte [80-bf] 0 -> 7\n"
313 "7. match! 0\n"
314 "8. byte [90-90] 0 -> 5\n",
315 forward);
316 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
317 "4. byte [80-bf] 0 -> 5\n"
318 "5+ byte [e8-ef] 0 -> 7\n"
319 "6. byte [90-90] 0 -> 8\n"
320 "7. match! 0\n"
321 "8. byte [f0-f0] 0 -> 7\n",
322 reverse);
323
324 Dump("[\\x{80}-\\x{10FFFF}]", Regexp::LikePerl, &forward, &reverse);
325 EXPECT_EQ("3+ byte [c2-df] 0 -> 6\n"
326 "4+ byte [e0-ef] 0 -> 8\n"
327 "5. byte [f0-f4] 0 -> 9\n"
328 "6. byte [80-bf] 0 -> 7\n"
329 "7. match! 0\n"
330 "8. byte [80-bf] 0 -> 6\n"
331 "9. byte [80-bf] 0 -> 8\n",
332 forward);
333 EXPECT_EQ("3. byte [80-bf] 0 -> 4\n"
334 "4+ byte [c2-df] 0 -> 6\n"
335 "5. byte [80-bf] 0 -> 7\n"
336 "6. match! 0\n"
337 "7+ byte [e0-ef] 0 -> 6\n"
338 "8. byte [80-bf] 0 -> 9\n"
339 "9. byte [f0-f4] 0 -> 6\n",
340 reverse);
341 }
342
TEST(TestCompile,Bug35237384)343 TEST(TestCompile, Bug35237384) {
344 // Bug in the compiler caused inefficient bytecode to be generated for
345 // nested nullable subexpressions.
346
347 std::string forward;
348
349 Dump("a**{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
350 EXPECT_EQ("3+ byte [61-61] 1 -> 3\n"
351 "4. nop -> 5\n"
352 "5+ byte [61-61] 1 -> 5\n"
353 "6. nop -> 7\n"
354 "7+ byte [61-61] 1 -> 7\n"
355 "8. match! 0\n",
356 forward);
357
358 Dump("(a*|b*)*{3,}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
359 EXPECT_EQ("3+ nop -> 28\n"
360 "4. nop -> 30\n"
361 "5+ byte [61-61] 1 -> 5\n"
362 "6. nop -> 32\n"
363 "7+ byte [61-61] 1 -> 7\n"
364 "8. nop -> 26\n"
365 "9+ byte [61-61] 1 -> 9\n"
366 "10. nop -> 20\n"
367 "11+ byte [62-62] 1 -> 11\n"
368 "12. nop -> 20\n"
369 "13+ byte [62-62] 1 -> 13\n"
370 "14. nop -> 26\n"
371 "15+ byte [62-62] 1 -> 15\n"
372 "16. nop -> 32\n"
373 "17+ nop -> 9\n"
374 "18. nop -> 11\n"
375 "19. match! 0\n"
376 "20+ nop -> 17\n"
377 "21. nop -> 19\n"
378 "22+ nop -> 7\n"
379 "23. nop -> 13\n"
380 "24+ nop -> 17\n"
381 "25. nop -> 19\n"
382 "26+ nop -> 22\n"
383 "27. nop -> 24\n"
384 "28+ nop -> 5\n"
385 "29. nop -> 15\n"
386 "30+ nop -> 22\n"
387 "31. nop -> 24\n"
388 "32+ nop -> 28\n"
389 "33. nop -> 30\n",
390 forward);
391
392 Dump("((|S.+)+|(|S.+)+|){2}", Regexp::Latin1|Regexp::NeverCapture, &forward, NULL);
393 EXPECT_EQ("3+ nop -> 36\n"
394 "4+ nop -> 31\n"
395 "5. nop -> 33\n"
396 "6+ byte [00-09] 0 -> 8\n"
397 "7. byte [0b-ff] 0 -> 8\n"
398 "8+ nop -> 6\n"
399 "9+ nop -> 29\n"
400 "10. nop -> 28\n"
401 "11+ byte [00-09] 0 -> 13\n"
402 "12. byte [0b-ff] 0 -> 13\n"
403 "13+ nop -> 11\n"
404 "14+ nop -> 26\n"
405 "15. nop -> 28\n"
406 "16+ byte [00-09] 0 -> 18\n"
407 "17. byte [0b-ff] 0 -> 18\n"
408 "18+ nop -> 16\n"
409 "19+ nop -> 36\n"
410 "20. nop -> 33\n"
411 "21+ byte [00-09] 0 -> 23\n"
412 "22. byte [0b-ff] 0 -> 23\n"
413 "23+ nop -> 21\n"
414 "24+ nop -> 31\n"
415 "25. nop -> 33\n"
416 "26+ nop -> 28\n"
417 "27. byte [53-53] 0 -> 11\n"
418 "28. match! 0\n"
419 "29+ nop -> 28\n"
420 "30. byte [53-53] 0 -> 6\n"
421 "31+ nop -> 33\n"
422 "32. byte [53-53] 0 -> 21\n"
423 "33+ nop -> 29\n"
424 "34+ nop -> 26\n"
425 "35. nop -> 28\n"
426 "36+ nop -> 33\n"
427 "37. byte [53-53] 0 -> 16\n",
428 forward);
429 }
430
431 } // namespace re2
432