1 // Copyright 2007 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 // Test prog.cc, compile.cc
6
7 #include <string>
8 #include <vector>
9 #include "util/test.h"
10 #include "re2/regexp.h"
11 #include "re2/prog.h"
12
13 DEFINE_string(show, "", "regular expression to compile and dump");
14
15 namespace re2 {
16
17 // Simple input/output tests checking that
18 // the regexp compiles to the expected code.
19 // These are just to sanity check the basic implementation.
20 // The real confidence tests happen by testing the NFA/DFA
21 // that run the compiled code.
22
23 struct Test {
24 const char* regexp;
25 const char* code;
26 };
27
28 static Test tests[] = {
29 { "a",
30 "1. byte [61-61] -> 2\n"
31 "2. match! 0\n" },
32 { "ab",
33 "1. byte [61-61] -> 2\n"
34 "2. byte [62-62] -> 3\n"
35 "3. match! 0\n" },
36 { "a|c",
37 "3. alt -> 1 | 2\n"
38 "1. byte [61-61] -> 4\n"
39 "2. byte [63-63] -> 4\n"
40 "4. match! 0\n" },
41 { "a|b",
42 "1. byte [61-62] -> 2\n"
43 "2. match! 0\n" },
44 { "[ab]",
45 "1. byte [61-62] -> 2\n"
46 "2. match! 0\n" },
47 { "a+",
48 "1. byte [61-61] -> 2\n"
49 "2. alt -> 1 | 3\n"
50 "3. match! 0\n" },
51 { "a+?",
52 "1. byte [61-61] -> 2\n"
53 "2. alt -> 3 | 1\n"
54 "3. match! 0\n" },
55 { "a*",
56 "2. alt -> 1 | 3\n"
57 "1. byte [61-61] -> 2\n"
58 "3. match! 0\n" },
59 { "a*?",
60 "2. alt -> 3 | 1\n"
61 "3. match! 0\n"
62 "1. byte [61-61] -> 2\n" },
63 { "a?",
64 "2. alt -> 1 | 3\n"
65 "1. byte [61-61] -> 3\n"
66 "3. match! 0\n" },
67 { "a??",
68 "2. alt -> 3 | 1\n"
69 "3. match! 0\n"
70 "1. byte [61-61] -> 3\n" },
71 { "a{4}",
72 "1. byte [61-61] -> 2\n"
73 "2. byte [61-61] -> 3\n"
74 "3. byte [61-61] -> 4\n"
75 "4. byte [61-61] -> 5\n"
76 "5. match! 0\n" },
77 { "(a)",
78 "2. capture 2 -> 1\n"
79 "1. byte [61-61] -> 3\n"
80 "3. capture 3 -> 4\n"
81 "4. match! 0\n" },
82 { "(?:a)",
83 "1. byte [61-61] -> 2\n"
84 "2. match! 0\n" },
85 { "",
86 "2. match! 0\n" },
87 { ".",
88 "3. alt -> 1 | 2\n"
89 "1. byte [00-09] -> 4\n"
90 "2. byte [0b-ff] -> 4\n"
91 "4. match! 0\n" },
92 { "[^ab]",
93 "5. alt -> 3 | 4\n"
94 "3. alt -> 1 | 2\n"
95 "4. byte [63-ff] -> 6\n"
96 "1. byte [00-09] -> 6\n"
97 "2. byte [0b-60] -> 6\n"
98 "6. match! 0\n" },
99 { "[Aa]",
100 "1. byte/i [61-61] -> 2\n"
101 "2. match! 0\n" },
102 };
103
TEST(TestRegexpCompileToProg,Simple)104 TEST(TestRegexpCompileToProg, Simple) {
105 int failed = 0;
106 for (int i = 0; i < arraysize(tests); i++) {
107 const re2::Test& t = tests[i];
108 Regexp* re = Regexp::Parse(t.regexp, Regexp::PerlX|Regexp::Latin1, NULL);
109 if (re == NULL) {
110 LOG(ERROR) << "Cannot parse: " << t.regexp;
111 failed++;
112 continue;
113 }
114 Prog* prog = re->CompileToProg(0);
115 if (prog == NULL) {
116 LOG(ERROR) << "Cannot compile: " << t.regexp;
117 re->Decref();
118 failed++;
119 continue;
120 }
121 CHECK(re->CompileToProg(1) == NULL);
122 string s = prog->Dump();
123 if (s != t.code) {
124 LOG(ERROR) << "Incorrect compiled code for: " << t.regexp;
125 LOG(ERROR) << "Want:\n" << t.code;
126 LOG(ERROR) << "Got:\n" << s;
127 failed++;
128 }
129 delete prog;
130 re->Decref();
131 }
132 EXPECT_EQ(failed, 0);
133 }
134
135 // The distinct byte ranges involved in the UTF-8 dot ([^\n]).
136 // Once, erroneously split between 0x3f and 0x40 because it is
137 // a 6-bit boundary.
138 static struct UTF8ByteRange {
139 int lo;
140 int hi;
141 } utf8ranges[] = {
142 { 0x00, 0x09 },
143 { 0x0A, 0x0A },
144 { 0x10, 0x7F },
145 { 0x80, 0x8F },
146 { 0x90, 0x9F },
147 { 0xA0, 0xBF },
148 { 0xC0, 0xC1 },
149 { 0xC2, 0xDF },
150 { 0xE0, 0xE0 },
151 { 0xE1, 0xEF },
152 { 0xF0, 0xF0 },
153 { 0xF1, 0xF3 },
154 { 0xF4, 0xF4 },
155 { 0xF5, 0xFF },
156 };
157
TEST(TestCompile,ByteRanges)158 TEST(TestCompile, ByteRanges) {
159 Regexp* re = Regexp::Parse(".", Regexp::PerlX, NULL);
160 EXPECT_TRUE(re != NULL);
161 Prog* prog = re->CompileToProg(0);
162 EXPECT_TRUE(prog != NULL);
163 EXPECT_EQ(prog->bytemap_range(), arraysize(utf8ranges));
164 for (int i = 0; i < arraysize(utf8ranges); i++)
165 for (int j = utf8ranges[i].lo; j <= utf8ranges[i].hi; j++)
166 EXPECT_EQ(prog->bytemap()[j], i) << " byte " << j;
167 delete prog;
168 re->Decref();
169 }
170
171 } // namespace re2
172