1 // Copyright 2006-2008 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <string.h>
6 #include <string>
7 #include <vector>
8
9 #include "util/test.h"
10 #include "util/logging.h"
11 #include "util/strutil.h"
12 #include "re2/prog.h"
13 #include "re2/re2.h"
14 #include "re2/regexp.h"
15 #include "re2/testing/exhaustive_tester.h"
16 #include "re2/testing/regexp_generator.h"
17 #include "re2/testing/string_generator.h"
18
19 namespace re2 {
20
21 // Test that C++ strings are compared as uint8s, not int8s.
22 // PossibleMatchRange doesn't depend on this, but callers probably will.
TEST(CplusplusStrings,EightBit)23 TEST(CplusplusStrings, EightBit) {
24 std::string s = "\x70";
25 std::string t = "\xA0";
26 EXPECT_LT(s, t);
27 }
28
29 struct PrefixTest {
30 const char* regexp;
31 int maxlen;
32 const char* min;
33 const char* max;
34 };
35
36 static PrefixTest tests[] = {
37 { "", 10, "", "", },
38 { "Abcdef", 10, "Abcdef", "Abcdef" },
39 { "abc(def|ghi)", 10, "abcdef", "abcghi" },
40 { "a+hello", 10, "aa", "ahello" },
41 { "a*hello", 10, "a", "hello" },
42 { "def|abc", 10, "abc", "def" },
43 { "a(b)(c)[d]", 10, "abcd", "abcd" },
44 { "ab(cab|cat)", 10, "abcab", "abcat" },
45 { "ab(cab|ca)x", 10, "abcabx", "abcax" },
46 { "(ab|x)(c|de)", 10, "abc", "xde" },
47 { "(ab|x)?(c|z)?", 10, "", "z" },
48 { "[^\\s\\S]", 10, "", "" },
49 { "(abc)+", 5, "abc", "abcac" },
50 { "(abc)+", 2, "ab", "ac" },
51 { "(abc)+", 1, "a", "b" },
52 { "[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
53 { "a*", 10, "", "ab" },
54
55 { "(?i)Abcdef", 10, "ABCDEF", "abcdef" },
56 { "(?i)abc(def|ghi)", 10, "ABCDEF", "abcghi" },
57 { "(?i)a+hello", 10, "AA", "ahello" },
58 { "(?i)a*hello", 10, "A", "hello" },
59 { "(?i)def|abc", 10, "ABC", "def" },
60 { "(?i)a(b)(c)[d]", 10, "ABCD", "abcd" },
61 { "(?i)ab(cab|cat)", 10, "ABCAB", "abcat" },
62 { "(?i)ab(cab|ca)x", 10, "ABCABX", "abcax" },
63 { "(?i)(ab|x)(c|de)", 10, "ABC", "xde" },
64 { "(?i)(ab|x)?(c|z)?", 10, "", "z" },
65 { "(?i)[^\\s\\S]", 10, "", "" },
66 { "(?i)(abc)+", 5, "ABC", "abcac" },
67 { "(?i)(abc)+", 2, "AB", "ac" },
68 { "(?i)(abc)+", 1, "A", "b" },
69 { "(?i)[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
70 { "(?i)a*", 10, "", "ab" },
71 { "(?i)A*", 10, "", "ab" },
72
73 { "\\AAbcdef", 10, "Abcdef", "Abcdef" },
74 { "\\Aabc(def|ghi)", 10, "abcdef", "abcghi" },
75 { "\\Aa+hello", 10, "aa", "ahello" },
76 { "\\Aa*hello", 10, "a", "hello" },
77 { "\\Adef|abc", 10, "abc", "def" },
78 { "\\Aa(b)(c)[d]", 10, "abcd", "abcd" },
79 { "\\Aab(cab|cat)", 10, "abcab", "abcat" },
80 { "\\Aab(cab|ca)x", 10, "abcabx", "abcax" },
81 { "\\A(ab|x)(c|de)", 10, "abc", "xde" },
82 { "\\A(ab|x)?(c|z)?", 10, "", "z" },
83 { "\\A[^\\s\\S]", 10, "", "" },
84 { "\\A(abc)+", 5, "abc", "abcac" },
85 { "\\A(abc)+", 2, "ab", "ac" },
86 { "\\A(abc)+", 1, "a", "b" },
87 { "\\A[a\xC3\xA1]", 4, "a", "\xC3\xA1" },
88 { "\\Aa*", 10, "", "ab" },
89
90 { "(?i)\\AAbcdef", 10, "ABCDEF", "abcdef" },
91 { "(?i)\\Aabc(def|ghi)", 10, "ABCDEF", "abcghi" },
92 { "(?i)\\Aa+hello", 10, "AA", "ahello" },
93 { "(?i)\\Aa*hello", 10, "A", "hello" },
94 { "(?i)\\Adef|abc", 10, "ABC", "def" },
95 { "(?i)\\Aa(b)(c)[d]", 10, "ABCD", "abcd" },
96 { "(?i)\\Aab(cab|cat)", 10, "ABCAB", "abcat" },
97 { "(?i)\\Aab(cab|ca)x", 10, "ABCABX", "abcax" },
98 { "(?i)\\A(ab|x)(c|de)", 10, "ABC", "xde" },
99 { "(?i)\\A(ab|x)?(c|z)?", 10, "", "z" },
100 { "(?i)\\A[^\\s\\S]", 10, "", "" },
101 { "(?i)\\A(abc)+", 5, "ABC", "abcac" },
102 { "(?i)\\A(abc)+", 2, "AB", "ac" },
103 { "(?i)\\A(abc)+", 1, "A", "b" },
104 { "(?i)\\A[a\xC3\xA1]", 4, "A", "\xC3\xA1" },
105 { "(?i)\\Aa*", 10, "", "ab" },
106 { "(?i)\\AA*", 10, "", "ab" },
107 };
108
TEST(PossibleMatchRange,HandWritten)109 TEST(PossibleMatchRange, HandWritten) {
110 for (size_t i = 0; i < arraysize(tests); i++) {
111 for (size_t j = 0; j < 2; j++) {
112 const PrefixTest& t = tests[i];
113 std::string min, max;
114 if (j == 0) {
115 LOG(INFO) << "Checking regexp=" << CEscape(t.regexp);
116 Regexp* re = Regexp::Parse(t.regexp, Regexp::LikePerl, NULL);
117 ASSERT_TRUE(re != NULL);
118 Prog* prog = re->CompileToProg(0);
119 ASSERT_TRUE(prog != NULL);
120 ASSERT_TRUE(prog->PossibleMatchRange(&min, &max, t.maxlen))
121 << " " << t.regexp;
122 delete prog;
123 re->Decref();
124 } else {
125 ASSERT_TRUE(RE2(t.regexp).PossibleMatchRange(&min, &max, t.maxlen));
126 }
127 EXPECT_EQ(t.min, min) << t.regexp;
128 EXPECT_EQ(t.max, max) << t.regexp;
129 }
130 }
131 }
132
133 // Test cases where PossibleMatchRange should return false.
TEST(PossibleMatchRange,Failures)134 TEST(PossibleMatchRange, Failures) {
135 std::string min, max;
136
137 // Fails because no room to write max.
138 EXPECT_FALSE(RE2("abc").PossibleMatchRange(&min, &max, 0));
139
140 // Fails because there is no max -- any non-empty string matches
141 // or begins a match. Have to use Latin-1 input, because there
142 // are no valid UTF-8 strings beginning with byte 0xFF.
143 EXPECT_FALSE(RE2("[\\s\\S]+", RE2::Latin1).
144 PossibleMatchRange(&min, &max, 10))
145 << "min=" << CEscape(min) << ", max=" << CEscape(max);
146 EXPECT_FALSE(RE2("[\\0-\xFF]+", RE2::Latin1).
147 PossibleMatchRange(&min, &max, 10))
148 << "min=" << CEscape(min) << ", max=" << CEscape(max);
149 EXPECT_FALSE(RE2(".+hello", RE2::Latin1).
150 PossibleMatchRange(&min, &max, 10))
151 << "min=" << CEscape(min) << ", max=" << CEscape(max);
152 EXPECT_FALSE(RE2(".*hello", RE2::Latin1).
153 PossibleMatchRange(&min, &max, 10))
154 << "min=" << CEscape(min) << ", max=" << CEscape(max);
155 EXPECT_FALSE(RE2(".*", RE2::Latin1).
156 PossibleMatchRange(&min, &max, 10))
157 << "min=" << CEscape(min) << ", max=" << CEscape(max);
158 EXPECT_FALSE(RE2("\\C*").
159 PossibleMatchRange(&min, &max, 10))
160 << "min=" << CEscape(min) << ", max=" << CEscape(max);
161
162 // Fails because it's a malformed regexp.
163 EXPECT_FALSE(RE2("*hello").PossibleMatchRange(&min, &max, 10))
164 << "min=" << CEscape(min) << ", max=" << CEscape(max);
165 }
166
167 // Exhaustive test: generate all regexps within parameters,
168 // then generate all strings of a given length over a given alphabet,
169 // then check that the prefix information agrees with whether
170 // the regexp matches each of the strings.
171 class PossibleMatchTester : public RegexpGenerator {
172 public:
PossibleMatchTester(int maxatoms,int maxops,const std::vector<std::string> & alphabet,const std::vector<std::string> & ops,int maxstrlen,const std::vector<std::string> & stralphabet)173 PossibleMatchTester(int maxatoms,
174 int maxops,
175 const std::vector<std::string>& alphabet,
176 const std::vector<std::string>& ops,
177 int maxstrlen,
178 const std::vector<std::string>& stralphabet)
179 : RegexpGenerator(maxatoms, maxops, alphabet, ops),
180 strgen_(maxstrlen, stralphabet),
181 regexps_(0), tests_(0) { }
182
regexps()183 int regexps() { return regexps_; }
tests()184 int tests() { return tests_; }
185
186 // Needed for RegexpGenerator interface.
187 void HandleRegexp(const std::string& regexp);
188
189 private:
190 StringGenerator strgen_;
191
192 int regexps_; // Number of HandleRegexp calls
193 int tests_; // Number of regexp tests.
194
195 PossibleMatchTester(const PossibleMatchTester&) = delete;
196 PossibleMatchTester& operator=(const PossibleMatchTester&) = delete;
197 };
198
199 // Processes a single generated regexp.
200 // Checks that all accepted strings agree with the prefix range.
HandleRegexp(const std::string & regexp)201 void PossibleMatchTester::HandleRegexp(const std::string& regexp) {
202 regexps_++;
203
204 VLOG(3) << CEscape(regexp);
205
206 RE2 re(regexp, RE2::Latin1);
207 ASSERT_EQ(re.error(), "");
208
209 std::string min, max;
210 if(!re.PossibleMatchRange(&min, &max, 10)) {
211 // There's no good max for "\\C*". Can't use strcmp
212 // because sometimes it gets embedded in more
213 // complicated expressions.
214 if(strstr(regexp.c_str(), "\\C*"))
215 return;
216 LOG(QFATAL) << "PossibleMatchRange failed on: " << CEscape(regexp);
217 }
218
219 strgen_.Reset();
220 while (strgen_.HasNext()) {
221 const StringPiece& s = strgen_.Next();
222 tests_++;
223 if (!RE2::FullMatch(s, re))
224 continue;
225 ASSERT_GE(s, min) << " regexp: " << regexp << " max: " << max;
226 ASSERT_LE(s, max) << " regexp: " << regexp << " min: " << min;
227 }
228 }
229
TEST(PossibleMatchRange,Exhaustive)230 TEST(PossibleMatchRange, Exhaustive) {
231 int natom = 3;
232 int noperator = 3;
233 int stringlen = 5;
234 if (RE2_DEBUG_MODE) {
235 natom = 2;
236 noperator = 3;
237 stringlen = 3;
238 }
239 PossibleMatchTester t(natom, noperator, Split(" ", "a b [0-9]"),
240 RegexpGenerator::EgrepOps(),
241 stringlen, Explode("ab4"));
242 t.Generate();
243 LOG(INFO) << t.regexps() << " regexps, "
244 << t.tests() << " tests";
245 }
246
247 } // namespace re2
248