1 // Copyright 2009 The RE2 Authors. All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4
5 #include <stddef.h>
6
7 #include <string>
8
9 #include "absl/base/macros.h"
10 #include "gtest/gtest.h"
11 #include "re2/prog.h"
12 #include "re2/regexp.h"
13
14 namespace re2 {
15
16 struct PrefixTest {
17 const char* regexp;
18 bool return_value;
19 const char* prefix;
20 bool foldcase;
21 const char* suffix;
22 };
23
24 static PrefixTest tests[] = {
25 // Empty cases.
26 { "", false },
27 { "(?m)^", false },
28 { "(?-m)^", false },
29
30 // If the regexp has no ^, there's no required prefix.
31 { "abc", false },
32
33 // If the regexp immediately goes into
34 // something not a literal match, there's no required prefix.
35 { "^a*", false },
36 { "^(abc)", false },
37
38 // Otherwise, it should work.
39 { "^abc$", true, "abc", false, "(?-m:$)" },
40 { "^abc", true, "abc", false, "" },
41 { "^(?i)abc", true, "abc", true, "" },
42 { "^abcd*", true, "abc", false, "d*" },
43 { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
44 { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
45 { "^☺abc", true, "☺abc", false, "" },
46 };
47
TEST(RequiredPrefix,SimpleTests)48 TEST(RequiredPrefix, SimpleTests) {
49 for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
50 const PrefixTest& t = tests[i];
51 for (size_t j = 0; j < 2; j++) {
52 Regexp::ParseFlags flags = Regexp::LikePerl;
53 if (j == 0)
54 flags = flags | Regexp::Latin1;
55 Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
56 ASSERT_TRUE(re != NULL) << " " << t.regexp;
57
58 std::string p;
59 bool f;
60 Regexp* s;
61 ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
62 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
63 << " " << re->Dump();
64 if (t.return_value) {
65 ASSERT_EQ(p, std::string(t.prefix))
66 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
67 ASSERT_EQ(f, t.foldcase)
68 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
69 ASSERT_EQ(s->ToString(), std::string(t.suffix))
70 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
71 s->Decref();
72 }
73 re->Decref();
74 }
75 }
76 }
77
78 static PrefixTest for_accel_tests[] = {
79 // Empty cases.
80 { "", false },
81 { "(?m)^", false },
82 { "(?-m)^", false },
83
84 // If the regexp has a ^, there's no required prefix.
85 { "^abc", false },
86
87 // If the regexp immediately goes into
88 // something not a literal match, there's no required prefix.
89 { "a*", false },
90
91 // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
92 // capturing groups, but doesn't try to glue prefix fragments together.
93 { "(a?)def", false },
94 { "(ab?)def", true, "a", false },
95 { "(abc?)def", true, "ab", false },
96 { "(()a)def", false },
97 { "((a)b)def", true, "a", false },
98 { "((ab)c)def", true, "ab", false },
99
100 // Otherwise, it should work.
101 { "abc$", true, "abc", false },
102 { "abc", true, "abc", false },
103 { "(?i)abc", true, "abc", true },
104 { "abcd*", true, "abc", false },
105 { "[Aa][Bb]cd*", true, "ab", true },
106 { "ab[Cc]d*", true, "ab", false },
107 { "☺abc", true, "☺abc", false },
108 };
109
TEST(RequiredPrefixForAccel,SimpleTests)110 TEST(RequiredPrefixForAccel, SimpleTests) {
111 for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) {
112 const PrefixTest& t = for_accel_tests[i];
113 for (size_t j = 0; j < 2; j++) {
114 Regexp::ParseFlags flags = Regexp::LikePerl;
115 if (j == 0)
116 flags = flags | Regexp::Latin1;
117 Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
118 ASSERT_TRUE(re != NULL) << " " << t.regexp;
119
120 std::string p;
121 bool f;
122 ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
123 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
124 << " " << re->Dump();
125 if (t.return_value) {
126 ASSERT_EQ(p, std::string(t.prefix))
127 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
128 ASSERT_EQ(f, t.foldcase)
129 << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
130 }
131 re->Decref();
132 }
133 }
134 }
135
TEST(RequiredPrefixForAccel,CaseFoldingForKAndS)136 TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
137 Regexp* re;
138 std::string p;
139 bool f;
140
141 // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
142 re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
143 ASSERT_TRUE(re != NULL);
144 ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
145 ASSERT_EQ(p, "klm");
146 ASSERT_EQ(f, true);
147 re->Decref();
148
149 re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
150 ASSERT_TRUE(re != NULL);
151 ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
152 ASSERT_EQ(p, "stu");
153 ASSERT_EQ(f, true);
154 re->Decref();
155
156 // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
157 // This is because they match U+212A and U+017F, respectively, and
158 // so the parser ends up emitting character classes, not literals.
159 re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
160 ASSERT_TRUE(re != NULL);
161 ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
162 re->Decref();
163
164 re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
165 ASSERT_TRUE(re != NULL);
166 ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
167 re->Decref();
168 }
169
170 static const char* prefix_accel_tests[] = {
171 "aababc\\d+",
172 "(?i)AABABC\\d+",
173 };
174
TEST(PrefixAccel,SimpleTests)175 TEST(PrefixAccel, SimpleTests) {
176 for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) {
177 const char* pattern = prefix_accel_tests[i];
178 Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
179 ASSERT_TRUE(re != NULL);
180 Prog* prog = re->CompileToProg(0);
181 ASSERT_TRUE(prog != NULL);
182 ASSERT_TRUE(prog->can_prefix_accel());
183 for (int j = 0; j < 100; j++) {
184 std::string text(j, 'a');
185 const char* p = reinterpret_cast<const char*>(
186 prog->PrefixAccel(text.data(), text.size()));
187 EXPECT_TRUE(p == NULL);
188 text.append("aababc");
189 for (int k = 0; k < 100; k++) {
190 text.append(k, 'a');
191 p = reinterpret_cast<const char*>(
192 prog->PrefixAccel(text.data(), text.size()));
193 EXPECT_EQ(j, p - text.data());
194 }
195 }
196 delete prog;
197 re->Decref();
198 }
199 }
200
201 } // namespace re2
202