• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2009 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 #include <stddef.h>
6 
7 #include <string>
8 
9 #include "absl/base/macros.h"
10 #include "gtest/gtest.h"
11 #include "re2/prog.h"
12 #include "re2/regexp.h"
13 
14 namespace re2 {
15 
16 struct PrefixTest {
17   const char* regexp;
18   bool return_value;
19   const char* prefix;
20   bool foldcase;
21   const char* suffix;
22 };
23 
24 static PrefixTest tests[] = {
25   // Empty cases.
26   { "", false },
27   { "(?m)^", false },
28   { "(?-m)^", false },
29 
30   // If the regexp has no ^, there's no required prefix.
31   { "abc", false },
32 
33   // If the regexp immediately goes into
34   // something not a literal match, there's no required prefix.
35   { "^a*",  false },
36   { "^(abc)", false },
37 
38   // Otherwise, it should work.
39   { "^abc$", true, "abc", false, "(?-m:$)" },
40   { "^abc", true, "abc", false, "" },
41   { "^(?i)abc", true, "abc", true, "" },
42   { "^abcd*", true, "abc", false, "d*" },
43   { "^[Aa][Bb]cd*", true, "ab", true, "cd*" },
44   { "^ab[Cc]d*", true, "ab", false, "[Cc]d*" },
45   { "^☺abc", true, "☺abc", false, "" },
46 };
47 
TEST(RequiredPrefix,SimpleTests)48 TEST(RequiredPrefix, SimpleTests) {
49   for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
50     const PrefixTest& t = tests[i];
51     for (size_t j = 0; j < 2; j++) {
52       Regexp::ParseFlags flags = Regexp::LikePerl;
53       if (j == 0)
54         flags = flags | Regexp::Latin1;
55       Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
56       ASSERT_TRUE(re != NULL) << " " << t.regexp;
57 
58       std::string p;
59       bool f;
60       Regexp* s;
61       ASSERT_EQ(t.return_value, re->RequiredPrefix(&p, &f, &s))
62         << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
63         << " " << re->Dump();
64       if (t.return_value) {
65         ASSERT_EQ(p, std::string(t.prefix))
66           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
67         ASSERT_EQ(f, t.foldcase)
68           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
69         ASSERT_EQ(s->ToString(), std::string(t.suffix))
70           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
71         s->Decref();
72       }
73       re->Decref();
74     }
75   }
76 }
77 
78 static PrefixTest for_accel_tests[] = {
79   // Empty cases.
80   { "", false },
81   { "(?m)^", false },
82   { "(?-m)^", false },
83 
84   // If the regexp has a ^, there's no required prefix.
85   { "^abc", false },
86 
87   // If the regexp immediately goes into
88   // something not a literal match, there's no required prefix.
89   { "a*",  false },
90 
91   // Unlike RequiredPrefix(), RequiredPrefixForAccel() can "see through"
92   // capturing groups, but doesn't try to glue prefix fragments together.
93   { "(a?)def", false },
94   { "(ab?)def", true, "a", false },
95   { "(abc?)def", true, "ab", false },
96   { "(()a)def", false },
97   { "((a)b)def", true, "a", false },
98   { "((ab)c)def", true, "ab", false },
99 
100   // Otherwise, it should work.
101   { "abc$", true, "abc", false },
102   { "abc", true, "abc", false },
103   { "(?i)abc", true, "abc", true },
104   { "abcd*", true, "abc", false },
105   { "[Aa][Bb]cd*", true, "ab", true },
106   { "ab[Cc]d*", true, "ab", false },
107   { "☺abc", true, "☺abc", false },
108 };
109 
TEST(RequiredPrefixForAccel,SimpleTests)110 TEST(RequiredPrefixForAccel, SimpleTests) {
111   for (size_t i = 0; i < ABSL_ARRAYSIZE(for_accel_tests); i++) {
112     const PrefixTest& t = for_accel_tests[i];
113     for (size_t j = 0; j < 2; j++) {
114       Regexp::ParseFlags flags = Regexp::LikePerl;
115       if (j == 0)
116         flags = flags | Regexp::Latin1;
117       Regexp* re = Regexp::Parse(t.regexp, flags, NULL);
118       ASSERT_TRUE(re != NULL) << " " << t.regexp;
119 
120       std::string p;
121       bool f;
122       ASSERT_EQ(t.return_value, re->RequiredPrefixForAccel(&p, &f))
123         << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8")
124         << " " << re->Dump();
125       if (t.return_value) {
126         ASSERT_EQ(p, std::string(t.prefix))
127           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
128         ASSERT_EQ(f, t.foldcase)
129           << " " << t.regexp << " " << (j == 0 ? "latin1" : "utf8");
130       }
131       re->Decref();
132     }
133   }
134 }
135 
TEST(RequiredPrefixForAccel,CaseFoldingForKAndS)136 TEST(RequiredPrefixForAccel, CaseFoldingForKAndS) {
137   Regexp* re;
138   std::string p;
139   bool f;
140 
141   // With Latin-1 encoding, `(?i)` prefixes can include 'k' and 's'.
142   re = Regexp::Parse("(?i)KLM", Regexp::LikePerl|Regexp::Latin1, NULL);
143   ASSERT_TRUE(re != NULL);
144   ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
145   ASSERT_EQ(p, "klm");
146   ASSERT_EQ(f, true);
147   re->Decref();
148 
149   re = Regexp::Parse("(?i)STU", Regexp::LikePerl|Regexp::Latin1, NULL);
150   ASSERT_TRUE(re != NULL);
151   ASSERT_TRUE(re->RequiredPrefixForAccel(&p, &f));
152   ASSERT_EQ(p, "stu");
153   ASSERT_EQ(f, true);
154   re->Decref();
155 
156   // With UTF-8 encoding, `(?i)` prefixes can't include 'k' and 's'.
157   // This is because they match U+212A and U+017F, respectively, and
158   // so the parser ends up emitting character classes, not literals.
159   re = Regexp::Parse("(?i)KLM", Regexp::LikePerl, NULL);
160   ASSERT_TRUE(re != NULL);
161   ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
162   re->Decref();
163 
164   re = Regexp::Parse("(?i)STU", Regexp::LikePerl, NULL);
165   ASSERT_TRUE(re != NULL);
166   ASSERT_FALSE(re->RequiredPrefixForAccel(&p, &f));
167   re->Decref();
168 }
169 
170 static const char* prefix_accel_tests[] = {
171     "aababc\\d+",
172     "(?i)AABABC\\d+",
173 };
174 
TEST(PrefixAccel,SimpleTests)175 TEST(PrefixAccel, SimpleTests) {
176   for (size_t i = 0; i < ABSL_ARRAYSIZE(prefix_accel_tests); i++) {
177     const char* pattern = prefix_accel_tests[i];
178     Regexp* re = Regexp::Parse(pattern, Regexp::LikePerl, NULL);
179     ASSERT_TRUE(re != NULL);
180     Prog* prog = re->CompileToProg(0);
181     ASSERT_TRUE(prog != NULL);
182     ASSERT_TRUE(prog->can_prefix_accel());
183     for (int j = 0; j < 100; j++) {
184       std::string text(j, 'a');
185       const char* p = reinterpret_cast<const char*>(
186           prog->PrefixAccel(text.data(), text.size()));
187       EXPECT_TRUE(p == NULL);
188       text.append("aababc");
189       for (int k = 0; k < 100; k++) {
190         text.append(k, 'a');
191         p = reinterpret_cast<const char*>(
192             prog->PrefixAccel(text.data(), text.size()));
193         EXPECT_EQ(j, p - text.data());
194       }
195     }
196     delete prog;
197     re->Decref();
198   }
199 }
200 
201 }  // namespace re2
202