• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2006 The RE2 Authors.  All Rights Reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 
5 // Test parse.cc, dump.cc, and tostring.cc.
6 
7 #include <stddef.h>
8 
9 #include <string>
10 
11 #include "absl/base/macros.h"
12 #include "absl/log/absl_log.h"
13 #include "gtest/gtest.h"
14 #include "re2/regexp.h"
15 
16 namespace re2 {
17 
18 // In the past, we used 1<<30 here and zeroed the bit later, but that
19 // has undefined behaviour, so now we use an internal-only flag because
20 // otherwise we would have to introduce a new flag value just for this.
21 static const Regexp::ParseFlags TestZeroFlags = Regexp::WasDollar;
22 
23 struct Test {
24   const char* regexp;
25   const char* parse;
26   Regexp::ParseFlags flags;
27 };
28 
29 static Regexp::ParseFlags kTestFlags = Regexp::MatchNL |
30                                        Regexp::PerlX |
31                                        Regexp::PerlClasses |
32                                        Regexp::UnicodeGroups;
33 
34 static Test tests[] = {
35   // Base cases
36   { "a", "lit{a}" },
37   { "a.", "cat{lit{a}dot{}}" },
38   { "a.b", "cat{lit{a}dot{}lit{b}}" },
39   { "ab", "str{ab}" },
40   { "a.b.c", "cat{lit{a}dot{}lit{b}dot{}lit{c}}" },
41   { "abc", "str{abc}" },
42   { "a|^", "alt{lit{a}bol{}}" },
43   { "a|b", "cc{0x61-0x62}" },
44   { "(a)", "cap{lit{a}}" },
45   { "(a)|b", "alt{cap{lit{a}}lit{b}}" },
46   { "a*", "star{lit{a}}" },
47   { "a+", "plus{lit{a}}" },
48   { "a?", "que{lit{a}}" },
49   { "a{2}", "rep{2,2 lit{a}}" },
50   { "a{2,3}", "rep{2,3 lit{a}}" },
51   { "a{2,}", "rep{2,-1 lit{a}}" },
52   { "a*?", "nstar{lit{a}}" },
53   { "a+?", "nplus{lit{a}}" },
54   { "a??", "nque{lit{a}}" },
55   { "a{2}?", "nrep{2,2 lit{a}}" },
56   { "a{2,3}?", "nrep{2,3 lit{a}}" },
57   { "a{2,}?", "nrep{2,-1 lit{a}}" },
58   { "", "emp{}" },
59   { "|", "alt{emp{}emp{}}" },
60   { "|x|", "alt{emp{}lit{x}emp{}}" },
61   { ".", "dot{}" },
62   { "^", "bol{}" },
63   { "$", "eol{}" },
64   { "\\|", "lit{|}" },
65   { "\\(", "lit{(}" },
66   { "\\)", "lit{)}" },
67   { "\\*", "lit{*}" },
68   { "\\+", "lit{+}" },
69   { "\\?", "lit{?}" },
70   { "{", "lit{{}" },
71   { "}", "lit{}}" },
72   { "\\.", "lit{.}" },
73   { "\\^", "lit{^}" },
74   { "\\$", "lit{$}" },
75   { "\\\\", "lit{\\}" },
76   { "[ace]", "cc{0x61 0x63 0x65}" },
77   { "[abc]", "cc{0x61-0x63}" },
78   { "[a-z]", "cc{0x61-0x7a}" },
79   { "[a]", "lit{a}" },
80   { "\\-", "lit{-}" },
81   { "-", "lit{-}" },
82   { "\\_", "lit{_}" },
83 
84   // Posix and Perl extensions
85   { "[[:lower:]]", "cc{0x61-0x7a}" },
86   { "[a-z]", "cc{0x61-0x7a}" },
87   { "[^[:lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
88   { "[[:^lower:]]", "cc{0-0x60 0x7b-0x10ffff}" },
89   { "(?i)[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
90   { "(?i)[a-z]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
91   { "(?i)[^[:lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
92   { "(?i)[[:^lower:]]", "cc{0-0x40 0x5b-0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
93   { "\\d", "cc{0x30-0x39}" },
94   { "\\D", "cc{0-0x2f 0x3a-0x10ffff}" },
95   { "\\s", "cc{0x9-0xa 0xc-0xd 0x20}" },
96   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}" },
97   { "\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a}" },
98   { "\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x10ffff}" },
99   { "(?i)\\w", "cc{0x30-0x39 0x41-0x5a 0x5f 0x61-0x7a 0x17f 0x212a}" },
100   { "(?i)\\W", "cc{0-0x2f 0x3a-0x40 0x5b-0x5e 0x60 0x7b-0x17e 0x180-0x2129 0x212b-0x10ffff}" },
101   { "[^\\\\]", "cc{0-0x5b 0x5d-0x10ffff}" },
102   { "\\C", "byte{}" },
103 
104   // Unicode, negatives, and a double negative.
105   { "\\p{Braille}", "cc{0x2800-0x28ff}" },
106   { "\\P{Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
107   { "\\p{^Braille}", "cc{0-0x27ff 0x2900-0x10ffff}" },
108   { "\\P{^Braille}", "cc{0x2800-0x28ff}" },
109 
110   // More interesting regular expressions.
111   { "a{,2}", "str{a{,2}}" },
112   { "\\.\\^\\$\\\\", "str{.^$\\}" },
113   { "[a-zABC]", "cc{0x41-0x43 0x61-0x7a}" },
114   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
115   { "[\xce\xb1-\xce\xb5\xe2\x98\xba]", "cc{0x3b1-0x3b5 0x263a}" },  // utf-8
116   { "a*{", "cat{star{lit{a}}lit{{}}" },
117 
118   // Test precedences
119   { "(?:ab)*", "star{str{ab}}" },
120   { "(ab)*", "star{cap{str{ab}}}" },
121   { "ab|cd", "alt{str{ab}str{cd}}" },
122   { "a(b|c)d", "cat{lit{a}cap{cc{0x62-0x63}}lit{d}}" },
123 
124   // Test squashing of **, ++, ?? et cetera.
125   { "(?:(?:a)*)*", "star{lit{a}}" },
126   { "(?:(?:a)+)+", "plus{lit{a}}" },
127   { "(?:(?:a)?)?", "que{lit{a}}" },
128   { "(?:(?:a)*)+", "star{lit{a}}" },
129   { "(?:(?:a)*)?", "star{lit{a}}" },
130   { "(?:(?:a)+)*", "star{lit{a}}" },
131   { "(?:(?:a)+)?", "star{lit{a}}" },
132   { "(?:(?:a)?)*", "star{lit{a}}" },
133   { "(?:(?:a)?)+", "star{lit{a}}" },
134 
135   // Test flattening.
136   { "(?:a)", "lit{a}" },
137   { "(?:ab)(?:cd)", "str{abcd}" },
138   { "(?:a|b)|(?:c|d)", "cc{0x61-0x64}" },
139   { "a|c", "cc{0x61 0x63}" },
140   { "a|[cd]", "cc{0x61 0x63-0x64}" },
141   { "a|.", "dot{}" },
142   { "[ab]|c", "cc{0x61-0x63}" },
143   { "[ab]|[cd]", "cc{0x61-0x64}" },
144   { "[ab]|.", "dot{}" },
145   { ".|c", "dot{}" },
146   { ".|[cd]", "dot{}" },
147   { ".|.", "dot{}" },
148 
149   // Test Perl quoted literals
150   { "\\Q+|*?{[\\E", "str{+|*?{[}" },
151   { "\\Q+\\E+", "plus{lit{+}}" },
152   { "\\Q\\\\E", "lit{\\}" },
153   { "\\Q\\\\\\E", "str{\\\\}" },
154   { "\\Qa\\E*", "star{lit{a}}" },
155   { "\\Qab\\E*", "cat{lit{a}star{lit{b}}}" },
156   { "\\Qabc\\E*", "cat{str{ab}star{lit{c}}}" },
157 
158   // Test Perl \A and \z
159   { "(?m)^", "bol{}" },
160   { "(?m)$", "eol{}" },
161   { "(?-m)^", "bot{}" },
162   { "(?-m)$", "eot{}" },
163   { "(?m)\\A", "bot{}" },
164   { "(?m)\\z", "eot{\\z}" },
165   { "(?-m)\\A", "bot{}" },
166   { "(?-m)\\z", "eot{\\z}" },
167 
168   // Test named captures
169   { "(?P<name>a)", "cap{name:lit{a}}" },
170   { "(?P<中文>a)", "cap{中文:lit{a}}" },
171   { "(?<name>a)", "cap{name:lit{a}}" },
172   { "(?<中文>a)", "cap{中文:lit{a}}" },
173 
174   // Case-folded literals
175   { "[Aa]", "litfold{a}" },
176 
177   // Strings
178   { "abcde", "str{abcde}" },
179   { "[Aa][Bb]cd", "cat{strfold{ab}str{cd}}" },
180 
181   // Reported bug involving \n leaking in despite use of NeverNL.
182   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
183   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
184   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
185   { "[^ ]", "cc{0-0x9 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
186   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", TestZeroFlags },
187   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
188   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
189   { "[^ \f]", "cc{0-0x9 0xb 0xd-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
190   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", TestZeroFlags },
191   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
192   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
193   { "[^ \r]", "cc{0-0x9 0xb-0xc 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
194   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", TestZeroFlags },
195   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
196   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
197   { "[^ \v]", "cc{0-0x9 0xc-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
198   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", TestZeroFlags },
199   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::FoldCase },
200   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
201   { "[^ \t]", "cc{0-0x8 0xb-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
202   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
203   { "[^ \r\f\v]", "cc{0-0x9 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
204   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
205   { "[^ \r\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
206   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
207   { "[^ \r\n\f\t\v]", "cc{0-0x8 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
208   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL },
209   { "[^ \r\n\f\t]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}", Regexp::NeverNL | Regexp::FoldCase },
210   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
211     Regexp::PerlClasses },
212   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
213     Regexp::PerlClasses | Regexp::FoldCase },
214   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
215     Regexp::PerlClasses | Regexp::NeverNL },
216   { "[^\t-\n\f-\r ]", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
217     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
218   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
219     Regexp::PerlClasses },
220   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
221     Regexp::PerlClasses | Regexp::FoldCase },
222   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
223     Regexp::PerlClasses | Regexp::NeverNL },
224   { "\\S", "cc{0-0x8 0xb 0xe-0x1f 0x21-0x10ffff}",
225     Regexp::PerlClasses | Regexp::NeverNL | Regexp::FoldCase },
226 
227   // Bug in Regexp::ToString() that emitted [^], which
228   // would (obviously) fail to parse when fed back in.
229   { "[\\s\\S]", "cc{0-0x10ffff}" },
230 
231   // As per https://github.com/google/re2/issues/477,
232   // there were long-standing bugs involving Latin-1.
233   // Here, we exercise it WITHOUT case folding...
234   { "\xa5\x64\xd1", "str{\xa5""d\xd1}", Regexp::Latin1 },
235   { "\xa5\xd1\x64", "str{\xa5\xd1""d}", Regexp::Latin1 },
236   { "\xa5\x64[\xd1\xd2]", "cat{str{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 },
237   { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}lit{d}}", Regexp::Latin1 },
238   { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
239   { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1}}", Regexp::Latin1 },
240   { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
241   { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x64 0xd1-0xd2}}", Regexp::Latin1 },
242   // Here, we exercise it WITH case folding...
243   // 0x64 should fold to 0x44, but neither 0xD1 nor 0xD2
244   // should fold to 0xF1 and 0xF2, respectively.
245   { "\xa5\x64\xd1", "strfold{\xa5""d\xd1}", Regexp::Latin1 | Regexp::FoldCase },
246   { "\xa5\xd1\x64", "strfold{\xa5\xd1""d}", Regexp::Latin1 | Regexp::FoldCase },
247   { "\xa5\x64[\xd1\xd2]", "cat{strfold{\xa5""d}cc{0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
248   { "\xa5[\xd1\xd2]\x64", "cat{lit{\xa5}cc{0xd1-0xd2}litfold{d}}", Regexp::Latin1 | Regexp::FoldCase },
249   { "\xa5\x64|\xa5\xd1", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
250   { "\xa5\xd1|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1}}", Regexp::Latin1 | Regexp::FoldCase },
251   { "\xa5\x64|\xa5[\xd1\xd2]", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
252   { "\xa5[\xd1\xd2]|\xa5\x64", "cat{lit{\xa5}cc{0x44 0x64 0xd1-0xd2}}", Regexp::Latin1 | Regexp::FoldCase },
253 };
254 
RegexpEqualTestingOnly(Regexp * a,Regexp * b)255 bool RegexpEqualTestingOnly(Regexp* a, Regexp* b) {
256   return Regexp::Equal(a, b);
257 }
258 
TestParse(const Test * tests,int ntests,Regexp::ParseFlags flags,const std::string & title)259 void TestParse(const Test* tests, int ntests, Regexp::ParseFlags flags,
260                const std::string& title) {
261   Regexp** re = new Regexp*[ntests];
262   for (int i = 0; i < ntests; i++) {
263     RegexpStatus status;
264     Regexp::ParseFlags f = flags;
265     if (tests[i].flags != 0) {
266       f = tests[i].flags & ~TestZeroFlags;
267     }
268     re[i] = Regexp::Parse(tests[i].regexp, f, &status);
269     ASSERT_TRUE(re[i] != NULL)
270       << " " << tests[i].regexp << " " << status.Text();
271     std::string s = re[i]->Dump();
272     EXPECT_EQ(std::string(tests[i].parse), s)
273         << "Regexp: " << tests[i].regexp
274         << "\nparse: " << std::string(tests[i].parse)
275         << " s: " << s << " flag=" << f;
276   }
277 
278   for (int i = 0; i < ntests; i++) {
279     for (int j = 0; j < ntests; j++) {
280       EXPECT_EQ(std::string(tests[i].parse) == std::string(tests[j].parse),
281                 RegexpEqualTestingOnly(re[i], re[j]))
282         << "Regexp: " << tests[i].regexp << " " << tests[j].regexp;
283     }
284   }
285 
286   for (int i = 0; i < ntests; i++)
287     re[i]->Decref();
288   delete[] re;
289 }
290 
291 // Test that regexps parse to expected structures.
TEST(TestParse,SimpleRegexps)292 TEST(TestParse, SimpleRegexps) {
293   TestParse(tests, ABSL_ARRAYSIZE(tests), kTestFlags, "simple");
294 }
295 
296 Test foldcase_tests[] = {
297   { "AbCdE", "strfold{abcde}" },
298   { "[Aa]", "litfold{a}" },
299   { "a", "litfold{a}" },
300 
301   // 0x17F is an old English long s (looks like an f) and folds to s.
302   // 0x212A is the Kelvin symbol and folds to k.
303   { "A[F-g]", "cat{litfold{a}cc{0x41-0x7a 0x17f 0x212a}}" },  // [Aa][A-z...]
304   { "[[:upper:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
305   { "[[:lower:]]", "cc{0x41-0x5a 0x61-0x7a 0x17f 0x212a}" },
306 };
307 
308 // Test that parsing with FoldCase works.
TEST(TestParse,FoldCase)309 TEST(TestParse, FoldCase) {
310   TestParse(foldcase_tests, ABSL_ARRAYSIZE(foldcase_tests), Regexp::FoldCase, "foldcase");
311 }
312 
313 Test literal_tests[] = {
314   { "(|)^$.[*+?]{5,10},\\", "str{(|)^$.[*+?]{5,10},\\}" },
315 };
316 
317 // Test that parsing with Literal works.
TEST(TestParse,Literal)318 TEST(TestParse, Literal) {
319   TestParse(literal_tests, ABSL_ARRAYSIZE(literal_tests), Regexp::Literal, "literal");
320 }
321 
322 Test matchnl_tests[] = {
323   { ".", "dot{}" },
324   { "\n", "lit{\n}" },
325   { "[^a]", "cc{0-0x60 0x62-0x10ffff}" },
326   { "[a\\n]", "cc{0xa 0x61}" },
327 };
328 
329 // Test that parsing with MatchNL works.
330 // (Also tested above during simple cases.)
TEST(TestParse,MatchNL)331 TEST(TestParse, MatchNL) {
332   TestParse(matchnl_tests, ABSL_ARRAYSIZE(matchnl_tests), Regexp::MatchNL, "with MatchNL");
333 }
334 
335 Test nomatchnl_tests[] = {
336   { ".", "cc{0-0x9 0xb-0x10ffff}" },
337   { "\n", "lit{\n}" },
338   { "[^a]", "cc{0-0x9 0xb-0x60 0x62-0x10ffff}" },
339   { "[a\\n]", "cc{0xa 0x61}" },
340 };
341 
342 // Test that parsing without MatchNL works.
TEST(TestParse,NoMatchNL)343 TEST(TestParse, NoMatchNL) {
344   TestParse(nomatchnl_tests, ABSL_ARRAYSIZE(nomatchnl_tests), Regexp::NoParseFlags, "without MatchNL");
345 }
346 
347 Test prefix_tests[] = {
348   { "abc|abd", "cat{str{ab}cc{0x63-0x64}}" },
349   { "a(?:b)c|abd", "cat{str{ab}cc{0x63-0x64}}" },
350   { "abc|abd|aef|bcx|bcy",
351     "alt{cat{lit{a}alt{cat{lit{b}cc{0x63-0x64}}str{ef}}}"
352       "cat{str{bc}cc{0x78-0x79}}}" },
353   { "abc|x|abd", "alt{str{abc}lit{x}str{abd}}" },
354   { "(?i)abc|ABD", "cat{strfold{ab}cc{0x43-0x44 0x63-0x64}}" },
355   { "[ab]c|[ab]d", "cat{cc{0x61-0x62}cc{0x63-0x64}}" },
356   { ".c|.d", "cat{cc{0-0x9 0xb-0x10ffff}cc{0x63-0x64}}" },
357   { "\\Cc|\\Cd", "cat{byte{}cc{0x63-0x64}}" },
358   { "x{2}|x{2}[0-9]",
359     "cat{rep{2,2 lit{x}}alt{emp{}cc{0x30-0x39}}}" },
360   { "x{2}y|x{2}[0-9]y",
361     "cat{rep{2,2 lit{x}}alt{lit{y}cat{cc{0x30-0x39}lit{y}}}}" },
362   { "n|r|rs",
363     "alt{lit{n}cat{lit{r}alt{emp{}lit{s}}}}" },
364   { "n|rs|r",
365     "alt{lit{n}cat{lit{r}alt{lit{s}emp{}}}}" },
366   { "r|rs|n",
367     "alt{cat{lit{r}alt{emp{}lit{s}}}lit{n}}" },
368   { "rs|r|n",
369     "alt{cat{lit{r}alt{lit{s}emp{}}}lit{n}}" },
370   { "a\\C*?c|a\\C*?b",
371     "cat{lit{a}alt{cat{nstar{byte{}}lit{c}}cat{nstar{byte{}}lit{b}}}}" },
372   { "^/a/bc|^/a/de",
373     "cat{bol{}cat{str{/a/}alt{str{bc}str{de}}}}" },
374   // In the past, factoring was limited to kFactorAlternationMaxDepth (8).
375   { "a|aa|aaa|aaaa|aaaaa|aaaaaa|aaaaaaa|aaaaaaaa|aaaaaaaaa|aaaaaaaaaa",
376     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
377     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
378     "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}" "cat{lit{a}alt{emp{}"
379     "lit{a}}}}}}}}}}}}}}}}}}}" },
380   { "a|aardvark|aardvarks|abaci|aback|abacus|abacuses|abaft|abalone|abalones",
381     "cat{lit{a}alt{emp{}cat{str{ardvark}alt{emp{}lit{s}}}"
382     "cat{str{ba}alt{cat{lit{c}alt{cc{0x69 0x6b}cat{str{us}alt{emp{}str{es}}}}}"
383     "str{ft}cat{str{lone}alt{emp{}lit{s}}}}}}}" },
384   // As per https://github.com/google/re2/issues/467,
385   // these should factor identically, but they didn't
386   // because AddFoldedRange() terminated prematurely.
387   { "0A|0[aA]", "cat{lit{0}cc{0x41 0x61}}" },
388   { "0a|0[aA]", "cat{lit{0}cc{0x41 0x61}}" },
389   { "0[aA]|0A", "cat{lit{0}cc{0x41 0x61}}" },
390   { "0[aA]|0a", "cat{lit{0}cc{0x41 0x61}}" },
391 };
392 
393 // Test that prefix factoring works.
TEST(TestParse,Prefix)394 TEST(TestParse, Prefix) {
395   TestParse(prefix_tests, ABSL_ARRAYSIZE(prefix_tests), Regexp::PerlX, "prefix");
396 }
397 
398 Test nested_tests[] = {
399   { "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}))",
400     "cap{cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}}}}}}}}" },
401   { "((((((((((x{1}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
402     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{1,1 lit{x}}}}}}}}}}}}}}}}}}}}}" },
403   { "((((((((((x{0}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
404     "cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 cap{rep{0,0 lit{x}}}}}}}}}}}}}}}}}}}}}" },
405   { "((((((x{2}){2}){2}){5}){5}){5})",
406     "cap{rep{5,5 cap{rep{5,5 cap{rep{5,5 cap{rep{2,2 cap{rep{2,2 cap{rep{2,2 lit{x}}}}}}}}}}}}}" },
407 };
408 
409 // Test that nested repetition works.
TEST(TestParse,Nested)410 TEST(TestParse, Nested) {
411   TestParse(nested_tests, ABSL_ARRAYSIZE(nested_tests), Regexp::PerlX, "nested");
412 }
413 
414 // Invalid regular expressions
415 const char* badtests[] = {
416   "(",
417   ")",
418   "(a",
419   "(a|b|",
420   "(a|b",
421   "[a-z",
422   "([a-z)",
423   "x{1001}",
424   "\xff",      // Invalid UTF-8
425   "[\xff]",
426   "[\\\xff]",
427   "\\\xff",
428   "(?P<name>a",
429   "(?P<name>",
430   "(?P<name",
431   "(?P<x y>a)",
432   "(?P<>a)",
433   "(?<name>a",
434   "(?<name>",
435   "(?<name",
436   "(?<x y>a)",
437   "(?<>a)",
438   "[a-Z]",
439   "(?i)[a-Z]",
440   "a{100000}",
441   "a{100000,}",
442   "((((((((((x{2}){2}){2}){2}){2}){2}){2}){2}){2}){2})",
443   "(((x{7}){11}){13})",
444   "\\Q\\E*",
445 };
446 
447 // Valid in Perl, bad in POSIX
448 const char* only_perl[] = {
449  "[a-b-c]",
450  "\\Qabc\\E",
451  "\\Q*+?{[\\E",
452  "\\Q\\\\E",
453  "\\Q\\\\\\E",
454  "\\Q\\\\\\\\E",
455  "\\Q\\\\\\\\\\E",
456  "(?:a)",
457  "(?P<name>a)",
458  "(?<name>a)",
459 };
460 
461 // Valid in POSIX, bad in Perl.
462 const char* only_posix[] = {
463   "a++",
464   "a**",
465   "a?*",
466   "a+*",
467   "a{1}*",
468 };
469 
470 // Test that parser rejects bad regexps.
TEST(TestParse,InvalidRegexps)471 TEST(TestParse, InvalidRegexps) {
472   for (size_t i = 0; i < ABSL_ARRAYSIZE(badtests); i++) {
473     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::PerlX, NULL) == NULL)
474       << " " << badtests[i];
475     ASSERT_TRUE(Regexp::Parse(badtests[i], Regexp::NoParseFlags, NULL) == NULL)
476       << " " << badtests[i];
477   }
478   for (size_t i = 0; i < ABSL_ARRAYSIZE(only_posix); i++) {
479     ASSERT_TRUE(Regexp::Parse(only_posix[i], Regexp::PerlX, NULL) == NULL)
480       << " " << only_posix[i];
481     Regexp* re = Regexp::Parse(only_posix[i], Regexp::NoParseFlags, NULL);
482     ASSERT_TRUE(re != NULL) << " " << only_posix[i];
483     re->Decref();
484   }
485   for (size_t i = 0; i < ABSL_ARRAYSIZE(only_perl); i++) {
486     ASSERT_TRUE(Regexp::Parse(only_perl[i], Regexp::NoParseFlags, NULL) == NULL)
487       << " " << only_perl[i];
488     Regexp* re = Regexp::Parse(only_perl[i], Regexp::PerlX, NULL);
489     ASSERT_TRUE(re != NULL) << " " << only_perl[i];
490     re->Decref();
491   }
492 }
493 
494 // Test that ToString produces original regexp or equivalent one.
TEST(TestToString,EquivalentParse)495 TEST(TestToString, EquivalentParse) {
496   for (size_t i = 0; i < ABSL_ARRAYSIZE(tests); i++) {
497     RegexpStatus status;
498     Regexp::ParseFlags f = kTestFlags;
499     if (tests[i].flags != 0) {
500       f = tests[i].flags & ~TestZeroFlags;
501     }
502     Regexp* re = Regexp::Parse(tests[i].regexp, f, &status);
503     ASSERT_TRUE(re != NULL) << " " << tests[i].regexp << " " << status.Text();
504     std::string s = re->Dump();
505     EXPECT_EQ(std::string(tests[i].parse), s)
506         << "Regexp: " << tests[i].regexp
507         << "\nparse: " << std::string(tests[i].parse)
508         << " s: " << s << " flag=" << f;
509     std::string t = re->ToString();
510     if (t != tests[i].regexp) {
511       // If ToString didn't return the original regexp,
512       // it must have found one with fewer parens.
513       // Unfortunately we can't check the length here, because
514       // ToString produces "\\{" for a literal brace,
515       // but "{" is a shorter equivalent.
516       // ASSERT_LT(t.size(), strlen(tests[i].regexp))
517       //     << " t=" << t << " regexp=" << tests[i].regexp;
518 
519       // Test that if we parse the new regexp we get the same structure.
520       Regexp* nre = Regexp::Parse(t, f, &status);
521       ASSERT_TRUE(nre != NULL) << " reparse " << t << " " << status.Text();
522       std::string ss = nre->Dump();
523       std::string tt = nre->ToString();
524       if (s != ss || t != tt)
525         ABSL_LOG(INFO) << "ToString(" << tests[i].regexp << ") = " << t;
526       EXPECT_EQ(s, ss);
527       EXPECT_EQ(t, tt);
528       nre->Decref();
529     }
530     re->Decref();
531   }
532 }
533 
534 // Test that capture error args are correct.
TEST(NamedCaptures,ErrorArgs)535 TEST(NamedCaptures, ErrorArgs) {
536   RegexpStatus status;
537   Regexp* re;
538 
539   re = Regexp::Parse("test(?P<name", Regexp::LikePerl, &status);
540   EXPECT_TRUE(re == NULL);
541   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
542   EXPECT_EQ(status.error_arg(), "(?P<name");
543 
544   re = Regexp::Parse("test(?P<space bar>z)", Regexp::LikePerl, &status);
545   EXPECT_TRUE(re == NULL);
546   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
547   EXPECT_EQ(status.error_arg(), "(?P<space bar>");
548 
549   re = Regexp::Parse("test(?<name", Regexp::LikePerl, &status);
550   EXPECT_TRUE(re == NULL);
551   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
552   EXPECT_EQ(status.error_arg(), "(?<name");
553 
554   re = Regexp::Parse("test(?<space bar>z)", Regexp::LikePerl, &status);
555   EXPECT_TRUE(re == NULL);
556   EXPECT_EQ(status.code(), kRegexpBadNamedCapture);
557   EXPECT_EQ(status.error_arg(), "(?<space bar>");
558 }
559 
560 // Test that look-around error args are correct.
TEST(LookAround,ErrorArgs)561 TEST(LookAround, ErrorArgs) {
562   RegexpStatus status;
563   Regexp* re;
564 
565   re = Regexp::Parse("(?=foo).*", Regexp::LikePerl, &status);
566   EXPECT_TRUE(re == NULL);
567   EXPECT_EQ(status.code(), kRegexpBadPerlOp);
568   EXPECT_EQ(status.error_arg(), "(?=");
569 
570   re = Regexp::Parse("(?!foo).*", Regexp::LikePerl, &status);
571   EXPECT_TRUE(re == NULL);
572   EXPECT_EQ(status.code(), kRegexpBadPerlOp);
573   EXPECT_EQ(status.error_arg(), "(?!");
574 
575   re = Regexp::Parse("(?<=foo).*", Regexp::LikePerl, &status);
576   EXPECT_TRUE(re == NULL);
577   EXPECT_EQ(status.code(), kRegexpBadPerlOp);
578   EXPECT_EQ(status.error_arg(), "(?<=");
579 
580   re = Regexp::Parse("(?<!foo).*", Regexp::LikePerl, &status);
581   EXPECT_TRUE(re == NULL);
582   EXPECT_EQ(status.code(), kRegexpBadPerlOp);
583   EXPECT_EQ(status.error_arg(), "(?<!");
584 }
585 
586 }  // namespace re2
587