1 // -*- coding: utf-8 -*-
2 // Copyright 2002-2009 The RE2 Authors. All Rights Reserved.
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file.
5
6 // TODO: Test extractions for PartialMatch/Consume
7
8 #include <errno.h>
9 #include <stddef.h>
10 #include <stdint.h>
11 #include <string.h>
12 #include <map>
13 #include <string>
14 #include <utility>
15 #include <vector>
16 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
17 #include <sys/mman.h>
18 #include <unistd.h> /* for sysconf */
19 #endif
20
21 #include "util/test.h"
22 #include "util/logging.h"
23 #include "util/strutil.h"
24 #include "re2/re2.h"
25 #include "re2/regexp.h"
26
27 namespace re2 {
28
TEST(RE2,HexTests)29 TEST(RE2, HexTests) {
30 #define ASSERT_HEX(type, value) \
31 do { \
32 type v; \
33 ASSERT_TRUE( \
34 RE2::FullMatch(#value, "([0-9a-fA-F]+)[uUlL]*", RE2::Hex(&v))); \
35 ASSERT_EQ(v, 0x##value); \
36 ASSERT_TRUE(RE2::FullMatch("0x" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
37 RE2::CRadix(&v))); \
38 ASSERT_EQ(v, 0x##value); \
39 } while (0)
40
41 ASSERT_HEX(short, 2bad);
42 ASSERT_HEX(unsigned short, 2badU);
43 ASSERT_HEX(int, dead);
44 ASSERT_HEX(unsigned int, deadU);
45 ASSERT_HEX(long, 7eadbeefL);
46 ASSERT_HEX(unsigned long, deadbeefUL);
47 ASSERT_HEX(long long, 12345678deadbeefLL);
48 ASSERT_HEX(unsigned long long, cafebabedeadbeefULL);
49
50 #undef ASSERT_HEX
51 }
52
TEST(RE2,OctalTests)53 TEST(RE2, OctalTests) {
54 #define ASSERT_OCTAL(type, value) \
55 do { \
56 type v; \
57 ASSERT_TRUE(RE2::FullMatch(#value, "([0-7]+)[uUlL]*", RE2::Octal(&v))); \
58 ASSERT_EQ(v, 0##value); \
59 ASSERT_TRUE(RE2::FullMatch("0" #value, "([0-9a-fA-FxX]+)[uUlL]*", \
60 RE2::CRadix(&v))); \
61 ASSERT_EQ(v, 0##value); \
62 } while (0)
63
64 ASSERT_OCTAL(short, 77777);
65 ASSERT_OCTAL(unsigned short, 177777U);
66 ASSERT_OCTAL(int, 17777777777);
67 ASSERT_OCTAL(unsigned int, 37777777777U);
68 ASSERT_OCTAL(long, 17777777777L);
69 ASSERT_OCTAL(unsigned long, 37777777777UL);
70 ASSERT_OCTAL(long long, 777777777777777777777LL);
71 ASSERT_OCTAL(unsigned long long, 1777777777777777777777ULL);
72
73 #undef ASSERT_OCTAL
74 }
75
TEST(RE2,DecimalTests)76 TEST(RE2, DecimalTests) {
77 #define ASSERT_DECIMAL(type, value) \
78 do { \
79 type v; \
80 ASSERT_TRUE(RE2::FullMatch(#value, "(-?[0-9]+)[uUlL]*", &v)); \
81 ASSERT_EQ(v, value); \
82 ASSERT_TRUE( \
83 RE2::FullMatch(#value, "(-?[0-9a-fA-FxX]+)[uUlL]*", RE2::CRadix(&v))); \
84 ASSERT_EQ(v, value); \
85 } while (0)
86
87 ASSERT_DECIMAL(short, -1);
88 ASSERT_DECIMAL(unsigned short, 9999);
89 ASSERT_DECIMAL(int, -1000);
90 ASSERT_DECIMAL(unsigned int, 12345U);
91 ASSERT_DECIMAL(long, -10000000L);
92 ASSERT_DECIMAL(unsigned long, 3083324652U);
93 ASSERT_DECIMAL(long long, -100000000000000LL);
94 ASSERT_DECIMAL(unsigned long long, 1234567890987654321ULL);
95
96 #undef ASSERT_DECIMAL
97 }
98
TEST(RE2,Replace)99 TEST(RE2, Replace) {
100 struct ReplaceTest {
101 const char *regexp;
102 const char *rewrite;
103 const char *original;
104 const char *single;
105 const char *global;
106 int greplace_count;
107 };
108 static const ReplaceTest tests[] = {
109 { "(qu|[b-df-hj-np-tv-z]*)([a-z]+)",
110 "\\2\\1ay",
111 "the quick brown fox jumps over the lazy dogs.",
112 "ethay quick brown fox jumps over the lazy dogs.",
113 "ethay ickquay ownbray oxfay umpsjay overay ethay azylay ogsday.",
114 9 },
115 { "\\w+",
116 "\\0-NOSPAM",
117 "abcd.efghi@google.com",
118 "abcd-NOSPAM.efghi@google.com",
119 "abcd-NOSPAM.efghi-NOSPAM@google-NOSPAM.com-NOSPAM",
120 4 },
121 { "^",
122 "(START)",
123 "foo",
124 "(START)foo",
125 "(START)foo",
126 1 },
127 { "^",
128 "(START)",
129 "",
130 "(START)",
131 "(START)",
132 1 },
133 { "$",
134 "(END)",
135 "",
136 "(END)",
137 "(END)",
138 1 },
139 { "b",
140 "bb",
141 "ababababab",
142 "abbabababab",
143 "abbabbabbabbabb",
144 5 },
145 { "b",
146 "bb",
147 "bbbbbb",
148 "bbbbbbb",
149 "bbbbbbbbbbbb",
150 6 },
151 { "b+",
152 "bb",
153 "bbbbbb",
154 "bb",
155 "bb",
156 1 },
157 { "b*",
158 "bb",
159 "bbbbbb",
160 "bb",
161 "bb",
162 1 },
163 { "b*",
164 "bb",
165 "aaaaa",
166 "bbaaaaa",
167 "bbabbabbabbabbabb",
168 6 },
169 // Check newline handling
170 { "a.*a",
171 "(\\0)",
172 "aba\naba",
173 "(aba)\naba",
174 "(aba)\n(aba)",
175 2 },
176 { "", NULL, NULL, NULL, NULL, 0 }
177 };
178
179 for (const ReplaceTest* t = tests; t->original != NULL; t++) {
180 std::string one(t->original);
181 ASSERT_TRUE(RE2::Replace(&one, t->regexp, t->rewrite));
182 ASSERT_EQ(one, t->single);
183 std::string all(t->original);
184 ASSERT_EQ(RE2::GlobalReplace(&all, t->regexp, t->rewrite), t->greplace_count)
185 << "Got: " << all;
186 ASSERT_EQ(all, t->global);
187 }
188 }
189
TestCheckRewriteString(const char * regexp,const char * rewrite,bool expect_ok)190 static void TestCheckRewriteString(const char* regexp, const char* rewrite,
191 bool expect_ok) {
192 std::string error;
193 RE2 exp(regexp);
194 bool actual_ok = exp.CheckRewriteString(rewrite, &error);
195 EXPECT_EQ(expect_ok, actual_ok) << " for " << rewrite << " error: " << error;
196 }
197
TEST(CheckRewriteString,all)198 TEST(CheckRewriteString, all) {
199 TestCheckRewriteString("abc", "foo", true);
200 TestCheckRewriteString("abc", "foo\\", false);
201 TestCheckRewriteString("abc", "foo\\0bar", true);
202
203 TestCheckRewriteString("a(b)c", "foo", true);
204 TestCheckRewriteString("a(b)c", "foo\\0bar", true);
205 TestCheckRewriteString("a(b)c", "foo\\1bar", true);
206 TestCheckRewriteString("a(b)c", "foo\\2bar", false);
207 TestCheckRewriteString("a(b)c", "f\\\\2o\\1o", true);
208
209 TestCheckRewriteString("a(b)(c)", "foo\\12", true);
210 TestCheckRewriteString("a(b)(c)", "f\\2o\\1o", true);
211 TestCheckRewriteString("a(b)(c)", "f\\oo\\1", false);
212 }
213
TEST(RE2,Extract)214 TEST(RE2, Extract) {
215 std::string s;
216
217 ASSERT_TRUE(RE2::Extract("boris@kremvax.ru", "(.*)@([^.]*)", "\\2!\\1", &s));
218 ASSERT_EQ(s, "kremvax!boris");
219
220 ASSERT_TRUE(RE2::Extract("foo", ".*", "'\\0'", &s));
221 ASSERT_EQ(s, "'foo'");
222 // check that false match doesn't overwrite
223 ASSERT_FALSE(RE2::Extract("baz", "bar", "'\\0'", &s));
224 ASSERT_EQ(s, "'foo'");
225 }
226
TEST(RE2,MaxSubmatchTooLarge)227 TEST(RE2, MaxSubmatchTooLarge) {
228 std::string s;
229 ASSERT_FALSE(RE2::Extract("foo", "f(o+)", "\\1\\2", &s));
230 s = "foo";
231 ASSERT_FALSE(RE2::Replace(&s, "f(o+)", "\\1\\2"));
232 s = "foo";
233 ASSERT_FALSE(RE2::GlobalReplace(&s, "f(o+)", "\\1\\2"));
234 }
235
TEST(RE2,Consume)236 TEST(RE2, Consume) {
237 RE2 r("\\s*(\\w+)"); // matches a word, possibly proceeded by whitespace
238 std::string word;
239
240 std::string s(" aaa b!@#$@#$cccc");
241 StringPiece input(s);
242
243 ASSERT_TRUE(RE2::Consume(&input, r, &word));
244 ASSERT_EQ(word, "aaa") << " input: " << input;
245 ASSERT_TRUE(RE2::Consume(&input, r, &word));
246 ASSERT_EQ(word, "b") << " input: " << input;
247 ASSERT_FALSE(RE2::Consume(&input, r, &word)) << " input: " << input;
248 }
249
TEST(RE2,ConsumeN)250 TEST(RE2, ConsumeN) {
251 const std::string s(" one two three 4");
252 StringPiece input(s);
253
254 RE2::Arg argv[2];
255 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
256
257 // 0 arg
258 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 0)); // Skips "one".
259
260 // 1 arg
261 std::string word;
262 argv[0] = &word;
263 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)", args, 1));
264 EXPECT_EQ("two", word);
265
266 // Multi-args
267 int n;
268 argv[1] = &n;
269 EXPECT_TRUE(RE2::ConsumeN(&input, "\\s*(\\w+)\\s*(\\d+)", args, 2));
270 EXPECT_EQ("three", word);
271 EXPECT_EQ(4, n);
272 }
273
TEST(RE2,FindAndConsume)274 TEST(RE2, FindAndConsume) {
275 RE2 r("(\\w+)"); // matches a word
276 std::string word;
277
278 std::string s(" aaa b!@#$@#$cccc");
279 StringPiece input(s);
280
281 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
282 ASSERT_EQ(word, "aaa");
283 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
284 ASSERT_EQ(word, "b");
285 ASSERT_TRUE(RE2::FindAndConsume(&input, r, &word));
286 ASSERT_EQ(word, "cccc");
287 ASSERT_FALSE(RE2::FindAndConsume(&input, r, &word));
288
289 // Check that FindAndConsume works without any submatches.
290 // Earlier version used uninitialized data for
291 // length to consume.
292 input = "aaa";
293 ASSERT_TRUE(RE2::FindAndConsume(&input, "aaa"));
294 ASSERT_EQ(input, "");
295 }
296
TEST(RE2,FindAndConsumeN)297 TEST(RE2, FindAndConsumeN) {
298 const std::string s(" one two three 4");
299 StringPiece input(s);
300
301 RE2::Arg argv[2];
302 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
303
304 // 0 arg
305 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 0)); // Skips "one".
306
307 // 1 arg
308 std::string word;
309 argv[0] = &word;
310 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)", args, 1));
311 EXPECT_EQ("two", word);
312
313 // Multi-args
314 int n;
315 argv[1] = &n;
316 EXPECT_TRUE(RE2::FindAndConsumeN(&input, "(\\w+)\\s*(\\d+)", args, 2));
317 EXPECT_EQ("three", word);
318 EXPECT_EQ(4, n);
319 }
320
TEST(RE2,MatchNumberPeculiarity)321 TEST(RE2, MatchNumberPeculiarity) {
322 RE2 r("(foo)|(bar)|(baz)");
323 std::string word1;
324 std::string word2;
325 std::string word3;
326
327 ASSERT_TRUE(RE2::PartialMatch("foo", r, &word1, &word2, &word3));
328 ASSERT_EQ(word1, "foo");
329 ASSERT_EQ(word2, "");
330 ASSERT_EQ(word3, "");
331 ASSERT_TRUE(RE2::PartialMatch("bar", r, &word1, &word2, &word3));
332 ASSERT_EQ(word1, "");
333 ASSERT_EQ(word2, "bar");
334 ASSERT_EQ(word3, "");
335 ASSERT_TRUE(RE2::PartialMatch("baz", r, &word1, &word2, &word3));
336 ASSERT_EQ(word1, "");
337 ASSERT_EQ(word2, "");
338 ASSERT_EQ(word3, "baz");
339 ASSERT_FALSE(RE2::PartialMatch("f", r, &word1, &word2, &word3));
340
341 std::string a;
342 ASSERT_TRUE(RE2::FullMatch("hello", "(foo)|hello", &a));
343 ASSERT_EQ(a, "");
344 }
345
TEST(RE2,Match)346 TEST(RE2, Match) {
347 RE2 re("((\\w+):([0-9]+))"); // extracts host and port
348 StringPiece group[4];
349
350 // No match.
351 StringPiece s = "zyzzyva";
352 ASSERT_FALSE(
353 re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
354
355 // Matches and extracts.
356 s = "a chrisr:9000 here";
357 ASSERT_TRUE(
358 re.Match(s, 0, s.size(), RE2::UNANCHORED, group, arraysize(group)));
359 ASSERT_EQ(group[0], "chrisr:9000");
360 ASSERT_EQ(group[1], "chrisr:9000");
361 ASSERT_EQ(group[2], "chrisr");
362 ASSERT_EQ(group[3], "9000");
363
364 std::string all, host;
365 int port;
366 ASSERT_TRUE(RE2::PartialMatch("a chrisr:9000 here", re, &all, &host, &port));
367 ASSERT_EQ(all, "chrisr:9000");
368 ASSERT_EQ(host, "chrisr");
369 ASSERT_EQ(port, 9000);
370 }
371
TestRecursion(int size,const char * pattern)372 static void TestRecursion(int size, const char* pattern) {
373 // Fill up a string repeating the pattern given
374 std::string domain;
375 domain.resize(size);
376 size_t patlen = strlen(pattern);
377 for (int i = 0; i < size; i++) {
378 domain[i] = pattern[i % patlen];
379 }
380 // Just make sure it doesn't crash due to too much recursion.
381 RE2 re("([a-zA-Z0-9]|-)+(\\.([a-zA-Z0-9]|-)+)*(\\.)?", RE2::Quiet);
382 RE2::FullMatch(domain, re);
383 }
384
385 // A meta-quoted string, interpreted as a pattern, should always match
386 // the original unquoted string.
TestQuoteMeta(const std::string & unquoted,const RE2::Options & options=RE2::DefaultOptions)387 static void TestQuoteMeta(const std::string& unquoted,
388 const RE2::Options& options = RE2::DefaultOptions) {
389 std::string quoted = RE2::QuoteMeta(unquoted);
390 RE2 re(quoted, options);
391 EXPECT_TRUE(RE2::FullMatch(unquoted, re))
392 << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
393 }
394
395 // A meta-quoted string, interpreted as a pattern, should always match
396 // the original unquoted string.
NegativeTestQuoteMeta(const std::string & unquoted,const std::string & should_not_match,const RE2::Options & options=RE2::DefaultOptions)397 static void NegativeTestQuoteMeta(
398 const std::string& unquoted, const std::string& should_not_match,
399 const RE2::Options& options = RE2::DefaultOptions) {
400 std::string quoted = RE2::QuoteMeta(unquoted);
401 RE2 re(quoted, options);
402 EXPECT_FALSE(RE2::FullMatch(should_not_match, re))
403 << "Unquoted='" << unquoted << "', quoted='" << quoted << "'.";
404 }
405
406 // Tests that quoted meta characters match their original strings,
407 // and that a few things that shouldn't match indeed do not.
TEST(QuoteMeta,Simple)408 TEST(QuoteMeta, Simple) {
409 TestQuoteMeta("foo");
410 TestQuoteMeta("foo.bar");
411 TestQuoteMeta("foo\\.bar");
412 TestQuoteMeta("[1-9]");
413 TestQuoteMeta("1.5-2.0?");
414 TestQuoteMeta("\\d");
415 TestQuoteMeta("Who doesn't like ice cream?");
416 TestQuoteMeta("((a|b)c?d*e+[f-h]i)");
417 TestQuoteMeta("((?!)xxx).*yyy");
418 TestQuoteMeta("([");
419 }
TEST(QuoteMeta,SimpleNegative)420 TEST(QuoteMeta, SimpleNegative) {
421 NegativeTestQuoteMeta("foo", "bar");
422 NegativeTestQuoteMeta("...", "bar");
423 NegativeTestQuoteMeta("\\.", ".");
424 NegativeTestQuoteMeta("\\.", "..");
425 NegativeTestQuoteMeta("(a)", "a");
426 NegativeTestQuoteMeta("(a|b)", "a");
427 NegativeTestQuoteMeta("(a|b)", "(a)");
428 NegativeTestQuoteMeta("(a|b)", "a|b");
429 NegativeTestQuoteMeta("[0-9]", "0");
430 NegativeTestQuoteMeta("[0-9]", "0-9");
431 NegativeTestQuoteMeta("[0-9]", "[9]");
432 NegativeTestQuoteMeta("((?!)xxx)", "xxx");
433 }
434
TEST(QuoteMeta,Latin1)435 TEST(QuoteMeta, Latin1) {
436 TestQuoteMeta("3\xb2 = 9", RE2::Latin1);
437 }
438
TEST(QuoteMeta,UTF8)439 TEST(QuoteMeta, UTF8) {
440 TestQuoteMeta("Plácido Domingo");
441 TestQuoteMeta("xyz"); // No fancy utf8.
442 TestQuoteMeta("\xc2\xb0"); // 2-byte utf8 -- a degree symbol.
443 TestQuoteMeta("27\xc2\xb0 degrees"); // As a middle character.
444 TestQuoteMeta("\xe2\x80\xb3"); // 3-byte utf8 -- a double prime.
445 TestQuoteMeta("\xf0\x9d\x85\x9f"); // 4-byte utf8 -- a music note.
446 TestQuoteMeta("27\xc2\xb0"); // Interpreted as Latin-1, this should
447 // still work.
448 NegativeTestQuoteMeta("27\xc2\xb0",
449 "27\\\xc2\\\xb0"); // 2-byte utf8 -- a degree symbol.
450 }
451
TEST(QuoteMeta,HasNull)452 TEST(QuoteMeta, HasNull) {
453 std::string has_null;
454
455 // string with one null character
456 has_null += '\0';
457 TestQuoteMeta(has_null);
458 NegativeTestQuoteMeta(has_null, "");
459
460 // Don't want null-followed-by-'1' to be interpreted as '\01'.
461 has_null += '1';
462 TestQuoteMeta(has_null);
463 NegativeTestQuoteMeta(has_null, "\1");
464 }
465
TEST(ProgramSize,BigProgram)466 TEST(ProgramSize, BigProgram) {
467 RE2 re_simple("simple regexp");
468 RE2 re_medium("medium.*regexp");
469 RE2 re_complex("complex.{1,128}regexp");
470
471 ASSERT_GT(re_simple.ProgramSize(), 0);
472 ASSERT_GT(re_medium.ProgramSize(), re_simple.ProgramSize());
473 ASSERT_GT(re_complex.ProgramSize(), re_medium.ProgramSize());
474
475 ASSERT_GT(re_simple.ReverseProgramSize(), 0);
476 ASSERT_GT(re_medium.ReverseProgramSize(), re_simple.ReverseProgramSize());
477 ASSERT_GT(re_complex.ReverseProgramSize(), re_medium.ReverseProgramSize());
478 }
479
TEST(ProgramFanout,BigProgram)480 TEST(ProgramFanout, BigProgram) {
481 RE2 re1("(?:(?:(?:(?:(?:.)?){1})*)+)");
482 RE2 re10("(?:(?:(?:(?:(?:.)?){10})*)+)");
483 RE2 re100("(?:(?:(?:(?:(?:.)?){100})*)+)");
484 RE2 re1000("(?:(?:(?:(?:(?:.)?){1000})*)+)");
485
486 std::vector<int> histogram;
487
488 // 3 is the largest non-empty bucket and has 1 element.
489 ASSERT_EQ(3, re1.ProgramFanout(&histogram));
490 ASSERT_EQ(1, histogram[3]);
491
492 // 6 is the largest non-empty bucket and has 10 elements.
493 ASSERT_EQ(6, re10.ProgramFanout(&histogram));
494 ASSERT_EQ(10, histogram[6]);
495
496 // 9 is the largest non-empty bucket and has 100 elements.
497 ASSERT_EQ(9, re100.ProgramFanout(&histogram));
498 ASSERT_EQ(100, histogram[9]);
499
500 // 13 is the largest non-empty bucket and has 1000 elements.
501 ASSERT_EQ(13, re1000.ProgramFanout(&histogram));
502 ASSERT_EQ(1000, histogram[13]);
503
504 // 2 is the largest non-empty bucket and has 1 element.
505 ASSERT_EQ(2, re1.ReverseProgramFanout(&histogram));
506 ASSERT_EQ(1, histogram[2]);
507
508 // 5 is the largest non-empty bucket and has 10 elements.
509 ASSERT_EQ(5, re10.ReverseProgramFanout(&histogram));
510 ASSERT_EQ(10, histogram[5]);
511
512 // 9 is the largest non-empty bucket and has 100 elements.
513 ASSERT_EQ(9, re100.ReverseProgramFanout(&histogram));
514 ASSERT_EQ(100, histogram[9]);
515
516 // 12 is the largest non-empty bucket and has 1000 elements.
517 ASSERT_EQ(12, re1000.ReverseProgramFanout(&histogram));
518 ASSERT_EQ(1000, histogram[12]);
519 }
520
521 // Issue 956519: handling empty character sets was
522 // causing NULL dereference. This tests a few empty character sets.
523 // (The way to get an empty character set is to negate a full one.)
TEST(EmptyCharset,Fuzz)524 TEST(EmptyCharset, Fuzz) {
525 static const char *empties[] = {
526 "[^\\S\\s]",
527 "[^\\S[:space:]]",
528 "[^\\D\\d]",
529 "[^\\D[:digit:]]"
530 };
531 for (size_t i = 0; i < arraysize(empties); i++)
532 ASSERT_FALSE(RE2(empties[i]).Match("abc", 0, 3, RE2::UNANCHORED, NULL, 0));
533 }
534
535 // Bitstate assumes that kInstFail instructions in
536 // alternations or capture groups have been "compiled away".
TEST(EmptyCharset,BitstateAssumptions)537 TEST(EmptyCharset, BitstateAssumptions) {
538 // Captures trigger use of Bitstate.
539 static const char *nop_empties[] = {
540 "((((()))))" "[^\\S\\s]?",
541 "((((()))))" "([^\\S\\s])?",
542 "((((()))))" "([^\\S\\s]|[^\\S\\s])?",
543 "((((()))))" "(([^\\S\\s]|[^\\S\\s])|)"
544 };
545 StringPiece group[6];
546 for (size_t i = 0; i < arraysize(nop_empties); i++)
547 ASSERT_TRUE(RE2(nop_empties[i]).Match("", 0, 0, RE2::UNANCHORED, group, 6));
548 }
549
550 // Test that named groups work correctly.
TEST(Capture,NamedGroups)551 TEST(Capture, NamedGroups) {
552 {
553 RE2 re("(hello world)");
554 ASSERT_EQ(re.NumberOfCapturingGroups(), 1);
555 const std::map<std::string, int>& m = re.NamedCapturingGroups();
556 ASSERT_EQ(m.size(), 0);
557 }
558
559 {
560 RE2 re("(?P<A>expr(?P<B>expr)(?P<C>expr))((expr)(?P<D>expr))");
561 ASSERT_EQ(re.NumberOfCapturingGroups(), 6);
562 const std::map<std::string, int>& m = re.NamedCapturingGroups();
563 ASSERT_EQ(m.size(), 4);
564 ASSERT_EQ(m.find("A")->second, 1);
565 ASSERT_EQ(m.find("B")->second, 2);
566 ASSERT_EQ(m.find("C")->second, 3);
567 ASSERT_EQ(m.find("D")->second, 6); // $4 and $5 are anonymous
568 }
569 }
570
TEST(RE2,CapturedGroupTest)571 TEST(RE2, CapturedGroupTest) {
572 RE2 re("directions from (?P<S>.*) to (?P<D>.*)");
573 int num_groups = re.NumberOfCapturingGroups();
574 EXPECT_EQ(2, num_groups);
575 std::string args[4];
576 RE2::Arg arg0(&args[0]);
577 RE2::Arg arg1(&args[1]);
578 RE2::Arg arg2(&args[2]);
579 RE2::Arg arg3(&args[3]);
580
581 const RE2::Arg* const matches[4] = {&arg0, &arg1, &arg2, &arg3};
582 EXPECT_TRUE(RE2::FullMatchN("directions from mountain view to san jose",
583 re, matches, num_groups));
584 const std::map<std::string, int>& named_groups = re.NamedCapturingGroups();
585 EXPECT_TRUE(named_groups.find("S") != named_groups.end());
586 EXPECT_TRUE(named_groups.find("D") != named_groups.end());
587
588 // The named group index is 1-based.
589 int source_group_index = named_groups.find("S")->second;
590 int destination_group_index = named_groups.find("D")->second;
591 EXPECT_EQ(1, source_group_index);
592 EXPECT_EQ(2, destination_group_index);
593
594 // The args is zero-based.
595 EXPECT_EQ("mountain view", args[source_group_index - 1]);
596 EXPECT_EQ("san jose", args[destination_group_index - 1]);
597 }
598
TEST(RE2,FullMatchWithNoArgs)599 TEST(RE2, FullMatchWithNoArgs) {
600 ASSERT_TRUE(RE2::FullMatch("h", "h"));
601 ASSERT_TRUE(RE2::FullMatch("hello", "hello"));
602 ASSERT_TRUE(RE2::FullMatch("hello", "h.*o"));
603 ASSERT_FALSE(RE2::FullMatch("othello", "h.*o")); // Must be anchored at front
604 ASSERT_FALSE(RE2::FullMatch("hello!", "h.*o")); // Must be anchored at end
605 }
606
TEST(RE2,PartialMatch)607 TEST(RE2, PartialMatch) {
608 ASSERT_TRUE(RE2::PartialMatch("x", "x"));
609 ASSERT_TRUE(RE2::PartialMatch("hello", "h.*o"));
610 ASSERT_TRUE(RE2::PartialMatch("othello", "h.*o"));
611 ASSERT_TRUE(RE2::PartialMatch("hello!", "h.*o"));
612 ASSERT_TRUE(RE2::PartialMatch("x", "((((((((((((((((((((x))))))))))))))))))))"));
613 }
614
TEST(RE2,PartialMatchN)615 TEST(RE2, PartialMatchN) {
616 RE2::Arg argv[2];
617 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
618
619 // 0 arg
620 EXPECT_TRUE(RE2::PartialMatchN("hello", "e.*o", args, 0));
621 EXPECT_FALSE(RE2::PartialMatchN("othello", "a.*o", args, 0));
622
623 // 1 arg
624 int i;
625 argv[0] = &i;
626 EXPECT_TRUE(RE2::PartialMatchN("1001 nights", "(\\d+)", args, 1));
627 EXPECT_EQ(1001, i);
628 EXPECT_FALSE(RE2::PartialMatchN("three", "(\\d+)", args, 1));
629
630 // Multi-arg
631 std::string s;
632 argv[1] = &s;
633 EXPECT_TRUE(RE2::PartialMatchN("answer: 42:life", "(\\d+):(\\w+)", args, 2));
634 EXPECT_EQ(42, i);
635 EXPECT_EQ("life", s);
636 EXPECT_FALSE(RE2::PartialMatchN("hi1", "(\\w+)(1)", args, 2));
637 }
638
TEST(RE2,FullMatchZeroArg)639 TEST(RE2, FullMatchZeroArg) {
640 // Zero-arg
641 ASSERT_TRUE(RE2::FullMatch("1001", "\\d+"));
642 }
643
TEST(RE2,FullMatchOneArg)644 TEST(RE2, FullMatchOneArg) {
645 int i;
646
647 // Single-arg
648 ASSERT_TRUE(RE2::FullMatch("1001", "(\\d+)", &i));
649 ASSERT_EQ(i, 1001);
650 ASSERT_TRUE(RE2::FullMatch("-123", "(-?\\d+)", &i));
651 ASSERT_EQ(i, -123);
652 ASSERT_FALSE(RE2::FullMatch("10", "()\\d+", &i));
653 ASSERT_FALSE(
654 RE2::FullMatch("1234567890123456789012345678901234567890", "(\\d+)", &i));
655 }
656
TEST(RE2,FullMatchIntegerArg)657 TEST(RE2, FullMatchIntegerArg) {
658 int i;
659
660 // Digits surrounding integer-arg
661 ASSERT_TRUE(RE2::FullMatch("1234", "1(\\d*)4", &i));
662 ASSERT_EQ(i, 23);
663 ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)\\d+", &i));
664 ASSERT_EQ(i, 1);
665 ASSERT_TRUE(RE2::FullMatch("-1234", "(-\\d)\\d+", &i));
666 ASSERT_EQ(i, -1);
667 ASSERT_TRUE(RE2::PartialMatch("1234", "(\\d)", &i));
668 ASSERT_EQ(i, 1);
669 ASSERT_TRUE(RE2::PartialMatch("-1234", "(-\\d)", &i));
670 ASSERT_EQ(i, -1);
671 }
672
TEST(RE2,FullMatchStringArg)673 TEST(RE2, FullMatchStringArg) {
674 std::string s;
675 // String-arg
676 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", &s));
677 ASSERT_EQ(s, std::string("ell"));
678 }
679
TEST(RE2,FullMatchStringPieceArg)680 TEST(RE2, FullMatchStringPieceArg) {
681 int i;
682 // StringPiece-arg
683 StringPiece sp;
684 ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &sp, &i));
685 ASSERT_EQ(sp.size(), 4);
686 ASSERT_TRUE(memcmp(sp.data(), "ruby", 4) == 0);
687 ASSERT_EQ(i, 1234);
688 }
689
TEST(RE2,FullMatchMultiArg)690 TEST(RE2, FullMatchMultiArg) {
691 int i;
692 std::string s;
693 // Multi-arg
694 ASSERT_TRUE(RE2::FullMatch("ruby:1234", "(\\w+):(\\d+)", &s, &i));
695 ASSERT_EQ(s, std::string("ruby"));
696 ASSERT_EQ(i, 1234);
697 }
698
TEST(RE2,FullMatchN)699 TEST(RE2, FullMatchN) {
700 RE2::Arg argv[2];
701 const RE2::Arg* const args[2] = { &argv[0], &argv[1] };
702
703 // 0 arg
704 EXPECT_TRUE(RE2::FullMatchN("hello", "h.*o", args, 0));
705 EXPECT_FALSE(RE2::FullMatchN("othello", "h.*o", args, 0));
706
707 // 1 arg
708 int i;
709 argv[0] = &i;
710 EXPECT_TRUE(RE2::FullMatchN("1001", "(\\d+)", args, 1));
711 EXPECT_EQ(1001, i);
712 EXPECT_FALSE(RE2::FullMatchN("three", "(\\d+)", args, 1));
713
714 // Multi-arg
715 std::string s;
716 argv[1] = &s;
717 EXPECT_TRUE(RE2::FullMatchN("42:life", "(\\d+):(\\w+)", args, 2));
718 EXPECT_EQ(42, i);
719 EXPECT_EQ("life", s);
720 EXPECT_FALSE(RE2::FullMatchN("hi1", "(\\w+)(1)", args, 2));
721 }
722
TEST(RE2,FullMatchIgnoredArg)723 TEST(RE2, FullMatchIgnoredArg) {
724 int i;
725 std::string s;
726
727 // Old-school NULL should be ignored.
728 ASSERT_TRUE(
729 RE2::FullMatch("ruby:1234", "(\\w+)(:)(\\d+)", &s, (void*)NULL, &i));
730 ASSERT_EQ(s, std::string("ruby"));
731 ASSERT_EQ(i, 1234);
732
733 // C++11 nullptr should also be ignored.
734 ASSERT_TRUE(RE2::FullMatch("rubz:1235", "(\\w+)(:)(\\d+)", &s, nullptr, &i));
735 ASSERT_EQ(s, std::string("rubz"));
736 ASSERT_EQ(i, 1235);
737 }
738
TEST(RE2,FullMatchTypedNullArg)739 TEST(RE2, FullMatchTypedNullArg) {
740 std::string s;
741
742 // Ignore non-void* NULL arg
743 ASSERT_TRUE(RE2::FullMatch("hello", "he(.*)lo", (char*)NULL));
744 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (std::string*)NULL));
745 ASSERT_TRUE(RE2::FullMatch("hello", "h(.*)o", (StringPiece*)NULL));
746 ASSERT_TRUE(RE2::FullMatch("1234", "(.*)", (int*)NULL));
747 ASSERT_TRUE(RE2::FullMatch("1234567890123456", "(.*)", (long long*)NULL));
748 ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (double*)NULL));
749 ASSERT_TRUE(RE2::FullMatch("123.4567890123456", "(.*)", (float*)NULL));
750
751 // Fail on non-void* NULL arg if the match doesn't parse for the given type.
752 ASSERT_FALSE(RE2::FullMatch("hello", "h(.*)lo", &s, (char*)NULL));
753 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (int*)NULL));
754 ASSERT_FALSE(RE2::FullMatch("1234567890123456", "(.*)", (int*)NULL));
755 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (double*)NULL));
756 ASSERT_FALSE(RE2::FullMatch("hello", "(.*)", (float*)NULL));
757 }
758
759 // Check that numeric parsing code does not read past the end of
760 // the number being parsed.
761 // This implementation requires mmap(2) et al. and thus cannot
762 // be used unless they are available.
TEST(RE2,NULTerminated)763 TEST(RE2, NULTerminated) {
764 #if defined(_POSIX_MAPPED_FILES) && _POSIX_MAPPED_FILES > 0
765 char *v;
766 int x;
767 long pagesize = sysconf(_SC_PAGE_SIZE);
768
769 #ifndef MAP_ANONYMOUS
770 #define MAP_ANONYMOUS MAP_ANON
771 #endif
772 v = static_cast<char*>(mmap(NULL, 2*pagesize, PROT_READ|PROT_WRITE,
773 MAP_ANONYMOUS|MAP_PRIVATE, -1, 0));
774 ASSERT_TRUE(v != reinterpret_cast<char*>(-1));
775 LOG(INFO) << "Memory at " << (void*)v;
776 ASSERT_EQ(munmap(v + pagesize, pagesize), 0) << " error " << errno;
777 v[pagesize - 1] = '1';
778
779 x = 0;
780 ASSERT_TRUE(RE2::FullMatch(StringPiece(v + pagesize - 1, 1), "(.*)", &x));
781 ASSERT_EQ(x, 1);
782 #endif
783 }
784
TEST(RE2,FullMatchTypeTests)785 TEST(RE2, FullMatchTypeTests) {
786 // Type tests
787 std::string zeros(1000, '0');
788 {
789 char c;
790 ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
791 ASSERT_EQ(c, 'H');
792 }
793 {
794 unsigned char c;
795 ASSERT_TRUE(RE2::FullMatch("Hello", "(H)ello", &c));
796 ASSERT_EQ(c, static_cast<unsigned char>('H'));
797 }
798 {
799 int16_t v;
800 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
801 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
802 ASSERT_TRUE(RE2::FullMatch("32767", "(-?\\d+)", &v)); ASSERT_EQ(v, 32767);
803 ASSERT_TRUE(RE2::FullMatch("-32768", "(-?\\d+)", &v)); ASSERT_EQ(v, -32768);
804 ASSERT_FALSE(RE2::FullMatch("-32769", "(-?\\d+)", &v));
805 ASSERT_FALSE(RE2::FullMatch("32768", "(-?\\d+)", &v));
806 }
807 {
808 uint16_t v;
809 ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
810 ASSERT_TRUE(RE2::FullMatch("32767", "(\\d+)", &v)); ASSERT_EQ(v, 32767);
811 ASSERT_TRUE(RE2::FullMatch("65535", "(\\d+)", &v)); ASSERT_EQ(v, 65535);
812 ASSERT_FALSE(RE2::FullMatch("65536", "(\\d+)", &v));
813 }
814 {
815 int32_t v;
816 static const int32_t max = INT32_C(0x7fffffff);
817 static const int32_t min = -max - 1;
818 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
819 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
820 ASSERT_TRUE(RE2::FullMatch("2147483647", "(-?\\d+)", &v)); ASSERT_EQ(v, max);
821 ASSERT_TRUE(RE2::FullMatch("-2147483648", "(-?\\d+)", &v)); ASSERT_EQ(v, min);
822 ASSERT_FALSE(RE2::FullMatch("-2147483649", "(-?\\d+)", &v));
823 ASSERT_FALSE(RE2::FullMatch("2147483648", "(-?\\d+)", &v));
824
825 ASSERT_TRUE(RE2::FullMatch(zeros + "2147483647", "(-?\\d+)", &v));
826 ASSERT_EQ(v, max);
827 ASSERT_TRUE(RE2::FullMatch("-" + zeros + "2147483648", "(-?\\d+)", &v));
828 ASSERT_EQ(v, min);
829
830 ASSERT_FALSE(RE2::FullMatch("-" + zeros + "2147483649", "(-?\\d+)", &v));
831 ASSERT_TRUE(RE2::FullMatch("0x7fffffff", "(.*)", RE2::CRadix(&v)));
832 ASSERT_EQ(v, max);
833 ASSERT_FALSE(RE2::FullMatch("000x7fffffff", "(.*)", RE2::CRadix(&v)));
834 }
835 {
836 uint32_t v;
837 static const uint32_t max = UINT32_C(0xffffffff);
838 ASSERT_TRUE(RE2::FullMatch("100", "(\\d+)", &v)); ASSERT_EQ(v, 100);
839 ASSERT_TRUE(RE2::FullMatch("4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
840 ASSERT_FALSE(RE2::FullMatch("4294967296", "(\\d+)", &v));
841 ASSERT_FALSE(RE2::FullMatch("-1", "(\\d+)", &v));
842
843 ASSERT_TRUE(RE2::FullMatch(zeros + "4294967295", "(\\d+)", &v)); ASSERT_EQ(v, max);
844 }
845 {
846 int64_t v;
847 static const int64_t max = INT64_C(0x7fffffffffffffff);
848 static const int64_t min = -max - 1;
849 std::string str;
850
851 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
852 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v)); ASSERT_EQ(v, -100);
853
854 str = std::to_string(max);
855 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
856
857 str = std::to_string(min);
858 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, min);
859
860 str = std::to_string(max);
861 ASSERT_NE(str.back(), '9');
862 str.back()++;
863 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
864
865 str = std::to_string(min);
866 ASSERT_NE(str.back(), '9');
867 str.back()++;
868 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
869 }
870 {
871 uint64_t v;
872 int64_t v2;
873 static const uint64_t max = UINT64_C(0xffffffffffffffff);
874 std::string str;
875
876 ASSERT_TRUE(RE2::FullMatch("100", "(-?\\d+)", &v)); ASSERT_EQ(v, 100);
877 ASSERT_TRUE(RE2::FullMatch("-100", "(-?\\d+)", &v2)); ASSERT_EQ(v2, -100);
878
879 str = std::to_string(max);
880 ASSERT_TRUE(RE2::FullMatch(str, "(-?\\d+)", &v)); ASSERT_EQ(v, max);
881
882 ASSERT_NE(str.back(), '9');
883 str.back()++;
884 ASSERT_FALSE(RE2::FullMatch(str, "(-?\\d+)", &v));
885 }
886 }
887
TEST(RE2,FloatingPointFullMatchTypes)888 TEST(RE2, FloatingPointFullMatchTypes) {
889 std::string zeros(1000, '0');
890 {
891 float v;
892 ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
893 ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
894 ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, float(1e23));
895 ASSERT_TRUE(RE2::FullMatch(" 100", "(.*)", &v)); ASSERT_EQ(v, 100);
896
897 ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
898 ASSERT_EQ(v, float(1e23));
899
900 // 6700000000081920.1 is an edge case.
901 // 6700000000081920 is exactly halfway between
902 // two float32s, so the .1 should make it round up.
903 // However, the .1 is outside the precision possible with
904 // a float64: the nearest float64 is 6700000000081920.
905 // So if the code uses strtod and then converts to float32,
906 // round-to-even will make it round down instead of up.
907 // To pass the test, the parser must call strtof directly.
908 // This test case is carefully chosen to use only a 17-digit
909 // number, since C does not guarantee to get the correctly
910 // rounded answer for strtod and strtof unless the input is
911 // short.
912 //
913 // This is known to fail on Cygwin and MinGW due to a broken
914 // implementation of strtof(3). And apparently MSVC too. Sigh.
915 #if !defined(_MSC_VER) && !defined(__CYGWIN__) && !defined(__MINGW32__)
916 ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
917 ASSERT_EQ(v, 0.1f) << StringPrintf("%.8g != %.8g", v, 0.1f);
918 ASSERT_TRUE(RE2::FullMatch("6700000000081920.1", "(.*)", &v));
919 ASSERT_EQ(v, 6700000000081920.1f)
920 << StringPrintf("%.8g != %.8g", v, 6700000000081920.1f);
921 #endif
922 }
923 {
924 double v;
925 ASSERT_TRUE(RE2::FullMatch("100", "(.*)", &v)); ASSERT_EQ(v, 100);
926 ASSERT_TRUE(RE2::FullMatch("-100.", "(.*)", &v)); ASSERT_EQ(v, -100);
927 ASSERT_TRUE(RE2::FullMatch("1e23", "(.*)", &v)); ASSERT_EQ(v, 1e23);
928 ASSERT_TRUE(RE2::FullMatch(zeros + "1e23", "(.*)", &v));
929 ASSERT_EQ(v, double(1e23));
930
931 ASSERT_TRUE(RE2::FullMatch("0.1", "(.*)", &v));
932 ASSERT_EQ(v, 0.1) << StringPrintf("%.17g != %.17g", v, 0.1);
933 ASSERT_TRUE(RE2::FullMatch("1.00000005960464485", "(.*)", &v));
934 ASSERT_EQ(v, 1.0000000596046448)
935 << StringPrintf("%.17g != %.17g", v, 1.0000000596046448);
936 }
937 }
938
TEST(RE2,FullMatchAnchored)939 TEST(RE2, FullMatchAnchored) {
940 int i;
941 // Check that matching is fully anchored
942 ASSERT_FALSE(RE2::FullMatch("x1001", "(\\d+)", &i));
943 ASSERT_FALSE(RE2::FullMatch("1001x", "(\\d+)", &i));
944 ASSERT_TRUE(RE2::FullMatch("x1001", "x(\\d+)", &i)); ASSERT_EQ(i, 1001);
945 ASSERT_TRUE(RE2::FullMatch("1001x", "(\\d+)x", &i)); ASSERT_EQ(i, 1001);
946 }
947
TEST(RE2,FullMatchBraces)948 TEST(RE2, FullMatchBraces) {
949 // Braces
950 ASSERT_TRUE(RE2::FullMatch("0abcd", "[0-9a-f+.-]{5,}"));
951 ASSERT_TRUE(RE2::FullMatch("0abcde", "[0-9a-f+.-]{5,}"));
952 ASSERT_FALSE(RE2::FullMatch("0abc", "[0-9a-f+.-]{5,}"));
953 }
954
TEST(RE2,Complicated)955 TEST(RE2, Complicated) {
956 // Complicated RE2
957 ASSERT_TRUE(RE2::FullMatch("foo", "foo|bar|[A-Z]"));
958 ASSERT_TRUE(RE2::FullMatch("bar", "foo|bar|[A-Z]"));
959 ASSERT_TRUE(RE2::FullMatch("X", "foo|bar|[A-Z]"));
960 ASSERT_FALSE(RE2::FullMatch("XY", "foo|bar|[A-Z]"));
961 }
962
TEST(RE2,FullMatchEnd)963 TEST(RE2, FullMatchEnd) {
964 // Check full-match handling (needs '$' tacked on internally)
965 ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo"));
966 ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo"));
967 ASSERT_TRUE(RE2::FullMatch("fo", "fo|foo$"));
968 ASSERT_TRUE(RE2::FullMatch("foo", "fo|foo$"));
969 ASSERT_TRUE(RE2::FullMatch("foo", "foo$"));
970 ASSERT_FALSE(RE2::FullMatch("foo$bar", "foo\\$"));
971 ASSERT_FALSE(RE2::FullMatch("fox", "fo|bar"));
972
973 // Uncomment the following if we change the handling of '$' to
974 // prevent it from matching a trailing newline
975 if (false) {
976 // Check that we don't get bitten by pcre's special handling of a
977 // '\n' at the end of the string matching '$'
978 ASSERT_FALSE(RE2::PartialMatch("foo\n", "foo$"));
979 }
980 }
981
TEST(RE2,FullMatchArgCount)982 TEST(RE2, FullMatchArgCount) {
983 // Number of args
984 int a[16];
985 ASSERT_TRUE(RE2::FullMatch("", ""));
986
987 memset(a, 0, sizeof(0));
988 ASSERT_TRUE(RE2::FullMatch("1", "(\\d){1}", &a[0]));
989 ASSERT_EQ(a[0], 1);
990
991 memset(a, 0, sizeof(0));
992 ASSERT_TRUE(RE2::FullMatch("12", "(\\d)(\\d)", &a[0], &a[1]));
993 ASSERT_EQ(a[0], 1);
994 ASSERT_EQ(a[1], 2);
995
996 memset(a, 0, sizeof(0));
997 ASSERT_TRUE(RE2::FullMatch("123", "(\\d)(\\d)(\\d)", &a[0], &a[1], &a[2]));
998 ASSERT_EQ(a[0], 1);
999 ASSERT_EQ(a[1], 2);
1000 ASSERT_EQ(a[2], 3);
1001
1002 memset(a, 0, sizeof(0));
1003 ASSERT_TRUE(RE2::FullMatch("1234", "(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1004 &a[2], &a[3]));
1005 ASSERT_EQ(a[0], 1);
1006 ASSERT_EQ(a[1], 2);
1007 ASSERT_EQ(a[2], 3);
1008 ASSERT_EQ(a[3], 4);
1009
1010 memset(a, 0, sizeof(0));
1011 ASSERT_TRUE(RE2::FullMatch("12345", "(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0], &a[1],
1012 &a[2], &a[3], &a[4]));
1013 ASSERT_EQ(a[0], 1);
1014 ASSERT_EQ(a[1], 2);
1015 ASSERT_EQ(a[2], 3);
1016 ASSERT_EQ(a[3], 4);
1017 ASSERT_EQ(a[4], 5);
1018
1019 memset(a, 0, sizeof(0));
1020 ASSERT_TRUE(RE2::FullMatch("123456", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)", &a[0],
1021 &a[1], &a[2], &a[3], &a[4], &a[5]));
1022 ASSERT_EQ(a[0], 1);
1023 ASSERT_EQ(a[1], 2);
1024 ASSERT_EQ(a[2], 3);
1025 ASSERT_EQ(a[3], 4);
1026 ASSERT_EQ(a[4], 5);
1027 ASSERT_EQ(a[5], 6);
1028
1029 memset(a, 0, sizeof(0));
1030 ASSERT_TRUE(RE2::FullMatch("1234567", "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1031 &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6]));
1032 ASSERT_EQ(a[0], 1);
1033 ASSERT_EQ(a[1], 2);
1034 ASSERT_EQ(a[2], 3);
1035 ASSERT_EQ(a[3], 4);
1036 ASSERT_EQ(a[4], 5);
1037 ASSERT_EQ(a[5], 6);
1038 ASSERT_EQ(a[6], 7);
1039
1040 memset(a, 0, sizeof(0));
1041 ASSERT_TRUE(RE2::FullMatch("1234567890123456",
1042 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)"
1043 "(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)(\\d)",
1044 &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6],
1045 &a[7], &a[8], &a[9], &a[10], &a[11], &a[12],
1046 &a[13], &a[14], &a[15]));
1047 ASSERT_EQ(a[0], 1);
1048 ASSERT_EQ(a[1], 2);
1049 ASSERT_EQ(a[2], 3);
1050 ASSERT_EQ(a[3], 4);
1051 ASSERT_EQ(a[4], 5);
1052 ASSERT_EQ(a[5], 6);
1053 ASSERT_EQ(a[6], 7);
1054 ASSERT_EQ(a[7], 8);
1055 ASSERT_EQ(a[8], 9);
1056 ASSERT_EQ(a[9], 0);
1057 ASSERT_EQ(a[10], 1);
1058 ASSERT_EQ(a[11], 2);
1059 ASSERT_EQ(a[12], 3);
1060 ASSERT_EQ(a[13], 4);
1061 ASSERT_EQ(a[14], 5);
1062 ASSERT_EQ(a[15], 6);
1063 }
1064
TEST(RE2,Accessors)1065 TEST(RE2, Accessors) {
1066 // Check the pattern() accessor
1067 {
1068 const std::string kPattern = "http://([^/]+)/.*";
1069 const RE2 re(kPattern);
1070 ASSERT_EQ(kPattern, re.pattern());
1071 }
1072
1073 // Check RE2 error field.
1074 {
1075 RE2 re("foo");
1076 ASSERT_TRUE(re.error().empty()); // Must have no error
1077 ASSERT_TRUE(re.ok());
1078 ASSERT_EQ(re.error_code(), RE2::NoError);
1079 }
1080 }
1081
TEST(RE2,UTF8)1082 TEST(RE2, UTF8) {
1083 // Check UTF-8 handling
1084 // Three Japanese characters (nihongo)
1085 const char utf8_string[] = {
1086 (char)0xe6, (char)0x97, (char)0xa5, // 65e5
1087 (char)0xe6, (char)0x9c, (char)0xac, // 627c
1088 (char)0xe8, (char)0xaa, (char)0x9e, // 8a9e
1089 0
1090 };
1091 const char utf8_pattern[] = {
1092 '.',
1093 (char)0xe6, (char)0x9c, (char)0xac, // 627c
1094 '.',
1095 0
1096 };
1097
1098 // Both should match in either mode, bytes or UTF-8
1099 RE2 re_test1(".........", RE2::Latin1);
1100 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test1));
1101 RE2 re_test2("...");
1102 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test2));
1103
1104 // Check that '.' matches one byte or UTF-8 character
1105 // according to the mode.
1106 std::string s;
1107 RE2 re_test3("(.)", RE2::Latin1);
1108 ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test3, &s));
1109 ASSERT_EQ(s, std::string("\xe6"));
1110 RE2 re_test4("(.)");
1111 ASSERT_TRUE(RE2::PartialMatch(utf8_string, re_test4, &s));
1112 ASSERT_EQ(s, std::string("\xe6\x97\xa5"));
1113
1114 // Check that string matches itself in either mode
1115 RE2 re_test5(utf8_string, RE2::Latin1);
1116 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test5));
1117 RE2 re_test6(utf8_string);
1118 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test6));
1119
1120 // Check that pattern matches string only in UTF8 mode
1121 RE2 re_test7(utf8_pattern, RE2::Latin1);
1122 ASSERT_FALSE(RE2::FullMatch(utf8_string, re_test7));
1123 RE2 re_test8(utf8_pattern);
1124 ASSERT_TRUE(RE2::FullMatch(utf8_string, re_test8));
1125 }
1126
TEST(RE2,UngreedyUTF8)1127 TEST(RE2, UngreedyUTF8) {
1128 // Check that ungreedy, UTF8 regular expressions don't match when they
1129 // oughtn't -- see bug 82246.
1130 {
1131 // This code always worked.
1132 const char* pattern = "\\w+X";
1133 const std::string target = "a aX";
1134 RE2 match_sentence(pattern, RE2::Latin1);
1135 RE2 match_sentence_re(pattern);
1136
1137 ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1138 ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1139 }
1140 {
1141 const char* pattern = "(?U)\\w+X";
1142 const std::string target = "a aX";
1143 RE2 match_sentence(pattern, RE2::Latin1);
1144 ASSERT_EQ(match_sentence.error(), "");
1145 RE2 match_sentence_re(pattern);
1146
1147 ASSERT_FALSE(RE2::FullMatch(target, match_sentence));
1148 ASSERT_FALSE(RE2::FullMatch(target, match_sentence_re));
1149 }
1150 }
1151
TEST(RE2,Rejects)1152 TEST(RE2, Rejects) {
1153 {
1154 RE2 re("a\\1", RE2::Quiet);
1155 ASSERT_FALSE(re.ok()); }
1156 {
1157 RE2 re("a[x", RE2::Quiet);
1158 ASSERT_FALSE(re.ok());
1159 }
1160 {
1161 RE2 re("a[z-a]", RE2::Quiet);
1162 ASSERT_FALSE(re.ok());
1163 }
1164 {
1165 RE2 re("a[[:foobar:]]", RE2::Quiet);
1166 ASSERT_FALSE(re.ok());
1167 }
1168 {
1169 RE2 re("a(b", RE2::Quiet);
1170 ASSERT_FALSE(re.ok());
1171 }
1172 {
1173 RE2 re("a\\", RE2::Quiet);
1174 ASSERT_FALSE(re.ok());
1175 }
1176 }
1177
TEST(RE2,NoCrash)1178 TEST(RE2, NoCrash) {
1179 // Test that using a bad regexp doesn't crash.
1180 {
1181 RE2 re("a\\", RE2::Quiet);
1182 ASSERT_FALSE(re.ok());
1183 ASSERT_FALSE(RE2::PartialMatch("a\\b", re));
1184 }
1185
1186 // Test that using an enormous regexp doesn't crash
1187 {
1188 RE2 re("(((.{100}){100}){100}){100}", RE2::Quiet);
1189 ASSERT_FALSE(re.ok());
1190 ASSERT_FALSE(RE2::PartialMatch("aaa", re));
1191 }
1192
1193 // Test that a crazy regexp still compiles and runs.
1194 {
1195 RE2 re(".{512}x", RE2::Quiet);
1196 ASSERT_TRUE(re.ok());
1197 std::string s;
1198 s.append(515, 'c');
1199 s.append("x");
1200 ASSERT_TRUE(RE2::PartialMatch(s, re));
1201 }
1202 }
1203
TEST(RE2,Recursion)1204 TEST(RE2, Recursion) {
1205 // Test that recursion is stopped.
1206 // This test is PCRE-legacy -- there's no recursion in RE2.
1207 int bytes = 15 * 1024; // enough to crash PCRE
1208 TestRecursion(bytes, ".");
1209 TestRecursion(bytes, "a");
1210 TestRecursion(bytes, "a.");
1211 TestRecursion(bytes, "ab.");
1212 TestRecursion(bytes, "abc.");
1213 }
1214
TEST(RE2,BigCountedRepetition)1215 TEST(RE2, BigCountedRepetition) {
1216 // Test that counted repetition works, given tons of memory.
1217 RE2::Options opt;
1218 opt.set_max_mem(256<<20);
1219
1220 RE2 re(".{512}x", opt);
1221 ASSERT_TRUE(re.ok());
1222 std::string s;
1223 s.append(515, 'c');
1224 s.append("x");
1225 ASSERT_TRUE(RE2::PartialMatch(s, re));
1226 }
1227
TEST(RE2,DeepRecursion)1228 TEST(RE2, DeepRecursion) {
1229 // Test for deep stack recursion. This would fail with a
1230 // segmentation violation due to stack overflow before pcre was
1231 // patched.
1232 // Again, a PCRE legacy test. RE2 doesn't recurse.
1233 std::string comment("x*");
1234 std::string a(131072, 'a');
1235 comment += a;
1236 comment += "*x";
1237 RE2 re("((?:\\s|xx.*\n|x[*](?:\n|.)*?[*]x)*)");
1238 ASSERT_TRUE(RE2::FullMatch(comment, re));
1239 }
1240
1241 // Suggested by Josh Hyman. Failed when SearchOnePass was
1242 // not implementing case-folding.
TEST(CaseInsensitive,MatchAndConsume)1243 TEST(CaseInsensitive, MatchAndConsume) {
1244 std::string text = "A fish named *Wanda*";
1245 StringPiece sp(text);
1246 StringPiece result;
1247 EXPECT_TRUE(RE2::PartialMatch(text, "(?i)([wand]{5})", &result));
1248 EXPECT_TRUE(RE2::FindAndConsume(&sp, "(?i)([wand]{5})", &result));
1249 }
1250
1251 // RE2 should permit implicit conversions from string, StringPiece, const char*,
1252 // and C string literals.
TEST(RE2,ImplicitConversions)1253 TEST(RE2, ImplicitConversions) {
1254 std::string re_string(".");
1255 StringPiece re_stringpiece(".");
1256 const char* re_cstring = ".";
1257 EXPECT_TRUE(RE2::PartialMatch("e", re_string));
1258 EXPECT_TRUE(RE2::PartialMatch("e", re_stringpiece));
1259 EXPECT_TRUE(RE2::PartialMatch("e", re_cstring));
1260 EXPECT_TRUE(RE2::PartialMatch("e", "."));
1261 }
1262
1263 // Bugs introduced by 8622304
TEST(RE2,CL8622304)1264 TEST(RE2, CL8622304) {
1265 // reported by ingow
1266 std::string dir;
1267 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])")); // ok
1268 EXPECT_TRUE(RE2::FullMatch("D", "([^\\\\])", &dir)); // fails
1269
1270 // reported by jacobsa
1271 std::string key, val;
1272 EXPECT_TRUE(RE2::PartialMatch("bar:1,0x2F,030,4,5;baz:true;fooby:false,true",
1273 "(\\w+)(?::((?:[^;\\\\]|\\\\.)*))?;?",
1274 &key,
1275 &val));
1276 EXPECT_EQ(key, "bar");
1277 EXPECT_EQ(val, "1,0x2F,030,4,5");
1278 }
1279
1280
1281 // Check that RE2 returns correct regexp pieces on error.
1282 // In particular, make sure it returns whole runes
1283 // and that it always reports invalid UTF-8.
1284 // Also check that Perl error flag piece is big enough.
1285 static struct ErrorTest {
1286 const char *regexp;
1287 const char *error;
1288 } error_tests[] = {
1289 { "ab\\αcd", "\\α" },
1290 { "ef\\x☺01", "\\x☺0" },
1291 { "gh\\x1☺01", "\\x1☺" },
1292 { "ij\\x1", "\\x1" },
1293 { "kl\\x", "\\x" },
1294 { "uv\\x{0000☺}", "\\x{0000☺" },
1295 { "wx\\p{ABC", "\\p{ABC" },
1296 { "yz(?smiUX:abc)", "(?smiUX" }, // used to return (?s but the error is X
1297 { "aa(?sm☺i", "(?sm☺" },
1298 { "bb[abc", "[abc" },
1299
1300 { "mn\\x1\377", "" }, // no argument string returned for invalid UTF-8
1301 { "op\377qr", "" },
1302 { "st\\x{00000\377", "" },
1303 { "zz\\p{\377}", "" },
1304 { "zz\\x{00\377}", "" },
1305 { "zz(?P<name\377>abc)", "" },
1306 };
TEST(RE2,ErrorArgs)1307 TEST(RE2, ErrorArgs) {
1308 for (size_t i = 0; i < arraysize(error_tests); i++) {
1309 RE2 re(error_tests[i].regexp, RE2::Quiet);
1310 EXPECT_FALSE(re.ok());
1311 EXPECT_EQ(re.error_arg(), error_tests[i].error) << re.error();
1312 }
1313 }
1314
1315 // Check that "never match \n" mode never matches \n.
1316 static struct NeverTest {
1317 const char* regexp;
1318 const char* text;
1319 const char* match;
1320 } never_tests[] = {
1321 { "(.*)", "abc\ndef\nghi\n", "abc" },
1322 { "(?s)(abc.*def)", "abc\ndef\n", NULL },
1323 { "(abc(.|\n)*def)", "abc\ndef\n", NULL },
1324 { "(abc[^x]*def)", "abc\ndef\n", NULL },
1325 { "(abc[^x]*def)", "abczzzdef\ndef\n", "abczzzdef" },
1326 };
TEST(RE2,NeverNewline)1327 TEST(RE2, NeverNewline) {
1328 RE2::Options opt;
1329 opt.set_never_nl(true);
1330 for (size_t i = 0; i < arraysize(never_tests); i++) {
1331 const NeverTest& t = never_tests[i];
1332 RE2 re(t.regexp, opt);
1333 if (t.match == NULL) {
1334 EXPECT_FALSE(re.PartialMatch(t.text, re));
1335 } else {
1336 StringPiece m;
1337 EXPECT_TRUE(re.PartialMatch(t.text, re, &m));
1338 EXPECT_EQ(m, t.match);
1339 }
1340 }
1341 }
1342
1343 // Check that dot_nl option works.
TEST(RE2,DotNL)1344 TEST(RE2, DotNL) {
1345 RE2::Options opt;
1346 opt.set_dot_nl(true);
1347 EXPECT_TRUE(RE2::PartialMatch("\n", RE2(".", opt)));
1348 EXPECT_FALSE(RE2::PartialMatch("\n", RE2("(?-s).", opt)));
1349 opt.set_never_nl(true);
1350 EXPECT_FALSE(RE2::PartialMatch("\n", RE2(".", opt)));
1351 }
1352
1353 // Check that there are no capturing groups in "never capture" mode.
TEST(RE2,NeverCapture)1354 TEST(RE2, NeverCapture) {
1355 RE2::Options opt;
1356 opt.set_never_capture(true);
1357 RE2 re("(r)(e)", opt);
1358 EXPECT_EQ(0, re.NumberOfCapturingGroups());
1359 }
1360
1361 // Bitstate bug was looking at submatch[0] even if nsubmatch == 0.
1362 // Triggered by a failed DFA search falling back to Bitstate when
1363 // using Match with a NULL submatch set. Bitstate tried to read
1364 // the submatch[0] entry even if nsubmatch was 0.
TEST(RE2,BitstateCaptureBug)1365 TEST(RE2, BitstateCaptureBug) {
1366 RE2::Options opt;
1367 opt.set_max_mem(20000);
1368 RE2 re("(_________$)", opt);
1369 StringPiece s = "xxxxxxxxxxxxxxxxxxxxxxxxxx_________x";
1370 EXPECT_FALSE(re.Match(s, 0, s.size(), RE2::UNANCHORED, NULL, 0));
1371 }
1372
1373 // C++ version of bug 609710.
TEST(RE2,UnicodeClasses)1374 TEST(RE2, UnicodeClasses) {
1375 const std::string str = "ABCDEFGHI譚永鋒";
1376 std::string a, b, c;
1377
1378 EXPECT_TRUE(RE2::FullMatch("A", "\\p{L}"));
1379 EXPECT_TRUE(RE2::FullMatch("A", "\\p{Lu}"));
1380 EXPECT_FALSE(RE2::FullMatch("A", "\\p{Ll}"));
1381 EXPECT_FALSE(RE2::FullMatch("A", "\\P{L}"));
1382 EXPECT_FALSE(RE2::FullMatch("A", "\\P{Lu}"));
1383 EXPECT_TRUE(RE2::FullMatch("A", "\\P{Ll}"));
1384
1385 EXPECT_TRUE(RE2::FullMatch("譚", "\\p{L}"));
1386 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Lu}"));
1387 EXPECT_FALSE(RE2::FullMatch("譚", "\\p{Ll}"));
1388 EXPECT_FALSE(RE2::FullMatch("譚", "\\P{L}"));
1389 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Lu}"));
1390 EXPECT_TRUE(RE2::FullMatch("譚", "\\P{Ll}"));
1391
1392 EXPECT_TRUE(RE2::FullMatch("永", "\\p{L}"));
1393 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Lu}"));
1394 EXPECT_FALSE(RE2::FullMatch("永", "\\p{Ll}"));
1395 EXPECT_FALSE(RE2::FullMatch("永", "\\P{L}"));
1396 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Lu}"));
1397 EXPECT_TRUE(RE2::FullMatch("永", "\\P{Ll}"));
1398
1399 EXPECT_TRUE(RE2::FullMatch("鋒", "\\p{L}"));
1400 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Lu}"));
1401 EXPECT_FALSE(RE2::FullMatch("鋒", "\\p{Ll}"));
1402 EXPECT_FALSE(RE2::FullMatch("鋒", "\\P{L}"));
1403 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Lu}"));
1404 EXPECT_TRUE(RE2::FullMatch("鋒", "\\P{Ll}"));
1405
1406 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?(.).*?(.)", &a, &b, &c));
1407 EXPECT_EQ("A", a);
1408 EXPECT_EQ("B", b);
1409 EXPECT_EQ("C", c);
1410
1411 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{L}]).*?(.)", &a, &b, &c));
1412 EXPECT_EQ("A", a);
1413 EXPECT_EQ("B", b);
1414 EXPECT_EQ("C", c);
1415
1416 EXPECT_FALSE(RE2::PartialMatch(str, "\\P{L}"));
1417
1418 EXPECT_TRUE(RE2::PartialMatch(str, "(.).*?([\\p{Lu}]).*?(.)", &a, &b, &c));
1419 EXPECT_EQ("A", a);
1420 EXPECT_EQ("B", b);
1421 EXPECT_EQ("C", c);
1422
1423 EXPECT_FALSE(RE2::PartialMatch(str, "[^\\p{Lu}\\p{Lo}]"));
1424
1425 EXPECT_TRUE(RE2::PartialMatch(str, ".*(.).*?([\\p{Lu}\\p{Lo}]).*?(.)", &a, &b, &c));
1426 EXPECT_EQ("譚", a);
1427 EXPECT_EQ("永", b);
1428 EXPECT_EQ("鋒", c);
1429 }
1430
TEST(RE2,LazyRE2)1431 TEST(RE2, LazyRE2) {
1432 // Test with and without options.
1433 static LazyRE2 a = {"a"};
1434 static LazyRE2 b = {"b", RE2::Latin1};
1435
1436 EXPECT_EQ("a", a->pattern());
1437 EXPECT_EQ(RE2::Options::EncodingUTF8, a->options().encoding());
1438
1439 EXPECT_EQ("b", b->pattern());
1440 EXPECT_EQ(RE2::Options::EncodingLatin1, b->options().encoding());
1441 }
1442
1443 // Bug reported by saito. 2009/02/17
TEST(RE2,NullVsEmptyString)1444 TEST(RE2, NullVsEmptyString) {
1445 RE2 re(".*");
1446 EXPECT_TRUE(re.ok());
1447
1448 StringPiece null;
1449 EXPECT_TRUE(RE2::FullMatch(null, re));
1450
1451 StringPiece empty("");
1452 EXPECT_TRUE(RE2::FullMatch(empty, re));
1453 }
1454
1455 // Similar to the previous test, check that the null string and the empty
1456 // string both match, but also that the null string can only provide null
1457 // submatches whereas the empty string can also provide empty submatches.
TEST(RE2,NullVsEmptyStringSubmatches)1458 TEST(RE2, NullVsEmptyStringSubmatches) {
1459 RE2 re("()|(foo)");
1460 EXPECT_TRUE(re.ok());
1461
1462 // matches[0] is overall match, [1] is (), [2] is (foo), [3] is nonexistent.
1463 StringPiece matches[4];
1464
1465 for (size_t i = 0; i < arraysize(matches); i++)
1466 matches[i] = "bar";
1467
1468 StringPiece null;
1469 EXPECT_TRUE(re.Match(null, 0, null.size(), RE2::UNANCHORED,
1470 matches, arraysize(matches)));
1471 for (size_t i = 0; i < arraysize(matches); i++) {
1472 EXPECT_TRUE(matches[i].data() == NULL); // always null
1473 EXPECT_TRUE(matches[i].empty());
1474 }
1475
1476 for (size_t i = 0; i < arraysize(matches); i++)
1477 matches[i] = "bar";
1478
1479 StringPiece empty("");
1480 EXPECT_TRUE(re.Match(empty, 0, empty.size(), RE2::UNANCHORED,
1481 matches, arraysize(matches)));
1482 EXPECT_TRUE(matches[0].data() != NULL); // empty, not null
1483 EXPECT_TRUE(matches[0].empty());
1484 EXPECT_TRUE(matches[1].data() != NULL); // empty, not null
1485 EXPECT_TRUE(matches[1].empty());
1486 EXPECT_TRUE(matches[2].data() == NULL);
1487 EXPECT_TRUE(matches[2].empty());
1488 EXPECT_TRUE(matches[3].data() == NULL);
1489 EXPECT_TRUE(matches[3].empty());
1490 }
1491
1492 // Issue 1816809
TEST(RE2,Bug1816809)1493 TEST(RE2, Bug1816809) {
1494 RE2 re("(((((llx((-3)|(4)))(;(llx((-3)|(4))))*))))");
1495 StringPiece piece("llx-3;llx4");
1496 std::string x;
1497 EXPECT_TRUE(RE2::Consume(&piece, re, &x));
1498 }
1499
1500 // Issue 3061120
TEST(RE2,Bug3061120)1501 TEST(RE2, Bug3061120) {
1502 RE2 re("(?i)\\W");
1503 EXPECT_FALSE(RE2::PartialMatch("x", re)); // always worked
1504 EXPECT_FALSE(RE2::PartialMatch("k", re)); // broke because of kelvin
1505 EXPECT_FALSE(RE2::PartialMatch("s", re)); // broke because of latin long s
1506 }
1507
TEST(RE2,CapturingGroupNames)1508 TEST(RE2, CapturingGroupNames) {
1509 // Opening parentheses annotated with group IDs:
1510 // 12 3 45 6 7
1511 RE2 re("((abc)(?P<G2>)|((e+)(?P<G2>.*)(?P<G1>u+)))");
1512 EXPECT_TRUE(re.ok());
1513 const std::map<int, std::string>& have = re.CapturingGroupNames();
1514 std::map<int, std::string> want;
1515 want[3] = "G2";
1516 want[6] = "G2";
1517 want[7] = "G1";
1518 EXPECT_EQ(want, have);
1519 }
1520
TEST(RE2,RegexpToStringLossOfAnchor)1521 TEST(RE2, RegexpToStringLossOfAnchor) {
1522 EXPECT_EQ(RE2("^[a-c]at", RE2::POSIX).Regexp()->ToString(), "^[a-c]at");
1523 EXPECT_EQ(RE2("^[a-c]at").Regexp()->ToString(), "(?-m:^)[a-c]at");
1524 EXPECT_EQ(RE2("ca[t-z]$", RE2::POSIX).Regexp()->ToString(), "ca[t-z]$");
1525 EXPECT_EQ(RE2("ca[t-z]$").Regexp()->ToString(), "ca[t-z](?-m:$)");
1526 }
1527
1528 // Issue 10131674
TEST(RE2,Bug10131674)1529 TEST(RE2, Bug10131674) {
1530 // Some of these escapes describe values that do not fit in a byte.
1531 RE2 re("\\140\\440\\174\\271\\150\\656\\106\\201\\004\\332", RE2::Latin1);
1532 EXPECT_FALSE(re.ok());
1533 EXPECT_FALSE(RE2::FullMatch("hello world", re));
1534 }
1535
TEST(RE2,Bug18391750)1536 TEST(RE2, Bug18391750) {
1537 // Stray write past end of match_ in nfa.cc, caught by fuzzing + address sanitizer.
1538 const char t[] = {
1539 (char)0x28, (char)0x28, (char)0xfc, (char)0xfc, (char)0x08, (char)0x08,
1540 (char)0x26, (char)0x26, (char)0x28, (char)0xc2, (char)0x9b, (char)0xc5,
1541 (char)0xc5, (char)0xd4, (char)0x8f, (char)0x8f, (char)0x69, (char)0x69,
1542 (char)0xe7, (char)0x29, (char)0x7b, (char)0x37, (char)0x31, (char)0x31,
1543 (char)0x7d, (char)0xae, (char)0x7c, (char)0x7c, (char)0xf3, (char)0x29,
1544 (char)0xae, (char)0xae, (char)0x2e, (char)0x2a, (char)0x29, (char)0x00,
1545 };
1546 RE2::Options opt;
1547 opt.set_encoding(RE2::Options::EncodingLatin1);
1548 opt.set_longest_match(true);
1549 opt.set_dot_nl(true);
1550 opt.set_case_sensitive(false);
1551 RE2 re(t, opt);
1552 ASSERT_TRUE(re.ok());
1553 RE2::PartialMatch(t, re);
1554 }
1555
TEST(RE2,Bug18458852)1556 TEST(RE2, Bug18458852) {
1557 // Bug in parser accepting invalid (too large) rune,
1558 // causing compiler to fail in DCHECK in UTF-8
1559 // character class code.
1560 const char b[] = {
1561 (char)0x28, (char)0x05, (char)0x05, (char)0x41, (char)0x41, (char)0x28,
1562 (char)0x24, (char)0x5b, (char)0x5e, (char)0xf5, (char)0x87, (char)0x87,
1563 (char)0x90, (char)0x29, (char)0x5d, (char)0x29, (char)0x29, (char)0x00,
1564 };
1565 RE2 re(b);
1566 ASSERT_FALSE(re.ok());
1567 }
1568
TEST(RE2,Bug18523943)1569 TEST(RE2, Bug18523943) {
1570 // Bug in BitState: case kFailInst failed the match entirely.
1571
1572 RE2::Options opt;
1573 const char a[] = {
1574 (char)0x29, (char)0x29, (char)0x24, (char)0x00,
1575 };
1576 const char b[] = {
1577 (char)0x28, (char)0x0a, (char)0x2a, (char)0x2a, (char)0x29, (char)0x00,
1578 };
1579 opt.set_log_errors(false);
1580 opt.set_encoding(RE2::Options::EncodingLatin1);
1581 opt.set_posix_syntax(true);
1582 opt.set_longest_match(true);
1583 opt.set_literal(false);
1584 opt.set_never_nl(true);
1585
1586 RE2 re((const char*)b, opt);
1587 ASSERT_TRUE(re.ok());
1588 std::string s1;
1589 ASSERT_TRUE(RE2::PartialMatch((const char*)a, re, &s1));
1590 }
1591
TEST(RE2,Bug21371806)1592 TEST(RE2, Bug21371806) {
1593 // Bug in parser accepting Unicode groups in Latin-1 mode,
1594 // causing compiler to fail in DCHECK in prog.cc.
1595
1596 RE2::Options opt;
1597 opt.set_encoding(RE2::Options::EncodingLatin1);
1598
1599 RE2 re("g\\p{Zl}]", opt);
1600 ASSERT_TRUE(re.ok());
1601 }
1602
TEST(RE2,Bug26356109)1603 TEST(RE2, Bug26356109) {
1604 // Bug in parser caused by factoring of common prefixes in alternations.
1605
1606 // In the past, this was factored to "a\\C*?[bc]". Thus, the automaton would
1607 // consume "ab" and then stop (when unanchored) whereas it should consume all
1608 // of "abc" as per first-match semantics.
1609 RE2 re("a\\C*?c|a\\C*?b");
1610 ASSERT_TRUE(re.ok());
1611
1612 std::string s = "abc";
1613 StringPiece m;
1614
1615 ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::UNANCHORED, &m, 1));
1616 ASSERT_EQ(m, s) << " (UNANCHORED) got m='" << m << "', want '" << s << "'";
1617
1618 ASSERT_TRUE(re.Match(s, 0, s.size(), RE2::ANCHOR_BOTH, &m, 1));
1619 ASSERT_EQ(m, s) << " (ANCHOR_BOTH) got m='" << m << "', want '" << s << "'";
1620 }
1621
TEST(RE2,Issue104)1622 TEST(RE2, Issue104) {
1623 // RE2::GlobalReplace always advanced by one byte when the empty string was
1624 // matched, which would clobber any rune that is longer than one byte.
1625
1626 std::string s = "bc";
1627 ASSERT_EQ(3, RE2::GlobalReplace(&s, "a*", "d"));
1628 ASSERT_EQ("dbdcd", s);
1629
1630 s = "ąć";
1631 ASSERT_EQ(3, RE2::GlobalReplace(&s, "Ć*", "Ĉ"));
1632 ASSERT_EQ("ĈąĈćĈ", s);
1633
1634 s = "人类";
1635 ASSERT_EQ(3, RE2::GlobalReplace(&s, "大*", "小"));
1636 ASSERT_EQ("小人小类小", s);
1637 }
1638
1639 } // namespace re2
1640