• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2008 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 
29 #include <stdlib.h>
30 
31 #include "v8.h"
32 
33 #include "string-stream.h"
34 #include "cctest.h"
35 #include "zone-inl.h"
36 #include "parser.h"
37 #include "ast.h"
38 #include "jsregexp.h"
39 #include "regexp-macro-assembler.h"
40 #include "regexp-macro-assembler-irregexp.h"
41 #ifdef V8_INTERPRETED_REGEXP
42 #include "interpreter-irregexp.h"
43 #else  // V8_INTERPRETED_REGEXP
44 #ifdef V8_TARGET_ARCH_ARM
45 #include "arm/macro-assembler-arm.h"
46 #include "arm/regexp-macro-assembler-arm.h"
47 #endif
48 #ifdef V8_TARGET_ARCH_MIPS
49 #include "mips/macro-assembler-mips.h"
50 #include "mips/regexp-macro-assembler-mips.h"
51 #endif
52 #ifdef V8_TARGET_ARCH_X64
53 #include "x64/macro-assembler-x64.h"
54 #include "x64/regexp-macro-assembler-x64.h"
55 #endif
56 #ifdef V8_TARGET_ARCH_IA32
57 #include "ia32/macro-assembler-ia32.h"
58 #include "ia32/regexp-macro-assembler-ia32.h"
59 #endif
60 #endif  // V8_INTERPRETED_REGEXP
61 
62 using namespace v8::internal;
63 
64 
CheckParse(const char * input)65 static bool CheckParse(const char* input) {
66   V8::Initialize(NULL);
67   v8::HandleScope scope;
68   ZoneScope zone_scope(DELETE_ON_EXIT);
69   FlatStringReader reader(Isolate::Current(), CStrVector(input));
70   RegExpCompileData result;
71   return v8::internal::RegExpParser::ParseRegExp(&reader, false, &result);
72 }
73 
74 
Parse(const char * input)75 static SmartPointer<const char> Parse(const char* input) {
76   V8::Initialize(NULL);
77   v8::HandleScope scope;
78   ZoneScope zone_scope(DELETE_ON_EXIT);
79   FlatStringReader reader(Isolate::Current(), CStrVector(input));
80   RegExpCompileData result;
81   CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
82   CHECK(result.tree != NULL);
83   CHECK(result.error.is_null());
84   SmartPointer<const char> output = result.tree->ToString();
85   return output;
86 }
87 
CheckSimple(const char * input)88 static bool CheckSimple(const char* input) {
89   V8::Initialize(NULL);
90   v8::HandleScope scope;
91   unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
92   ZoneScope zone_scope(DELETE_ON_EXIT);
93   FlatStringReader reader(Isolate::Current(), CStrVector(input));
94   RegExpCompileData result;
95   CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
96   CHECK(result.tree != NULL);
97   CHECK(result.error.is_null());
98   return result.simple;
99 }
100 
101 struct MinMaxPair {
102   int min_match;
103   int max_match;
104 };
105 
CheckMinMaxMatch(const char * input)106 static MinMaxPair CheckMinMaxMatch(const char* input) {
107   V8::Initialize(NULL);
108   v8::HandleScope scope;
109   unibrow::Utf8InputBuffer<> buffer(input, StrLength(input));
110   ZoneScope zone_scope(DELETE_ON_EXIT);
111   FlatStringReader reader(Isolate::Current(), CStrVector(input));
112   RegExpCompileData result;
113   CHECK(v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
114   CHECK(result.tree != NULL);
115   CHECK(result.error.is_null());
116   int min_match = result.tree->min_match();
117   int max_match = result.tree->max_match();
118   MinMaxPair pair = { min_match, max_match };
119   return pair;
120 }
121 
122 
123 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
124 #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
125 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
126 #define CHECK_MIN_MAX(input, min, max)                                         \
127   { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
128     CHECK_EQ(min, min_max.min_match);                                          \
129     CHECK_EQ(max, min_max.max_match);                                          \
130   }
131 
TEST(Parser)132 TEST(Parser) {
133   V8::Initialize(NULL);
134 
135   CHECK_PARSE_ERROR("?");
136 
137   CHECK_PARSE_EQ("abc", "'abc'");
138   CHECK_PARSE_EQ("", "%");
139   CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')");
140   CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
141   CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)");
142   CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
143   CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
144   CHECK_PARSE_EQ("a*", "(# 0 - g 'a')");
145   CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')");
146   CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))");
147   CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))");
148   CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))");
149   CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))");
150   CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
151   CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
152   CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
153   CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
154   CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
155   CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
156   CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
157   CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
158   CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
159   CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
160   CHECK_PARSE_EQ("(?:foo)", "'foo'");
161   CHECK_PARSE_EQ("(?: foo )", "' foo '");
162   CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
163   CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
164   CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
165   CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
166   CHECK_PARSE_EQ("()", "(^ %)");
167   CHECK_PARSE_EQ("(?=)", "(-> + %)");
168   CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]");   // Doesn't compile on windows
169   CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]");   // \uffff isn't in codepage 1252
170   CHECK_PARSE_EQ("[x]", "[x]");
171   CHECK_PARSE_EQ("[xyz]", "[x y z]");
172   CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
173   CHECK_PARSE_EQ("[-123]", "[- 1 2 3]");
174   CHECK_PARSE_EQ("[^123]", "^[1 2 3]");
175   CHECK_PARSE_EQ("]", "']'");
176   CHECK_PARSE_EQ("}", "'}'");
177   CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
178   CHECK_PARSE_EQ("[\\d]", "[0-9]");
179   CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
180   CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
181   CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
182   CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
183   // Control character outside character class.
184   CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
185                  "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
186   CHECK_PARSE_EQ("\\c!", "'\\c!'");
187   CHECK_PARSE_EQ("\\c_", "'\\c_'");
188   CHECK_PARSE_EQ("\\c~", "'\\c~'");
189   CHECK_PARSE_EQ("\\c1", "'\\c1'");
190   // Control character inside character class.
191   CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
192   CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
193   CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
194   CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
195   CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
196   CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
197   CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
198   CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
199 
200   CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
201   CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
202   CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
203   CHECK_PARSE_EQ("\\0", "'\\x00'");
204   CHECK_PARSE_EQ("\\8", "'8'");
205   CHECK_PARSE_EQ("\\9", "'9'");
206   CHECK_PARSE_EQ("\\11", "'\\x09'");
207   CHECK_PARSE_EQ("\\11a", "'\\x09a'");
208   CHECK_PARSE_EQ("\\011", "'\\x09'");
209   CHECK_PARSE_EQ("\\00011", "'\\x0011'");
210   CHECK_PARSE_EQ("\\118", "'\\x098'");
211   CHECK_PARSE_EQ("\\111", "'I'");
212   CHECK_PARSE_EQ("\\1111", "'I1'");
213   CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
214   CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
215   CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
216   CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
217   CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')"
218                                " (# 0 - g (<- 1)))");
219   CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')"
220                                " (# 0 - g (<- 2)))");
221   CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')"
222                                " (# 0 - g (<- 3)))");
223   CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')"
224                                " (# 0 - g '\\x04'))");
225   CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
226               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
227               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
228   CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
229               "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
230               " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
231   CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
232   CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
233   CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
234   CHECK_PARSE_EQ("(?=a)?a", "'a'");
235   CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
236   CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
237   CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
238   CHECK_PARSE_EQ("(?!a)?a", "'a'");
239   CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
240   CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
241   CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
242   CHECK_PARSE_EQ("[\\0]", "[\\x00]");
243   CHECK_PARSE_EQ("[\\11]", "[\\x09]");
244   CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
245   CHECK_PARSE_EQ("[\\011]", "[\\x09]");
246   CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]");
247   CHECK_PARSE_EQ("[\\118]", "[\\x09 8]");
248   CHECK_PARSE_EQ("[\\111]", "[I]");
249   CHECK_PARSE_EQ("[\\1111]", "[I 1]");
250   CHECK_PARSE_EQ("\\x34", "'\x34'");
251   CHECK_PARSE_EQ("\\x60", "'\x60'");
252   CHECK_PARSE_EQ("\\x3z", "'x3z'");
253   CHECK_PARSE_EQ("\\c", "'\\c'");
254   CHECK_PARSE_EQ("\\u0034", "'\x34'");
255   CHECK_PARSE_EQ("\\u003z", "'u003z'");
256   CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
257 
258   CHECK_SIMPLE("a", true);
259   CHECK_SIMPLE("a|b", false);
260   CHECK_SIMPLE("a\\n", false);
261   CHECK_SIMPLE("^a", false);
262   CHECK_SIMPLE("a$", false);
263   CHECK_SIMPLE("a\\b!", false);
264   CHECK_SIMPLE("a\\Bb", false);
265   CHECK_SIMPLE("a*", false);
266   CHECK_SIMPLE("a*?", false);
267   CHECK_SIMPLE("a?", false);
268   CHECK_SIMPLE("a??", false);
269   CHECK_SIMPLE("a{0,1}?", false);
270   CHECK_SIMPLE("a{1,1}?", false);
271   CHECK_SIMPLE("a{1,2}?", false);
272   CHECK_SIMPLE("a+?", false);
273   CHECK_SIMPLE("(a)", false);
274   CHECK_SIMPLE("(a)\\1", false);
275   CHECK_SIMPLE("(\\1a)", false);
276   CHECK_SIMPLE("\\1(a)", false);
277   CHECK_SIMPLE("a\\s", false);
278   CHECK_SIMPLE("a\\S", false);
279   CHECK_SIMPLE("a\\d", false);
280   CHECK_SIMPLE("a\\D", false);
281   CHECK_SIMPLE("a\\w", false);
282   CHECK_SIMPLE("a\\W", false);
283   CHECK_SIMPLE("a.", false);
284   CHECK_SIMPLE("a\\q", false);
285   CHECK_SIMPLE("a[a]", false);
286   CHECK_SIMPLE("a[^a]", false);
287   CHECK_SIMPLE("a[a-z]", false);
288   CHECK_SIMPLE("a[\\q]", false);
289   CHECK_SIMPLE("a(?:b)", false);
290   CHECK_SIMPLE("a(?=b)", false);
291   CHECK_SIMPLE("a(?!b)", false);
292   CHECK_SIMPLE("\\x60", false);
293   CHECK_SIMPLE("\\u0060", false);
294   CHECK_SIMPLE("\\cA", false);
295   CHECK_SIMPLE("\\q", false);
296   CHECK_SIMPLE("\\1112", false);
297   CHECK_SIMPLE("\\0", false);
298   CHECK_SIMPLE("(a)\\1", false);
299   CHECK_SIMPLE("(?=a)?a", false);
300   CHECK_SIMPLE("(?!a)?a\\1", false);
301   CHECK_SIMPLE("(?:(?=a))a\\1", false);
302 
303   CHECK_PARSE_EQ("a{}", "'a{}'");
304   CHECK_PARSE_EQ("a{,}", "'a{,}'");
305   CHECK_PARSE_EQ("a{", "'a{'");
306   CHECK_PARSE_EQ("a{z}", "'a{z}'");
307   CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
308   CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
309   CHECK_PARSE_EQ("a{12,", "'a{12,'");
310   CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
311   CHECK_PARSE_EQ("{}", "'{}'");
312   CHECK_PARSE_EQ("{,}", "'{,}'");
313   CHECK_PARSE_EQ("{", "'{'");
314   CHECK_PARSE_EQ("{z}", "'{z}'");
315   CHECK_PARSE_EQ("{1z}", "'{1z}'");
316   CHECK_PARSE_EQ("{12z}", "'{12z}'");
317   CHECK_PARSE_EQ("{12,", "'{12,'");
318   CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
319 
320   CHECK_MIN_MAX("a", 1, 1);
321   CHECK_MIN_MAX("abc", 3, 3);
322   CHECK_MIN_MAX("a[bc]d", 3, 3);
323   CHECK_MIN_MAX("a|bc", 1, 2);
324   CHECK_MIN_MAX("ab|c", 1, 2);
325   CHECK_MIN_MAX("a||bc", 0, 2);
326   CHECK_MIN_MAX("|", 0, 0);
327   CHECK_MIN_MAX("(?:ab)", 2, 2);
328   CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
329   CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
330   CHECK_MIN_MAX("(ab)", 2, 2);
331   CHECK_MIN_MAX("(ab|cde)", 2, 3);
332   CHECK_MIN_MAX("(ab)\\1", 2, 4);
333   CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
334   CHECK_MIN_MAX("(?:ab)?", 0, 2);
335   CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
336   CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
337   CHECK_MIN_MAX("a?", 0, 1);
338   CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
339   CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
340   CHECK_MIN_MAX("a??", 0, 1);
341   CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
342   CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
343   CHECK_MIN_MAX("(?:a?)?", 0, 1);
344   CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
345   CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
346   CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
347   CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
348   CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
349   CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
350   CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
351   CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
352   CHECK_MIN_MAX("a{0}", 0, 0);
353   CHECK_MIN_MAX("(?:a+){0}", 0, 0);
354   CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
355   CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
356   CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
357   CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
358   CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
359   CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
360   CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
361   CHECK_MIN_MAX("a\\bc", 2, 2);
362   CHECK_MIN_MAX("a\\Bc", 2, 2);
363   CHECK_MIN_MAX("a\\sc", 3, 3);
364   CHECK_MIN_MAX("a\\Sc", 3, 3);
365   CHECK_MIN_MAX("a(?=b)c", 2, 2);
366   CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
367   CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
368 }
369 
TEST(ParserRegression)370 TEST(ParserRegression) {
371   CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
372   CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
373   CHECK_PARSE_EQ("{", "'{'");
374   CHECK_PARSE_EQ("a|", "(| 'a' %)");
375 }
376 
ExpectError(const char * input,const char * expected)377 static void ExpectError(const char* input,
378                         const char* expected) {
379   V8::Initialize(NULL);
380   v8::HandleScope scope;
381   ZoneScope zone_scope(DELETE_ON_EXIT);
382   FlatStringReader reader(Isolate::Current(), CStrVector(input));
383   RegExpCompileData result;
384   CHECK(!v8::internal::RegExpParser::ParseRegExp(&reader, false, &result));
385   CHECK(result.tree == NULL);
386   CHECK(!result.error.is_null());
387   SmartPointer<char> str = result.error->ToCString(ALLOW_NULLS);
388   CHECK_EQ(expected, *str);
389 }
390 
391 
TEST(Errors)392 TEST(Errors) {
393   V8::Initialize(NULL);
394   const char* kEndBackslash = "\\ at end of pattern";
395   ExpectError("\\", kEndBackslash);
396   const char* kUnterminatedGroup = "Unterminated group";
397   ExpectError("(foo", kUnterminatedGroup);
398   const char* kInvalidGroup = "Invalid group";
399   ExpectError("(?", kInvalidGroup);
400   const char* kUnterminatedCharacterClass = "Unterminated character class";
401   ExpectError("[", kUnterminatedCharacterClass);
402   ExpectError("[a-", kUnterminatedCharacterClass);
403   const char* kNothingToRepeat = "Nothing to repeat";
404   ExpectError("*", kNothingToRepeat);
405   ExpectError("?", kNothingToRepeat);
406   ExpectError("+", kNothingToRepeat);
407   ExpectError("{1}", kNothingToRepeat);
408   ExpectError("{1,2}", kNothingToRepeat);
409   ExpectError("{1,}", kNothingToRepeat);
410 
411   // Check that we don't allow more than kMaxCapture captures
412   const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
413   const char* kTooManyCaptures = "Too many captures";
414   HeapStringAllocator allocator;
415   StringStream accumulator(&allocator);
416   for (int i = 0; i <= kMaxCaptures; i++) {
417     accumulator.Add("()");
418   }
419   SmartPointer<const char> many_captures(accumulator.ToCString());
420   ExpectError(*many_captures, kTooManyCaptures);
421 }
422 
423 
IsDigit(uc16 c)424 static bool IsDigit(uc16 c) {
425   return ('0' <= c && c <= '9');
426 }
427 
428 
NotDigit(uc16 c)429 static bool NotDigit(uc16 c) {
430   return !IsDigit(c);
431 }
432 
433 
IsWhiteSpace(uc16 c)434 static bool IsWhiteSpace(uc16 c) {
435   switch (c) {
436     case 0x09:
437     case 0x0A:
438     case 0x0B:
439     case 0x0C:
440     case 0x0d:
441     case 0x20:
442     case 0xA0:
443     case 0x2028:
444     case 0x2029:
445       return true;
446     default:
447       return unibrow::Space::Is(c);
448   }
449 }
450 
451 
NotWhiteSpace(uc16 c)452 static bool NotWhiteSpace(uc16 c) {
453   return !IsWhiteSpace(c);
454 }
455 
456 
NotWord(uc16 c)457 static bool NotWord(uc16 c) {
458   return !IsRegExpWord(c);
459 }
460 
461 
TestCharacterClassEscapes(uc16 c,bool (pred)(uc16 c))462 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
463   ZoneScope scope(DELETE_ON_EXIT);
464   ZoneList<CharacterRange>* ranges = new ZoneList<CharacterRange>(2);
465   CharacterRange::AddClassEscape(c, ranges);
466   for (unsigned i = 0; i < (1 << 16); i++) {
467     bool in_class = false;
468     for (int j = 0; !in_class && j < ranges->length(); j++) {
469       CharacterRange& range = ranges->at(j);
470       in_class = (range.from() <= i && i <= range.to());
471     }
472     CHECK_EQ(pred(i), in_class);
473   }
474 }
475 
476 
TEST(CharacterClassEscapes)477 TEST(CharacterClassEscapes) {
478   v8::internal::V8::Initialize(NULL);
479   TestCharacterClassEscapes('.', IsRegExpNewline);
480   TestCharacterClassEscapes('d', IsDigit);
481   TestCharacterClassEscapes('D', NotDigit);
482   TestCharacterClassEscapes('s', IsWhiteSpace);
483   TestCharacterClassEscapes('S', NotWhiteSpace);
484   TestCharacterClassEscapes('w', IsRegExpWord);
485   TestCharacterClassEscapes('W', NotWord);
486 }
487 
488 
Compile(const char * input,bool multiline,bool is_ascii)489 static RegExpNode* Compile(const char* input, bool multiline, bool is_ascii) {
490   V8::Initialize(NULL);
491   Isolate* isolate = Isolate::Current();
492   FlatStringReader reader(isolate, CStrVector(input));
493   RegExpCompileData compile_data;
494   if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline,
495                                                &compile_data))
496     return NULL;
497   Handle<String> pattern = isolate->factory()->
498       NewStringFromUtf8(CStrVector(input));
499   RegExpEngine::Compile(&compile_data, false, multiline, pattern, is_ascii);
500   return compile_data.node;
501 }
502 
503 
Execute(const char * input,bool multiline,bool is_ascii,bool dot_output=false)504 static void Execute(const char* input,
505                     bool multiline,
506                     bool is_ascii,
507                     bool dot_output = false) {
508   v8::HandleScope scope;
509   ZoneScope zone_scope(DELETE_ON_EXIT);
510   RegExpNode* node = Compile(input, multiline, is_ascii);
511   USE(node);
512 #ifdef DEBUG
513   if (dot_output) {
514     RegExpEngine::DotPrint(input, node, false);
515     exit(0);
516   }
517 #endif  // DEBUG
518 }
519 
520 
521 class TestConfig {
522  public:
523   typedef int Key;
524   typedef int Value;
525   static const int kNoKey;
526   static const int kNoValue;
Compare(int a,int b)527   static inline int Compare(int a, int b) {
528     if (a < b)
529       return -1;
530     else if (a > b)
531       return 1;
532     else
533       return 0;
534   }
535 };
536 
537 
538 const int TestConfig::kNoKey = 0;
539 const int TestConfig::kNoValue = 0;
540 
541 
PseudoRandom(int i,int j)542 static unsigned PseudoRandom(int i, int j) {
543   return ~(~((i * 781) ^ (j * 329)));
544 }
545 
546 
TEST(SplayTreeSimple)547 TEST(SplayTreeSimple) {
548   v8::internal::V8::Initialize(NULL);
549   static const unsigned kLimit = 1000;
550   ZoneScope zone_scope(DELETE_ON_EXIT);
551   ZoneSplayTree<TestConfig> tree;
552   bool seen[kLimit];
553   for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
554 #define CHECK_MAPS_EQUAL() do {                                      \
555     for (unsigned k = 0; k < kLimit; k++)                            \
556       CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
557   } while (false)
558   for (int i = 0; i < 50; i++) {
559     for (int j = 0; j < 50; j++) {
560       unsigned next = PseudoRandom(i, j) % kLimit;
561       if (seen[next]) {
562         // We've already seen this one.  Check the value and remove
563         // it.
564         ZoneSplayTree<TestConfig>::Locator loc;
565         CHECK(tree.Find(next, &loc));
566         CHECK_EQ(next, loc.key());
567         CHECK_EQ(3 * next, loc.value());
568         tree.Remove(next);
569         seen[next] = false;
570         CHECK_MAPS_EQUAL();
571       } else {
572         // Check that it wasn't there already and then add it.
573         ZoneSplayTree<TestConfig>::Locator loc;
574         CHECK(!tree.Find(next, &loc));
575         CHECK(tree.Insert(next, &loc));
576         CHECK_EQ(next, loc.key());
577         loc.set_value(3 * next);
578         seen[next] = true;
579         CHECK_MAPS_EQUAL();
580       }
581       int val = PseudoRandom(j, i) % kLimit;
582       if (seen[val]) {
583         ZoneSplayTree<TestConfig>::Locator loc;
584         CHECK(tree.FindGreatestLessThan(val, &loc));
585         CHECK_EQ(loc.key(), val);
586         break;
587       }
588       val = PseudoRandom(i + j, i - j) % kLimit;
589       if (seen[val]) {
590         ZoneSplayTree<TestConfig>::Locator loc;
591         CHECK(tree.FindLeastGreaterThan(val, &loc));
592         CHECK_EQ(loc.key(), val);
593         break;
594       }
595     }
596   }
597 }
598 
599 
TEST(DispatchTableConstruction)600 TEST(DispatchTableConstruction) {
601   v8::internal::V8::Initialize(NULL);
602   // Initialize test data.
603   static const int kLimit = 1000;
604   static const int kRangeCount = 8;
605   static const int kRangeSize = 16;
606   uc16 ranges[kRangeCount][2 * kRangeSize];
607   for (int i = 0; i < kRangeCount; i++) {
608     Vector<uc16> range(ranges[i], 2 * kRangeSize);
609     for (int j = 0; j < 2 * kRangeSize; j++) {
610       range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
611     }
612     range.Sort();
613     for (int j = 1; j < 2 * kRangeSize; j++) {
614       CHECK(range[j-1] <= range[j]);
615     }
616   }
617   // Enter test data into dispatch table.
618   ZoneScope zone_scope(DELETE_ON_EXIT);
619   DispatchTable table;
620   for (int i = 0; i < kRangeCount; i++) {
621     uc16* range = ranges[i];
622     for (int j = 0; j < 2 * kRangeSize; j += 2)
623       table.AddRange(CharacterRange(range[j], range[j + 1]), i);
624   }
625   // Check that the table looks as we would expect
626   for (int p = 0; p < kLimit; p++) {
627     OutSet* outs = table.Get(p);
628     for (int j = 0; j < kRangeCount; j++) {
629       uc16* range = ranges[j];
630       bool is_on = false;
631       for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
632         is_on = (range[k] <= p && p <= range[k + 1]);
633       CHECK_EQ(is_on, outs->Get(j));
634     }
635   }
636 }
637 
638 // Test of debug-only syntax.
639 #ifdef DEBUG
640 
TEST(ParsePossessiveRepetition)641 TEST(ParsePossessiveRepetition) {
642   bool old_flag_value = FLAG_regexp_possessive_quantifier;
643 
644   // Enable possessive quantifier syntax.
645   FLAG_regexp_possessive_quantifier = true;
646 
647   CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
648   CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
649   CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
650   CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
651   CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
652 
653   // Disable possessive quantifier syntax.
654   FLAG_regexp_possessive_quantifier = false;
655 
656   CHECK_PARSE_ERROR("a*+");
657   CHECK_PARSE_ERROR("a++");
658   CHECK_PARSE_ERROR("a?+");
659   CHECK_PARSE_ERROR("a{10,20}+");
660   CHECK_PARSE_ERROR("a{10,20}+b");
661 
662   FLAG_regexp_possessive_quantifier = old_flag_value;
663 }
664 
665 #endif
666 
667 // Tests of interpreter.
668 
669 
670 #ifndef V8_INTERPRETED_REGEXP
671 
672 #if V8_TARGET_ARCH_IA32
673 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
674 #elif V8_TARGET_ARCH_X64
675 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
676 #elif V8_TARGET_ARCH_ARM
677 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
678 #elif V8_TARGET_ARCH_MIPS
679 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
680 #endif
681 
682 class ContextInitializer {
683  public:
ContextInitializer()684   ContextInitializer()
685       : env_(), scope_(), zone_(DELETE_ON_EXIT) {
686     env_ = v8::Context::New();
687     env_->Enter();
688   }
~ContextInitializer()689   ~ContextInitializer() {
690     env_->Exit();
691     env_.Dispose();
692   }
693  private:
694   v8::Persistent<v8::Context> env_;
695   v8::HandleScope scope_;
696   v8::internal::ZoneScope zone_;
697 };
698 
699 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * captures)700 static ArchRegExpMacroAssembler::Result Execute(Code* code,
701                                                 String* input,
702                                                 int start_offset,
703                                                 const byte* input_start,
704                                                 const byte* input_end,
705                                                 int* captures) {
706   return NativeRegExpMacroAssembler::Execute(
707       code,
708       input,
709       start_offset,
710       input_start,
711       input_end,
712       captures,
713       Isolate::Current());
714 }
715 
716 
TEST(MacroAssemblerNativeSuccess)717 TEST(MacroAssemblerNativeSuccess) {
718   v8::V8::Initialize();
719   ContextInitializer initializer;
720   Factory* factory = Isolate::Current()->factory();
721 
722   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
723 
724   m.Succeed();
725 
726   Handle<String> source = factory->NewStringFromAscii(CStrVector(""));
727   Handle<Object> code_object = m.GetCode(source);
728   Handle<Code> code = Handle<Code>::cast(code_object);
729 
730   int captures[4] = {42, 37, 87, 117};
731   Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
732   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
733   const byte* start_adr =
734       reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
735 
736   NativeRegExpMacroAssembler::Result result =
737       Execute(*code,
738               *input,
739               0,
740               start_adr,
741               start_adr + seq_input->length(),
742               captures);
743 
744   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
745   CHECK_EQ(-1, captures[0]);
746   CHECK_EQ(-1, captures[1]);
747   CHECK_EQ(-1, captures[2]);
748   CHECK_EQ(-1, captures[3]);
749 }
750 
751 
TEST(MacroAssemblerNativeSimple)752 TEST(MacroAssemblerNativeSimple) {
753   v8::V8::Initialize();
754   ContextInitializer initializer;
755   Factory* factory = Isolate::Current()->factory();
756 
757   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
758 
759   uc16 foo_chars[3] = {'f', 'o', 'o'};
760   Vector<const uc16> foo(foo_chars, 3);
761 
762   Label fail;
763   m.CheckCharacters(foo, 0, &fail, true);
764   m.WriteCurrentPositionToRegister(0, 0);
765   m.AdvanceCurrentPosition(3);
766   m.WriteCurrentPositionToRegister(1, 0);
767   m.Succeed();
768   m.Bind(&fail);
769   m.Fail();
770 
771   Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo"));
772   Handle<Object> code_object = m.GetCode(source);
773   Handle<Code> code = Handle<Code>::cast(code_object);
774 
775   int captures[4] = {42, 37, 87, 117};
776   Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
777   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
778   Address start_adr = seq_input->GetCharsAddress();
779 
780   NativeRegExpMacroAssembler::Result result =
781       Execute(*code,
782               *input,
783               0,
784               start_adr,
785               start_adr + input->length(),
786               captures);
787 
788   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
789   CHECK_EQ(0, captures[0]);
790   CHECK_EQ(3, captures[1]);
791   CHECK_EQ(-1, captures[2]);
792   CHECK_EQ(-1, captures[3]);
793 
794   input = factory->NewStringFromAscii(CStrVector("barbarbar"));
795   seq_input = Handle<SeqAsciiString>::cast(input);
796   start_adr = seq_input->GetCharsAddress();
797 
798   result = Execute(*code,
799                    *input,
800                    0,
801                    start_adr,
802                    start_adr + input->length(),
803                    captures);
804 
805   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
806 }
807 
808 
TEST(MacroAssemblerNativeSimpleUC16)809 TEST(MacroAssemblerNativeSimpleUC16) {
810   v8::V8::Initialize();
811   ContextInitializer initializer;
812   Factory* factory = Isolate::Current()->factory();
813 
814   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
815 
816   uc16 foo_chars[3] = {'f', 'o', 'o'};
817   Vector<const uc16> foo(foo_chars, 3);
818 
819   Label fail;
820   m.CheckCharacters(foo, 0, &fail, true);
821   m.WriteCurrentPositionToRegister(0, 0);
822   m.AdvanceCurrentPosition(3);
823   m.WriteCurrentPositionToRegister(1, 0);
824   m.Succeed();
825   m.Bind(&fail);
826   m.Fail();
827 
828   Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo"));
829   Handle<Object> code_object = m.GetCode(source);
830   Handle<Code> code = Handle<Code>::cast(code_object);
831 
832   int captures[4] = {42, 37, 87, 117};
833   const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o', '\xa0'};
834   Handle<String> input =
835       factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
836   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
837   Address start_adr = seq_input->GetCharsAddress();
838 
839   NativeRegExpMacroAssembler::Result result =
840       Execute(*code,
841               *input,
842               0,
843               start_adr,
844               start_adr + input->length(),
845               captures);
846 
847   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
848   CHECK_EQ(0, captures[0]);
849   CHECK_EQ(3, captures[1]);
850   CHECK_EQ(-1, captures[2]);
851   CHECK_EQ(-1, captures[3]);
852 
853   const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a', '\xa0'};
854   input = factory->NewStringFromTwoByte(Vector<const uc16>(input_data2, 9));
855   seq_input = Handle<SeqTwoByteString>::cast(input);
856   start_adr = seq_input->GetCharsAddress();
857 
858   result = Execute(*code,
859                    *input,
860                    0,
861                    start_adr,
862                    start_adr + input->length() * 2,
863                    captures);
864 
865   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
866 }
867 
868 
TEST(MacroAssemblerNativeBacktrack)869 TEST(MacroAssemblerNativeBacktrack) {
870   v8::V8::Initialize();
871   ContextInitializer initializer;
872   Factory* factory = Isolate::Current()->factory();
873 
874   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
875 
876   Label fail;
877   Label backtrack;
878   m.LoadCurrentCharacter(10, &fail);
879   m.Succeed();
880   m.Bind(&fail);
881   m.PushBacktrack(&backtrack);
882   m.LoadCurrentCharacter(10, NULL);
883   m.Succeed();
884   m.Bind(&backtrack);
885   m.Fail();
886 
887   Handle<String> source = factory->NewStringFromAscii(CStrVector(".........."));
888   Handle<Object> code_object = m.GetCode(source);
889   Handle<Code> code = Handle<Code>::cast(code_object);
890 
891   Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
892   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
893   Address start_adr = seq_input->GetCharsAddress();
894 
895   NativeRegExpMacroAssembler::Result result =
896       Execute(*code,
897               *input,
898               0,
899               start_adr,
900               start_adr + input->length(),
901               NULL);
902 
903   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
904 }
905 
906 
TEST(MacroAssemblerNativeBackReferenceASCII)907 TEST(MacroAssemblerNativeBackReferenceASCII) {
908   v8::V8::Initialize();
909   ContextInitializer initializer;
910   Factory* factory = Isolate::Current()->factory();
911 
912   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
913 
914   m.WriteCurrentPositionToRegister(0, 0);
915   m.AdvanceCurrentPosition(2);
916   m.WriteCurrentPositionToRegister(1, 0);
917   Label nomatch;
918   m.CheckNotBackReference(0, &nomatch);
919   m.Fail();
920   m.Bind(&nomatch);
921   m.AdvanceCurrentPosition(2);
922   Label missing_match;
923   m.CheckNotBackReference(0, &missing_match);
924   m.WriteCurrentPositionToRegister(2, 0);
925   m.Succeed();
926   m.Bind(&missing_match);
927   m.Fail();
928 
929   Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1"));
930   Handle<Object> code_object = m.GetCode(source);
931   Handle<Code> code = Handle<Code>::cast(code_object);
932 
933   Handle<String> input = factory->NewStringFromAscii(CStrVector("fooofo"));
934   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
935   Address start_adr = seq_input->GetCharsAddress();
936 
937   int output[4];
938   NativeRegExpMacroAssembler::Result result =
939       Execute(*code,
940               *input,
941               0,
942               start_adr,
943               start_adr + input->length(),
944               output);
945 
946   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
947   CHECK_EQ(0, output[0]);
948   CHECK_EQ(2, output[1]);
949   CHECK_EQ(6, output[2]);
950   CHECK_EQ(-1, output[3]);
951 }
952 
953 
TEST(MacroAssemblerNativeBackReferenceUC16)954 TEST(MacroAssemblerNativeBackReferenceUC16) {
955   v8::V8::Initialize();
956   ContextInitializer initializer;
957   Factory* factory = Isolate::Current()->factory();
958 
959   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4);
960 
961   m.WriteCurrentPositionToRegister(0, 0);
962   m.AdvanceCurrentPosition(2);
963   m.WriteCurrentPositionToRegister(1, 0);
964   Label nomatch;
965   m.CheckNotBackReference(0, &nomatch);
966   m.Fail();
967   m.Bind(&nomatch);
968   m.AdvanceCurrentPosition(2);
969   Label missing_match;
970   m.CheckNotBackReference(0, &missing_match);
971   m.WriteCurrentPositionToRegister(2, 0);
972   m.Succeed();
973   m.Bind(&missing_match);
974   m.Fail();
975 
976   Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1"));
977   Handle<Object> code_object = m.GetCode(source);
978   Handle<Code> code = Handle<Code>::cast(code_object);
979 
980   const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
981   Handle<String> input =
982       factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
983   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
984   Address start_adr = seq_input->GetCharsAddress();
985 
986   int output[4];
987   NativeRegExpMacroAssembler::Result result =
988       Execute(*code,
989                   *input,
990                   0,
991                   start_adr,
992                   start_adr + input->length() * 2,
993                   output);
994 
995   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
996   CHECK_EQ(0, output[0]);
997   CHECK_EQ(2, output[1]);
998   CHECK_EQ(6, output[2]);
999   CHECK_EQ(-1, output[3]);
1000 }
1001 
1002 
1003 
TEST(MacroAssemblernativeAtStart)1004 TEST(MacroAssemblernativeAtStart) {
1005   v8::V8::Initialize();
1006   ContextInitializer initializer;
1007   Factory* factory = Isolate::Current()->factory();
1008 
1009   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
1010 
1011   Label not_at_start, newline, fail;
1012   m.CheckNotAtStart(&not_at_start);
1013   // Check that prevchar = '\n' and current = 'f'.
1014   m.CheckCharacter('\n', &newline);
1015   m.Bind(&fail);
1016   m.Fail();
1017   m.Bind(&newline);
1018   m.LoadCurrentCharacter(0, &fail);
1019   m.CheckNotCharacter('f', &fail);
1020   m.Succeed();
1021 
1022   m.Bind(&not_at_start);
1023   // Check that prevchar = 'o' and current = 'b'.
1024   Label prevo;
1025   m.CheckCharacter('o', &prevo);
1026   m.Fail();
1027   m.Bind(&prevo);
1028   m.LoadCurrentCharacter(0, &fail);
1029   m.CheckNotCharacter('b', &fail);
1030   m.Succeed();
1031 
1032   Handle<String> source = factory->NewStringFromAscii(CStrVector("(^f|ob)"));
1033   Handle<Object> code_object = m.GetCode(source);
1034   Handle<Code> code = Handle<Code>::cast(code_object);
1035 
1036   Handle<String> input = factory->NewStringFromAscii(CStrVector("foobar"));
1037   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1038   Address start_adr = seq_input->GetCharsAddress();
1039 
1040   NativeRegExpMacroAssembler::Result result =
1041       Execute(*code,
1042               *input,
1043               0,
1044               start_adr,
1045               start_adr + input->length(),
1046               NULL);
1047 
1048   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1049 
1050   result = Execute(*code,
1051                    *input,
1052                    3,
1053                    start_adr + 3,
1054                    start_adr + input->length(),
1055                    NULL);
1056 
1057   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1058 }
1059 
1060 
TEST(MacroAssemblerNativeBackRefNoCase)1061 TEST(MacroAssemblerNativeBackRefNoCase) {
1062   v8::V8::Initialize();
1063   ContextInitializer initializer;
1064   Factory* factory = Isolate::Current()->factory();
1065 
1066   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4);
1067 
1068   Label fail, succ;
1069 
1070   m.WriteCurrentPositionToRegister(0, 0);
1071   m.WriteCurrentPositionToRegister(2, 0);
1072   m.AdvanceCurrentPosition(3);
1073   m.WriteCurrentPositionToRegister(3, 0);
1074   m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "AbC".
1075   m.CheckNotBackReferenceIgnoreCase(2, &fail);  // Match "ABC".
1076   Label expected_fail;
1077   m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
1078   m.Bind(&fail);
1079   m.Fail();
1080 
1081   m.Bind(&expected_fail);
1082   m.AdvanceCurrentPosition(3);  // Skip "xYz"
1083   m.CheckNotBackReferenceIgnoreCase(2, &succ);
1084   m.Fail();
1085 
1086   m.Bind(&succ);
1087   m.WriteCurrentPositionToRegister(1, 0);
1088   m.Succeed();
1089 
1090   Handle<String> source =
1091       factory->NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)"));
1092   Handle<Object> code_object = m.GetCode(source);
1093   Handle<Code> code = Handle<Code>::cast(code_object);
1094 
1095   Handle<String> input =
1096       factory->NewStringFromAscii(CStrVector("aBcAbCABCxYzab"));
1097   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1098   Address start_adr = seq_input->GetCharsAddress();
1099 
1100   int output[4];
1101   NativeRegExpMacroAssembler::Result result =
1102       Execute(*code,
1103               *input,
1104               0,
1105               start_adr,
1106               start_adr + input->length(),
1107               output);
1108 
1109   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1110   CHECK_EQ(0, output[0]);
1111   CHECK_EQ(12, output[1]);
1112   CHECK_EQ(0, output[2]);
1113   CHECK_EQ(3, output[3]);
1114 }
1115 
1116 
1117 
TEST(MacroAssemblerNativeRegisters)1118 TEST(MacroAssemblerNativeRegisters) {
1119   v8::V8::Initialize();
1120   ContextInitializer initializer;
1121   Factory* factory = Isolate::Current()->factory();
1122 
1123   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6);
1124 
1125   uc16 foo_chars[3] = {'f', 'o', 'o'};
1126   Vector<const uc16> foo(foo_chars, 3);
1127 
1128   enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1129   Label fail;
1130   Label backtrack;
1131   m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
1132   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1133   m.PushBacktrack(&backtrack);
1134   m.WriteStackPointerToRegister(sp);
1135   // Fill stack and registers
1136   m.AdvanceCurrentPosition(2);
1137   m.WriteCurrentPositionToRegister(out1, 0);
1138   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1139   m.PushBacktrack(&fail);
1140   // Drop backtrack stack frames.
1141   m.ReadStackPointerFromRegister(sp);
1142   // And take the first backtrack (to &backtrack)
1143   m.Backtrack();
1144 
1145   m.PushCurrentPosition();
1146   m.AdvanceCurrentPosition(2);
1147   m.PopCurrentPosition();
1148 
1149   m.Bind(&backtrack);
1150   m.PopRegister(out1);
1151   m.ReadCurrentPositionFromRegister(out1);
1152   m.AdvanceCurrentPosition(3);
1153   m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
1154 
1155   Label loop;
1156   m.SetRegister(loop_cnt, 0);  // loop counter
1157   m.Bind(&loop);
1158   m.AdvanceRegister(loop_cnt, 1);
1159   m.AdvanceCurrentPosition(1);
1160   m.IfRegisterLT(loop_cnt, 3, &loop);
1161   m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
1162 
1163   Label loop2;
1164   m.SetRegister(loop_cnt, 2);  // loop counter
1165   m.Bind(&loop2);
1166   m.AdvanceRegister(loop_cnt, -1);
1167   m.AdvanceCurrentPosition(1);
1168   m.IfRegisterGE(loop_cnt, 0, &loop2);
1169   m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
1170 
1171   Label loop3;
1172   Label exit_loop3;
1173   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1174   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1175   m.ReadCurrentPositionFromRegister(out3);
1176   m.Bind(&loop3);
1177   m.AdvanceCurrentPosition(1);
1178   m.CheckGreedyLoop(&exit_loop3);
1179   m.GoTo(&loop3);
1180   m.Bind(&exit_loop3);
1181   m.PopCurrentPosition();
1182   m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
1183 
1184   m.Succeed();
1185 
1186   m.Bind(&fail);
1187   m.Fail();
1188 
1189   Handle<String> source =
1190       factory->NewStringFromAscii(CStrVector("<loop test>"));
1191   Handle<Object> code_object = m.GetCode(source);
1192   Handle<Code> code = Handle<Code>::cast(code_object);
1193 
1194   // String long enough for test (content doesn't matter).
1195   Handle<String> input =
1196       factory->NewStringFromAscii(CStrVector("foofoofoofoofoo"));
1197   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1198   Address start_adr = seq_input->GetCharsAddress();
1199 
1200   int output[6];
1201   NativeRegExpMacroAssembler::Result result =
1202       Execute(*code,
1203               *input,
1204               0,
1205               start_adr,
1206               start_adr + input->length(),
1207               output);
1208 
1209   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1210   CHECK_EQ(0, output[0]);
1211   CHECK_EQ(3, output[1]);
1212   CHECK_EQ(6, output[2]);
1213   CHECK_EQ(9, output[3]);
1214   CHECK_EQ(9, output[4]);
1215   CHECK_EQ(-1, output[5]);
1216 }
1217 
1218 
TEST(MacroAssemblerStackOverflow)1219 TEST(MacroAssemblerStackOverflow) {
1220   v8::V8::Initialize();
1221   ContextInitializer initializer;
1222   Isolate* isolate = Isolate::Current();
1223   Factory* factory = isolate->factory();
1224 
1225   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0);
1226 
1227   Label loop;
1228   m.Bind(&loop);
1229   m.PushBacktrack(&loop);
1230   m.GoTo(&loop);
1231 
1232   Handle<String> source =
1233       factory->NewStringFromAscii(CStrVector("<stack overflow test>"));
1234   Handle<Object> code_object = m.GetCode(source);
1235   Handle<Code> code = Handle<Code>::cast(code_object);
1236 
1237   // String long enough for test (content doesn't matter).
1238   Handle<String> input =
1239       factory->NewStringFromAscii(CStrVector("dummy"));
1240   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1241   Address start_adr = seq_input->GetCharsAddress();
1242 
1243   NativeRegExpMacroAssembler::Result result =
1244       Execute(*code,
1245               *input,
1246               0,
1247               start_adr,
1248               start_adr + input->length(),
1249               NULL);
1250 
1251   CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1252   CHECK(isolate->has_pending_exception());
1253   isolate->clear_pending_exception();
1254 }
1255 
1256 
TEST(MacroAssemblerNativeLotsOfRegisters)1257 TEST(MacroAssemblerNativeLotsOfRegisters) {
1258   v8::V8::Initialize();
1259   ContextInitializer initializer;
1260   Isolate* isolate = Isolate::Current();
1261   Factory* factory = isolate->factory();
1262 
1263   ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2);
1264 
1265   // At least 2048, to ensure the allocated space for registers
1266   // span one full page.
1267   const int large_number = 8000;
1268   m.WriteCurrentPositionToRegister(large_number, 42);
1269   m.WriteCurrentPositionToRegister(0, 0);
1270   m.WriteCurrentPositionToRegister(1, 1);
1271   Label done;
1272   m.CheckNotBackReference(0, &done);  // Performs a system-stack push.
1273   m.Bind(&done);
1274   m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1275   m.PopRegister(1);
1276   m.Succeed();
1277 
1278   Handle<String> source =
1279       factory->NewStringFromAscii(CStrVector("<huge register space test>"));
1280   Handle<Object> code_object = m.GetCode(source);
1281   Handle<Code> code = Handle<Code>::cast(code_object);
1282 
1283   // String long enough for test (content doesn't matter).
1284   Handle<String> input =
1285       factory->NewStringFromAscii(CStrVector("sample text"));
1286   Handle<SeqAsciiString> seq_input = Handle<SeqAsciiString>::cast(input);
1287   Address start_adr = seq_input->GetCharsAddress();
1288 
1289   int captures[2];
1290   NativeRegExpMacroAssembler::Result result =
1291       Execute(*code,
1292               *input,
1293               0,
1294               start_adr,
1295               start_adr + input->length(),
1296               captures);
1297 
1298   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1299   CHECK_EQ(0, captures[0]);
1300   CHECK_EQ(42, captures[1]);
1301 
1302   isolate->clear_pending_exception();
1303 }
1304 
1305 #else  // V8_INTERPRETED_REGEXP
1306 
TEST(MacroAssembler)1307 TEST(MacroAssembler) {
1308   V8::Initialize(NULL);
1309   byte codes[1024];
1310   RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024));
1311   // ^f(o)o.
1312   Label fail, fail2, start;
1313   uc16 foo_chars[3];
1314   foo_chars[0] = 'f';
1315   foo_chars[1] = 'o';
1316   foo_chars[2] = 'o';
1317   Vector<const uc16> foo(foo_chars, 3);
1318   m.SetRegister(4, 42);
1319   m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1320   m.AdvanceRegister(4, 42);
1321   m.GoTo(&start);
1322   m.Fail();
1323   m.Bind(&start);
1324   m.PushBacktrack(&fail2);
1325   m.CheckCharacters(foo, 0, &fail, true);
1326   m.WriteCurrentPositionToRegister(0, 0);
1327   m.PushCurrentPosition();
1328   m.AdvanceCurrentPosition(3);
1329   m.WriteCurrentPositionToRegister(1, 0);
1330   m.PopCurrentPosition();
1331   m.AdvanceCurrentPosition(1);
1332   m.WriteCurrentPositionToRegister(2, 0);
1333   m.AdvanceCurrentPosition(1);
1334   m.WriteCurrentPositionToRegister(3, 0);
1335   m.Succeed();
1336 
1337   m.Bind(&fail);
1338   m.Backtrack();
1339   m.Succeed();
1340 
1341   m.Bind(&fail2);
1342   m.PopRegister(0);
1343   m.Fail();
1344 
1345   Isolate* isolate = Isolate::Current();
1346   Factory* factory = isolate->factory();
1347   HandleScope scope(isolate);
1348 
1349   Handle<String> source = factory->NewStringFromAscii(CStrVector("^f(o)o"));
1350   Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1351   int captures[5];
1352 
1353   const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1354   Handle<String> f1_16 =
1355       factory->NewStringFromTwoByte(Vector<const uc16>(str1, 6));
1356 
1357   CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1358   CHECK_EQ(0, captures[0]);
1359   CHECK_EQ(3, captures[1]);
1360   CHECK_EQ(1, captures[2]);
1361   CHECK_EQ(2, captures[3]);
1362   CHECK_EQ(84, captures[4]);
1363 
1364   const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1365   Handle<String> f2_16 =
1366       factory->NewStringFromTwoByte(Vector<const uc16>(str2, 6));
1367 
1368   CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1369   CHECK_EQ(42, captures[0]);
1370 }
1371 
1372 #endif  // V8_INTERPRETED_REGEXP
1373 
1374 
TEST(AddInverseToTable)1375 TEST(AddInverseToTable) {
1376   v8::internal::V8::Initialize(NULL);
1377   static const int kLimit = 1000;
1378   static const int kRangeCount = 16;
1379   for (int t = 0; t < 10; t++) {
1380     ZoneScope zone_scope(DELETE_ON_EXIT);
1381     ZoneList<CharacterRange>* ranges =
1382         new ZoneList<CharacterRange>(kRangeCount);
1383     for (int i = 0; i < kRangeCount; i++) {
1384       int from = PseudoRandom(t + 87, i + 25) % kLimit;
1385       int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1386       if (to > kLimit) to = kLimit;
1387       ranges->Add(CharacterRange(from, to));
1388     }
1389     DispatchTable table;
1390     DispatchTableConstructor cons(&table, false);
1391     cons.set_choice_index(0);
1392     cons.AddInverse(ranges);
1393     for (int i = 0; i < kLimit; i++) {
1394       bool is_on = false;
1395       for (int j = 0; !is_on && j < kRangeCount; j++)
1396         is_on = ranges->at(j).Contains(i);
1397       OutSet* set = table.Get(i);
1398       CHECK_EQ(is_on, set->Get(0) == false);
1399     }
1400   }
1401   ZoneScope zone_scope(DELETE_ON_EXIT);
1402   ZoneList<CharacterRange>* ranges =
1403           new ZoneList<CharacterRange>(1);
1404   ranges->Add(CharacterRange(0xFFF0, 0xFFFE));
1405   DispatchTable table;
1406   DispatchTableConstructor cons(&table, false);
1407   cons.set_choice_index(0);
1408   cons.AddInverse(ranges);
1409   CHECK(!table.Get(0xFFFE)->Get(0));
1410   CHECK(table.Get(0xFFFF)->Get(0));
1411 }
1412 
1413 
canonicalize(uc32 c)1414 static uc32 canonicalize(uc32 c) {
1415   unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1416   int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1417   if (count == 0) {
1418     return c;
1419   } else {
1420     CHECK_EQ(1, count);
1421     return canon[0];
1422   }
1423 }
1424 
1425 
TEST(LatinCanonicalize)1426 TEST(LatinCanonicalize) {
1427   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1428   for (char lower = 'a'; lower <= 'z'; lower++) {
1429     char upper = lower + ('A' - 'a');
1430     CHECK_EQ(canonicalize(lower), canonicalize(upper));
1431     unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1432     int length = un_canonicalize.get(lower, '\0', uncanon);
1433     CHECK_EQ(2, length);
1434     CHECK_EQ(upper, uncanon[0]);
1435     CHECK_EQ(lower, uncanon[1]);
1436   }
1437   for (uc32 c = 128; c < (1 << 21); c++)
1438     CHECK_GE(canonicalize(c), 128);
1439   unibrow::Mapping<unibrow::ToUppercase> to_upper;
1440   // Canonicalization is only defined for the Basic Multilingual Plane.
1441   for (uc32 c = 0; c < (1 << 16); c++) {
1442     unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1443     int length = to_upper.get(c, '\0', upper);
1444     if (length == 0) {
1445       length = 1;
1446       upper[0] = c;
1447     }
1448     uc32 u = upper[0];
1449     if (length > 1 || (c >= 128 && u < 128))
1450       u = c;
1451     CHECK_EQ(u, canonicalize(c));
1452   }
1453 }
1454 
1455 
CanonRangeEnd(uc32 c)1456 static uc32 CanonRangeEnd(uc32 c) {
1457   unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1458   int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1459   if (count == 0) {
1460     return c;
1461   } else {
1462     CHECK_EQ(1, count);
1463     return canon[0];
1464   }
1465 }
1466 
1467 
TEST(RangeCanonicalization)1468 TEST(RangeCanonicalization) {
1469   // Check that we arrive at the same result when using the basic
1470   // range canonicalization primitives as when using immediate
1471   // canonicalization.
1472   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1473   int block_start = 0;
1474   while (block_start <= 0xFFFF) {
1475     uc32 block_end = CanonRangeEnd(block_start);
1476     unsigned block_length = block_end - block_start + 1;
1477     if (block_length > 1) {
1478       unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1479       int first_length = un_canonicalize.get(block_start, '\0', first);
1480       for (unsigned i = 1; i < block_length; i++) {
1481         unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1482         int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1483         CHECK_EQ(first_length, succ_length);
1484         for (int j = 0; j < succ_length; j++) {
1485           int calc = first[j] + i;
1486           int found = succ[j];
1487           CHECK_EQ(calc, found);
1488         }
1489       }
1490     }
1491     block_start = block_start + block_length;
1492   }
1493 }
1494 
1495 
TEST(UncanonicalizeEquivalence)1496 TEST(UncanonicalizeEquivalence) {
1497   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1498   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1499   for (int i = 0; i < (1 << 16); i++) {
1500     int length = un_canonicalize.get(i, '\0', chars);
1501     for (int j = 0; j < length; j++) {
1502       unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1503       int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1504       CHECK_EQ(length, length2);
1505       for (int k = 0; k < length; k++)
1506         CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1507     }
1508   }
1509 }
1510 
1511 
TestRangeCaseIndependence(CharacterRange input,Vector<CharacterRange> expected)1512 static void TestRangeCaseIndependence(CharacterRange input,
1513                                       Vector<CharacterRange> expected) {
1514   ZoneScope zone_scope(DELETE_ON_EXIT);
1515   int count = expected.length();
1516   ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(count);
1517   input.AddCaseEquivalents(list, false);
1518   CHECK_EQ(count, list->length());
1519   for (int i = 0; i < list->length(); i++) {
1520     CHECK_EQ(expected[i].from(), list->at(i).from());
1521     CHECK_EQ(expected[i].to(), list->at(i).to());
1522   }
1523 }
1524 
1525 
TestSimpleRangeCaseIndependence(CharacterRange input,CharacterRange expected)1526 static void TestSimpleRangeCaseIndependence(CharacterRange input,
1527                                             CharacterRange expected) {
1528   EmbeddedVector<CharacterRange, 1> vector;
1529   vector[0] = expected;
1530   TestRangeCaseIndependence(input, vector);
1531 }
1532 
1533 
TEST(CharacterRangeCaseIndependence)1534 TEST(CharacterRangeCaseIndependence) {
1535   v8::internal::V8::Initialize(NULL);
1536   TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
1537                                   CharacterRange::Singleton('A'));
1538   TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
1539                                   CharacterRange::Singleton('Z'));
1540   TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
1541                                   CharacterRange('A', 'Z'));
1542   TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
1543                                   CharacterRange('C', 'F'));
1544   TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
1545                                   CharacterRange('A', 'B'));
1546   TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
1547                                   CharacterRange('Y', 'Z'));
1548   TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
1549                                   CharacterRange('A', 'Z'));
1550   TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
1551                                   CharacterRange('a', 'z'));
1552   TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
1553                                   CharacterRange('c', 'f'));
1554   TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
1555                                   CharacterRange('a', 'z'));
1556   // Here we need to add [l-z] to complete the case independence of
1557   // [A-Za-z] but we expect [a-z] to be added since we always add a
1558   // whole block at a time.
1559   TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
1560                                   CharacterRange('a', 'z'));
1561 }
1562 
1563 
InClass(uc16 c,ZoneList<CharacterRange> * ranges)1564 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1565   if (ranges == NULL)
1566     return false;
1567   for (int i = 0; i < ranges->length(); i++) {
1568     CharacterRange range = ranges->at(i);
1569     if (range.from() <= c && c <= range.to())
1570       return true;
1571   }
1572   return false;
1573 }
1574 
1575 
TEST(CharClassDifference)1576 TEST(CharClassDifference) {
1577   v8::internal::V8::Initialize(NULL);
1578   ZoneScope zone_scope(DELETE_ON_EXIT);
1579   ZoneList<CharacterRange>* base = new ZoneList<CharacterRange>(1);
1580   base->Add(CharacterRange::Everything());
1581   Vector<const uc16> overlay = CharacterRange::GetWordBounds();
1582   ZoneList<CharacterRange>* included = NULL;
1583   ZoneList<CharacterRange>* excluded = NULL;
1584   CharacterRange::Split(base, overlay, &included, &excluded);
1585   for (int i = 0; i < (1 << 16); i++) {
1586     bool in_base = InClass(i, base);
1587     if (in_base) {
1588       bool in_overlay = false;
1589       for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1590         if (overlay[j] <= i && i <= overlay[j+1])
1591           in_overlay = true;
1592       }
1593       CHECK_EQ(in_overlay, InClass(i, included));
1594       CHECK_EQ(!in_overlay, InClass(i, excluded));
1595     } else {
1596       CHECK(!InClass(i, included));
1597       CHECK(!InClass(i, excluded));
1598     }
1599   }
1600 }
1601 
1602 
TEST(CanonicalizeCharacterSets)1603 TEST(CanonicalizeCharacterSets) {
1604   v8::internal::V8::Initialize(NULL);
1605   ZoneScope scope(DELETE_ON_EXIT);
1606   ZoneList<CharacterRange>* list = new ZoneList<CharacterRange>(4);
1607   CharacterSet set(list);
1608 
1609   list->Add(CharacterRange(10, 20));
1610   list->Add(CharacterRange(30, 40));
1611   list->Add(CharacterRange(50, 60));
1612   set.Canonicalize();
1613   ASSERT_EQ(3, list->length());
1614   ASSERT_EQ(10, list->at(0).from());
1615   ASSERT_EQ(20, list->at(0).to());
1616   ASSERT_EQ(30, list->at(1).from());
1617   ASSERT_EQ(40, list->at(1).to());
1618   ASSERT_EQ(50, list->at(2).from());
1619   ASSERT_EQ(60, list->at(2).to());
1620 
1621   list->Rewind(0);
1622   list->Add(CharacterRange(10, 20));
1623   list->Add(CharacterRange(50, 60));
1624   list->Add(CharacterRange(30, 40));
1625   set.Canonicalize();
1626   ASSERT_EQ(3, list->length());
1627   ASSERT_EQ(10, list->at(0).from());
1628   ASSERT_EQ(20, list->at(0).to());
1629   ASSERT_EQ(30, list->at(1).from());
1630   ASSERT_EQ(40, list->at(1).to());
1631   ASSERT_EQ(50, list->at(2).from());
1632   ASSERT_EQ(60, list->at(2).to());
1633 
1634   list->Rewind(0);
1635   list->Add(CharacterRange(30, 40));
1636   list->Add(CharacterRange(10, 20));
1637   list->Add(CharacterRange(25, 25));
1638   list->Add(CharacterRange(100, 100));
1639   list->Add(CharacterRange(1, 1));
1640   set.Canonicalize();
1641   ASSERT_EQ(5, list->length());
1642   ASSERT_EQ(1, list->at(0).from());
1643   ASSERT_EQ(1, list->at(0).to());
1644   ASSERT_EQ(10, list->at(1).from());
1645   ASSERT_EQ(20, list->at(1).to());
1646   ASSERT_EQ(25, list->at(2).from());
1647   ASSERT_EQ(25, list->at(2).to());
1648   ASSERT_EQ(30, list->at(3).from());
1649   ASSERT_EQ(40, list->at(3).to());
1650   ASSERT_EQ(100, list->at(4).from());
1651   ASSERT_EQ(100, list->at(4).to());
1652 
1653   list->Rewind(0);
1654   list->Add(CharacterRange(10, 19));
1655   list->Add(CharacterRange(21, 30));
1656   list->Add(CharacterRange(20, 20));
1657   set.Canonicalize();
1658   ASSERT_EQ(1, list->length());
1659   ASSERT_EQ(10, list->at(0).from());
1660   ASSERT_EQ(30, list->at(0).to());
1661 }
1662 
1663 // Checks whether a character is in the set represented by a list of ranges.
CharacterInSet(ZoneList<CharacterRange> * set,uc16 value)1664 static bool CharacterInSet(ZoneList<CharacterRange>* set, uc16 value) {
1665   for (int i = 0; i < set->length(); i++) {
1666     CharacterRange range = set->at(i);
1667     if (range.from() <= value && value <= range.to()) {
1668       return true;
1669     }
1670   }
1671   return false;
1672 }
1673 
TEST(CharacterRangeMerge)1674 TEST(CharacterRangeMerge) {
1675   v8::internal::V8::Initialize(NULL);
1676   ZoneScope zone_scope(DELETE_ON_EXIT);
1677   ZoneList<CharacterRange> l1(4);
1678   ZoneList<CharacterRange> l2(4);
1679   // Create all combinations of intersections of ranges, both singletons and
1680   // longer.
1681 
1682   int offset = 0;
1683 
1684   // The five kinds of singleton intersections:
1685   //     X
1686   //   Y      - outside before
1687   //    Y     - outside touching start
1688   //     Y    - overlap
1689   //      Y   - outside touching end
1690   //       Y  - outside after
1691 
1692   for (int i = 0; i < 5; i++) {
1693     l1.Add(CharacterRange::Singleton(offset + 2));
1694     l2.Add(CharacterRange::Singleton(offset + i));
1695     offset += 6;
1696   }
1697 
1698   // The seven kinds of singleton/non-singleton intersections:
1699   //    XXX
1700   //  Y        - outside before
1701   //   Y       - outside touching start
1702   //    Y      - inside touching start
1703   //     Y     - entirely inside
1704   //      Y    - inside touching end
1705   //       Y   - outside touching end
1706   //        Y  - disjoint after
1707 
1708   for (int i = 0; i < 7; i++) {
1709     l1.Add(CharacterRange::Range(offset + 2, offset + 4));
1710     l2.Add(CharacterRange::Singleton(offset + i));
1711     offset += 8;
1712   }
1713 
1714   // The eleven kinds of non-singleton intersections:
1715   //
1716   //       XXXXXXXX
1717   // YYYY                  - outside before.
1718   //   YYYY                - outside touching start.
1719   //     YYYY              - overlapping start
1720   //       YYYY            - inside touching start
1721   //         YYYY          - entirely inside
1722   //           YYYY        - inside touching end
1723   //             YYYY      - overlapping end
1724   //               YYYY    - outside touching end
1725   //                 YYYY  - outside after
1726   //       YYYYYYYY        - identical
1727   //     YYYYYYYYYYYY      - containing entirely.
1728 
1729   for (int i = 0; i < 9; i++) {
1730     l1.Add(CharacterRange::Range(offset + 6, offset + 15));  // Length 8.
1731     l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3));
1732     offset += 22;
1733   }
1734   l1.Add(CharacterRange::Range(offset + 6, offset + 15));
1735   l2.Add(CharacterRange::Range(offset + 6, offset + 15));
1736   offset += 22;
1737   l1.Add(CharacterRange::Range(offset + 6, offset + 15));
1738   l2.Add(CharacterRange::Range(offset + 4, offset + 17));
1739   offset += 22;
1740 
1741   // Different kinds of multi-range overlap:
1742   // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
1743   //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y
1744 
1745   l1.Add(CharacterRange::Range(offset, offset + 21));
1746   l1.Add(CharacterRange::Range(offset + 31, offset + 52));
1747   for (int i = 0; i < 6; i++) {
1748     l2.Add(CharacterRange::Range(offset + 2, offset + 5));
1749     l2.Add(CharacterRange::Singleton(offset + 8));
1750     offset += 9;
1751   }
1752 
1753   ASSERT(CharacterRange::IsCanonical(&l1));
1754   ASSERT(CharacterRange::IsCanonical(&l2));
1755 
1756   ZoneList<CharacterRange> first_only(4);
1757   ZoneList<CharacterRange> second_only(4);
1758   ZoneList<CharacterRange> both(4);
1759 
1760   // Merge one direction.
1761   CharacterRange::Merge(&l1, &l2, &first_only, &second_only, &both);
1762 
1763   CHECK(CharacterRange::IsCanonical(&first_only));
1764   CHECK(CharacterRange::IsCanonical(&second_only));
1765   CHECK(CharacterRange::IsCanonical(&both));
1766 
1767   for (uc16 i = 0; i < offset; i++) {
1768     bool in_first = CharacterInSet(&l1, i);
1769     bool in_second = CharacterInSet(&l2, i);
1770     CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
1771     CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
1772     CHECK((in_first && in_second) == CharacterInSet(&both, i));
1773   }
1774 
1775   first_only.Clear();
1776   second_only.Clear();
1777   both.Clear();
1778 
1779   // Merge other direction.
1780   CharacterRange::Merge(&l2, &l1, &second_only, &first_only, &both);
1781 
1782   CHECK(CharacterRange::IsCanonical(&first_only));
1783   CHECK(CharacterRange::IsCanonical(&second_only));
1784   CHECK(CharacterRange::IsCanonical(&both));
1785 
1786   for (uc16 i = 0; i < offset; i++) {
1787     bool in_first = CharacterInSet(&l1, i);
1788     bool in_second = CharacterInSet(&l2, i);
1789     CHECK((in_first && !in_second) == CharacterInSet(&first_only, i));
1790     CHECK((!in_first && in_second) == CharacterInSet(&second_only, i));
1791     CHECK((in_first && in_second) == CharacterInSet(&both, i));
1792   }
1793 
1794   first_only.Clear();
1795   second_only.Clear();
1796   both.Clear();
1797 
1798   // Merge but don't record all combinations.
1799   CharacterRange::Merge(&l1, &l2, NULL, NULL, &both);
1800 
1801   CHECK(CharacterRange::IsCanonical(&both));
1802 
1803   for (uc16 i = 0; i < offset; i++) {
1804     bool in_first = CharacterInSet(&l1, i);
1805     bool in_second = CharacterInSet(&l2, i);
1806     CHECK((in_first && in_second) == CharacterInSet(&both, i));
1807   }
1808 
1809   // Merge into same set.
1810   ZoneList<CharacterRange> all(4);
1811   CharacterRange::Merge(&l1, &l2, &all, &all, &all);
1812 
1813   CHECK(CharacterRange::IsCanonical(&all));
1814 
1815   for (uc16 i = 0; i < offset; i++) {
1816     bool in_first = CharacterInSet(&l1, i);
1817     bool in_second = CharacterInSet(&l2, i);
1818     CHECK((in_first || in_second) == CharacterInSet(&all, i));
1819   }
1820 }
1821 
1822 
TEST(Graph)1823 TEST(Graph) {
1824   V8::Initialize(NULL);
1825   Execute("\\b\\w+\\b", false, true, true);
1826 }
1827