• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 //     * Redistributions of source code must retain the above copyright
7 //       notice, this list of conditions and the following disclaimer.
8 //     * Redistributions in binary form must reproduce the above
9 //       copyright notice, this list of conditions and the following
10 //       disclaimer in the documentation and/or other materials provided
11 //       with the distribution.
12 //     * Neither the name of Google Inc. nor the names of its
13 //       contributors may be used to endorse or promote products derived
14 //       from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27 
28 #include <cstdlib>
29 #include <sstream>
30 
31 #include "include/v8.h"
32 #include "src/v8.h"
33 
34 #include "src/ast/ast.h"
35 #include "src/char-predicates-inl.h"
36 #include "src/ostreams.h"
37 #include "src/regexp/jsregexp.h"
38 #include "src/regexp/regexp-macro-assembler.h"
39 #include "src/regexp/regexp-macro-assembler-irregexp.h"
40 #include "src/regexp/regexp-parser.h"
41 #include "src/splay-tree-inl.h"
42 #include "src/string-stream.h"
43 #ifdef V8_INTERPRETED_REGEXP
44 #include "src/regexp/interpreter-irregexp.h"
45 #else  // V8_INTERPRETED_REGEXP
46 #include "src/macro-assembler.h"
47 #if V8_TARGET_ARCH_ARM
48 #include "src/arm/assembler-arm.h"  // NOLINT
49 #include "src/arm/macro-assembler-arm.h"
50 #include "src/regexp/arm/regexp-macro-assembler-arm.h"
51 #endif
52 #if V8_TARGET_ARCH_ARM64
53 #include "src/arm64/assembler-arm64.h"
54 #include "src/arm64/macro-assembler-arm64.h"
55 #include "src/regexp/arm64/regexp-macro-assembler-arm64.h"
56 #endif
57 #if V8_TARGET_ARCH_PPC
58 #include "src/ppc/assembler-ppc.h"
59 #include "src/ppc/macro-assembler-ppc.h"
60 #include "src/regexp/ppc/regexp-macro-assembler-ppc.h"
61 #endif
62 #if V8_TARGET_ARCH_MIPS
63 #include "src/mips/assembler-mips.h"
64 #include "src/mips/macro-assembler-mips.h"
65 #include "src/regexp/mips/regexp-macro-assembler-mips.h"
66 #endif
67 #if V8_TARGET_ARCH_MIPS64
68 #include "src/mips64/assembler-mips64.h"
69 #include "src/mips64/macro-assembler-mips64.h"
70 #include "src/regexp/mips64/regexp-macro-assembler-mips64.h"
71 #endif
72 #if V8_TARGET_ARCH_X64
73 #include "src/regexp/x64/regexp-macro-assembler-x64.h"
74 #include "src/x64/assembler-x64.h"
75 #include "src/x64/macro-assembler-x64.h"
76 #endif
77 #if V8_TARGET_ARCH_IA32
78 #include "src/ia32/assembler-ia32.h"
79 #include "src/ia32/macro-assembler-ia32.h"
80 #include "src/regexp/ia32/regexp-macro-assembler-ia32.h"
81 #endif
82 #if V8_TARGET_ARCH_X87
83 #include "src/regexp/x87/regexp-macro-assembler-x87.h"
84 #include "src/x87/assembler-x87.h"
85 #include "src/x87/macro-assembler-x87.h"
86 #endif
87 #endif  // V8_INTERPRETED_REGEXP
88 #include "test/cctest/cctest.h"
89 
90 using namespace v8::internal;
91 
92 
CheckParse(const char * input)93 static bool CheckParse(const char* input) {
94   v8::HandleScope scope(CcTest::isolate());
95   Zone zone;
96   FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
97   RegExpCompileData result;
98   return v8::internal::RegExpParser::ParseRegExp(
99       CcTest::i_isolate(), &zone, &reader, false, false, &result);
100 }
101 
102 
CheckParseEq(const char * input,const char * expected,bool unicode=false)103 static void CheckParseEq(const char* input, const char* expected,
104                          bool unicode = false) {
105   v8::HandleScope scope(CcTest::isolate());
106   Zone zone;
107   FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
108   RegExpCompileData result;
109   CHECK(v8::internal::RegExpParser::ParseRegExp(
110       CcTest::i_isolate(), &zone, &reader, false, unicode, &result));
111   CHECK(result.tree != NULL);
112   CHECK(result.error.is_null());
113   std::ostringstream os;
114   result.tree->Print(os, &zone);
115   if (strcmp(expected, os.str().c_str()) != 0) {
116     printf("%s | %s\n", expected, os.str().c_str());
117   }
118   CHECK_EQ(0, strcmp(expected, os.str().c_str()));
119 }
120 
121 
CheckSimple(const char * input)122 static bool CheckSimple(const char* input) {
123   v8::HandleScope scope(CcTest::isolate());
124   Zone zone;
125   FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
126   RegExpCompileData result;
127   CHECK(v8::internal::RegExpParser::ParseRegExp(
128       CcTest::i_isolate(), &zone, &reader, false, false, &result));
129   CHECK(result.tree != NULL);
130   CHECK(result.error.is_null());
131   return result.simple;
132 }
133 
134 struct MinMaxPair {
135   int min_match;
136   int max_match;
137 };
138 
139 
CheckMinMaxMatch(const char * input)140 static MinMaxPair CheckMinMaxMatch(const char* input) {
141   v8::HandleScope scope(CcTest::isolate());
142   Zone zone;
143   FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
144   RegExpCompileData result;
145   CHECK(v8::internal::RegExpParser::ParseRegExp(
146       CcTest::i_isolate(), &zone, &reader, false, false, &result));
147   CHECK(result.tree != NULL);
148   CHECK(result.error.is_null());
149   int min_match = result.tree->min_match();
150   int max_match = result.tree->max_match();
151   MinMaxPair pair = { min_match, max_match };
152   return pair;
153 }
154 
155 
156 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
157 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
158 #define CHECK_MIN_MAX(input, min, max)                                         \
159   { MinMaxPair min_max = CheckMinMaxMatch(input);                              \
160     CHECK_EQ(min, min_max.min_match);                                          \
161     CHECK_EQ(max, min_max.max_match);                                          \
162   }
163 
164 
TestRegExpParser(bool lookbehind)165 void TestRegExpParser(bool lookbehind) {
166   FLAG_harmony_regexp_lookbehind = lookbehind;
167   FLAG_harmony_unicode_regexps = true;
168 
169   CHECK_PARSE_ERROR("?");
170 
171   CheckParseEq("abc", "'abc'");
172   CheckParseEq("", "%");
173   CheckParseEq("abc|def", "(| 'abc' 'def')");
174   CheckParseEq("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
175   CheckParseEq("^xxx$", "(: @^i 'xxx' @$i)");
176   CheckParseEq("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
177   CheckParseEq("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
178   CheckParseEq("a*", "(# 0 - g 'a')");
179   CheckParseEq("a*?", "(# 0 - n 'a')");
180   CheckParseEq("abc+", "(: 'ab' (# 1 - g 'c'))");
181   CheckParseEq("abc+?", "(: 'ab' (# 1 - n 'c'))");
182   CheckParseEq("xyz?", "(: 'xy' (# 0 1 g 'z'))");
183   CheckParseEq("xyz??", "(: 'xy' (# 0 1 n 'z'))");
184   CheckParseEq("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
185   CheckParseEq("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
186   CheckParseEq("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
187   CheckParseEq("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
188   CheckParseEq("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
189   CheckParseEq("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
190   CheckParseEq("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
191   CheckParseEq("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
192   CheckParseEq("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
193   CheckParseEq("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
194   CheckParseEq("(?:foo)", "'foo'");
195   CheckParseEq("(?: foo )", "' foo '");
196   CheckParseEq("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
197   CheckParseEq("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
198   CheckParseEq("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
199   CheckParseEq("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
200   if (lookbehind) {
201     CheckParseEq("foo(?<=bar)baz", "(: 'foo' (<- + 'bar') 'baz')");
202     CheckParseEq("foo(?<!bar)baz", "(: 'foo' (<- - 'bar') 'baz')");
203   } else {
204     CHECK_PARSE_ERROR("foo(?<=bar)baz");
205     CHECK_PARSE_ERROR("foo(?<!bar)baz");
206   }
207   CheckParseEq("()", "(^ %)");
208   CheckParseEq("(?=)", "(-> + %)");
209   CheckParseEq("[]", "^[\\x00-\\uffff]");  // Doesn't compile on windows
210   CheckParseEq("[^]", "[\\x00-\\uffff]");  // \uffff isn't in codepage 1252
211   CheckParseEq("[x]", "[x]");
212   CheckParseEq("[xyz]", "[x y z]");
213   CheckParseEq("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
214   CheckParseEq("[-123]", "[- 1 2 3]");
215   CheckParseEq("[^123]", "^[1 2 3]");
216   CheckParseEq("]", "']'");
217   CheckParseEq("}", "'}'");
218   CheckParseEq("[a-b-c]", "[a-b - c]");
219   CheckParseEq("[\\d]", "[0-9]");
220   CheckParseEq("[x\\dz]", "[x 0-9 z]");
221   CheckParseEq("[\\d-z]", "[0-9 - z]");
222   CheckParseEq("[\\d-\\d]", "[0-9 - 0-9]");
223   CheckParseEq("[z-\\d]", "[z - 0-9]");
224   // Control character outside character class.
225   CheckParseEq("\\cj\\cJ\\ci\\cI\\ck\\cK", "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
226   CheckParseEq("\\c!", "'\\c!'");
227   CheckParseEq("\\c_", "'\\c_'");
228   CheckParseEq("\\c~", "'\\c~'");
229   CheckParseEq("\\c1", "'\\c1'");
230   // Control character inside character class.
231   CheckParseEq("[\\c!]", "[\\ c !]");
232   CheckParseEq("[\\c_]", "[\\x1f]");
233   CheckParseEq("[\\c~]", "[\\ c ~]");
234   CheckParseEq("[\\ca]", "[\\x01]");
235   CheckParseEq("[\\cz]", "[\\x1a]");
236   CheckParseEq("[\\cA]", "[\\x01]");
237   CheckParseEq("[\\cZ]", "[\\x1a]");
238   CheckParseEq("[\\c1]", "[\\x11]");
239 
240   CheckParseEq("[a\\]c]", "[a ] c]");
241   CheckParseEq("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
242   CheckParseEq("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ #  ]");
243   CheckParseEq("\\0", "'\\x00'");
244   CheckParseEq("\\8", "'8'");
245   CheckParseEq("\\9", "'9'");
246   CheckParseEq("\\11", "'\\x09'");
247   CheckParseEq("\\11a", "'\\x09a'");
248   CheckParseEq("\\011", "'\\x09'");
249   CheckParseEq("\\00011", "'\\x0011'");
250   CheckParseEq("\\118", "'\\x098'");
251   CheckParseEq("\\111", "'I'");
252   CheckParseEq("\\1111", "'I1'");
253   CheckParseEq("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
254   CheckParseEq("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
255   CheckParseEq("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
256   CheckParseEq("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
257   CheckParseEq("(x)(x)(x)\\1*",
258                "(: (^ 'x') (^ 'x') (^ 'x')"
259                " (# 0 - g (<- 1)))");
260   CheckParseEq("(x)(x)(x)\\2*",
261                "(: (^ 'x') (^ 'x') (^ 'x')"
262                " (# 0 - g (<- 2)))");
263   CheckParseEq("(x)(x)(x)\\3*",
264                "(: (^ 'x') (^ 'x') (^ 'x')"
265                " (# 0 - g (<- 3)))");
266   CheckParseEq("(x)(x)(x)\\4*",
267                "(: (^ 'x') (^ 'x') (^ 'x')"
268                " (# 0 - g '\\x04'))");
269   CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
270                "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
271                " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
272   CheckParseEq("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
273                "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
274                " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
275   CheckParseEq("(a)\\1", "(: (^ 'a') (<- 1))");
276   CheckParseEq("(a\\1)", "(^ 'a')");
277   CheckParseEq("(\\1a)", "(^ 'a')");
278   CheckParseEq("(\\2)(\\1)", "(: (^ (<- 2)) (^ (<- 1)))");
279   CheckParseEq("(?=a)?a", "'a'");
280   CheckParseEq("(?=a){0,10}a", "'a'");
281   CheckParseEq("(?=a){1,10}a", "(: (-> + 'a') 'a')");
282   CheckParseEq("(?=a){9,10}a", "(: (-> + 'a') 'a')");
283   CheckParseEq("(?!a)?a", "'a'");
284   CheckParseEq("\\1(a)", "(: (<- 1) (^ 'a'))");
285   CheckParseEq("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
286   CheckParseEq("(?!\\1(a\\1)\\1)\\1",
287                "(: (-> - (: (<- 1) (^ 'a') (<- 1))) (<- 1))");
288   CheckParseEq("\\1\\2(a(?:\\1(b\\1\\2))\\2)\\1",
289                "(: (<- 1) (<- 2) (^ (: 'a' (^ 'b') (<- 2))) (<- 1))");
290   if (lookbehind) {
291     CheckParseEq("\\1\\2(a(?<=\\1(b\\1\\2))\\2)\\1",
292                  "(: (<- 1) (<- 2) (^ (: 'a' (<- + (^ 'b')) (<- 2))) (<- 1))");
293   }
294   CheckParseEq("[\\0]", "[\\x00]");
295   CheckParseEq("[\\11]", "[\\x09]");
296   CheckParseEq("[\\11a]", "[\\x09 a]");
297   CheckParseEq("[\\011]", "[\\x09]");
298   CheckParseEq("[\\00011]", "[\\x00 1 1]");
299   CheckParseEq("[\\118]", "[\\x09 8]");
300   CheckParseEq("[\\111]", "[I]");
301   CheckParseEq("[\\1111]", "[I 1]");
302   CheckParseEq("\\x34", "'\x34'");
303   CheckParseEq("\\x60", "'\x60'");
304   CheckParseEq("\\x3z", "'x3z'");
305   CheckParseEq("\\c", "'\\c'");
306   CheckParseEq("\\u0034", "'\x34'");
307   CheckParseEq("\\u003z", "'u003z'");
308   CheckParseEq("foo[z]*", "(: 'foo' (# 0 - g [z]))");
309 
310   // Unicode regexps
311   CheckParseEq("\\u{12345}", "'\\ud808\\udf45'", true);
312   CheckParseEq("\\u{12345}\\u{23456}", "(! '\\ud808\\udf45' '\\ud84d\\udc56')",
313                true);
314   CheckParseEq("\\u{12345}|\\u{23456}", "(| '\\ud808\\udf45' '\\ud84d\\udc56')",
315                true);
316   CheckParseEq("\\u{12345}{3}", "(# 3 3 g '\\ud808\\udf45')", true);
317   CheckParseEq("\\u{12345}*", "(# 0 - g '\\ud808\\udf45')", true);
318 
319   CHECK_SIMPLE("", false);
320   CHECK_SIMPLE("a", true);
321   CHECK_SIMPLE("a|b", false);
322   CHECK_SIMPLE("a\\n", false);
323   CHECK_SIMPLE("^a", false);
324   CHECK_SIMPLE("a$", false);
325   CHECK_SIMPLE("a\\b!", false);
326   CHECK_SIMPLE("a\\Bb", false);
327   CHECK_SIMPLE("a*", false);
328   CHECK_SIMPLE("a*?", false);
329   CHECK_SIMPLE("a?", false);
330   CHECK_SIMPLE("a??", false);
331   CHECK_SIMPLE("a{0,1}?", false);
332   CHECK_SIMPLE("a{1,1}?", false);
333   CHECK_SIMPLE("a{1,2}?", false);
334   CHECK_SIMPLE("a+?", false);
335   CHECK_SIMPLE("(a)", false);
336   CHECK_SIMPLE("(a)\\1", false);
337   CHECK_SIMPLE("(\\1a)", false);
338   CHECK_SIMPLE("\\1(a)", false);
339   CHECK_SIMPLE("a\\s", false);
340   CHECK_SIMPLE("a\\S", false);
341   CHECK_SIMPLE("a\\d", false);
342   CHECK_SIMPLE("a\\D", false);
343   CHECK_SIMPLE("a\\w", false);
344   CHECK_SIMPLE("a\\W", false);
345   CHECK_SIMPLE("a.", false);
346   CHECK_SIMPLE("a\\q", false);
347   CHECK_SIMPLE("a[a]", false);
348   CHECK_SIMPLE("a[^a]", false);
349   CHECK_SIMPLE("a[a-z]", false);
350   CHECK_SIMPLE("a[\\q]", false);
351   CHECK_SIMPLE("a(?:b)", false);
352   CHECK_SIMPLE("a(?=b)", false);
353   CHECK_SIMPLE("a(?!b)", false);
354   CHECK_SIMPLE("\\x60", false);
355   CHECK_SIMPLE("\\u0060", false);
356   CHECK_SIMPLE("\\cA", false);
357   CHECK_SIMPLE("\\q", false);
358   CHECK_SIMPLE("\\1112", false);
359   CHECK_SIMPLE("\\0", false);
360   CHECK_SIMPLE("(a)\\1", false);
361   CHECK_SIMPLE("(?=a)?a", false);
362   CHECK_SIMPLE("(?!a)?a\\1", false);
363   CHECK_SIMPLE("(?:(?=a))a\\1", false);
364 
365   CheckParseEq("a{}", "'a{}'");
366   CheckParseEq("a{,}", "'a{,}'");
367   CheckParseEq("a{", "'a{'");
368   CheckParseEq("a{z}", "'a{z}'");
369   CheckParseEq("a{1z}", "'a{1z}'");
370   CheckParseEq("a{12z}", "'a{12z}'");
371   CheckParseEq("a{12,", "'a{12,'");
372   CheckParseEq("a{12,3b", "'a{12,3b'");
373   CheckParseEq("{}", "'{}'");
374   CheckParseEq("{,}", "'{,}'");
375   CheckParseEq("{", "'{'");
376   CheckParseEq("{z}", "'{z}'");
377   CheckParseEq("{1z}", "'{1z}'");
378   CheckParseEq("{12z}", "'{12z}'");
379   CheckParseEq("{12,", "'{12,'");
380   CheckParseEq("{12,3b", "'{12,3b'");
381 
382   CHECK_MIN_MAX("a", 1, 1);
383   CHECK_MIN_MAX("abc", 3, 3);
384   CHECK_MIN_MAX("a[bc]d", 3, 3);
385   CHECK_MIN_MAX("a|bc", 1, 2);
386   CHECK_MIN_MAX("ab|c", 1, 2);
387   CHECK_MIN_MAX("a||bc", 0, 2);
388   CHECK_MIN_MAX("|", 0, 0);
389   CHECK_MIN_MAX("(?:ab)", 2, 2);
390   CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
391   CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
392   CHECK_MIN_MAX("(ab)", 2, 2);
393   CHECK_MIN_MAX("(ab|cde)", 2, 3);
394   CHECK_MIN_MAX("(ab)\\1", 2, RegExpTree::kInfinity);
395   CHECK_MIN_MAX("(ab|cde)\\1", 2, RegExpTree::kInfinity);
396   CHECK_MIN_MAX("(?:ab)?", 0, 2);
397   CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
398   CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
399   CHECK_MIN_MAX("a?", 0, 1);
400   CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
401   CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
402   CHECK_MIN_MAX("a??", 0, 1);
403   CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
404   CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
405   CHECK_MIN_MAX("(?:a?)?", 0, 1);
406   CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
407   CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
408   CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
409   CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
410   CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
411   CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
412   CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
413   CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
414   CHECK_MIN_MAX("a{0}", 0, 0);
415   CHECK_MIN_MAX("(?:a+){0}", 0, 0);
416   CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
417   CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
418   CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
419   CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
420   CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
421   CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
422   CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
423   CHECK_MIN_MAX("a\\bc", 2, 2);
424   CHECK_MIN_MAX("a\\Bc", 2, 2);
425   CHECK_MIN_MAX("a\\sc", 3, 3);
426   CHECK_MIN_MAX("a\\Sc", 3, 3);
427   CHECK_MIN_MAX("a(?=b)c", 2, 2);
428   CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
429   CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
430 }
431 
432 
TEST(ParserWithLookbehind)433 TEST(ParserWithLookbehind) {
434   TestRegExpParser(true);  // Lookbehind enabled.
435 }
436 
437 
TEST(ParserWithoutLookbehind)438 TEST(ParserWithoutLookbehind) {
439   TestRegExpParser(true);  // Lookbehind enabled.
440 }
441 
442 
TEST(ParserRegression)443 TEST(ParserRegression) {
444   CheckParseEq("[A-Z$-][x]", "(! [A-Z $ -] [x])");
445   CheckParseEq("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
446   CheckParseEq("{", "'{'");
447   CheckParseEq("a|", "(| 'a' %)");
448 }
449 
ExpectError(const char * input,const char * expected)450 static void ExpectError(const char* input,
451                         const char* expected) {
452   v8::HandleScope scope(CcTest::isolate());
453   Zone zone;
454   FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
455   RegExpCompileData result;
456   CHECK(!v8::internal::RegExpParser::ParseRegExp(
457             CcTest::i_isolate(), &zone, &reader, false, false, &result));
458   CHECK(result.tree == NULL);
459   CHECK(!result.error.is_null());
460   v8::base::SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
461   CHECK_EQ(0, strcmp(expected, str.get()));
462 }
463 
464 
TEST(Errors)465 TEST(Errors) {
466   const char* kEndBackslash = "\\ at end of pattern";
467   ExpectError("\\", kEndBackslash);
468   const char* kUnterminatedGroup = "Unterminated group";
469   ExpectError("(foo", kUnterminatedGroup);
470   const char* kInvalidGroup = "Invalid group";
471   ExpectError("(?", kInvalidGroup);
472   const char* kUnterminatedCharacterClass = "Unterminated character class";
473   ExpectError("[", kUnterminatedCharacterClass);
474   ExpectError("[a-", kUnterminatedCharacterClass);
475   const char* kNothingToRepeat = "Nothing to repeat";
476   ExpectError("*", kNothingToRepeat);
477   ExpectError("?", kNothingToRepeat);
478   ExpectError("+", kNothingToRepeat);
479   ExpectError("{1}", kNothingToRepeat);
480   ExpectError("{1,2}", kNothingToRepeat);
481   ExpectError("{1,}", kNothingToRepeat);
482 
483   // Check that we don't allow more than kMaxCapture captures
484   const int kMaxCaptures = 1 << 16;  // Must match RegExpParser::kMaxCaptures.
485   const char* kTooManyCaptures = "Too many captures";
486   std::ostringstream os;
487   for (int i = 0; i <= kMaxCaptures; i++) {
488     os << "()";
489   }
490   ExpectError(os.str().c_str(), kTooManyCaptures);
491 }
492 
493 
IsDigit(uc16 c)494 static bool IsDigit(uc16 c) {
495   return ('0' <= c && c <= '9');
496 }
497 
498 
NotDigit(uc16 c)499 static bool NotDigit(uc16 c) {
500   return !IsDigit(c);
501 }
502 
503 
IsWhiteSpaceOrLineTerminator(uc16 c)504 static bool IsWhiteSpaceOrLineTerminator(uc16 c) {
505   // According to ECMA 5.1, 15.10.2.12 the CharacterClassEscape \s includes
506   // WhiteSpace (7.2) and LineTerminator (7.3) values.
507   return v8::internal::WhiteSpaceOrLineTerminator::Is(c);
508 }
509 
510 
NotWhiteSpaceNorLineTermiantor(uc16 c)511 static bool NotWhiteSpaceNorLineTermiantor(uc16 c) {
512   return !IsWhiteSpaceOrLineTerminator(c);
513 }
514 
515 
NotWord(uc16 c)516 static bool NotWord(uc16 c) {
517   return !IsRegExpWord(c);
518 }
519 
520 
TestCharacterClassEscapes(uc16 c,bool (pred)(uc16 c))521 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
522   Zone zone;
523   ZoneList<CharacterRange>* ranges =
524       new(&zone) ZoneList<CharacterRange>(2, &zone);
525   CharacterRange::AddClassEscape(c, ranges, &zone);
526   for (unsigned i = 0; i < (1 << 16); i++) {
527     bool in_class = false;
528     for (int j = 0; !in_class && j < ranges->length(); j++) {
529       CharacterRange& range = ranges->at(j);
530       in_class = (range.from() <= i && i <= range.to());
531     }
532     CHECK_EQ(pred(i), in_class);
533   }
534 }
535 
536 
TEST(CharacterClassEscapes)537 TEST(CharacterClassEscapes) {
538   TestCharacterClassEscapes('.', IsRegExpNewline);
539   TestCharacterClassEscapes('d', IsDigit);
540   TestCharacterClassEscapes('D', NotDigit);
541   TestCharacterClassEscapes('s', IsWhiteSpaceOrLineTerminator);
542   TestCharacterClassEscapes('S', NotWhiteSpaceNorLineTermiantor);
543   TestCharacterClassEscapes('w', IsRegExpWord);
544   TestCharacterClassEscapes('W', NotWord);
545 }
546 
547 
Compile(const char * input,bool multiline,bool unicode,bool is_one_byte,Zone * zone)548 static RegExpNode* Compile(const char* input, bool multiline, bool unicode,
549                            bool is_one_byte, Zone* zone) {
550   Isolate* isolate = CcTest::i_isolate();
551   FlatStringReader reader(isolate, CStrVector(input));
552   RegExpCompileData compile_data;
553   if (!v8::internal::RegExpParser::ParseRegExp(CcTest::i_isolate(), zone,
554                                                &reader, multiline, unicode,
555                                                &compile_data))
556     return NULL;
557   Handle<String> pattern = isolate->factory()
558                                ->NewStringFromUtf8(CStrVector(input))
559                                .ToHandleChecked();
560   Handle<String> sample_subject =
561       isolate->factory()->NewStringFromUtf8(CStrVector("")).ToHandleChecked();
562   RegExpEngine::Compile(isolate, zone, &compile_data, false, false, multiline,
563                         false, pattern, sample_subject, is_one_byte);
564   return compile_data.node;
565 }
566 
567 
Execute(const char * input,bool multiline,bool unicode,bool is_one_byte,bool dot_output=false)568 static void Execute(const char* input, bool multiline, bool unicode,
569                     bool is_one_byte, bool dot_output = false) {
570   v8::HandleScope scope(CcTest::isolate());
571   Zone zone;
572   RegExpNode* node = Compile(input, multiline, unicode, is_one_byte, &zone);
573   USE(node);
574 #ifdef DEBUG
575   if (dot_output) {
576     RegExpEngine::DotPrint(input, node, false);
577   }
578 #endif  // DEBUG
579 }
580 
581 
582 class TestConfig {
583  public:
584   typedef int Key;
585   typedef int Value;
586   static const int kNoKey;
NoValue()587   static int NoValue() { return 0; }
Compare(int a,int b)588   static inline int Compare(int a, int b) {
589     if (a < b)
590       return -1;
591     else if (a > b)
592       return 1;
593     else
594       return 0;
595   }
596 };
597 
598 
599 const int TestConfig::kNoKey = 0;
600 
601 
PseudoRandom(int i,int j)602 static unsigned PseudoRandom(int i, int j) {
603   return ~(~((i * 781) ^ (j * 329)));
604 }
605 
606 
TEST(SplayTreeSimple)607 TEST(SplayTreeSimple) {
608   static const unsigned kLimit = 1000;
609   Zone zone;
610   ZoneSplayTree<TestConfig> tree(&zone);
611   bool seen[kLimit];
612   for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
613 #define CHECK_MAPS_EQUAL() do {                                      \
614     for (unsigned k = 0; k < kLimit; k++)                            \
615       CHECK_EQ(seen[k], tree.Find(k, &loc));                         \
616   } while (false)
617   for (int i = 0; i < 50; i++) {
618     for (int j = 0; j < 50; j++) {
619       int next = PseudoRandom(i, j) % kLimit;
620       if (seen[next]) {
621         // We've already seen this one.  Check the value and remove
622         // it.
623         ZoneSplayTree<TestConfig>::Locator loc;
624         CHECK(tree.Find(next, &loc));
625         CHECK_EQ(next, loc.key());
626         CHECK_EQ(3 * next, loc.value());
627         tree.Remove(next);
628         seen[next] = false;
629         CHECK_MAPS_EQUAL();
630       } else {
631         // Check that it wasn't there already and then add it.
632         ZoneSplayTree<TestConfig>::Locator loc;
633         CHECK(!tree.Find(next, &loc));
634         CHECK(tree.Insert(next, &loc));
635         CHECK_EQ(next, loc.key());
636         loc.set_value(3 * next);
637         seen[next] = true;
638         CHECK_MAPS_EQUAL();
639       }
640       int val = PseudoRandom(j, i) % kLimit;
641       if (seen[val]) {
642         ZoneSplayTree<TestConfig>::Locator loc;
643         CHECK(tree.FindGreatestLessThan(val, &loc));
644         CHECK_EQ(loc.key(), val);
645         break;
646       }
647       val = PseudoRandom(i + j, i - j) % kLimit;
648       if (seen[val]) {
649         ZoneSplayTree<TestConfig>::Locator loc;
650         CHECK(tree.FindLeastGreaterThan(val, &loc));
651         CHECK_EQ(loc.key(), val);
652         break;
653       }
654     }
655   }
656 }
657 
658 
TEST(DispatchTableConstruction)659 TEST(DispatchTableConstruction) {
660   // Initialize test data.
661   static const int kLimit = 1000;
662   static const int kRangeCount = 8;
663   static const int kRangeSize = 16;
664   uc16 ranges[kRangeCount][2 * kRangeSize];
665   for (int i = 0; i < kRangeCount; i++) {
666     Vector<uc16> range(ranges[i], 2 * kRangeSize);
667     for (int j = 0; j < 2 * kRangeSize; j++) {
668       range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
669     }
670     range.Sort();
671     for (int j = 1; j < 2 * kRangeSize; j++) {
672       CHECK(range[j-1] <= range[j]);
673     }
674   }
675   // Enter test data into dispatch table.
676   Zone zone;
677   DispatchTable table(&zone);
678   for (int i = 0; i < kRangeCount; i++) {
679     uc16* range = ranges[i];
680     for (int j = 0; j < 2 * kRangeSize; j += 2)
681       table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone);
682   }
683   // Check that the table looks as we would expect
684   for (int p = 0; p < kLimit; p++) {
685     OutSet* outs = table.Get(p);
686     for (int j = 0; j < kRangeCount; j++) {
687       uc16* range = ranges[j];
688       bool is_on = false;
689       for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
690         is_on = (range[k] <= p && p <= range[k + 1]);
691       CHECK_EQ(is_on, outs->Get(j));
692     }
693   }
694 }
695 
696 
697 // Test of debug-only syntax.
698 #ifdef DEBUG
699 
TEST(ParsePossessiveRepetition)700 TEST(ParsePossessiveRepetition) {
701   bool old_flag_value = FLAG_regexp_possessive_quantifier;
702 
703   // Enable possessive quantifier syntax.
704   FLAG_regexp_possessive_quantifier = true;
705 
706   CheckParseEq("a*+", "(# 0 - p 'a')");
707   CheckParseEq("a++", "(# 1 - p 'a')");
708   CheckParseEq("a?+", "(# 0 1 p 'a')");
709   CheckParseEq("a{10,20}+", "(# 10 20 p 'a')");
710   CheckParseEq("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
711 
712   // Disable possessive quantifier syntax.
713   FLAG_regexp_possessive_quantifier = false;
714 
715   CHECK_PARSE_ERROR("a*+");
716   CHECK_PARSE_ERROR("a++");
717   CHECK_PARSE_ERROR("a?+");
718   CHECK_PARSE_ERROR("a{10,20}+");
719   CHECK_PARSE_ERROR("a{10,20}+b");
720 
721   FLAG_regexp_possessive_quantifier = old_flag_value;
722 }
723 
724 #endif
725 
726 // Tests of interpreter.
727 
728 
729 #ifndef V8_INTERPRETED_REGEXP
730 
731 #if V8_TARGET_ARCH_IA32
732 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
733 #elif V8_TARGET_ARCH_X64
734 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
735 #elif V8_TARGET_ARCH_ARM
736 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
737 #elif V8_TARGET_ARCH_ARM64
738 typedef RegExpMacroAssemblerARM64 ArchRegExpMacroAssembler;
739 #elif V8_TARGET_ARCH_PPC
740 typedef RegExpMacroAssemblerPPC ArchRegExpMacroAssembler;
741 #elif V8_TARGET_ARCH_MIPS
742 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
743 #elif V8_TARGET_ARCH_MIPS64
744 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
745 #elif V8_TARGET_ARCH_X87
746 typedef RegExpMacroAssemblerX87 ArchRegExpMacroAssembler;
747 #endif
748 
749 class ContextInitializer {
750  public:
ContextInitializer()751   ContextInitializer()
752       : scope_(CcTest::isolate()),
753         env_(v8::Context::New(CcTest::isolate())) {
754     env_->Enter();
755   }
~ContextInitializer()756   ~ContextInitializer() {
757     env_->Exit();
758   }
759  private:
760   v8::HandleScope scope_;
761   v8::Local<v8::Context> env_;
762 };
763 
764 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * captures)765 static ArchRegExpMacroAssembler::Result Execute(Code* code,
766                                                 String* input,
767                                                 int start_offset,
768                                                 const byte* input_start,
769                                                 const byte* input_end,
770                                                 int* captures) {
771   return NativeRegExpMacroAssembler::Execute(
772       code,
773       input,
774       start_offset,
775       input_start,
776       input_end,
777       captures,
778       0,
779       CcTest::i_isolate());
780 }
781 
782 
TEST(MacroAssemblerNativeSuccess)783 TEST(MacroAssemblerNativeSuccess) {
784   v8::V8::Initialize();
785   ContextInitializer initializer;
786   Isolate* isolate = CcTest::i_isolate();
787   Factory* factory = isolate->factory();
788   Zone zone;
789 
790   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
791                              4);
792 
793   m.Succeed();
794 
795   Handle<String> source = factory->NewStringFromStaticChars("");
796   Handle<Object> code_object = m.GetCode(source);
797   Handle<Code> code = Handle<Code>::cast(code_object);
798 
799   int captures[4] = {42, 37, 87, 117};
800   Handle<String> input = factory->NewStringFromStaticChars("foofoo");
801   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
802   const byte* start_adr =
803       reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
804 
805   NativeRegExpMacroAssembler::Result result =
806       Execute(*code,
807               *input,
808               0,
809               start_adr,
810               start_adr + seq_input->length(),
811               captures);
812 
813   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
814   CHECK_EQ(-1, captures[0]);
815   CHECK_EQ(-1, captures[1]);
816   CHECK_EQ(-1, captures[2]);
817   CHECK_EQ(-1, captures[3]);
818 }
819 
820 
TEST(MacroAssemblerNativeSimple)821 TEST(MacroAssemblerNativeSimple) {
822   v8::V8::Initialize();
823   ContextInitializer initializer;
824   Isolate* isolate = CcTest::i_isolate();
825   Factory* factory = isolate->factory();
826   Zone zone;
827 
828   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
829                              4);
830 
831   Label fail, backtrack;
832   m.PushBacktrack(&fail);
833   m.CheckNotAtStart(0, NULL);
834   m.LoadCurrentCharacter(2, NULL);
835   m.CheckNotCharacter('o', NULL);
836   m.LoadCurrentCharacter(1, NULL, false);
837   m.CheckNotCharacter('o', NULL);
838   m.LoadCurrentCharacter(0, NULL, false);
839   m.CheckNotCharacter('f', NULL);
840   m.WriteCurrentPositionToRegister(0, 0);
841   m.WriteCurrentPositionToRegister(1, 3);
842   m.AdvanceCurrentPosition(3);
843   m.PushBacktrack(&backtrack);
844   m.Succeed();
845   m.Bind(&backtrack);
846   m.Backtrack();
847   m.Bind(&fail);
848   m.Fail();
849 
850   Handle<String> source = factory->NewStringFromStaticChars("^foo");
851   Handle<Object> code_object = m.GetCode(source);
852   Handle<Code> code = Handle<Code>::cast(code_object);
853 
854   int captures[4] = {42, 37, 87, 117};
855   Handle<String> input = factory->NewStringFromStaticChars("foofoo");
856   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
857   Address start_adr = seq_input->GetCharsAddress();
858 
859   NativeRegExpMacroAssembler::Result result =
860       Execute(*code,
861               *input,
862               0,
863               start_adr,
864               start_adr + input->length(),
865               captures);
866 
867   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
868   CHECK_EQ(0, captures[0]);
869   CHECK_EQ(3, captures[1]);
870   CHECK_EQ(-1, captures[2]);
871   CHECK_EQ(-1, captures[3]);
872 
873   input = factory->NewStringFromStaticChars("barbarbar");
874   seq_input = Handle<SeqOneByteString>::cast(input);
875   start_adr = seq_input->GetCharsAddress();
876 
877   result = Execute(*code,
878                    *input,
879                    0,
880                    start_adr,
881                    start_adr + input->length(),
882                    captures);
883 
884   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
885 }
886 
887 
TEST(MacroAssemblerNativeSimpleUC16)888 TEST(MacroAssemblerNativeSimpleUC16) {
889   v8::V8::Initialize();
890   ContextInitializer initializer;
891   Isolate* isolate = CcTest::i_isolate();
892   Factory* factory = isolate->factory();
893   Zone zone;
894 
895   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
896                              4);
897 
898   Label fail, backtrack;
899   m.PushBacktrack(&fail);
900   m.CheckNotAtStart(0, NULL);
901   m.LoadCurrentCharacter(2, NULL);
902   m.CheckNotCharacter('o', NULL);
903   m.LoadCurrentCharacter(1, NULL, false);
904   m.CheckNotCharacter('o', NULL);
905   m.LoadCurrentCharacter(0, NULL, false);
906   m.CheckNotCharacter('f', NULL);
907   m.WriteCurrentPositionToRegister(0, 0);
908   m.WriteCurrentPositionToRegister(1, 3);
909   m.AdvanceCurrentPosition(3);
910   m.PushBacktrack(&backtrack);
911   m.Succeed();
912   m.Bind(&backtrack);
913   m.Backtrack();
914   m.Bind(&fail);
915   m.Fail();
916 
917   Handle<String> source = factory->NewStringFromStaticChars("^foo");
918   Handle<Object> code_object = m.GetCode(source);
919   Handle<Code> code = Handle<Code>::cast(code_object);
920 
921   int captures[4] = {42, 37, 87, 117};
922   const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
923                               static_cast<uc16>(0x2603)};
924   Handle<String> input = factory->NewStringFromTwoByte(
925       Vector<const uc16>(input_data, 6)).ToHandleChecked();
926   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
927   Address start_adr = seq_input->GetCharsAddress();
928 
929   NativeRegExpMacroAssembler::Result result =
930       Execute(*code,
931               *input,
932               0,
933               start_adr,
934               start_adr + input->length(),
935               captures);
936 
937   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
938   CHECK_EQ(0, captures[0]);
939   CHECK_EQ(3, captures[1]);
940   CHECK_EQ(-1, captures[2]);
941   CHECK_EQ(-1, captures[3]);
942 
943   const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
944                                static_cast<uc16>(0x2603)};
945   input = factory->NewStringFromTwoByte(
946       Vector<const uc16>(input_data2, 9)).ToHandleChecked();
947   seq_input = Handle<SeqTwoByteString>::cast(input);
948   start_adr = seq_input->GetCharsAddress();
949 
950   result = Execute(*code,
951                    *input,
952                    0,
953                    start_adr,
954                    start_adr + input->length() * 2,
955                    captures);
956 
957   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
958 }
959 
960 
TEST(MacroAssemblerNativeBacktrack)961 TEST(MacroAssemblerNativeBacktrack) {
962   v8::V8::Initialize();
963   ContextInitializer initializer;
964   Isolate* isolate = CcTest::i_isolate();
965   Factory* factory = isolate->factory();
966   Zone zone;
967 
968   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
969                              0);
970 
971   Label fail;
972   Label backtrack;
973   m.LoadCurrentCharacter(10, &fail);
974   m.Succeed();
975   m.Bind(&fail);
976   m.PushBacktrack(&backtrack);
977   m.LoadCurrentCharacter(10, NULL);
978   m.Succeed();
979   m.Bind(&backtrack);
980   m.Fail();
981 
982   Handle<String> source = factory->NewStringFromStaticChars("..........");
983   Handle<Object> code_object = m.GetCode(source);
984   Handle<Code> code = Handle<Code>::cast(code_object);
985 
986   Handle<String> input = factory->NewStringFromStaticChars("foofoo");
987   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
988   Address start_adr = seq_input->GetCharsAddress();
989 
990   NativeRegExpMacroAssembler::Result result =
991       Execute(*code,
992               *input,
993               0,
994               start_adr,
995               start_adr + input->length(),
996               NULL);
997 
998   CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
999 }
1000 
1001 
TEST(MacroAssemblerNativeBackReferenceLATIN1)1002 TEST(MacroAssemblerNativeBackReferenceLATIN1) {
1003   v8::V8::Initialize();
1004   ContextInitializer initializer;
1005   Isolate* isolate = CcTest::i_isolate();
1006   Factory* factory = isolate->factory();
1007   Zone zone;
1008 
1009   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1010                              4);
1011 
1012   m.WriteCurrentPositionToRegister(0, 0);
1013   m.AdvanceCurrentPosition(2);
1014   m.WriteCurrentPositionToRegister(1, 0);
1015   Label nomatch;
1016   m.CheckNotBackReference(0, false, &nomatch);
1017   m.Fail();
1018   m.Bind(&nomatch);
1019   m.AdvanceCurrentPosition(2);
1020   Label missing_match;
1021   m.CheckNotBackReference(0, false, &missing_match);
1022   m.WriteCurrentPositionToRegister(2, 0);
1023   m.Succeed();
1024   m.Bind(&missing_match);
1025   m.Fail();
1026 
1027   Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1028   Handle<Object> code_object = m.GetCode(source);
1029   Handle<Code> code = Handle<Code>::cast(code_object);
1030 
1031   Handle<String> input = factory->NewStringFromStaticChars("fooofo");
1032   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1033   Address start_adr = seq_input->GetCharsAddress();
1034 
1035   int output[4];
1036   NativeRegExpMacroAssembler::Result result =
1037       Execute(*code,
1038               *input,
1039               0,
1040               start_adr,
1041               start_adr + input->length(),
1042               output);
1043 
1044   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1045   CHECK_EQ(0, output[0]);
1046   CHECK_EQ(2, output[1]);
1047   CHECK_EQ(6, output[2]);
1048   CHECK_EQ(-1, output[3]);
1049 }
1050 
1051 
TEST(MacroAssemblerNativeBackReferenceUC16)1052 TEST(MacroAssemblerNativeBackReferenceUC16) {
1053   v8::V8::Initialize();
1054   ContextInitializer initializer;
1055   Isolate* isolate = CcTest::i_isolate();
1056   Factory* factory = isolate->factory();
1057   Zone zone;
1058 
1059   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::UC16,
1060                              4);
1061 
1062   m.WriteCurrentPositionToRegister(0, 0);
1063   m.AdvanceCurrentPosition(2);
1064   m.WriteCurrentPositionToRegister(1, 0);
1065   Label nomatch;
1066   m.CheckNotBackReference(0, false, &nomatch);
1067   m.Fail();
1068   m.Bind(&nomatch);
1069   m.AdvanceCurrentPosition(2);
1070   Label missing_match;
1071   m.CheckNotBackReference(0, false, &missing_match);
1072   m.WriteCurrentPositionToRegister(2, 0);
1073   m.Succeed();
1074   m.Bind(&missing_match);
1075   m.Fail();
1076 
1077   Handle<String> source = factory->NewStringFromStaticChars("^(..)..\1");
1078   Handle<Object> code_object = m.GetCode(source);
1079   Handle<Code> code = Handle<Code>::cast(code_object);
1080 
1081   const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1082   Handle<String> input = factory->NewStringFromTwoByte(
1083       Vector<const uc16>(input_data, 6)).ToHandleChecked();
1084   Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
1085   Address start_adr = seq_input->GetCharsAddress();
1086 
1087   int output[4];
1088   NativeRegExpMacroAssembler::Result result =
1089       Execute(*code,
1090               *input,
1091               0,
1092               start_adr,
1093               start_adr + input->length() * 2,
1094               output);
1095 
1096   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1097   CHECK_EQ(0, output[0]);
1098   CHECK_EQ(2, output[1]);
1099   CHECK_EQ(6, output[2]);
1100   CHECK_EQ(-1, output[3]);
1101 }
1102 
1103 
1104 
TEST(MacroAssemblernativeAtStart)1105 TEST(MacroAssemblernativeAtStart) {
1106   v8::V8::Initialize();
1107   ContextInitializer initializer;
1108   Isolate* isolate = CcTest::i_isolate();
1109   Factory* factory = isolate->factory();
1110   Zone zone;
1111 
1112   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1113                              0);
1114 
1115   Label not_at_start, newline, fail;
1116   m.CheckNotAtStart(0, &not_at_start);
1117   // Check that prevchar = '\n' and current = 'f'.
1118   m.CheckCharacter('\n', &newline);
1119   m.Bind(&fail);
1120   m.Fail();
1121   m.Bind(&newline);
1122   m.LoadCurrentCharacter(0, &fail);
1123   m.CheckNotCharacter('f', &fail);
1124   m.Succeed();
1125 
1126   m.Bind(&not_at_start);
1127   // Check that prevchar = 'o' and current = 'b'.
1128   Label prevo;
1129   m.CheckCharacter('o', &prevo);
1130   m.Fail();
1131   m.Bind(&prevo);
1132   m.LoadCurrentCharacter(0, &fail);
1133   m.CheckNotCharacter('b', &fail);
1134   m.Succeed();
1135 
1136   Handle<String> source = factory->NewStringFromStaticChars("(^f|ob)");
1137   Handle<Object> code_object = m.GetCode(source);
1138   Handle<Code> code = Handle<Code>::cast(code_object);
1139 
1140   Handle<String> input = factory->NewStringFromStaticChars("foobar");
1141   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1142   Address start_adr = seq_input->GetCharsAddress();
1143 
1144   NativeRegExpMacroAssembler::Result result =
1145       Execute(*code,
1146               *input,
1147               0,
1148               start_adr,
1149               start_adr + input->length(),
1150               NULL);
1151 
1152   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1153 
1154   result = Execute(*code,
1155                    *input,
1156                    3,
1157                    start_adr + 3,
1158                    start_adr + input->length(),
1159                    NULL);
1160 
1161   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1162 }
1163 
1164 
TEST(MacroAssemblerNativeBackRefNoCase)1165 TEST(MacroAssemblerNativeBackRefNoCase) {
1166   v8::V8::Initialize();
1167   ContextInitializer initializer;
1168   Isolate* isolate = CcTest::i_isolate();
1169   Factory* factory = isolate->factory();
1170   Zone zone;
1171 
1172   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1173                              4);
1174 
1175   Label fail, succ;
1176 
1177   m.WriteCurrentPositionToRegister(0, 0);
1178   m.WriteCurrentPositionToRegister(2, 0);
1179   m.AdvanceCurrentPosition(3);
1180   m.WriteCurrentPositionToRegister(3, 0);
1181   m.CheckNotBackReferenceIgnoreCase(2, false, &fail);  // Match "AbC".
1182   m.CheckNotBackReferenceIgnoreCase(2, false, &fail);  // Match "ABC".
1183   Label expected_fail;
1184   m.CheckNotBackReferenceIgnoreCase(2, false, &expected_fail);
1185   m.Bind(&fail);
1186   m.Fail();
1187 
1188   m.Bind(&expected_fail);
1189   m.AdvanceCurrentPosition(3);  // Skip "xYz"
1190   m.CheckNotBackReferenceIgnoreCase(2, false, &succ);
1191   m.Fail();
1192 
1193   m.Bind(&succ);
1194   m.WriteCurrentPositionToRegister(1, 0);
1195   m.Succeed();
1196 
1197   Handle<String> source =
1198       factory->NewStringFromStaticChars("^(abc)\1\1(?!\1)...(?!\1)");
1199   Handle<Object> code_object = m.GetCode(source);
1200   Handle<Code> code = Handle<Code>::cast(code_object);
1201 
1202   Handle<String> input = factory->NewStringFromStaticChars("aBcAbCABCxYzab");
1203   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1204   Address start_adr = seq_input->GetCharsAddress();
1205 
1206   int output[4];
1207   NativeRegExpMacroAssembler::Result result =
1208       Execute(*code,
1209               *input,
1210               0,
1211               start_adr,
1212               start_adr + input->length(),
1213               output);
1214 
1215   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1216   CHECK_EQ(0, output[0]);
1217   CHECK_EQ(12, output[1]);
1218   CHECK_EQ(0, output[2]);
1219   CHECK_EQ(3, output[3]);
1220 }
1221 
1222 
1223 
TEST(MacroAssemblerNativeRegisters)1224 TEST(MacroAssemblerNativeRegisters) {
1225   v8::V8::Initialize();
1226   ContextInitializer initializer;
1227   Isolate* isolate = CcTest::i_isolate();
1228   Factory* factory = isolate->factory();
1229   Zone zone;
1230 
1231   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1232                              6);
1233 
1234   uc16 foo_chars[3] = {'f', 'o', 'o'};
1235   Vector<const uc16> foo(foo_chars, 3);
1236 
1237   enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1238   Label fail;
1239   Label backtrack;
1240   m.WriteCurrentPositionToRegister(out1, 0);  // Output: [0]
1241   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1242   m.PushBacktrack(&backtrack);
1243   m.WriteStackPointerToRegister(sp);
1244   // Fill stack and registers
1245   m.AdvanceCurrentPosition(2);
1246   m.WriteCurrentPositionToRegister(out1, 0);
1247   m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1248   m.PushBacktrack(&fail);
1249   // Drop backtrack stack frames.
1250   m.ReadStackPointerFromRegister(sp);
1251   // And take the first backtrack (to &backtrack)
1252   m.Backtrack();
1253 
1254   m.PushCurrentPosition();
1255   m.AdvanceCurrentPosition(2);
1256   m.PopCurrentPosition();
1257 
1258   m.Bind(&backtrack);
1259   m.PopRegister(out1);
1260   m.ReadCurrentPositionFromRegister(out1);
1261   m.AdvanceCurrentPosition(3);
1262   m.WriteCurrentPositionToRegister(out2, 0);  // [0,3]
1263 
1264   Label loop;
1265   m.SetRegister(loop_cnt, 0);  // loop counter
1266   m.Bind(&loop);
1267   m.AdvanceRegister(loop_cnt, 1);
1268   m.AdvanceCurrentPosition(1);
1269   m.IfRegisterLT(loop_cnt, 3, &loop);
1270   m.WriteCurrentPositionToRegister(out3, 0);  // [0,3,6]
1271 
1272   Label loop2;
1273   m.SetRegister(loop_cnt, 2);  // loop counter
1274   m.Bind(&loop2);
1275   m.AdvanceRegister(loop_cnt, -1);
1276   m.AdvanceCurrentPosition(1);
1277   m.IfRegisterGE(loop_cnt, 0, &loop2);
1278   m.WriteCurrentPositionToRegister(out4, 0);  // [0,3,6,9]
1279 
1280   Label loop3;
1281   Label exit_loop3;
1282   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1283   m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1284   m.ReadCurrentPositionFromRegister(out3);
1285   m.Bind(&loop3);
1286   m.AdvanceCurrentPosition(1);
1287   m.CheckGreedyLoop(&exit_loop3);
1288   m.GoTo(&loop3);
1289   m.Bind(&exit_loop3);
1290   m.PopCurrentPosition();
1291   m.WriteCurrentPositionToRegister(out5, 0);  // [0,3,6,9,9,-1]
1292 
1293   m.Succeed();
1294 
1295   m.Bind(&fail);
1296   m.Fail();
1297 
1298   Handle<String> source = factory->NewStringFromStaticChars("<loop test>");
1299   Handle<Object> code_object = m.GetCode(source);
1300   Handle<Code> code = Handle<Code>::cast(code_object);
1301 
1302   // String long enough for test (content doesn't matter).
1303   Handle<String> input = factory->NewStringFromStaticChars("foofoofoofoofoo");
1304   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1305   Address start_adr = seq_input->GetCharsAddress();
1306 
1307   int output[6];
1308   NativeRegExpMacroAssembler::Result result =
1309       Execute(*code,
1310               *input,
1311               0,
1312               start_adr,
1313               start_adr + input->length(),
1314               output);
1315 
1316   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1317   CHECK_EQ(0, output[0]);
1318   CHECK_EQ(3, output[1]);
1319   CHECK_EQ(6, output[2]);
1320   CHECK_EQ(9, output[3]);
1321   CHECK_EQ(9, output[4]);
1322   CHECK_EQ(-1, output[5]);
1323 }
1324 
1325 
TEST(MacroAssemblerStackOverflow)1326 TEST(MacroAssemblerStackOverflow) {
1327   v8::V8::Initialize();
1328   ContextInitializer initializer;
1329   Isolate* isolate = CcTest::i_isolate();
1330   Factory* factory = isolate->factory();
1331   Zone zone;
1332 
1333   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1334                              0);
1335 
1336   Label loop;
1337   m.Bind(&loop);
1338   m.PushBacktrack(&loop);
1339   m.GoTo(&loop);
1340 
1341   Handle<String> source =
1342       factory->NewStringFromStaticChars("<stack overflow test>");
1343   Handle<Object> code_object = m.GetCode(source);
1344   Handle<Code> code = Handle<Code>::cast(code_object);
1345 
1346   // String long enough for test (content doesn't matter).
1347   Handle<String> input = factory->NewStringFromStaticChars("dummy");
1348   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1349   Address start_adr = seq_input->GetCharsAddress();
1350 
1351   NativeRegExpMacroAssembler::Result result =
1352       Execute(*code,
1353               *input,
1354               0,
1355               start_adr,
1356               start_adr + input->length(),
1357               NULL);
1358 
1359   CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1360   CHECK(isolate->has_pending_exception());
1361   isolate->clear_pending_exception();
1362 }
1363 
1364 
TEST(MacroAssemblerNativeLotsOfRegisters)1365 TEST(MacroAssemblerNativeLotsOfRegisters) {
1366   v8::V8::Initialize();
1367   ContextInitializer initializer;
1368   Isolate* isolate = CcTest::i_isolate();
1369   Factory* factory = isolate->factory();
1370   Zone zone;
1371 
1372   ArchRegExpMacroAssembler m(isolate, &zone, NativeRegExpMacroAssembler::LATIN1,
1373                              2);
1374 
1375   // At least 2048, to ensure the allocated space for registers
1376   // span one full page.
1377   const int large_number = 8000;
1378   m.WriteCurrentPositionToRegister(large_number, 42);
1379   m.WriteCurrentPositionToRegister(0, 0);
1380   m.WriteCurrentPositionToRegister(1, 1);
1381   Label done;
1382   m.CheckNotBackReference(0, false, &done);  // Performs a system-stack push.
1383   m.Bind(&done);
1384   m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1385   m.PopRegister(1);
1386   m.Succeed();
1387 
1388   Handle<String> source =
1389       factory->NewStringFromStaticChars("<huge register space test>");
1390   Handle<Object> code_object = m.GetCode(source);
1391   Handle<Code> code = Handle<Code>::cast(code_object);
1392 
1393   // String long enough for test (content doesn't matter).
1394   Handle<String> input = factory->NewStringFromStaticChars("sample text");
1395   Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1396   Address start_adr = seq_input->GetCharsAddress();
1397 
1398   int captures[2];
1399   NativeRegExpMacroAssembler::Result result =
1400       Execute(*code,
1401               *input,
1402               0,
1403               start_adr,
1404               start_adr + input->length(),
1405               captures);
1406 
1407   CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1408   CHECK_EQ(0, captures[0]);
1409   CHECK_EQ(42, captures[1]);
1410 
1411   isolate->clear_pending_exception();
1412 }
1413 
1414 #else  // V8_INTERPRETED_REGEXP
1415 
TEST(MacroAssembler)1416 TEST(MacroAssembler) {
1417   byte codes[1024];
1418   Zone zone;
1419   RegExpMacroAssemblerIrregexp m(CcTest::i_isolate(), Vector<byte>(codes, 1024),
1420                                  &zone);
1421   // ^f(o)o.
1422   Label start, fail, backtrack;
1423 
1424   m.SetRegister(4, 42);
1425   m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1426   m.AdvanceRegister(4, 42);
1427   m.GoTo(&start);
1428   m.Fail();
1429   m.Bind(&start);
1430   m.PushBacktrack(&fail);
1431   m.CheckNotAtStart(0, NULL);
1432   m.LoadCurrentCharacter(0, NULL);
1433   m.CheckNotCharacter('f', NULL);
1434   m.LoadCurrentCharacter(1, NULL);
1435   m.CheckNotCharacter('o', NULL);
1436   m.LoadCurrentCharacter(2, NULL);
1437   m.CheckNotCharacter('o', NULL);
1438   m.WriteCurrentPositionToRegister(0, 0);
1439   m.WriteCurrentPositionToRegister(1, 3);
1440   m.WriteCurrentPositionToRegister(2, 1);
1441   m.WriteCurrentPositionToRegister(3, 2);
1442   m.AdvanceCurrentPosition(3);
1443   m.PushBacktrack(&backtrack);
1444   m.Succeed();
1445   m.Bind(&backtrack);
1446   m.ClearRegisters(2, 3);
1447   m.Backtrack();
1448   m.Bind(&fail);
1449   m.PopRegister(0);
1450   m.Fail();
1451 
1452   Isolate* isolate = CcTest::i_isolate();
1453   Factory* factory = isolate->factory();
1454   HandleScope scope(isolate);
1455 
1456   Handle<String> source = factory->NewStringFromStaticChars("^f(o)o");
1457   Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1458   int captures[5];
1459 
1460   const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1461   Handle<String> f1_16 = factory->NewStringFromTwoByte(
1462       Vector<const uc16>(str1, 6)).ToHandleChecked();
1463 
1464   CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1465   CHECK_EQ(0, captures[0]);
1466   CHECK_EQ(3, captures[1]);
1467   CHECK_EQ(1, captures[2]);
1468   CHECK_EQ(2, captures[3]);
1469   CHECK_EQ(84, captures[4]);
1470 
1471   const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1472   Handle<String> f2_16 = factory->NewStringFromTwoByte(
1473       Vector<const uc16>(str2, 6)).ToHandleChecked();
1474 
1475   CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1476   CHECK_EQ(42, captures[0]);
1477 }
1478 
1479 #endif  // V8_INTERPRETED_REGEXP
1480 
1481 
TEST(AddInverseToTable)1482 TEST(AddInverseToTable) {
1483   static const int kLimit = 1000;
1484   static const int kRangeCount = 16;
1485   for (int t = 0; t < 10; t++) {
1486     Zone zone;
1487     ZoneList<CharacterRange>* ranges =
1488         new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1489     for (int i = 0; i < kRangeCount; i++) {
1490       int from = PseudoRandom(t + 87, i + 25) % kLimit;
1491       int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1492       if (to > kLimit) to = kLimit;
1493       ranges->Add(CharacterRange(from, to), &zone);
1494     }
1495     DispatchTable table(&zone);
1496     DispatchTableConstructor cons(&table, false, &zone);
1497     cons.set_choice_index(0);
1498     cons.AddInverse(ranges);
1499     for (int i = 0; i < kLimit; i++) {
1500       bool is_on = false;
1501       for (int j = 0; !is_on && j < kRangeCount; j++)
1502         is_on = ranges->at(j).Contains(i);
1503       OutSet* set = table.Get(i);
1504       CHECK_EQ(is_on, set->Get(0) == false);
1505     }
1506   }
1507   Zone zone;
1508   ZoneList<CharacterRange>* ranges =
1509       new(&zone) ZoneList<CharacterRange>(1, &zone);
1510   ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone);
1511   DispatchTable table(&zone);
1512   DispatchTableConstructor cons(&table, false, &zone);
1513   cons.set_choice_index(0);
1514   cons.AddInverse(ranges);
1515   CHECK(!table.Get(0xFFFE)->Get(0));
1516   CHECK(table.Get(0xFFFF)->Get(0));
1517 }
1518 
1519 
canonicalize(uc32 c)1520 static uc32 canonicalize(uc32 c) {
1521   unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1522   int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1523   if (count == 0) {
1524     return c;
1525   } else {
1526     CHECK_EQ(1, count);
1527     return canon[0];
1528   }
1529 }
1530 
1531 
TEST(LatinCanonicalize)1532 TEST(LatinCanonicalize) {
1533   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1534   for (unibrow::uchar lower = 'a'; lower <= 'z'; lower++) {
1535     unibrow::uchar upper = lower + ('A' - 'a');
1536     CHECK_EQ(canonicalize(lower), canonicalize(upper));
1537     unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1538     int length = un_canonicalize.get(lower, '\0', uncanon);
1539     CHECK_EQ(2, length);
1540     CHECK_EQ(upper, uncanon[0]);
1541     CHECK_EQ(lower, uncanon[1]);
1542   }
1543   for (uc32 c = 128; c < (1 << 21); c++)
1544     CHECK_GE(canonicalize(c), 128);
1545   unibrow::Mapping<unibrow::ToUppercase> to_upper;
1546   // Canonicalization is only defined for the Basic Multilingual Plane.
1547   for (uc32 c = 0; c < (1 << 16); c++) {
1548     unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1549     int length = to_upper.get(c, '\0', upper);
1550     if (length == 0) {
1551       length = 1;
1552       upper[0] = c;
1553     }
1554     uc32 u = upper[0];
1555     if (length > 1 || (c >= 128 && u < 128))
1556       u = c;
1557     CHECK_EQ(u, canonicalize(c));
1558   }
1559 }
1560 
1561 
CanonRangeEnd(uc32 c)1562 static uc32 CanonRangeEnd(uc32 c) {
1563   unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1564   int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1565   if (count == 0) {
1566     return c;
1567   } else {
1568     CHECK_EQ(1, count);
1569     return canon[0];
1570   }
1571 }
1572 
1573 
TEST(RangeCanonicalization)1574 TEST(RangeCanonicalization) {
1575   // Check that we arrive at the same result when using the basic
1576   // range canonicalization primitives as when using immediate
1577   // canonicalization.
1578   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1579   int block_start = 0;
1580   while (block_start <= 0xFFFF) {
1581     uc32 block_end = CanonRangeEnd(block_start);
1582     unsigned block_length = block_end - block_start + 1;
1583     if (block_length > 1) {
1584       unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1585       int first_length = un_canonicalize.get(block_start, '\0', first);
1586       for (unsigned i = 1; i < block_length; i++) {
1587         unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1588         int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1589         CHECK_EQ(first_length, succ_length);
1590         for (int j = 0; j < succ_length; j++) {
1591           int calc = first[j] + i;
1592           int found = succ[j];
1593           CHECK_EQ(calc, found);
1594         }
1595       }
1596     }
1597     block_start = block_start + block_length;
1598   }
1599 }
1600 
1601 
TEST(UncanonicalizeEquivalence)1602 TEST(UncanonicalizeEquivalence) {
1603   unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1604   unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1605   for (int i = 0; i < (1 << 16); i++) {
1606     int length = un_canonicalize.get(i, '\0', chars);
1607     for (int j = 0; j < length; j++) {
1608       unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1609       int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1610       CHECK_EQ(length, length2);
1611       for (int k = 0; k < length; k++)
1612         CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1613     }
1614   }
1615 }
1616 
1617 
TestRangeCaseIndependence(Isolate * isolate,CharacterRange input,Vector<CharacterRange> expected)1618 static void TestRangeCaseIndependence(Isolate* isolate, CharacterRange input,
1619                                       Vector<CharacterRange> expected) {
1620   Zone zone;
1621   int count = expected.length();
1622   ZoneList<CharacterRange>* list =
1623       new(&zone) ZoneList<CharacterRange>(count, &zone);
1624   input.AddCaseEquivalents(isolate, &zone, list, false);
1625   CHECK_EQ(count, list->length());
1626   for (int i = 0; i < list->length(); i++) {
1627     CHECK_EQ(expected[i].from(), list->at(i).from());
1628     CHECK_EQ(expected[i].to(), list->at(i).to());
1629   }
1630 }
1631 
1632 
TestSimpleRangeCaseIndependence(Isolate * isolate,CharacterRange input,CharacterRange expected)1633 static void TestSimpleRangeCaseIndependence(Isolate* isolate,
1634                                             CharacterRange input,
1635                                             CharacterRange expected) {
1636   EmbeddedVector<CharacterRange, 1> vector;
1637   vector[0] = expected;
1638   TestRangeCaseIndependence(isolate, input, vector);
1639 }
1640 
1641 
TEST(CharacterRangeCaseIndependence)1642 TEST(CharacterRangeCaseIndependence) {
1643   Isolate* isolate = CcTest::i_isolate();
1644   TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('a'),
1645                                   CharacterRange::Singleton('A'));
1646   TestSimpleRangeCaseIndependence(isolate, CharacterRange::Singleton('z'),
1647                                   CharacterRange::Singleton('Z'));
1648   TestSimpleRangeCaseIndependence(isolate, CharacterRange('a', 'z'),
1649                                   CharacterRange('A', 'Z'));
1650   TestSimpleRangeCaseIndependence(isolate, CharacterRange('c', 'f'),
1651                                   CharacterRange('C', 'F'));
1652   TestSimpleRangeCaseIndependence(isolate, CharacterRange('a', 'b'),
1653                                   CharacterRange('A', 'B'));
1654   TestSimpleRangeCaseIndependence(isolate, CharacterRange('y', 'z'),
1655                                   CharacterRange('Y', 'Z'));
1656   TestSimpleRangeCaseIndependence(isolate, CharacterRange('a' - 1, 'z' + 1),
1657                                   CharacterRange('A', 'Z'));
1658   TestSimpleRangeCaseIndependence(isolate, CharacterRange('A', 'Z'),
1659                                   CharacterRange('a', 'z'));
1660   TestSimpleRangeCaseIndependence(isolate, CharacterRange('C', 'F'),
1661                                   CharacterRange('c', 'f'));
1662   TestSimpleRangeCaseIndependence(isolate, CharacterRange('A' - 1, 'Z' + 1),
1663                                   CharacterRange('a', 'z'));
1664   // Here we need to add [l-z] to complete the case independence of
1665   // [A-Za-z] but we expect [a-z] to be added since we always add a
1666   // whole block at a time.
1667   TestSimpleRangeCaseIndependence(isolate, CharacterRange('A', 'k'),
1668                                   CharacterRange('a', 'z'));
1669 }
1670 
1671 
InClass(uc16 c,ZoneList<CharacterRange> * ranges)1672 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1673   if (ranges == NULL)
1674     return false;
1675   for (int i = 0; i < ranges->length(); i++) {
1676     CharacterRange range = ranges->at(i);
1677     if (range.from() <= c && c <= range.to())
1678       return true;
1679   }
1680   return false;
1681 }
1682 
1683 
TEST(CharClassDifference)1684 TEST(CharClassDifference) {
1685   Zone zone;
1686   ZoneList<CharacterRange>* base =
1687       new(&zone) ZoneList<CharacterRange>(1, &zone);
1688   base->Add(CharacterRange::Everything(), &zone);
1689   Vector<const int> overlay = CharacterRange::GetWordBounds();
1690   ZoneList<CharacterRange>* included = NULL;
1691   ZoneList<CharacterRange>* excluded = NULL;
1692   CharacterRange::Split(base, overlay, &included, &excluded, &zone);
1693   for (int i = 0; i < (1 << 16); i++) {
1694     bool in_base = InClass(i, base);
1695     if (in_base) {
1696       bool in_overlay = false;
1697       for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1698         if (overlay[j] <= i && i < overlay[j+1])
1699           in_overlay = true;
1700       }
1701       CHECK_EQ(in_overlay, InClass(i, included));
1702       CHECK_EQ(!in_overlay, InClass(i, excluded));
1703     } else {
1704       CHECK(!InClass(i, included));
1705       CHECK(!InClass(i, excluded));
1706     }
1707   }
1708 }
1709 
1710 
TEST(CanonicalizeCharacterSets)1711 TEST(CanonicalizeCharacterSets) {
1712   Zone zone;
1713   ZoneList<CharacterRange>* list =
1714       new(&zone) ZoneList<CharacterRange>(4, &zone);
1715   CharacterSet set(list);
1716 
1717   list->Add(CharacterRange(10, 20), &zone);
1718   list->Add(CharacterRange(30, 40), &zone);
1719   list->Add(CharacterRange(50, 60), &zone);
1720   set.Canonicalize();
1721   CHECK_EQ(3, list->length());
1722   CHECK_EQ(10, list->at(0).from());
1723   CHECK_EQ(20, list->at(0).to());
1724   CHECK_EQ(30, list->at(1).from());
1725   CHECK_EQ(40, list->at(1).to());
1726   CHECK_EQ(50, list->at(2).from());
1727   CHECK_EQ(60, list->at(2).to());
1728 
1729   list->Rewind(0);
1730   list->Add(CharacterRange(10, 20), &zone);
1731   list->Add(CharacterRange(50, 60), &zone);
1732   list->Add(CharacterRange(30, 40), &zone);
1733   set.Canonicalize();
1734   CHECK_EQ(3, list->length());
1735   CHECK_EQ(10, list->at(0).from());
1736   CHECK_EQ(20, list->at(0).to());
1737   CHECK_EQ(30, list->at(1).from());
1738   CHECK_EQ(40, list->at(1).to());
1739   CHECK_EQ(50, list->at(2).from());
1740   CHECK_EQ(60, list->at(2).to());
1741 
1742   list->Rewind(0);
1743   list->Add(CharacterRange(30, 40), &zone);
1744   list->Add(CharacterRange(10, 20), &zone);
1745   list->Add(CharacterRange(25, 25), &zone);
1746   list->Add(CharacterRange(100, 100), &zone);
1747   list->Add(CharacterRange(1, 1), &zone);
1748   set.Canonicalize();
1749   CHECK_EQ(5, list->length());
1750   CHECK_EQ(1, list->at(0).from());
1751   CHECK_EQ(1, list->at(0).to());
1752   CHECK_EQ(10, list->at(1).from());
1753   CHECK_EQ(20, list->at(1).to());
1754   CHECK_EQ(25, list->at(2).from());
1755   CHECK_EQ(25, list->at(2).to());
1756   CHECK_EQ(30, list->at(3).from());
1757   CHECK_EQ(40, list->at(3).to());
1758   CHECK_EQ(100, list->at(4).from());
1759   CHECK_EQ(100, list->at(4).to());
1760 
1761   list->Rewind(0);
1762   list->Add(CharacterRange(10, 19), &zone);
1763   list->Add(CharacterRange(21, 30), &zone);
1764   list->Add(CharacterRange(20, 20), &zone);
1765   set.Canonicalize();
1766   CHECK_EQ(1, list->length());
1767   CHECK_EQ(10, list->at(0).from());
1768   CHECK_EQ(30, list->at(0).to());
1769 }
1770 
1771 
TEST(CharacterRangeMerge)1772 TEST(CharacterRangeMerge) {
1773   Zone zone;
1774   ZoneList<CharacterRange> l1(4, &zone);
1775   ZoneList<CharacterRange> l2(4, &zone);
1776   // Create all combinations of intersections of ranges, both singletons and
1777   // longer.
1778 
1779   int offset = 0;
1780 
1781   // The five kinds of singleton intersections:
1782   //     X
1783   //   Y      - outside before
1784   //    Y     - outside touching start
1785   //     Y    - overlap
1786   //      Y   - outside touching end
1787   //       Y  - outside after
1788 
1789   for (int i = 0; i < 5; i++) {
1790     l1.Add(CharacterRange::Singleton(offset + 2), &zone);
1791     l2.Add(CharacterRange::Singleton(offset + i), &zone);
1792     offset += 6;
1793   }
1794 
1795   // The seven kinds of singleton/non-singleton intersections:
1796   //    XXX
1797   //  Y        - outside before
1798   //   Y       - outside touching start
1799   //    Y      - inside touching start
1800   //     Y     - entirely inside
1801   //      Y    - inside touching end
1802   //       Y   - outside touching end
1803   //        Y  - disjoint after
1804 
1805   for (int i = 0; i < 7; i++) {
1806     l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
1807     l2.Add(CharacterRange::Singleton(offset + i), &zone);
1808     offset += 8;
1809   }
1810 
1811   // The eleven kinds of non-singleton intersections:
1812   //
1813   //       XXXXXXXX
1814   // YYYY                  - outside before.
1815   //   YYYY                - outside touching start.
1816   //     YYYY              - overlapping start
1817   //       YYYY            - inside touching start
1818   //         YYYY          - entirely inside
1819   //           YYYY        - inside touching end
1820   //             YYYY      - overlapping end
1821   //               YYYY    - outside touching end
1822   //                 YYYY  - outside after
1823   //       YYYYYYYY        - identical
1824   //     YYYYYYYYYYYY      - containing entirely.
1825 
1826   for (int i = 0; i < 9; i++) {
1827     l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);  // Length 8.
1828     l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1829     offset += 22;
1830   }
1831   l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1832   l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1833   offset += 22;
1834   l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1835   l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1836   offset += 22;
1837 
1838   // Different kinds of multi-range overlap:
1839   // XXXXXXXXXXXXXXXXXXXXXX         XXXXXXXXXXXXXXXXXXXXXX
1840   //   YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y  YYYY  Y
1841 
1842   l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
1843   l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1844   for (int i = 0; i < 6; i++) {
1845     l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
1846     l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1847     offset += 9;
1848   }
1849 
1850   CHECK(CharacterRange::IsCanonical(&l1));
1851   CHECK(CharacterRange::IsCanonical(&l2));
1852 
1853   ZoneList<CharacterRange> first_only(4, &zone);
1854   ZoneList<CharacterRange> second_only(4, &zone);
1855   ZoneList<CharacterRange> both(4, &zone);
1856 }
1857 
1858 
TEST(Graph)1859 TEST(Graph) {
1860   Execute("\\b\\w+\\b", false, true, true);
1861 }
1862 
1863 
1864 namespace {
1865 
1866 int* global_use_counts = NULL;
1867 
MockUseCounterCallback(v8::Isolate * isolate,v8::Isolate::UseCounterFeature feature)1868 void MockUseCounterCallback(v8::Isolate* isolate,
1869                             v8::Isolate::UseCounterFeature feature) {
1870   ++global_use_counts[feature];
1871 }
1872 }
1873 
1874 
1875 // Test that ES2015 RegExp compatibility fixes are in place, that they
1876 // are not overly broad, and the appropriate UseCounters are incremented
TEST(UseCountRegExp)1877 TEST(UseCountRegExp) {
1878   i::FLAG_harmony_regexps = true;
1879   v8::Isolate* isolate = CcTest::isolate();
1880   v8::HandleScope scope(isolate);
1881   LocalContext env;
1882   int use_counts[v8::Isolate::kUseCounterFeatureCount] = {};
1883   global_use_counts = use_counts;
1884   CcTest::isolate()->SetUseCounterCallback(MockUseCounterCallback);
1885 
1886   // Compat fix: RegExp.prototype.sticky == undefined; UseCounter tracks it
1887   v8::Local<v8::Value> resultSticky = CompileRun("RegExp.prototype.sticky");
1888   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1889   CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1890   CHECK(resultSticky->IsUndefined());
1891 
1892   // re.sticky has approriate value and doesn't touch UseCounter
1893   v8::Local<v8::Value> resultReSticky = CompileRun("/a/.sticky");
1894   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1895   CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1896   CHECK(resultReSticky->IsFalse());
1897 
1898   // When the getter is caleld on another object, throw an exception
1899   // and don't increment the UseCounter
1900   v8::Local<v8::Value> resultStickyError = CompileRun(
1901       "var exception;"
1902       "try { "
1903       "  Object.getOwnPropertyDescriptor(RegExp.prototype, 'sticky')"
1904       "      .get.call(null);"
1905       "} catch (e) {"
1906       "  exception = e;"
1907       "}"
1908       "exception");
1909   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1910   CHECK_EQ(0, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1911   CHECK(resultStickyError->IsObject());
1912 
1913   // RegExp.prototype.toString() returns '/(?:)/' as a compatibility fix;
1914   // a UseCounter is incremented to track it.
1915   v8::Local<v8::Value> resultToString =
1916       CompileRun("RegExp.prototype.toString().length");
1917   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1918   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1919   CHECK(resultToString->IsInt32());
1920   CHECK_EQ(6,
1921            resultToString->Int32Value(isolate->GetCurrentContext()).FromJust());
1922 
1923   // .toString() works on normal RegExps
1924   v8::Local<v8::Value> resultReToString = CompileRun("/a/.toString().length");
1925   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1926   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1927   CHECK(resultReToString->IsInt32());
1928   CHECK_EQ(
1929       3, resultReToString->Int32Value(isolate->GetCurrentContext()).FromJust());
1930 
1931   // .toString() throws on non-RegExps that aren't RegExp.prototype
1932   v8::Local<v8::Value> resultToStringError = CompileRun(
1933       "var exception;"
1934       "try { RegExp.prototype.toString.call(null) }"
1935       "catch (e) { exception = e; }"
1936       "exception");
1937   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeStickyGetter]);
1938   CHECK_EQ(1, use_counts[v8::Isolate::kRegExpPrototypeToString]);
1939   CHECK(resultToStringError->IsObject());
1940 }
1941