1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Redistribution and use in source and binary forms, with or without
3 // modification, are permitted provided that the following conditions are
4 // met:
5 //
6 // * Redistributions of source code must retain the above copyright
7 // notice, this list of conditions and the following disclaimer.
8 // * Redistributions in binary form must reproduce the above
9 // copyright notice, this list of conditions and the following
10 // disclaimer in the documentation and/or other materials provided
11 // with the distribution.
12 // * Neither the name of Google Inc. nor the names of its
13 // contributors may be used to endorse or promote products derived
14 // from this software without specific prior written permission.
15 //
16 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17 // "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18 // LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19 // A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
20 // OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
21 // SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
22 // LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23 // DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24 // THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25 // (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
26 // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27
28
29 #include <stdlib.h>
30
31 #include "v8.h"
32
33 #include "ast.h"
34 #include "char-predicates-inl.h"
35 #include "cctest.h"
36 #include "jsregexp.h"
37 #include "parser.h"
38 #include "regexp-macro-assembler.h"
39 #include "regexp-macro-assembler-irregexp.h"
40 #include "string-stream.h"
41 #include "zone-inl.h"
42 #ifdef V8_INTERPRETED_REGEXP
43 #include "interpreter-irregexp.h"
44 #else // V8_INTERPRETED_REGEXP
45 #include "macro-assembler.h"
46 #include "code.h"
47 #if V8_TARGET_ARCH_ARM
48 #include "arm/assembler-arm.h"
49 #include "arm/macro-assembler-arm.h"
50 #include "arm/regexp-macro-assembler-arm.h"
51 #endif
52 #if V8_TARGET_ARCH_MIPS
53 #include "mips/assembler-mips.h"
54 #include "mips/macro-assembler-mips.h"
55 #include "mips/regexp-macro-assembler-mips.h"
56 #endif
57 #if V8_TARGET_ARCH_X64
58 #include "x64/assembler-x64.h"
59 #include "x64/macro-assembler-x64.h"
60 #include "x64/regexp-macro-assembler-x64.h"
61 #endif
62 #if V8_TARGET_ARCH_IA32
63 #include "ia32/assembler-ia32.h"
64 #include "ia32/macro-assembler-ia32.h"
65 #include "ia32/regexp-macro-assembler-ia32.h"
66 #endif
67 #endif // V8_INTERPRETED_REGEXP
68
69 using namespace v8::internal;
70
71
CheckParse(const char * input)72 static bool CheckParse(const char* input) {
73 V8::Initialize(NULL);
74 v8::HandleScope scope(CcTest::isolate());
75 Zone zone(CcTest::i_isolate());
76 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
77 RegExpCompileData result;
78 return v8::internal::RegExpParser::ParseRegExp(
79 &reader, false, &result, &zone);
80 }
81
82
Parse(const char * input)83 static SmartArrayPointer<const char> Parse(const char* input) {
84 V8::Initialize(NULL);
85 v8::HandleScope scope(CcTest::isolate());
86 Zone zone(CcTest::i_isolate());
87 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
88 RegExpCompileData result;
89 CHECK(v8::internal::RegExpParser::ParseRegExp(
90 &reader, false, &result, &zone));
91 CHECK(result.tree != NULL);
92 CHECK(result.error.is_null());
93 SmartArrayPointer<const char> output = result.tree->ToString(&zone);
94 return output;
95 }
96
97
CheckSimple(const char * input)98 static bool CheckSimple(const char* input) {
99 V8::Initialize(NULL);
100 v8::HandleScope scope(CcTest::isolate());
101 Zone zone(CcTest::i_isolate());
102 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
103 RegExpCompileData result;
104 CHECK(v8::internal::RegExpParser::ParseRegExp(
105 &reader, false, &result, &zone));
106 CHECK(result.tree != NULL);
107 CHECK(result.error.is_null());
108 return result.simple;
109 }
110
111 struct MinMaxPair {
112 int min_match;
113 int max_match;
114 };
115
116
CheckMinMaxMatch(const char * input)117 static MinMaxPair CheckMinMaxMatch(const char* input) {
118 V8::Initialize(NULL);
119 v8::HandleScope scope(CcTest::isolate());
120 Zone zone(CcTest::i_isolate());
121 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
122 RegExpCompileData result;
123 CHECK(v8::internal::RegExpParser::ParseRegExp(
124 &reader, false, &result, &zone));
125 CHECK(result.tree != NULL);
126 CHECK(result.error.is_null());
127 int min_match = result.tree->min_match();
128 int max_match = result.tree->max_match();
129 MinMaxPair pair = { min_match, max_match };
130 return pair;
131 }
132
133
134 #define CHECK_PARSE_ERROR(input) CHECK(!CheckParse(input))
135 #define CHECK_PARSE_EQ(input, expected) CHECK_EQ(expected, *Parse(input))
136 #define CHECK_SIMPLE(input, simple) CHECK_EQ(simple, CheckSimple(input));
137 #define CHECK_MIN_MAX(input, min, max) \
138 { MinMaxPair min_max = CheckMinMaxMatch(input); \
139 CHECK_EQ(min, min_max.min_match); \
140 CHECK_EQ(max, min_max.max_match); \
141 }
142
TEST(Parser)143 TEST(Parser) {
144 V8::Initialize(NULL);
145
146 CHECK_PARSE_ERROR("?");
147
148 CHECK_PARSE_EQ("abc", "'abc'");
149 CHECK_PARSE_EQ("", "%");
150 CHECK_PARSE_EQ("abc|def", "(| 'abc' 'def')");
151 CHECK_PARSE_EQ("abc|def|ghi", "(| 'abc' 'def' 'ghi')");
152 CHECK_PARSE_EQ("^xxx$", "(: @^i 'xxx' @$i)");
153 CHECK_PARSE_EQ("ab\\b\\d\\bcd", "(: 'ab' @b [0-9] @b 'cd')");
154 CHECK_PARSE_EQ("\\w|\\d", "(| [0-9 A-Z _ a-z] [0-9])");
155 CHECK_PARSE_EQ("a*", "(# 0 - g 'a')");
156 CHECK_PARSE_EQ("a*?", "(# 0 - n 'a')");
157 CHECK_PARSE_EQ("abc+", "(: 'ab' (# 1 - g 'c'))");
158 CHECK_PARSE_EQ("abc+?", "(: 'ab' (# 1 - n 'c'))");
159 CHECK_PARSE_EQ("xyz?", "(: 'xy' (# 0 1 g 'z'))");
160 CHECK_PARSE_EQ("xyz??", "(: 'xy' (# 0 1 n 'z'))");
161 CHECK_PARSE_EQ("xyz{0,1}", "(: 'xy' (# 0 1 g 'z'))");
162 CHECK_PARSE_EQ("xyz{0,1}?", "(: 'xy' (# 0 1 n 'z'))");
163 CHECK_PARSE_EQ("xyz{93}", "(: 'xy' (# 93 93 g 'z'))");
164 CHECK_PARSE_EQ("xyz{93}?", "(: 'xy' (# 93 93 n 'z'))");
165 CHECK_PARSE_EQ("xyz{1,32}", "(: 'xy' (# 1 32 g 'z'))");
166 CHECK_PARSE_EQ("xyz{1,32}?", "(: 'xy' (# 1 32 n 'z'))");
167 CHECK_PARSE_EQ("xyz{1,}", "(: 'xy' (# 1 - g 'z'))");
168 CHECK_PARSE_EQ("xyz{1,}?", "(: 'xy' (# 1 - n 'z'))");
169 CHECK_PARSE_EQ("a\\fb\\nc\\rd\\te\\vf", "'a\\x0cb\\x0ac\\x0dd\\x09e\\x0bf'");
170 CHECK_PARSE_EQ("a\\nb\\bc", "(: 'a\\x0ab' @b 'c')");
171 CHECK_PARSE_EQ("(?:foo)", "'foo'");
172 CHECK_PARSE_EQ("(?: foo )", "' foo '");
173 CHECK_PARSE_EQ("(foo|bar|baz)", "(^ (| 'foo' 'bar' 'baz'))");
174 CHECK_PARSE_EQ("foo|(bar|baz)|quux", "(| 'foo' (^ (| 'bar' 'baz')) 'quux')");
175 CHECK_PARSE_EQ("foo(?=bar)baz", "(: 'foo' (-> + 'bar') 'baz')");
176 CHECK_PARSE_EQ("foo(?!bar)baz", "(: 'foo' (-> - 'bar') 'baz')");
177 CHECK_PARSE_EQ("()", "(^ %)");
178 CHECK_PARSE_EQ("(?=)", "(-> + %)");
179 CHECK_PARSE_EQ("[]", "^[\\x00-\\uffff]"); // Doesn't compile on windows
180 CHECK_PARSE_EQ("[^]", "[\\x00-\\uffff]"); // \uffff isn't in codepage 1252
181 CHECK_PARSE_EQ("[x]", "[x]");
182 CHECK_PARSE_EQ("[xyz]", "[x y z]");
183 CHECK_PARSE_EQ("[a-zA-Z0-9]", "[a-z A-Z 0-9]");
184 CHECK_PARSE_EQ("[-123]", "[- 1 2 3]");
185 CHECK_PARSE_EQ("[^123]", "^[1 2 3]");
186 CHECK_PARSE_EQ("]", "']'");
187 CHECK_PARSE_EQ("}", "'}'");
188 CHECK_PARSE_EQ("[a-b-c]", "[a-b - c]");
189 CHECK_PARSE_EQ("[\\d]", "[0-9]");
190 CHECK_PARSE_EQ("[x\\dz]", "[x 0-9 z]");
191 CHECK_PARSE_EQ("[\\d-z]", "[0-9 - z]");
192 CHECK_PARSE_EQ("[\\d-\\d]", "[0-9 - 0-9]");
193 CHECK_PARSE_EQ("[z-\\d]", "[z - 0-9]");
194 // Control character outside character class.
195 CHECK_PARSE_EQ("\\cj\\cJ\\ci\\cI\\ck\\cK",
196 "'\\x0a\\x0a\\x09\\x09\\x0b\\x0b'");
197 CHECK_PARSE_EQ("\\c!", "'\\c!'");
198 CHECK_PARSE_EQ("\\c_", "'\\c_'");
199 CHECK_PARSE_EQ("\\c~", "'\\c~'");
200 CHECK_PARSE_EQ("\\c1", "'\\c1'");
201 // Control character inside character class.
202 CHECK_PARSE_EQ("[\\c!]", "[\\ c !]");
203 CHECK_PARSE_EQ("[\\c_]", "[\\x1f]");
204 CHECK_PARSE_EQ("[\\c~]", "[\\ c ~]");
205 CHECK_PARSE_EQ("[\\ca]", "[\\x01]");
206 CHECK_PARSE_EQ("[\\cz]", "[\\x1a]");
207 CHECK_PARSE_EQ("[\\cA]", "[\\x01]");
208 CHECK_PARSE_EQ("[\\cZ]", "[\\x1a]");
209 CHECK_PARSE_EQ("[\\c1]", "[\\x11]");
210
211 CHECK_PARSE_EQ("[a\\]c]", "[a ] c]");
212 CHECK_PARSE_EQ("\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ", "'[]{}()%^# '");
213 CHECK_PARSE_EQ("[\\[\\]\\{\\}\\(\\)\\%\\^\\#\\ ]", "[[ ] { } ( ) % ^ # ]");
214 CHECK_PARSE_EQ("\\0", "'\\x00'");
215 CHECK_PARSE_EQ("\\8", "'8'");
216 CHECK_PARSE_EQ("\\9", "'9'");
217 CHECK_PARSE_EQ("\\11", "'\\x09'");
218 CHECK_PARSE_EQ("\\11a", "'\\x09a'");
219 CHECK_PARSE_EQ("\\011", "'\\x09'");
220 CHECK_PARSE_EQ("\\00011", "'\\x0011'");
221 CHECK_PARSE_EQ("\\118", "'\\x098'");
222 CHECK_PARSE_EQ("\\111", "'I'");
223 CHECK_PARSE_EQ("\\1111", "'I1'");
224 CHECK_PARSE_EQ("(x)(x)(x)\\1", "(: (^ 'x') (^ 'x') (^ 'x') (<- 1))");
225 CHECK_PARSE_EQ("(x)(x)(x)\\2", "(: (^ 'x') (^ 'x') (^ 'x') (<- 2))");
226 CHECK_PARSE_EQ("(x)(x)(x)\\3", "(: (^ 'x') (^ 'x') (^ 'x') (<- 3))");
227 CHECK_PARSE_EQ("(x)(x)(x)\\4", "(: (^ 'x') (^ 'x') (^ 'x') '\\x04')");
228 CHECK_PARSE_EQ("(x)(x)(x)\\1*", "(: (^ 'x') (^ 'x') (^ 'x')"
229 " (# 0 - g (<- 1)))");
230 CHECK_PARSE_EQ("(x)(x)(x)\\2*", "(: (^ 'x') (^ 'x') (^ 'x')"
231 " (# 0 - g (<- 2)))");
232 CHECK_PARSE_EQ("(x)(x)(x)\\3*", "(: (^ 'x') (^ 'x') (^ 'x')"
233 " (# 0 - g (<- 3)))");
234 CHECK_PARSE_EQ("(x)(x)(x)\\4*", "(: (^ 'x') (^ 'x') (^ 'x')"
235 " (# 0 - g '\\x04'))");
236 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\10",
237 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
238 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') (<- 10))");
239 CHECK_PARSE_EQ("(x)(x)(x)(x)(x)(x)(x)(x)(x)(x)\\11",
240 "(: (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x') (^ 'x')"
241 " (^ 'x') (^ 'x') (^ 'x') (^ 'x') '\\x09')");
242 CHECK_PARSE_EQ("(a)\\1", "(: (^ 'a') (<- 1))");
243 CHECK_PARSE_EQ("(a\\1)", "(^ 'a')");
244 CHECK_PARSE_EQ("(\\1a)", "(^ 'a')");
245 CHECK_PARSE_EQ("(?=a)?a", "'a'");
246 CHECK_PARSE_EQ("(?=a){0,10}a", "'a'");
247 CHECK_PARSE_EQ("(?=a){1,10}a", "(: (-> + 'a') 'a')");
248 CHECK_PARSE_EQ("(?=a){9,10}a", "(: (-> + 'a') 'a')");
249 CHECK_PARSE_EQ("(?!a)?a", "'a'");
250 CHECK_PARSE_EQ("\\1(a)", "(^ 'a')");
251 CHECK_PARSE_EQ("(?!(a))\\1", "(: (-> - (^ 'a')) (<- 1))");
252 CHECK_PARSE_EQ("(?!\\1(a\\1)\\1)\\1", "(: (-> - (: (^ 'a') (<- 1))) (<- 1))");
253 CHECK_PARSE_EQ("[\\0]", "[\\x00]");
254 CHECK_PARSE_EQ("[\\11]", "[\\x09]");
255 CHECK_PARSE_EQ("[\\11a]", "[\\x09 a]");
256 CHECK_PARSE_EQ("[\\011]", "[\\x09]");
257 CHECK_PARSE_EQ("[\\00011]", "[\\x00 1 1]");
258 CHECK_PARSE_EQ("[\\118]", "[\\x09 8]");
259 CHECK_PARSE_EQ("[\\111]", "[I]");
260 CHECK_PARSE_EQ("[\\1111]", "[I 1]");
261 CHECK_PARSE_EQ("\\x34", "'\x34'");
262 CHECK_PARSE_EQ("\\x60", "'\x60'");
263 CHECK_PARSE_EQ("\\x3z", "'x3z'");
264 CHECK_PARSE_EQ("\\c", "'\\c'");
265 CHECK_PARSE_EQ("\\u0034", "'\x34'");
266 CHECK_PARSE_EQ("\\u003z", "'u003z'");
267 CHECK_PARSE_EQ("foo[z]*", "(: 'foo' (# 0 - g [z]))");
268
269 CHECK_SIMPLE("", false);
270 CHECK_SIMPLE("a", true);
271 CHECK_SIMPLE("a|b", false);
272 CHECK_SIMPLE("a\\n", false);
273 CHECK_SIMPLE("^a", false);
274 CHECK_SIMPLE("a$", false);
275 CHECK_SIMPLE("a\\b!", false);
276 CHECK_SIMPLE("a\\Bb", false);
277 CHECK_SIMPLE("a*", false);
278 CHECK_SIMPLE("a*?", false);
279 CHECK_SIMPLE("a?", false);
280 CHECK_SIMPLE("a??", false);
281 CHECK_SIMPLE("a{0,1}?", false);
282 CHECK_SIMPLE("a{1,1}?", false);
283 CHECK_SIMPLE("a{1,2}?", false);
284 CHECK_SIMPLE("a+?", false);
285 CHECK_SIMPLE("(a)", false);
286 CHECK_SIMPLE("(a)\\1", false);
287 CHECK_SIMPLE("(\\1a)", false);
288 CHECK_SIMPLE("\\1(a)", false);
289 CHECK_SIMPLE("a\\s", false);
290 CHECK_SIMPLE("a\\S", false);
291 CHECK_SIMPLE("a\\d", false);
292 CHECK_SIMPLE("a\\D", false);
293 CHECK_SIMPLE("a\\w", false);
294 CHECK_SIMPLE("a\\W", false);
295 CHECK_SIMPLE("a.", false);
296 CHECK_SIMPLE("a\\q", false);
297 CHECK_SIMPLE("a[a]", false);
298 CHECK_SIMPLE("a[^a]", false);
299 CHECK_SIMPLE("a[a-z]", false);
300 CHECK_SIMPLE("a[\\q]", false);
301 CHECK_SIMPLE("a(?:b)", false);
302 CHECK_SIMPLE("a(?=b)", false);
303 CHECK_SIMPLE("a(?!b)", false);
304 CHECK_SIMPLE("\\x60", false);
305 CHECK_SIMPLE("\\u0060", false);
306 CHECK_SIMPLE("\\cA", false);
307 CHECK_SIMPLE("\\q", false);
308 CHECK_SIMPLE("\\1112", false);
309 CHECK_SIMPLE("\\0", false);
310 CHECK_SIMPLE("(a)\\1", false);
311 CHECK_SIMPLE("(?=a)?a", false);
312 CHECK_SIMPLE("(?!a)?a\\1", false);
313 CHECK_SIMPLE("(?:(?=a))a\\1", false);
314
315 CHECK_PARSE_EQ("a{}", "'a{}'");
316 CHECK_PARSE_EQ("a{,}", "'a{,}'");
317 CHECK_PARSE_EQ("a{", "'a{'");
318 CHECK_PARSE_EQ("a{z}", "'a{z}'");
319 CHECK_PARSE_EQ("a{1z}", "'a{1z}'");
320 CHECK_PARSE_EQ("a{12z}", "'a{12z}'");
321 CHECK_PARSE_EQ("a{12,", "'a{12,'");
322 CHECK_PARSE_EQ("a{12,3b", "'a{12,3b'");
323 CHECK_PARSE_EQ("{}", "'{}'");
324 CHECK_PARSE_EQ("{,}", "'{,}'");
325 CHECK_PARSE_EQ("{", "'{'");
326 CHECK_PARSE_EQ("{z}", "'{z}'");
327 CHECK_PARSE_EQ("{1z}", "'{1z}'");
328 CHECK_PARSE_EQ("{12z}", "'{12z}'");
329 CHECK_PARSE_EQ("{12,", "'{12,'");
330 CHECK_PARSE_EQ("{12,3b", "'{12,3b'");
331
332 CHECK_MIN_MAX("a", 1, 1);
333 CHECK_MIN_MAX("abc", 3, 3);
334 CHECK_MIN_MAX("a[bc]d", 3, 3);
335 CHECK_MIN_MAX("a|bc", 1, 2);
336 CHECK_MIN_MAX("ab|c", 1, 2);
337 CHECK_MIN_MAX("a||bc", 0, 2);
338 CHECK_MIN_MAX("|", 0, 0);
339 CHECK_MIN_MAX("(?:ab)", 2, 2);
340 CHECK_MIN_MAX("(?:ab|cde)", 2, 3);
341 CHECK_MIN_MAX("(?:ab)|cde", 2, 3);
342 CHECK_MIN_MAX("(ab)", 2, 2);
343 CHECK_MIN_MAX("(ab|cde)", 2, 3);
344 CHECK_MIN_MAX("(ab)\\1", 2, 4);
345 CHECK_MIN_MAX("(ab|cde)\\1", 2, 6);
346 CHECK_MIN_MAX("(?:ab)?", 0, 2);
347 CHECK_MIN_MAX("(?:ab)*", 0, RegExpTree::kInfinity);
348 CHECK_MIN_MAX("(?:ab)+", 2, RegExpTree::kInfinity);
349 CHECK_MIN_MAX("a?", 0, 1);
350 CHECK_MIN_MAX("a*", 0, RegExpTree::kInfinity);
351 CHECK_MIN_MAX("a+", 1, RegExpTree::kInfinity);
352 CHECK_MIN_MAX("a??", 0, 1);
353 CHECK_MIN_MAX("a*?", 0, RegExpTree::kInfinity);
354 CHECK_MIN_MAX("a+?", 1, RegExpTree::kInfinity);
355 CHECK_MIN_MAX("(?:a?)?", 0, 1);
356 CHECK_MIN_MAX("(?:a*)?", 0, RegExpTree::kInfinity);
357 CHECK_MIN_MAX("(?:a+)?", 0, RegExpTree::kInfinity);
358 CHECK_MIN_MAX("(?:a?)+", 0, RegExpTree::kInfinity);
359 CHECK_MIN_MAX("(?:a*)+", 0, RegExpTree::kInfinity);
360 CHECK_MIN_MAX("(?:a+)+", 1, RegExpTree::kInfinity);
361 CHECK_MIN_MAX("(?:a?)*", 0, RegExpTree::kInfinity);
362 CHECK_MIN_MAX("(?:a*)*", 0, RegExpTree::kInfinity);
363 CHECK_MIN_MAX("(?:a+)*", 0, RegExpTree::kInfinity);
364 CHECK_MIN_MAX("a{0}", 0, 0);
365 CHECK_MIN_MAX("(?:a+){0}", 0, 0);
366 CHECK_MIN_MAX("(?:a+){0,0}", 0, 0);
367 CHECK_MIN_MAX("a*b", 1, RegExpTree::kInfinity);
368 CHECK_MIN_MAX("a+b", 2, RegExpTree::kInfinity);
369 CHECK_MIN_MAX("a*b|c", 1, RegExpTree::kInfinity);
370 CHECK_MIN_MAX("a+b|c", 1, RegExpTree::kInfinity);
371 CHECK_MIN_MAX("(?:a{5,1000000}){3,1000000}", 15, RegExpTree::kInfinity);
372 CHECK_MIN_MAX("(?:ab){4,7}", 8, 14);
373 CHECK_MIN_MAX("a\\bc", 2, 2);
374 CHECK_MIN_MAX("a\\Bc", 2, 2);
375 CHECK_MIN_MAX("a\\sc", 3, 3);
376 CHECK_MIN_MAX("a\\Sc", 3, 3);
377 CHECK_MIN_MAX("a(?=b)c", 2, 2);
378 CHECK_MIN_MAX("a(?=bbb|bb)c", 2, 2);
379 CHECK_MIN_MAX("a(?!bbb|bb)c", 2, 2);
380 }
381
382
TEST(ParserRegression)383 TEST(ParserRegression) {
384 CHECK_PARSE_EQ("[A-Z$-][x]", "(! [A-Z $ -] [x])");
385 CHECK_PARSE_EQ("a{3,4*}", "(: 'a{3,' (# 0 - g '4') '}')");
386 CHECK_PARSE_EQ("{", "'{'");
387 CHECK_PARSE_EQ("a|", "(| 'a' %)");
388 }
389
ExpectError(const char * input,const char * expected)390 static void ExpectError(const char* input,
391 const char* expected) {
392 V8::Initialize(NULL);
393 v8::HandleScope scope(CcTest::isolate());
394 Zone zone(CcTest::i_isolate());
395 FlatStringReader reader(CcTest::i_isolate(), CStrVector(input));
396 RegExpCompileData result;
397 CHECK(!v8::internal::RegExpParser::ParseRegExp(
398 &reader, false, &result, &zone));
399 CHECK(result.tree == NULL);
400 CHECK(!result.error.is_null());
401 SmartArrayPointer<char> str = result.error->ToCString(ALLOW_NULLS);
402 CHECK_EQ(expected, *str);
403 }
404
405
TEST(Errors)406 TEST(Errors) {
407 const char* kEndBackslash = "\\ at end of pattern";
408 ExpectError("\\", kEndBackslash);
409 const char* kUnterminatedGroup = "Unterminated group";
410 ExpectError("(foo", kUnterminatedGroup);
411 const char* kInvalidGroup = "Invalid group";
412 ExpectError("(?", kInvalidGroup);
413 const char* kUnterminatedCharacterClass = "Unterminated character class";
414 ExpectError("[", kUnterminatedCharacterClass);
415 ExpectError("[a-", kUnterminatedCharacterClass);
416 const char* kNothingToRepeat = "Nothing to repeat";
417 ExpectError("*", kNothingToRepeat);
418 ExpectError("?", kNothingToRepeat);
419 ExpectError("+", kNothingToRepeat);
420 ExpectError("{1}", kNothingToRepeat);
421 ExpectError("{1,2}", kNothingToRepeat);
422 ExpectError("{1,}", kNothingToRepeat);
423
424 // Check that we don't allow more than kMaxCapture captures
425 const int kMaxCaptures = 1 << 16; // Must match RegExpParser::kMaxCaptures.
426 const char* kTooManyCaptures = "Too many captures";
427 HeapStringAllocator allocator;
428 StringStream accumulator(&allocator);
429 for (int i = 0; i <= kMaxCaptures; i++) {
430 accumulator.Add("()");
431 }
432 SmartArrayPointer<const char> many_captures(accumulator.ToCString());
433 ExpectError(*many_captures, kTooManyCaptures);
434 }
435
436
IsDigit(uc16 c)437 static bool IsDigit(uc16 c) {
438 return ('0' <= c && c <= '9');
439 }
440
441
NotDigit(uc16 c)442 static bool NotDigit(uc16 c) {
443 return !IsDigit(c);
444 }
445
446
IsWhiteSpace(uc16 c)447 static bool IsWhiteSpace(uc16 c) {
448 switch (c) {
449 case 0x09:
450 case 0x0A:
451 case 0x0B:
452 case 0x0C:
453 case 0x0d:
454 case 0x20:
455 case 0xA0:
456 case 0x2028:
457 case 0x2029:
458 case 0xFEFF:
459 return true;
460 default:
461 return unibrow::Space::Is(c);
462 }
463 }
464
465
NotWhiteSpace(uc16 c)466 static bool NotWhiteSpace(uc16 c) {
467 return !IsWhiteSpace(c);
468 }
469
470
NotWord(uc16 c)471 static bool NotWord(uc16 c) {
472 return !IsRegExpWord(c);
473 }
474
475
TestCharacterClassEscapes(uc16 c,bool (pred)(uc16 c))476 static void TestCharacterClassEscapes(uc16 c, bool (pred)(uc16 c)) {
477 Zone zone(CcTest::i_isolate());
478 ZoneList<CharacterRange>* ranges =
479 new(&zone) ZoneList<CharacterRange>(2, &zone);
480 CharacterRange::AddClassEscape(c, ranges, &zone);
481 for (unsigned i = 0; i < (1 << 16); i++) {
482 bool in_class = false;
483 for (int j = 0; !in_class && j < ranges->length(); j++) {
484 CharacterRange& range = ranges->at(j);
485 in_class = (range.from() <= i && i <= range.to());
486 }
487 CHECK_EQ(pred(i), in_class);
488 }
489 }
490
491
TEST(CharacterClassEscapes)492 TEST(CharacterClassEscapes) {
493 v8::internal::V8::Initialize(NULL);
494 TestCharacterClassEscapes('.', IsRegExpNewline);
495 TestCharacterClassEscapes('d', IsDigit);
496 TestCharacterClassEscapes('D', NotDigit);
497 TestCharacterClassEscapes('s', IsWhiteSpace);
498 TestCharacterClassEscapes('S', NotWhiteSpace);
499 TestCharacterClassEscapes('w', IsRegExpWord);
500 TestCharacterClassEscapes('W', NotWord);
501 }
502
503
Compile(const char * input,bool multiline,bool is_ascii,Zone * zone)504 static RegExpNode* Compile(const char* input,
505 bool multiline,
506 bool is_ascii,
507 Zone* zone) {
508 V8::Initialize(NULL);
509 Isolate* isolate = CcTest::i_isolate();
510 FlatStringReader reader(isolate, CStrVector(input));
511 RegExpCompileData compile_data;
512 if (!v8::internal::RegExpParser::ParseRegExp(&reader, multiline,
513 &compile_data, zone))
514 return NULL;
515 Handle<String> pattern = isolate->factory()->
516 NewStringFromUtf8(CStrVector(input));
517 Handle<String> sample_subject =
518 isolate->factory()->NewStringFromUtf8(CStrVector(""));
519 RegExpEngine::Compile(&compile_data,
520 false,
521 false,
522 multiline,
523 pattern,
524 sample_subject,
525 is_ascii,
526 zone);
527 return compile_data.node;
528 }
529
530
Execute(const char * input,bool multiline,bool is_ascii,bool dot_output=false)531 static void Execute(const char* input,
532 bool multiline,
533 bool is_ascii,
534 bool dot_output = false) {
535 v8::HandleScope scope(CcTest::isolate());
536 Zone zone(CcTest::i_isolate());
537 RegExpNode* node = Compile(input, multiline, is_ascii, &zone);
538 USE(node);
539 #ifdef DEBUG
540 if (dot_output) {
541 RegExpEngine::DotPrint(input, node, false);
542 exit(0);
543 }
544 #endif // DEBUG
545 }
546
547
548 class TestConfig {
549 public:
550 typedef int Key;
551 typedef int Value;
552 static const int kNoKey;
NoValue()553 static int NoValue() { return 0; }
Compare(int a,int b)554 static inline int Compare(int a, int b) {
555 if (a < b)
556 return -1;
557 else if (a > b)
558 return 1;
559 else
560 return 0;
561 }
562 };
563
564
565 const int TestConfig::kNoKey = 0;
566
567
PseudoRandom(int i,int j)568 static unsigned PseudoRandom(int i, int j) {
569 return ~(~((i * 781) ^ (j * 329)));
570 }
571
572
TEST(SplayTreeSimple)573 TEST(SplayTreeSimple) {
574 v8::internal::V8::Initialize(NULL);
575 static const unsigned kLimit = 1000;
576 Zone zone(CcTest::i_isolate());
577 ZoneSplayTree<TestConfig> tree(&zone);
578 bool seen[kLimit];
579 for (unsigned i = 0; i < kLimit; i++) seen[i] = false;
580 #define CHECK_MAPS_EQUAL() do { \
581 for (unsigned k = 0; k < kLimit; k++) \
582 CHECK_EQ(seen[k], tree.Find(k, &loc)); \
583 } while (false)
584 for (int i = 0; i < 50; i++) {
585 for (int j = 0; j < 50; j++) {
586 unsigned next = PseudoRandom(i, j) % kLimit;
587 if (seen[next]) {
588 // We've already seen this one. Check the value and remove
589 // it.
590 ZoneSplayTree<TestConfig>::Locator loc;
591 CHECK(tree.Find(next, &loc));
592 CHECK_EQ(next, loc.key());
593 CHECK_EQ(3 * next, loc.value());
594 tree.Remove(next);
595 seen[next] = false;
596 CHECK_MAPS_EQUAL();
597 } else {
598 // Check that it wasn't there already and then add it.
599 ZoneSplayTree<TestConfig>::Locator loc;
600 CHECK(!tree.Find(next, &loc));
601 CHECK(tree.Insert(next, &loc));
602 CHECK_EQ(next, loc.key());
603 loc.set_value(3 * next);
604 seen[next] = true;
605 CHECK_MAPS_EQUAL();
606 }
607 int val = PseudoRandom(j, i) % kLimit;
608 if (seen[val]) {
609 ZoneSplayTree<TestConfig>::Locator loc;
610 CHECK(tree.FindGreatestLessThan(val, &loc));
611 CHECK_EQ(loc.key(), val);
612 break;
613 }
614 val = PseudoRandom(i + j, i - j) % kLimit;
615 if (seen[val]) {
616 ZoneSplayTree<TestConfig>::Locator loc;
617 CHECK(tree.FindLeastGreaterThan(val, &loc));
618 CHECK_EQ(loc.key(), val);
619 break;
620 }
621 }
622 }
623 }
624
625
TEST(DispatchTableConstruction)626 TEST(DispatchTableConstruction) {
627 v8::internal::V8::Initialize(NULL);
628 // Initialize test data.
629 static const int kLimit = 1000;
630 static const int kRangeCount = 8;
631 static const int kRangeSize = 16;
632 uc16 ranges[kRangeCount][2 * kRangeSize];
633 for (int i = 0; i < kRangeCount; i++) {
634 Vector<uc16> range(ranges[i], 2 * kRangeSize);
635 for (int j = 0; j < 2 * kRangeSize; j++) {
636 range[j] = PseudoRandom(i + 25, j + 87) % kLimit;
637 }
638 range.Sort();
639 for (int j = 1; j < 2 * kRangeSize; j++) {
640 CHECK(range[j-1] <= range[j]);
641 }
642 }
643 // Enter test data into dispatch table.
644 Zone zone(CcTest::i_isolate());
645 DispatchTable table(&zone);
646 for (int i = 0; i < kRangeCount; i++) {
647 uc16* range = ranges[i];
648 for (int j = 0; j < 2 * kRangeSize; j += 2)
649 table.AddRange(CharacterRange(range[j], range[j + 1]), i, &zone);
650 }
651 // Check that the table looks as we would expect
652 for (int p = 0; p < kLimit; p++) {
653 OutSet* outs = table.Get(p);
654 for (int j = 0; j < kRangeCount; j++) {
655 uc16* range = ranges[j];
656 bool is_on = false;
657 for (int k = 0; !is_on && (k < 2 * kRangeSize); k += 2)
658 is_on = (range[k] <= p && p <= range[k + 1]);
659 CHECK_EQ(is_on, outs->Get(j));
660 }
661 }
662 }
663
664
665 // Test of debug-only syntax.
666 #ifdef DEBUG
667
TEST(ParsePossessiveRepetition)668 TEST(ParsePossessiveRepetition) {
669 bool old_flag_value = FLAG_regexp_possessive_quantifier;
670
671 // Enable possessive quantifier syntax.
672 FLAG_regexp_possessive_quantifier = true;
673
674 CHECK_PARSE_EQ("a*+", "(# 0 - p 'a')");
675 CHECK_PARSE_EQ("a++", "(# 1 - p 'a')");
676 CHECK_PARSE_EQ("a?+", "(# 0 1 p 'a')");
677 CHECK_PARSE_EQ("a{10,20}+", "(# 10 20 p 'a')");
678 CHECK_PARSE_EQ("za{10,20}+b", "(: 'z' (# 10 20 p 'a') 'b')");
679
680 // Disable possessive quantifier syntax.
681 FLAG_regexp_possessive_quantifier = false;
682
683 CHECK_PARSE_ERROR("a*+");
684 CHECK_PARSE_ERROR("a++");
685 CHECK_PARSE_ERROR("a?+");
686 CHECK_PARSE_ERROR("a{10,20}+");
687 CHECK_PARSE_ERROR("a{10,20}+b");
688
689 FLAG_regexp_possessive_quantifier = old_flag_value;
690 }
691
692 #endif
693
694 // Tests of interpreter.
695
696
697 #ifndef V8_INTERPRETED_REGEXP
698
699 #if V8_TARGET_ARCH_IA32
700 typedef RegExpMacroAssemblerIA32 ArchRegExpMacroAssembler;
701 #elif V8_TARGET_ARCH_X64
702 typedef RegExpMacroAssemblerX64 ArchRegExpMacroAssembler;
703 #elif V8_TARGET_ARCH_ARM
704 typedef RegExpMacroAssemblerARM ArchRegExpMacroAssembler;
705 #elif V8_TARGET_ARCH_MIPS
706 typedef RegExpMacroAssemblerMIPS ArchRegExpMacroAssembler;
707 #endif
708
709 class ContextInitializer {
710 public:
ContextInitializer()711 ContextInitializer()
712 : scope_(CcTest::isolate()),
713 env_(v8::Context::New(CcTest::isolate())) {
714 env_->Enter();
715 }
~ContextInitializer()716 ~ContextInitializer() {
717 env_->Exit();
718 }
719 private:
720 v8::HandleScope scope_;
721 v8::Handle<v8::Context> env_;
722 };
723
724
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * captures)725 static ArchRegExpMacroAssembler::Result Execute(Code* code,
726 String* input,
727 int start_offset,
728 const byte* input_start,
729 const byte* input_end,
730 int* captures) {
731 return NativeRegExpMacroAssembler::Execute(
732 code,
733 input,
734 start_offset,
735 input_start,
736 input_end,
737 captures,
738 0,
739 CcTest::i_isolate());
740 }
741
742
TEST(MacroAssemblerNativeSuccess)743 TEST(MacroAssemblerNativeSuccess) {
744 v8::V8::Initialize();
745 ContextInitializer initializer;
746 Isolate* isolate = CcTest::i_isolate();
747 Factory* factory = isolate->factory();
748 Zone zone(isolate);
749
750 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
751
752 m.Succeed();
753
754 Handle<String> source = factory->NewStringFromAscii(CStrVector(""));
755 Handle<Object> code_object = m.GetCode(source);
756 Handle<Code> code = Handle<Code>::cast(code_object);
757
758 int captures[4] = {42, 37, 87, 117};
759 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
760 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
761 const byte* start_adr =
762 reinterpret_cast<const byte*>(seq_input->GetCharsAddress());
763
764 NativeRegExpMacroAssembler::Result result =
765 Execute(*code,
766 *input,
767 0,
768 start_adr,
769 start_adr + seq_input->length(),
770 captures);
771
772 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
773 CHECK_EQ(-1, captures[0]);
774 CHECK_EQ(-1, captures[1]);
775 CHECK_EQ(-1, captures[2]);
776 CHECK_EQ(-1, captures[3]);
777 }
778
779
TEST(MacroAssemblerNativeSimple)780 TEST(MacroAssemblerNativeSimple) {
781 v8::V8::Initialize();
782 ContextInitializer initializer;
783 Isolate* isolate = CcTest::i_isolate();
784 Factory* factory = isolate->factory();
785 Zone zone(isolate);
786
787 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
788
789 Label fail, backtrack;
790 m.PushBacktrack(&fail);
791 m.CheckNotAtStart(NULL);
792 m.LoadCurrentCharacter(2, NULL);
793 m.CheckNotCharacter('o', NULL);
794 m.LoadCurrentCharacter(1, NULL, false);
795 m.CheckNotCharacter('o', NULL);
796 m.LoadCurrentCharacter(0, NULL, false);
797 m.CheckNotCharacter('f', NULL);
798 m.WriteCurrentPositionToRegister(0, 0);
799 m.WriteCurrentPositionToRegister(1, 3);
800 m.AdvanceCurrentPosition(3);
801 m.PushBacktrack(&backtrack);
802 m.Succeed();
803 m.Bind(&backtrack);
804 m.Backtrack();
805 m.Bind(&fail);
806 m.Fail();
807
808 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo"));
809 Handle<Object> code_object = m.GetCode(source);
810 Handle<Code> code = Handle<Code>::cast(code_object);
811
812 int captures[4] = {42, 37, 87, 117};
813 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
814 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
815 Address start_adr = seq_input->GetCharsAddress();
816
817 NativeRegExpMacroAssembler::Result result =
818 Execute(*code,
819 *input,
820 0,
821 start_adr,
822 start_adr + input->length(),
823 captures);
824
825 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
826 CHECK_EQ(0, captures[0]);
827 CHECK_EQ(3, captures[1]);
828 CHECK_EQ(-1, captures[2]);
829 CHECK_EQ(-1, captures[3]);
830
831 input = factory->NewStringFromAscii(CStrVector("barbarbar"));
832 seq_input = Handle<SeqOneByteString>::cast(input);
833 start_adr = seq_input->GetCharsAddress();
834
835 result = Execute(*code,
836 *input,
837 0,
838 start_adr,
839 start_adr + input->length(),
840 captures);
841
842 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
843 }
844
845
TEST(MacroAssemblerNativeSimpleUC16)846 TEST(MacroAssemblerNativeSimpleUC16) {
847 v8::V8::Initialize();
848 ContextInitializer initializer;
849 Isolate* isolate = CcTest::i_isolate();
850 Factory* factory = isolate->factory();
851 Zone zone(isolate);
852
853 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
854
855 Label fail, backtrack;
856 m.PushBacktrack(&fail);
857 m.CheckNotAtStart(NULL);
858 m.LoadCurrentCharacter(2, NULL);
859 m.CheckNotCharacter('o', NULL);
860 m.LoadCurrentCharacter(1, NULL, false);
861 m.CheckNotCharacter('o', NULL);
862 m.LoadCurrentCharacter(0, NULL, false);
863 m.CheckNotCharacter('f', NULL);
864 m.WriteCurrentPositionToRegister(0, 0);
865 m.WriteCurrentPositionToRegister(1, 3);
866 m.AdvanceCurrentPosition(3);
867 m.PushBacktrack(&backtrack);
868 m.Succeed();
869 m.Bind(&backtrack);
870 m.Backtrack();
871 m.Bind(&fail);
872 m.Fail();
873
874 Handle<String> source = factory->NewStringFromAscii(CStrVector("^foo"));
875 Handle<Object> code_object = m.GetCode(source);
876 Handle<Code> code = Handle<Code>::cast(code_object);
877
878 int captures[4] = {42, 37, 87, 117};
879 const uc16 input_data[6] = {'f', 'o', 'o', 'f', 'o',
880 static_cast<uc16>(0x2603)};
881 Handle<String> input =
882 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
883 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
884 Address start_adr = seq_input->GetCharsAddress();
885
886 NativeRegExpMacroAssembler::Result result =
887 Execute(*code,
888 *input,
889 0,
890 start_adr,
891 start_adr + input->length(),
892 captures);
893
894 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
895 CHECK_EQ(0, captures[0]);
896 CHECK_EQ(3, captures[1]);
897 CHECK_EQ(-1, captures[2]);
898 CHECK_EQ(-1, captures[3]);
899
900 const uc16 input_data2[9] = {'b', 'a', 'r', 'b', 'a', 'r', 'b', 'a',
901 static_cast<uc16>(0x2603)};
902 input = factory->NewStringFromTwoByte(Vector<const uc16>(input_data2, 9));
903 seq_input = Handle<SeqTwoByteString>::cast(input);
904 start_adr = seq_input->GetCharsAddress();
905
906 result = Execute(*code,
907 *input,
908 0,
909 start_adr,
910 start_adr + input->length() * 2,
911 captures);
912
913 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
914 }
915
916
TEST(MacroAssemblerNativeBacktrack)917 TEST(MacroAssemblerNativeBacktrack) {
918 v8::V8::Initialize();
919 ContextInitializer initializer;
920 Isolate* isolate = CcTest::i_isolate();
921 Factory* factory = isolate->factory();
922 Zone zone(isolate);
923
924 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
925
926 Label fail;
927 Label backtrack;
928 m.LoadCurrentCharacter(10, &fail);
929 m.Succeed();
930 m.Bind(&fail);
931 m.PushBacktrack(&backtrack);
932 m.LoadCurrentCharacter(10, NULL);
933 m.Succeed();
934 m.Bind(&backtrack);
935 m.Fail();
936
937 Handle<String> source = factory->NewStringFromAscii(CStrVector(".........."));
938 Handle<Object> code_object = m.GetCode(source);
939 Handle<Code> code = Handle<Code>::cast(code_object);
940
941 Handle<String> input = factory->NewStringFromAscii(CStrVector("foofoo"));
942 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
943 Address start_adr = seq_input->GetCharsAddress();
944
945 NativeRegExpMacroAssembler::Result result =
946 Execute(*code,
947 *input,
948 0,
949 start_adr,
950 start_adr + input->length(),
951 NULL);
952
953 CHECK_EQ(NativeRegExpMacroAssembler::FAILURE, result);
954 }
955
956
TEST(MacroAssemblerNativeBackReferenceASCII)957 TEST(MacroAssemblerNativeBackReferenceASCII) {
958 v8::V8::Initialize();
959 ContextInitializer initializer;
960 Isolate* isolate = CcTest::i_isolate();
961 Factory* factory = isolate->factory();
962 Zone zone(isolate);
963
964 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
965
966 m.WriteCurrentPositionToRegister(0, 0);
967 m.AdvanceCurrentPosition(2);
968 m.WriteCurrentPositionToRegister(1, 0);
969 Label nomatch;
970 m.CheckNotBackReference(0, &nomatch);
971 m.Fail();
972 m.Bind(&nomatch);
973 m.AdvanceCurrentPosition(2);
974 Label missing_match;
975 m.CheckNotBackReference(0, &missing_match);
976 m.WriteCurrentPositionToRegister(2, 0);
977 m.Succeed();
978 m.Bind(&missing_match);
979 m.Fail();
980
981 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1"));
982 Handle<Object> code_object = m.GetCode(source);
983 Handle<Code> code = Handle<Code>::cast(code_object);
984
985 Handle<String> input = factory->NewStringFromAscii(CStrVector("fooofo"));
986 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
987 Address start_adr = seq_input->GetCharsAddress();
988
989 int output[4];
990 NativeRegExpMacroAssembler::Result result =
991 Execute(*code,
992 *input,
993 0,
994 start_adr,
995 start_adr + input->length(),
996 output);
997
998 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
999 CHECK_EQ(0, output[0]);
1000 CHECK_EQ(2, output[1]);
1001 CHECK_EQ(6, output[2]);
1002 CHECK_EQ(-1, output[3]);
1003 }
1004
1005
TEST(MacroAssemblerNativeBackReferenceUC16)1006 TEST(MacroAssemblerNativeBackReferenceUC16) {
1007 v8::V8::Initialize();
1008 ContextInitializer initializer;
1009 Isolate* isolate = CcTest::i_isolate();
1010 Factory* factory = isolate->factory();
1011 Zone zone(isolate);
1012
1013 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::UC16, 4, &zone);
1014
1015 m.WriteCurrentPositionToRegister(0, 0);
1016 m.AdvanceCurrentPosition(2);
1017 m.WriteCurrentPositionToRegister(1, 0);
1018 Label nomatch;
1019 m.CheckNotBackReference(0, &nomatch);
1020 m.Fail();
1021 m.Bind(&nomatch);
1022 m.AdvanceCurrentPosition(2);
1023 Label missing_match;
1024 m.CheckNotBackReference(0, &missing_match);
1025 m.WriteCurrentPositionToRegister(2, 0);
1026 m.Succeed();
1027 m.Bind(&missing_match);
1028 m.Fail();
1029
1030 Handle<String> source = factory->NewStringFromAscii(CStrVector("^(..)..\1"));
1031 Handle<Object> code_object = m.GetCode(source);
1032 Handle<Code> code = Handle<Code>::cast(code_object);
1033
1034 const uc16 input_data[6] = {'f', 0x2028, 'o', 'o', 'f', 0x2028};
1035 Handle<String> input =
1036 factory->NewStringFromTwoByte(Vector<const uc16>(input_data, 6));
1037 Handle<SeqTwoByteString> seq_input = Handle<SeqTwoByteString>::cast(input);
1038 Address start_adr = seq_input->GetCharsAddress();
1039
1040 int output[4];
1041 NativeRegExpMacroAssembler::Result result =
1042 Execute(*code,
1043 *input,
1044 0,
1045 start_adr,
1046 start_adr + input->length() * 2,
1047 output);
1048
1049 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1050 CHECK_EQ(0, output[0]);
1051 CHECK_EQ(2, output[1]);
1052 CHECK_EQ(6, output[2]);
1053 CHECK_EQ(-1, output[3]);
1054 }
1055
1056
1057
TEST(MacroAssemblernativeAtStart)1058 TEST(MacroAssemblernativeAtStart) {
1059 v8::V8::Initialize();
1060 ContextInitializer initializer;
1061 Isolate* isolate = CcTest::i_isolate();
1062 Factory* factory = isolate->factory();
1063 Zone zone(isolate);
1064
1065 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
1066
1067 Label not_at_start, newline, fail;
1068 m.CheckNotAtStart(¬_at_start);
1069 // Check that prevchar = '\n' and current = 'f'.
1070 m.CheckCharacter('\n', &newline);
1071 m.Bind(&fail);
1072 m.Fail();
1073 m.Bind(&newline);
1074 m.LoadCurrentCharacter(0, &fail);
1075 m.CheckNotCharacter('f', &fail);
1076 m.Succeed();
1077
1078 m.Bind(¬_at_start);
1079 // Check that prevchar = 'o' and current = 'b'.
1080 Label prevo;
1081 m.CheckCharacter('o', &prevo);
1082 m.Fail();
1083 m.Bind(&prevo);
1084 m.LoadCurrentCharacter(0, &fail);
1085 m.CheckNotCharacter('b', &fail);
1086 m.Succeed();
1087
1088 Handle<String> source = factory->NewStringFromAscii(CStrVector("(^f|ob)"));
1089 Handle<Object> code_object = m.GetCode(source);
1090 Handle<Code> code = Handle<Code>::cast(code_object);
1091
1092 Handle<String> input = factory->NewStringFromAscii(CStrVector("foobar"));
1093 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1094 Address start_adr = seq_input->GetCharsAddress();
1095
1096 NativeRegExpMacroAssembler::Result result =
1097 Execute(*code,
1098 *input,
1099 0,
1100 start_adr,
1101 start_adr + input->length(),
1102 NULL);
1103
1104 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1105
1106 result = Execute(*code,
1107 *input,
1108 3,
1109 start_adr + 3,
1110 start_adr + input->length(),
1111 NULL);
1112
1113 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1114 }
1115
1116
TEST(MacroAssemblerNativeBackRefNoCase)1117 TEST(MacroAssemblerNativeBackRefNoCase) {
1118 v8::V8::Initialize();
1119 ContextInitializer initializer;
1120 Isolate* isolate = CcTest::i_isolate();
1121 Factory* factory = isolate->factory();
1122 Zone zone(isolate);
1123
1124 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 4, &zone);
1125
1126 Label fail, succ;
1127
1128 m.WriteCurrentPositionToRegister(0, 0);
1129 m.WriteCurrentPositionToRegister(2, 0);
1130 m.AdvanceCurrentPosition(3);
1131 m.WriteCurrentPositionToRegister(3, 0);
1132 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "AbC".
1133 m.CheckNotBackReferenceIgnoreCase(2, &fail); // Match "ABC".
1134 Label expected_fail;
1135 m.CheckNotBackReferenceIgnoreCase(2, &expected_fail);
1136 m.Bind(&fail);
1137 m.Fail();
1138
1139 m.Bind(&expected_fail);
1140 m.AdvanceCurrentPosition(3); // Skip "xYz"
1141 m.CheckNotBackReferenceIgnoreCase(2, &succ);
1142 m.Fail();
1143
1144 m.Bind(&succ);
1145 m.WriteCurrentPositionToRegister(1, 0);
1146 m.Succeed();
1147
1148 Handle<String> source =
1149 factory->NewStringFromAscii(CStrVector("^(abc)\1\1(?!\1)...(?!\1)"));
1150 Handle<Object> code_object = m.GetCode(source);
1151 Handle<Code> code = Handle<Code>::cast(code_object);
1152
1153 Handle<String> input =
1154 factory->NewStringFromAscii(CStrVector("aBcAbCABCxYzab"));
1155 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1156 Address start_adr = seq_input->GetCharsAddress();
1157
1158 int output[4];
1159 NativeRegExpMacroAssembler::Result result =
1160 Execute(*code,
1161 *input,
1162 0,
1163 start_adr,
1164 start_adr + input->length(),
1165 output);
1166
1167 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1168 CHECK_EQ(0, output[0]);
1169 CHECK_EQ(12, output[1]);
1170 CHECK_EQ(0, output[2]);
1171 CHECK_EQ(3, output[3]);
1172 }
1173
1174
1175
TEST(MacroAssemblerNativeRegisters)1176 TEST(MacroAssemblerNativeRegisters) {
1177 v8::V8::Initialize();
1178 ContextInitializer initializer;
1179 Isolate* isolate = CcTest::i_isolate();
1180 Factory* factory = isolate->factory();
1181 Zone zone(isolate);
1182
1183 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 6, &zone);
1184
1185 uc16 foo_chars[3] = {'f', 'o', 'o'};
1186 Vector<const uc16> foo(foo_chars, 3);
1187
1188 enum registers { out1, out2, out3, out4, out5, out6, sp, loop_cnt };
1189 Label fail;
1190 Label backtrack;
1191 m.WriteCurrentPositionToRegister(out1, 0); // Output: [0]
1192 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1193 m.PushBacktrack(&backtrack);
1194 m.WriteStackPointerToRegister(sp);
1195 // Fill stack and registers
1196 m.AdvanceCurrentPosition(2);
1197 m.WriteCurrentPositionToRegister(out1, 0);
1198 m.PushRegister(out1, RegExpMacroAssembler::kNoStackLimitCheck);
1199 m.PushBacktrack(&fail);
1200 // Drop backtrack stack frames.
1201 m.ReadStackPointerFromRegister(sp);
1202 // And take the first backtrack (to &backtrack)
1203 m.Backtrack();
1204
1205 m.PushCurrentPosition();
1206 m.AdvanceCurrentPosition(2);
1207 m.PopCurrentPosition();
1208
1209 m.Bind(&backtrack);
1210 m.PopRegister(out1);
1211 m.ReadCurrentPositionFromRegister(out1);
1212 m.AdvanceCurrentPosition(3);
1213 m.WriteCurrentPositionToRegister(out2, 0); // [0,3]
1214
1215 Label loop;
1216 m.SetRegister(loop_cnt, 0); // loop counter
1217 m.Bind(&loop);
1218 m.AdvanceRegister(loop_cnt, 1);
1219 m.AdvanceCurrentPosition(1);
1220 m.IfRegisterLT(loop_cnt, 3, &loop);
1221 m.WriteCurrentPositionToRegister(out3, 0); // [0,3,6]
1222
1223 Label loop2;
1224 m.SetRegister(loop_cnt, 2); // loop counter
1225 m.Bind(&loop2);
1226 m.AdvanceRegister(loop_cnt, -1);
1227 m.AdvanceCurrentPosition(1);
1228 m.IfRegisterGE(loop_cnt, 0, &loop2);
1229 m.WriteCurrentPositionToRegister(out4, 0); // [0,3,6,9]
1230
1231 Label loop3;
1232 Label exit_loop3;
1233 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1234 m.PushRegister(out4, RegExpMacroAssembler::kNoStackLimitCheck);
1235 m.ReadCurrentPositionFromRegister(out3);
1236 m.Bind(&loop3);
1237 m.AdvanceCurrentPosition(1);
1238 m.CheckGreedyLoop(&exit_loop3);
1239 m.GoTo(&loop3);
1240 m.Bind(&exit_loop3);
1241 m.PopCurrentPosition();
1242 m.WriteCurrentPositionToRegister(out5, 0); // [0,3,6,9,9,-1]
1243
1244 m.Succeed();
1245
1246 m.Bind(&fail);
1247 m.Fail();
1248
1249 Handle<String> source =
1250 factory->NewStringFromAscii(CStrVector("<loop test>"));
1251 Handle<Object> code_object = m.GetCode(source);
1252 Handle<Code> code = Handle<Code>::cast(code_object);
1253
1254 // String long enough for test (content doesn't matter).
1255 Handle<String> input =
1256 factory->NewStringFromAscii(CStrVector("foofoofoofoofoo"));
1257 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1258 Address start_adr = seq_input->GetCharsAddress();
1259
1260 int output[6];
1261 NativeRegExpMacroAssembler::Result result =
1262 Execute(*code,
1263 *input,
1264 0,
1265 start_adr,
1266 start_adr + input->length(),
1267 output);
1268
1269 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1270 CHECK_EQ(0, output[0]);
1271 CHECK_EQ(3, output[1]);
1272 CHECK_EQ(6, output[2]);
1273 CHECK_EQ(9, output[3]);
1274 CHECK_EQ(9, output[4]);
1275 CHECK_EQ(-1, output[5]);
1276 }
1277
1278
TEST(MacroAssemblerStackOverflow)1279 TEST(MacroAssemblerStackOverflow) {
1280 v8::V8::Initialize();
1281 ContextInitializer initializer;
1282 Isolate* isolate = CcTest::i_isolate();
1283 Factory* factory = isolate->factory();
1284 Zone zone(isolate);
1285
1286 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 0, &zone);
1287
1288 Label loop;
1289 m.Bind(&loop);
1290 m.PushBacktrack(&loop);
1291 m.GoTo(&loop);
1292
1293 Handle<String> source =
1294 factory->NewStringFromAscii(CStrVector("<stack overflow test>"));
1295 Handle<Object> code_object = m.GetCode(source);
1296 Handle<Code> code = Handle<Code>::cast(code_object);
1297
1298 // String long enough for test (content doesn't matter).
1299 Handle<String> input =
1300 factory->NewStringFromAscii(CStrVector("dummy"));
1301 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1302 Address start_adr = seq_input->GetCharsAddress();
1303
1304 NativeRegExpMacroAssembler::Result result =
1305 Execute(*code,
1306 *input,
1307 0,
1308 start_adr,
1309 start_adr + input->length(),
1310 NULL);
1311
1312 CHECK_EQ(NativeRegExpMacroAssembler::EXCEPTION, result);
1313 CHECK(isolate->has_pending_exception());
1314 isolate->clear_pending_exception();
1315 }
1316
1317
TEST(MacroAssemblerNativeLotsOfRegisters)1318 TEST(MacroAssemblerNativeLotsOfRegisters) {
1319 v8::V8::Initialize();
1320 ContextInitializer initializer;
1321 Isolate* isolate = CcTest::i_isolate();
1322 Factory* factory = isolate->factory();
1323 Zone zone(isolate);
1324
1325 ArchRegExpMacroAssembler m(NativeRegExpMacroAssembler::ASCII, 2, &zone);
1326
1327 // At least 2048, to ensure the allocated space for registers
1328 // span one full page.
1329 const int large_number = 8000;
1330 m.WriteCurrentPositionToRegister(large_number, 42);
1331 m.WriteCurrentPositionToRegister(0, 0);
1332 m.WriteCurrentPositionToRegister(1, 1);
1333 Label done;
1334 m.CheckNotBackReference(0, &done); // Performs a system-stack push.
1335 m.Bind(&done);
1336 m.PushRegister(large_number, RegExpMacroAssembler::kNoStackLimitCheck);
1337 m.PopRegister(1);
1338 m.Succeed();
1339
1340 Handle<String> source =
1341 factory->NewStringFromAscii(CStrVector("<huge register space test>"));
1342 Handle<Object> code_object = m.GetCode(source);
1343 Handle<Code> code = Handle<Code>::cast(code_object);
1344
1345 // String long enough for test (content doesn't matter).
1346 Handle<String> input =
1347 factory->NewStringFromAscii(CStrVector("sample text"));
1348 Handle<SeqOneByteString> seq_input = Handle<SeqOneByteString>::cast(input);
1349 Address start_adr = seq_input->GetCharsAddress();
1350
1351 int captures[2];
1352 NativeRegExpMacroAssembler::Result result =
1353 Execute(*code,
1354 *input,
1355 0,
1356 start_adr,
1357 start_adr + input->length(),
1358 captures);
1359
1360 CHECK_EQ(NativeRegExpMacroAssembler::SUCCESS, result);
1361 CHECK_EQ(0, captures[0]);
1362 CHECK_EQ(42, captures[1]);
1363
1364 isolate->clear_pending_exception();
1365 }
1366
1367 #else // V8_INTERPRETED_REGEXP
1368
TEST(MacroAssembler)1369 TEST(MacroAssembler) {
1370 V8::Initialize(NULL);
1371 byte codes[1024];
1372 Zone zone(CcTest::i_isolate());
1373 RegExpMacroAssemblerIrregexp m(Vector<byte>(codes, 1024), &zone);
1374 // ^f(o)o.
1375 Label start, fail, backtrack;
1376
1377 m.SetRegister(4, 42);
1378 m.PushRegister(4, RegExpMacroAssembler::kNoStackLimitCheck);
1379 m.AdvanceRegister(4, 42);
1380 m.GoTo(&start);
1381 m.Fail();
1382 m.Bind(&start);
1383 m.PushBacktrack(&fail);
1384 m.CheckNotAtStart(NULL);
1385 m.LoadCurrentCharacter(0, NULL);
1386 m.CheckNotCharacter('f', NULL);
1387 m.LoadCurrentCharacter(1, NULL);
1388 m.CheckNotCharacter('o', NULL);
1389 m.LoadCurrentCharacter(2, NULL);
1390 m.CheckNotCharacter('o', NULL);
1391 m.WriteCurrentPositionToRegister(0, 0);
1392 m.WriteCurrentPositionToRegister(1, 3);
1393 m.WriteCurrentPositionToRegister(2, 1);
1394 m.WriteCurrentPositionToRegister(3, 2);
1395 m.AdvanceCurrentPosition(3);
1396 m.PushBacktrack(&backtrack);
1397 m.Succeed();
1398 m.Bind(&backtrack);
1399 m.ClearRegisters(2, 3);
1400 m.Backtrack();
1401 m.Bind(&fail);
1402 m.PopRegister(0);
1403 m.Fail();
1404
1405 Isolate* isolate = CcTest::i_isolate();
1406 Factory* factory = isolate->factory();
1407 HandleScope scope(isolate);
1408
1409 Handle<String> source = factory->NewStringFromAscii(CStrVector("^f(o)o"));
1410 Handle<ByteArray> array = Handle<ByteArray>::cast(m.GetCode(source));
1411 int captures[5];
1412
1413 const uc16 str1[] = {'f', 'o', 'o', 'b', 'a', 'r'};
1414 Handle<String> f1_16 =
1415 factory->NewStringFromTwoByte(Vector<const uc16>(str1, 6));
1416
1417 CHECK(IrregexpInterpreter::Match(isolate, array, f1_16, captures, 0));
1418 CHECK_EQ(0, captures[0]);
1419 CHECK_EQ(3, captures[1]);
1420 CHECK_EQ(1, captures[2]);
1421 CHECK_EQ(2, captures[3]);
1422 CHECK_EQ(84, captures[4]);
1423
1424 const uc16 str2[] = {'b', 'a', 'r', 'f', 'o', 'o'};
1425 Handle<String> f2_16 =
1426 factory->NewStringFromTwoByte(Vector<const uc16>(str2, 6));
1427
1428 CHECK(!IrregexpInterpreter::Match(isolate, array, f2_16, captures, 0));
1429 CHECK_EQ(42, captures[0]);
1430 }
1431
1432 #endif // V8_INTERPRETED_REGEXP
1433
1434
TEST(AddInverseToTable)1435 TEST(AddInverseToTable) {
1436 v8::internal::V8::Initialize(NULL);
1437 static const int kLimit = 1000;
1438 static const int kRangeCount = 16;
1439 for (int t = 0; t < 10; t++) {
1440 Zone zone(CcTest::i_isolate());
1441 ZoneList<CharacterRange>* ranges =
1442 new(&zone) ZoneList<CharacterRange>(kRangeCount, &zone);
1443 for (int i = 0; i < kRangeCount; i++) {
1444 int from = PseudoRandom(t + 87, i + 25) % kLimit;
1445 int to = from + (PseudoRandom(i + 87, t + 25) % (kLimit / 20));
1446 if (to > kLimit) to = kLimit;
1447 ranges->Add(CharacterRange(from, to), &zone);
1448 }
1449 DispatchTable table(&zone);
1450 DispatchTableConstructor cons(&table, false, &zone);
1451 cons.set_choice_index(0);
1452 cons.AddInverse(ranges);
1453 for (int i = 0; i < kLimit; i++) {
1454 bool is_on = false;
1455 for (int j = 0; !is_on && j < kRangeCount; j++)
1456 is_on = ranges->at(j).Contains(i);
1457 OutSet* set = table.Get(i);
1458 CHECK_EQ(is_on, set->Get(0) == false);
1459 }
1460 }
1461 Zone zone(CcTest::i_isolate());
1462 ZoneList<CharacterRange>* ranges =
1463 new(&zone) ZoneList<CharacterRange>(1, &zone);
1464 ranges->Add(CharacterRange(0xFFF0, 0xFFFE), &zone);
1465 DispatchTable table(&zone);
1466 DispatchTableConstructor cons(&table, false, &zone);
1467 cons.set_choice_index(0);
1468 cons.AddInverse(ranges);
1469 CHECK(!table.Get(0xFFFE)->Get(0));
1470 CHECK(table.Get(0xFFFF)->Get(0));
1471 }
1472
1473
canonicalize(uc32 c)1474 static uc32 canonicalize(uc32 c) {
1475 unibrow::uchar canon[unibrow::Ecma262Canonicalize::kMaxWidth];
1476 int count = unibrow::Ecma262Canonicalize::Convert(c, '\0', canon, NULL);
1477 if (count == 0) {
1478 return c;
1479 } else {
1480 CHECK_EQ(1, count);
1481 return canon[0];
1482 }
1483 }
1484
1485
TEST(LatinCanonicalize)1486 TEST(LatinCanonicalize) {
1487 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1488 for (char lower = 'a'; lower <= 'z'; lower++) {
1489 char upper = lower + ('A' - 'a');
1490 CHECK_EQ(canonicalize(lower), canonicalize(upper));
1491 unibrow::uchar uncanon[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1492 int length = un_canonicalize.get(lower, '\0', uncanon);
1493 CHECK_EQ(2, length);
1494 CHECK_EQ(upper, uncanon[0]);
1495 CHECK_EQ(lower, uncanon[1]);
1496 }
1497 for (uc32 c = 128; c < (1 << 21); c++)
1498 CHECK_GE(canonicalize(c), 128);
1499 unibrow::Mapping<unibrow::ToUppercase> to_upper;
1500 // Canonicalization is only defined for the Basic Multilingual Plane.
1501 for (uc32 c = 0; c < (1 << 16); c++) {
1502 unibrow::uchar upper[unibrow::ToUppercase::kMaxWidth];
1503 int length = to_upper.get(c, '\0', upper);
1504 if (length == 0) {
1505 length = 1;
1506 upper[0] = c;
1507 }
1508 uc32 u = upper[0];
1509 if (length > 1 || (c >= 128 && u < 128))
1510 u = c;
1511 CHECK_EQ(u, canonicalize(c));
1512 }
1513 }
1514
1515
CanonRangeEnd(uc32 c)1516 static uc32 CanonRangeEnd(uc32 c) {
1517 unibrow::uchar canon[unibrow::CanonicalizationRange::kMaxWidth];
1518 int count = unibrow::CanonicalizationRange::Convert(c, '\0', canon, NULL);
1519 if (count == 0) {
1520 return c;
1521 } else {
1522 CHECK_EQ(1, count);
1523 return canon[0];
1524 }
1525 }
1526
1527
TEST(RangeCanonicalization)1528 TEST(RangeCanonicalization) {
1529 // Check that we arrive at the same result when using the basic
1530 // range canonicalization primitives as when using immediate
1531 // canonicalization.
1532 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1533 int block_start = 0;
1534 while (block_start <= 0xFFFF) {
1535 uc32 block_end = CanonRangeEnd(block_start);
1536 unsigned block_length = block_end - block_start + 1;
1537 if (block_length > 1) {
1538 unibrow::uchar first[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1539 int first_length = un_canonicalize.get(block_start, '\0', first);
1540 for (unsigned i = 1; i < block_length; i++) {
1541 unibrow::uchar succ[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1542 int succ_length = un_canonicalize.get(block_start + i, '\0', succ);
1543 CHECK_EQ(first_length, succ_length);
1544 for (int j = 0; j < succ_length; j++) {
1545 int calc = first[j] + i;
1546 int found = succ[j];
1547 CHECK_EQ(calc, found);
1548 }
1549 }
1550 }
1551 block_start = block_start + block_length;
1552 }
1553 }
1554
1555
TEST(UncanonicalizeEquivalence)1556 TEST(UncanonicalizeEquivalence) {
1557 unibrow::Mapping<unibrow::Ecma262UnCanonicalize> un_canonicalize;
1558 unibrow::uchar chars[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1559 for (int i = 0; i < (1 << 16); i++) {
1560 int length = un_canonicalize.get(i, '\0', chars);
1561 for (int j = 0; j < length; j++) {
1562 unibrow::uchar chars2[unibrow::Ecma262UnCanonicalize::kMaxWidth];
1563 int length2 = un_canonicalize.get(chars[j], '\0', chars2);
1564 CHECK_EQ(length, length2);
1565 for (int k = 0; k < length; k++)
1566 CHECK_EQ(static_cast<int>(chars[k]), static_cast<int>(chars2[k]));
1567 }
1568 }
1569 }
1570
1571
TestRangeCaseIndependence(CharacterRange input,Vector<CharacterRange> expected)1572 static void TestRangeCaseIndependence(CharacterRange input,
1573 Vector<CharacterRange> expected) {
1574 Zone zone(CcTest::i_isolate());
1575 int count = expected.length();
1576 ZoneList<CharacterRange>* list =
1577 new(&zone) ZoneList<CharacterRange>(count, &zone);
1578 input.AddCaseEquivalents(list, false, &zone);
1579 CHECK_EQ(count, list->length());
1580 for (int i = 0; i < list->length(); i++) {
1581 CHECK_EQ(expected[i].from(), list->at(i).from());
1582 CHECK_EQ(expected[i].to(), list->at(i).to());
1583 }
1584 }
1585
1586
TestSimpleRangeCaseIndependence(CharacterRange input,CharacterRange expected)1587 static void TestSimpleRangeCaseIndependence(CharacterRange input,
1588 CharacterRange expected) {
1589 EmbeddedVector<CharacterRange, 1> vector;
1590 vector[0] = expected;
1591 TestRangeCaseIndependence(input, vector);
1592 }
1593
1594
TEST(CharacterRangeCaseIndependence)1595 TEST(CharacterRangeCaseIndependence) {
1596 v8::internal::V8::Initialize(NULL);
1597 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('a'),
1598 CharacterRange::Singleton('A'));
1599 TestSimpleRangeCaseIndependence(CharacterRange::Singleton('z'),
1600 CharacterRange::Singleton('Z'));
1601 TestSimpleRangeCaseIndependence(CharacterRange('a', 'z'),
1602 CharacterRange('A', 'Z'));
1603 TestSimpleRangeCaseIndependence(CharacterRange('c', 'f'),
1604 CharacterRange('C', 'F'));
1605 TestSimpleRangeCaseIndependence(CharacterRange('a', 'b'),
1606 CharacterRange('A', 'B'));
1607 TestSimpleRangeCaseIndependence(CharacterRange('y', 'z'),
1608 CharacterRange('Y', 'Z'));
1609 TestSimpleRangeCaseIndependence(CharacterRange('a' - 1, 'z' + 1),
1610 CharacterRange('A', 'Z'));
1611 TestSimpleRangeCaseIndependence(CharacterRange('A', 'Z'),
1612 CharacterRange('a', 'z'));
1613 TestSimpleRangeCaseIndependence(CharacterRange('C', 'F'),
1614 CharacterRange('c', 'f'));
1615 TestSimpleRangeCaseIndependence(CharacterRange('A' - 1, 'Z' + 1),
1616 CharacterRange('a', 'z'));
1617 // Here we need to add [l-z] to complete the case independence of
1618 // [A-Za-z] but we expect [a-z] to be added since we always add a
1619 // whole block at a time.
1620 TestSimpleRangeCaseIndependence(CharacterRange('A', 'k'),
1621 CharacterRange('a', 'z'));
1622 }
1623
1624
InClass(uc16 c,ZoneList<CharacterRange> * ranges)1625 static bool InClass(uc16 c, ZoneList<CharacterRange>* ranges) {
1626 if (ranges == NULL)
1627 return false;
1628 for (int i = 0; i < ranges->length(); i++) {
1629 CharacterRange range = ranges->at(i);
1630 if (range.from() <= c && c <= range.to())
1631 return true;
1632 }
1633 return false;
1634 }
1635
1636
TEST(CharClassDifference)1637 TEST(CharClassDifference) {
1638 v8::internal::V8::Initialize(NULL);
1639 Zone zone(CcTest::i_isolate());
1640 ZoneList<CharacterRange>* base =
1641 new(&zone) ZoneList<CharacterRange>(1, &zone);
1642 base->Add(CharacterRange::Everything(), &zone);
1643 Vector<const int> overlay = CharacterRange::GetWordBounds();
1644 ZoneList<CharacterRange>* included = NULL;
1645 ZoneList<CharacterRange>* excluded = NULL;
1646 CharacterRange::Split(base, overlay, &included, &excluded, &zone);
1647 for (int i = 0; i < (1 << 16); i++) {
1648 bool in_base = InClass(i, base);
1649 if (in_base) {
1650 bool in_overlay = false;
1651 for (int j = 0; !in_overlay && j < overlay.length(); j += 2) {
1652 if (overlay[j] <= i && i < overlay[j+1])
1653 in_overlay = true;
1654 }
1655 CHECK_EQ(in_overlay, InClass(i, included));
1656 CHECK_EQ(!in_overlay, InClass(i, excluded));
1657 } else {
1658 CHECK(!InClass(i, included));
1659 CHECK(!InClass(i, excluded));
1660 }
1661 }
1662 }
1663
1664
TEST(CanonicalizeCharacterSets)1665 TEST(CanonicalizeCharacterSets) {
1666 v8::internal::V8::Initialize(NULL);
1667 Zone zone(CcTest::i_isolate());
1668 ZoneList<CharacterRange>* list =
1669 new(&zone) ZoneList<CharacterRange>(4, &zone);
1670 CharacterSet set(list);
1671
1672 list->Add(CharacterRange(10, 20), &zone);
1673 list->Add(CharacterRange(30, 40), &zone);
1674 list->Add(CharacterRange(50, 60), &zone);
1675 set.Canonicalize();
1676 ASSERT_EQ(3, list->length());
1677 ASSERT_EQ(10, list->at(0).from());
1678 ASSERT_EQ(20, list->at(0).to());
1679 ASSERT_EQ(30, list->at(1).from());
1680 ASSERT_EQ(40, list->at(1).to());
1681 ASSERT_EQ(50, list->at(2).from());
1682 ASSERT_EQ(60, list->at(2).to());
1683
1684 list->Rewind(0);
1685 list->Add(CharacterRange(10, 20), &zone);
1686 list->Add(CharacterRange(50, 60), &zone);
1687 list->Add(CharacterRange(30, 40), &zone);
1688 set.Canonicalize();
1689 ASSERT_EQ(3, list->length());
1690 ASSERT_EQ(10, list->at(0).from());
1691 ASSERT_EQ(20, list->at(0).to());
1692 ASSERT_EQ(30, list->at(1).from());
1693 ASSERT_EQ(40, list->at(1).to());
1694 ASSERT_EQ(50, list->at(2).from());
1695 ASSERT_EQ(60, list->at(2).to());
1696
1697 list->Rewind(0);
1698 list->Add(CharacterRange(30, 40), &zone);
1699 list->Add(CharacterRange(10, 20), &zone);
1700 list->Add(CharacterRange(25, 25), &zone);
1701 list->Add(CharacterRange(100, 100), &zone);
1702 list->Add(CharacterRange(1, 1), &zone);
1703 set.Canonicalize();
1704 ASSERT_EQ(5, list->length());
1705 ASSERT_EQ(1, list->at(0).from());
1706 ASSERT_EQ(1, list->at(0).to());
1707 ASSERT_EQ(10, list->at(1).from());
1708 ASSERT_EQ(20, list->at(1).to());
1709 ASSERT_EQ(25, list->at(2).from());
1710 ASSERT_EQ(25, list->at(2).to());
1711 ASSERT_EQ(30, list->at(3).from());
1712 ASSERT_EQ(40, list->at(3).to());
1713 ASSERT_EQ(100, list->at(4).from());
1714 ASSERT_EQ(100, list->at(4).to());
1715
1716 list->Rewind(0);
1717 list->Add(CharacterRange(10, 19), &zone);
1718 list->Add(CharacterRange(21, 30), &zone);
1719 list->Add(CharacterRange(20, 20), &zone);
1720 set.Canonicalize();
1721 ASSERT_EQ(1, list->length());
1722 ASSERT_EQ(10, list->at(0).from());
1723 ASSERT_EQ(30, list->at(0).to());
1724 }
1725
1726
TEST(CharacterRangeMerge)1727 TEST(CharacterRangeMerge) {
1728 v8::internal::V8::Initialize(NULL);
1729 Zone zone(CcTest::i_isolate());
1730 ZoneList<CharacterRange> l1(4, &zone);
1731 ZoneList<CharacterRange> l2(4, &zone);
1732 // Create all combinations of intersections of ranges, both singletons and
1733 // longer.
1734
1735 int offset = 0;
1736
1737 // The five kinds of singleton intersections:
1738 // X
1739 // Y - outside before
1740 // Y - outside touching start
1741 // Y - overlap
1742 // Y - outside touching end
1743 // Y - outside after
1744
1745 for (int i = 0; i < 5; i++) {
1746 l1.Add(CharacterRange::Singleton(offset + 2), &zone);
1747 l2.Add(CharacterRange::Singleton(offset + i), &zone);
1748 offset += 6;
1749 }
1750
1751 // The seven kinds of singleton/non-singleton intersections:
1752 // XXX
1753 // Y - outside before
1754 // Y - outside touching start
1755 // Y - inside touching start
1756 // Y - entirely inside
1757 // Y - inside touching end
1758 // Y - outside touching end
1759 // Y - disjoint after
1760
1761 for (int i = 0; i < 7; i++) {
1762 l1.Add(CharacterRange::Range(offset + 2, offset + 4), &zone);
1763 l2.Add(CharacterRange::Singleton(offset + i), &zone);
1764 offset += 8;
1765 }
1766
1767 // The eleven kinds of non-singleton intersections:
1768 //
1769 // XXXXXXXX
1770 // YYYY - outside before.
1771 // YYYY - outside touching start.
1772 // YYYY - overlapping start
1773 // YYYY - inside touching start
1774 // YYYY - entirely inside
1775 // YYYY - inside touching end
1776 // YYYY - overlapping end
1777 // YYYY - outside touching end
1778 // YYYY - outside after
1779 // YYYYYYYY - identical
1780 // YYYYYYYYYYYY - containing entirely.
1781
1782 for (int i = 0; i < 9; i++) {
1783 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone); // Length 8.
1784 l2.Add(CharacterRange::Range(offset + 2 * i, offset + 2 * i + 3), &zone);
1785 offset += 22;
1786 }
1787 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1788 l2.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1789 offset += 22;
1790 l1.Add(CharacterRange::Range(offset + 6, offset + 15), &zone);
1791 l2.Add(CharacterRange::Range(offset + 4, offset + 17), &zone);
1792 offset += 22;
1793
1794 // Different kinds of multi-range overlap:
1795 // XXXXXXXXXXXXXXXXXXXXXX XXXXXXXXXXXXXXXXXXXXXX
1796 // YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y YYYY Y
1797
1798 l1.Add(CharacterRange::Range(offset, offset + 21), &zone);
1799 l1.Add(CharacterRange::Range(offset + 31, offset + 52), &zone);
1800 for (int i = 0; i < 6; i++) {
1801 l2.Add(CharacterRange::Range(offset + 2, offset + 5), &zone);
1802 l2.Add(CharacterRange::Singleton(offset + 8), &zone);
1803 offset += 9;
1804 }
1805
1806 ASSERT(CharacterRange::IsCanonical(&l1));
1807 ASSERT(CharacterRange::IsCanonical(&l2));
1808
1809 ZoneList<CharacterRange> first_only(4, &zone);
1810 ZoneList<CharacterRange> second_only(4, &zone);
1811 ZoneList<CharacterRange> both(4, &zone);
1812 }
1813
1814
TEST(Graph)1815 TEST(Graph) {
1816 V8::Initialize(NULL);
1817 Execute("\\b\\w+\\b", false, true, true);
1818 }
1819