1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 // A simple interpreter for the Irregexp byte code.
6
7
8 #include "src/v8.h"
9 #include "src/unicode.h"
10 #include "src/utils.h"
11 #include "src/ast.h"
12 #include "src/bytecodes-irregexp.h"
13 #include "src/interpreter-irregexp.h"
14 #include "src/jsregexp.h"
15 #include "src/regexp-macro-assembler.h"
16
17 namespace v8 {
18 namespace internal {
19
20
21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
22
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uc16> subject)23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
24 int from,
25 int current,
26 int len,
27 Vector<const uc16> subject) {
28 for (int i = 0; i < len; i++) {
29 unibrow::uchar old_char = subject[from++];
30 unibrow::uchar new_char = subject[current++];
31 if (old_char == new_char) continue;
32 unibrow::uchar old_string[1] = { old_char };
33 unibrow::uchar new_string[1] = { new_char };
34 interp_canonicalize->get(old_char, '\0', old_string);
35 interp_canonicalize->get(new_char, '\0', new_string);
36 if (old_string[0] != new_string[0]) {
37 return false;
38 }
39 }
40 return true;
41 }
42
43
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uint8_t> subject)44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
45 int from,
46 int current,
47 int len,
48 Vector<const uint8_t> subject) {
49 for (int i = 0; i < len; i++) {
50 unsigned int old_char = subject[from++];
51 unsigned int new_char = subject[current++];
52 if (old_char == new_char) continue;
53 // Convert both characters to lower case.
54 old_char |= 0x20;
55 new_char |= 0x20;
56 if (old_char != new_char) return false;
57 // Not letters in the ASCII range and Latin-1 range.
58 if (!(old_char - 'a' <= 'z' - 'a') &&
59 !(old_char - 224 <= 254 - 224 && old_char != 247)) {
60 return false;
61 }
62 }
63 return true;
64 }
65
66
67 #ifdef DEBUG
TraceInterpreter(const byte * code_base,const byte * pc,int stack_depth,int current_position,uint32_t current_char,int bytecode_length,const char * bytecode_name)68 static void TraceInterpreter(const byte* code_base,
69 const byte* pc,
70 int stack_depth,
71 int current_position,
72 uint32_t current_char,
73 int bytecode_length,
74 const char* bytecode_name) {
75 if (FLAG_trace_regexp_bytecodes) {
76 bool printable = (current_char < 127 && current_char >= 32);
77 const char* format =
78 printable ?
79 "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
80 "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
81 PrintF(format,
82 pc - code_base,
83 stack_depth,
84 current_position,
85 current_char,
86 printable ? current_char : '.',
87 bytecode_name);
88 for (int i = 0; i < bytecode_length; i++) {
89 printf(", %02x", pc[i]);
90 }
91 printf(" ");
92 for (int i = 1; i < bytecode_length; i++) {
93 unsigned char b = pc[i];
94 if (b < 127 && b >= 32) {
95 printf("%c", b);
96 } else {
97 printf(".");
98 }
99 }
100 printf("\n");
101 }
102 }
103
104
105 #define BYTECODE(name) \
106 case BC_##name: \
107 TraceInterpreter(code_base, \
108 pc, \
109 static_cast<int>(backtrack_sp - backtrack_stack_base), \
110 current, \
111 current_char, \
112 BC_##name##_LENGTH, \
113 #name);
114 #else
115 #define BYTECODE(name) \
116 case BC_##name:
117 #endif
118
119
Load32Aligned(const byte * pc)120 static int32_t Load32Aligned(const byte* pc) {
121 ASSERT((reinterpret_cast<intptr_t>(pc) & 3) == 0);
122 return *reinterpret_cast<const int32_t *>(pc);
123 }
124
125
Load16Aligned(const byte * pc)126 static int32_t Load16Aligned(const byte* pc) {
127 ASSERT((reinterpret_cast<intptr_t>(pc) & 1) == 0);
128 return *reinterpret_cast<const uint16_t *>(pc);
129 }
130
131
132 // A simple abstraction over the backtracking stack used by the interpreter.
133 // This backtracking stack does not grow automatically, but it ensures that the
134 // the memory held by the stack is released or remembered in a cache if the
135 // matching terminates.
136 class BacktrackStack {
137 public:
BacktrackStack()138 explicit BacktrackStack() {
139 data_ = NewArray<int>(kBacktrackStackSize);
140 }
141
~BacktrackStack()142 ~BacktrackStack() {
143 DeleteArray(data_);
144 }
145
data() const146 int* data() const { return data_; }
147
max_size() const148 int max_size() const { return kBacktrackStackSize; }
149
150 private:
151 static const int kBacktrackStackSize = 10000;
152
153 int* data_;
154
155 DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
156 };
157
158
159 template <typename Char>
RawMatch(Isolate * isolate,const byte * code_base,Vector<const Char> subject,int * registers,int current,uint32_t current_char)160 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
161 const byte* code_base,
162 Vector<const Char> subject,
163 int* registers,
164 int current,
165 uint32_t current_char) {
166 const byte* pc = code_base;
167 // BacktrackStack ensures that the memory allocated for the backtracking stack
168 // is returned to the system or cached if there is no stack being cached at
169 // the moment.
170 BacktrackStack backtrack_stack;
171 int* backtrack_stack_base = backtrack_stack.data();
172 int* backtrack_sp = backtrack_stack_base;
173 int backtrack_stack_space = backtrack_stack.max_size();
174 #ifdef DEBUG
175 if (FLAG_trace_regexp_bytecodes) {
176 PrintF("\n\nStart bytecode interpreter\n\n");
177 }
178 #endif
179 while (true) {
180 int32_t insn = Load32Aligned(pc);
181 switch (insn & BYTECODE_MASK) {
182 BYTECODE(BREAK)
183 UNREACHABLE();
184 return RegExpImpl::RE_FAILURE;
185 BYTECODE(PUSH_CP)
186 if (--backtrack_stack_space < 0) {
187 return RegExpImpl::RE_EXCEPTION;
188 }
189 *backtrack_sp++ = current;
190 pc += BC_PUSH_CP_LENGTH;
191 break;
192 BYTECODE(PUSH_BT)
193 if (--backtrack_stack_space < 0) {
194 return RegExpImpl::RE_EXCEPTION;
195 }
196 *backtrack_sp++ = Load32Aligned(pc + 4);
197 pc += BC_PUSH_BT_LENGTH;
198 break;
199 BYTECODE(PUSH_REGISTER)
200 if (--backtrack_stack_space < 0) {
201 return RegExpImpl::RE_EXCEPTION;
202 }
203 *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
204 pc += BC_PUSH_REGISTER_LENGTH;
205 break;
206 BYTECODE(SET_REGISTER)
207 registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
208 pc += BC_SET_REGISTER_LENGTH;
209 break;
210 BYTECODE(ADVANCE_REGISTER)
211 registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
212 pc += BC_ADVANCE_REGISTER_LENGTH;
213 break;
214 BYTECODE(SET_REGISTER_TO_CP)
215 registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
216 pc += BC_SET_REGISTER_TO_CP_LENGTH;
217 break;
218 BYTECODE(SET_CP_TO_REGISTER)
219 current = registers[insn >> BYTECODE_SHIFT];
220 pc += BC_SET_CP_TO_REGISTER_LENGTH;
221 break;
222 BYTECODE(SET_REGISTER_TO_SP)
223 registers[insn >> BYTECODE_SHIFT] =
224 static_cast<int>(backtrack_sp - backtrack_stack_base);
225 pc += BC_SET_REGISTER_TO_SP_LENGTH;
226 break;
227 BYTECODE(SET_SP_TO_REGISTER)
228 backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
229 backtrack_stack_space = backtrack_stack.max_size() -
230 static_cast<int>(backtrack_sp - backtrack_stack_base);
231 pc += BC_SET_SP_TO_REGISTER_LENGTH;
232 break;
233 BYTECODE(POP_CP)
234 backtrack_stack_space++;
235 --backtrack_sp;
236 current = *backtrack_sp;
237 pc += BC_POP_CP_LENGTH;
238 break;
239 BYTECODE(POP_BT)
240 backtrack_stack_space++;
241 --backtrack_sp;
242 pc = code_base + *backtrack_sp;
243 break;
244 BYTECODE(POP_REGISTER)
245 backtrack_stack_space++;
246 --backtrack_sp;
247 registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
248 pc += BC_POP_REGISTER_LENGTH;
249 break;
250 BYTECODE(FAIL)
251 return RegExpImpl::RE_FAILURE;
252 BYTECODE(SUCCEED)
253 return RegExpImpl::RE_SUCCESS;
254 BYTECODE(ADVANCE_CP)
255 current += insn >> BYTECODE_SHIFT;
256 pc += BC_ADVANCE_CP_LENGTH;
257 break;
258 BYTECODE(GOTO)
259 pc = code_base + Load32Aligned(pc + 4);
260 break;
261 BYTECODE(ADVANCE_CP_AND_GOTO)
262 current += insn >> BYTECODE_SHIFT;
263 pc = code_base + Load32Aligned(pc + 4);
264 break;
265 BYTECODE(CHECK_GREEDY)
266 if (current == backtrack_sp[-1]) {
267 backtrack_sp--;
268 backtrack_stack_space++;
269 pc = code_base + Load32Aligned(pc + 4);
270 } else {
271 pc += BC_CHECK_GREEDY_LENGTH;
272 }
273 break;
274 BYTECODE(LOAD_CURRENT_CHAR) {
275 int pos = current + (insn >> BYTECODE_SHIFT);
276 if (pos >= subject.length()) {
277 pc = code_base + Load32Aligned(pc + 4);
278 } else {
279 current_char = subject[pos];
280 pc += BC_LOAD_CURRENT_CHAR_LENGTH;
281 }
282 break;
283 }
284 BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
285 int pos = current + (insn >> BYTECODE_SHIFT);
286 current_char = subject[pos];
287 pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
288 break;
289 }
290 BYTECODE(LOAD_2_CURRENT_CHARS) {
291 int pos = current + (insn >> BYTECODE_SHIFT);
292 if (pos + 2 > subject.length()) {
293 pc = code_base + Load32Aligned(pc + 4);
294 } else {
295 Char next = subject[pos + 1];
296 current_char =
297 (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
298 pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
299 }
300 break;
301 }
302 BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
303 int pos = current + (insn >> BYTECODE_SHIFT);
304 Char next = subject[pos + 1];
305 current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
306 pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
307 break;
308 }
309 BYTECODE(LOAD_4_CURRENT_CHARS) {
310 ASSERT(sizeof(Char) == 1);
311 int pos = current + (insn >> BYTECODE_SHIFT);
312 if (pos + 4 > subject.length()) {
313 pc = code_base + Load32Aligned(pc + 4);
314 } else {
315 Char next1 = subject[pos + 1];
316 Char next2 = subject[pos + 2];
317 Char next3 = subject[pos + 3];
318 current_char = (subject[pos] |
319 (next1 << 8) |
320 (next2 << 16) |
321 (next3 << 24));
322 pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
323 }
324 break;
325 }
326 BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
327 ASSERT(sizeof(Char) == 1);
328 int pos = current + (insn >> BYTECODE_SHIFT);
329 Char next1 = subject[pos + 1];
330 Char next2 = subject[pos + 2];
331 Char next3 = subject[pos + 3];
332 current_char = (subject[pos] |
333 (next1 << 8) |
334 (next2 << 16) |
335 (next3 << 24));
336 pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
337 break;
338 }
339 BYTECODE(CHECK_4_CHARS) {
340 uint32_t c = Load32Aligned(pc + 4);
341 if (c == current_char) {
342 pc = code_base + Load32Aligned(pc + 8);
343 } else {
344 pc += BC_CHECK_4_CHARS_LENGTH;
345 }
346 break;
347 }
348 BYTECODE(CHECK_CHAR) {
349 uint32_t c = (insn >> BYTECODE_SHIFT);
350 if (c == current_char) {
351 pc = code_base + Load32Aligned(pc + 4);
352 } else {
353 pc += BC_CHECK_CHAR_LENGTH;
354 }
355 break;
356 }
357 BYTECODE(CHECK_NOT_4_CHARS) {
358 uint32_t c = Load32Aligned(pc + 4);
359 if (c != current_char) {
360 pc = code_base + Load32Aligned(pc + 8);
361 } else {
362 pc += BC_CHECK_NOT_4_CHARS_LENGTH;
363 }
364 break;
365 }
366 BYTECODE(CHECK_NOT_CHAR) {
367 uint32_t c = (insn >> BYTECODE_SHIFT);
368 if (c != current_char) {
369 pc = code_base + Load32Aligned(pc + 4);
370 } else {
371 pc += BC_CHECK_NOT_CHAR_LENGTH;
372 }
373 break;
374 }
375 BYTECODE(AND_CHECK_4_CHARS) {
376 uint32_t c = Load32Aligned(pc + 4);
377 if (c == (current_char & Load32Aligned(pc + 8))) {
378 pc = code_base + Load32Aligned(pc + 12);
379 } else {
380 pc += BC_AND_CHECK_4_CHARS_LENGTH;
381 }
382 break;
383 }
384 BYTECODE(AND_CHECK_CHAR) {
385 uint32_t c = (insn >> BYTECODE_SHIFT);
386 if (c == (current_char & Load32Aligned(pc + 4))) {
387 pc = code_base + Load32Aligned(pc + 8);
388 } else {
389 pc += BC_AND_CHECK_CHAR_LENGTH;
390 }
391 break;
392 }
393 BYTECODE(AND_CHECK_NOT_4_CHARS) {
394 uint32_t c = Load32Aligned(pc + 4);
395 if (c != (current_char & Load32Aligned(pc + 8))) {
396 pc = code_base + Load32Aligned(pc + 12);
397 } else {
398 pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
399 }
400 break;
401 }
402 BYTECODE(AND_CHECK_NOT_CHAR) {
403 uint32_t c = (insn >> BYTECODE_SHIFT);
404 if (c != (current_char & Load32Aligned(pc + 4))) {
405 pc = code_base + Load32Aligned(pc + 8);
406 } else {
407 pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
408 }
409 break;
410 }
411 BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
412 uint32_t c = (insn >> BYTECODE_SHIFT);
413 uint32_t minus = Load16Aligned(pc + 4);
414 uint32_t mask = Load16Aligned(pc + 6);
415 if (c != ((current_char - minus) & mask)) {
416 pc = code_base + Load32Aligned(pc + 8);
417 } else {
418 pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
419 }
420 break;
421 }
422 BYTECODE(CHECK_CHAR_IN_RANGE) {
423 uint32_t from = Load16Aligned(pc + 4);
424 uint32_t to = Load16Aligned(pc + 6);
425 if (from <= current_char && current_char <= to) {
426 pc = code_base + Load32Aligned(pc + 8);
427 } else {
428 pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
429 }
430 break;
431 }
432 BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
433 uint32_t from = Load16Aligned(pc + 4);
434 uint32_t to = Load16Aligned(pc + 6);
435 if (from > current_char || current_char > to) {
436 pc = code_base + Load32Aligned(pc + 8);
437 } else {
438 pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
439 }
440 break;
441 }
442 BYTECODE(CHECK_BIT_IN_TABLE) {
443 int mask = RegExpMacroAssembler::kTableMask;
444 byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
445 int bit = (current_char & (kBitsPerByte - 1));
446 if ((b & (1 << bit)) != 0) {
447 pc = code_base + Load32Aligned(pc + 4);
448 } else {
449 pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
450 }
451 break;
452 }
453 BYTECODE(CHECK_LT) {
454 uint32_t limit = (insn >> BYTECODE_SHIFT);
455 if (current_char < limit) {
456 pc = code_base + Load32Aligned(pc + 4);
457 } else {
458 pc += BC_CHECK_LT_LENGTH;
459 }
460 break;
461 }
462 BYTECODE(CHECK_GT) {
463 uint32_t limit = (insn >> BYTECODE_SHIFT);
464 if (current_char > limit) {
465 pc = code_base + Load32Aligned(pc + 4);
466 } else {
467 pc += BC_CHECK_GT_LENGTH;
468 }
469 break;
470 }
471 BYTECODE(CHECK_REGISTER_LT)
472 if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
473 pc = code_base + Load32Aligned(pc + 8);
474 } else {
475 pc += BC_CHECK_REGISTER_LT_LENGTH;
476 }
477 break;
478 BYTECODE(CHECK_REGISTER_GE)
479 if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
480 pc = code_base + Load32Aligned(pc + 8);
481 } else {
482 pc += BC_CHECK_REGISTER_GE_LENGTH;
483 }
484 break;
485 BYTECODE(CHECK_REGISTER_EQ_POS)
486 if (registers[insn >> BYTECODE_SHIFT] == current) {
487 pc = code_base + Load32Aligned(pc + 4);
488 } else {
489 pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
490 }
491 break;
492 BYTECODE(CHECK_NOT_REGS_EQUAL)
493 if (registers[insn >> BYTECODE_SHIFT] ==
494 registers[Load32Aligned(pc + 4)]) {
495 pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
496 } else {
497 pc = code_base + Load32Aligned(pc + 8);
498 }
499 break;
500 BYTECODE(CHECK_NOT_BACK_REF) {
501 int from = registers[insn >> BYTECODE_SHIFT];
502 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
503 if (from < 0 || len <= 0) {
504 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
505 break;
506 }
507 if (current + len > subject.length()) {
508 pc = code_base + Load32Aligned(pc + 4);
509 break;
510 } else {
511 int i;
512 for (i = 0; i < len; i++) {
513 if (subject[from + i] != subject[current + i]) {
514 pc = code_base + Load32Aligned(pc + 4);
515 break;
516 }
517 }
518 if (i < len) break;
519 current += len;
520 }
521 pc += BC_CHECK_NOT_BACK_REF_LENGTH;
522 break;
523 }
524 BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
525 int from = registers[insn >> BYTECODE_SHIFT];
526 int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
527 if (from < 0 || len <= 0) {
528 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
529 break;
530 }
531 if (current + len > subject.length()) {
532 pc = code_base + Load32Aligned(pc + 4);
533 break;
534 } else {
535 if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
536 from, current, len, subject)) {
537 current += len;
538 pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
539 } else {
540 pc = code_base + Load32Aligned(pc + 4);
541 }
542 }
543 break;
544 }
545 BYTECODE(CHECK_AT_START)
546 if (current == 0) {
547 pc = code_base + Load32Aligned(pc + 4);
548 } else {
549 pc += BC_CHECK_AT_START_LENGTH;
550 }
551 break;
552 BYTECODE(CHECK_NOT_AT_START)
553 if (current == 0) {
554 pc += BC_CHECK_NOT_AT_START_LENGTH;
555 } else {
556 pc = code_base + Load32Aligned(pc + 4);
557 }
558 break;
559 BYTECODE(SET_CURRENT_POSITION_FROM_END) {
560 int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
561 if (subject.length() - current > by) {
562 current = subject.length() - by;
563 current_char = subject[current - 1];
564 }
565 pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
566 break;
567 }
568 default:
569 UNREACHABLE();
570 break;
571 }
572 }
573 }
574
575
Match(Isolate * isolate,Handle<ByteArray> code_array,Handle<String> subject,int * registers,int start_position)576 RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
577 Isolate* isolate,
578 Handle<ByteArray> code_array,
579 Handle<String> subject,
580 int* registers,
581 int start_position) {
582 ASSERT(subject->IsFlat());
583
584 DisallowHeapAllocation no_gc;
585 const byte* code_base = code_array->GetDataStartAddress();
586 uc16 previous_char = '\n';
587 String::FlatContent subject_content = subject->GetFlatContent();
588 if (subject_content.IsAscii()) {
589 Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
590 if (start_position != 0) previous_char = subject_vector[start_position - 1];
591 return RawMatch(isolate,
592 code_base,
593 subject_vector,
594 registers,
595 start_position,
596 previous_char);
597 } else {
598 ASSERT(subject_content.IsTwoByte());
599 Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
600 if (start_position != 0) previous_char = subject_vector[start_position - 1];
601 return RawMatch(isolate,
602 code_base,
603 subject_vector,
604 registers,
605 start_position,
606 previous_char);
607 }
608 }
609
610 } } // namespace v8::internal
611