• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // A simple interpreter for the Irregexp byte code.
6 
7 
8 #include "src/v8.h"
9 #include "src/unicode.h"
10 #include "src/utils.h"
11 #include "src/ast.h"
12 #include "src/bytecodes-irregexp.h"
13 #include "src/interpreter-irregexp.h"
14 #include "src/jsregexp.h"
15 #include "src/regexp-macro-assembler.h"
16 
17 namespace v8 {
18 namespace internal {
19 
20 
21 typedef unibrow::Mapping<unibrow::Ecma262Canonicalize> Canonicalize;
22 
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uc16> subject)23 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
24                                  int from,
25                                  int current,
26                                  int len,
27                                  Vector<const uc16> subject) {
28   for (int i = 0; i < len; i++) {
29     unibrow::uchar old_char = subject[from++];
30     unibrow::uchar new_char = subject[current++];
31     if (old_char == new_char) continue;
32     unibrow::uchar old_string[1] = { old_char };
33     unibrow::uchar new_string[1] = { new_char };
34     interp_canonicalize->get(old_char, '\0', old_string);
35     interp_canonicalize->get(new_char, '\0', new_string);
36     if (old_string[0] != new_string[0]) {
37       return false;
38     }
39   }
40   return true;
41 }
42 
43 
BackRefMatchesNoCase(Canonicalize * interp_canonicalize,int from,int current,int len,Vector<const uint8_t> subject)44 static bool BackRefMatchesNoCase(Canonicalize* interp_canonicalize,
45                                  int from,
46                                  int current,
47                                  int len,
48                                  Vector<const uint8_t> subject) {
49   for (int i = 0; i < len; i++) {
50     unsigned int old_char = subject[from++];
51     unsigned int new_char = subject[current++];
52     if (old_char == new_char) continue;
53     // Convert both characters to lower case.
54     old_char |= 0x20;
55     new_char |= 0x20;
56     if (old_char != new_char) return false;
57     // Not letters in the ASCII range and Latin-1 range.
58     if (!(old_char - 'a' <= 'z' - 'a') &&
59         !(old_char - 224 <= 254 - 224 && old_char != 247)) {
60       return false;
61     }
62   }
63   return true;
64 }
65 
66 
67 #ifdef DEBUG
TraceInterpreter(const byte * code_base,const byte * pc,int stack_depth,int current_position,uint32_t current_char,int bytecode_length,const char * bytecode_name)68 static void TraceInterpreter(const byte* code_base,
69                              const byte* pc,
70                              int stack_depth,
71                              int current_position,
72                              uint32_t current_char,
73                              int bytecode_length,
74                              const char* bytecode_name) {
75   if (FLAG_trace_regexp_bytecodes) {
76     bool printable = (current_char < 127 && current_char >= 32);
77     const char* format =
78         printable ?
79         "pc = %02x, sp = %d, curpos = %d, curchar = %08x (%c), bc = %s" :
80         "pc = %02x, sp = %d, curpos = %d, curchar = %08x .%c., bc = %s";
81     PrintF(format,
82            pc - code_base,
83            stack_depth,
84            current_position,
85            current_char,
86            printable ? current_char : '.',
87            bytecode_name);
88     for (int i = 0; i < bytecode_length; i++) {
89       printf(", %02x", pc[i]);
90     }
91     printf(" ");
92     for (int i = 1; i < bytecode_length; i++) {
93       unsigned char b = pc[i];
94       if (b < 127 && b >= 32) {
95         printf("%c", b);
96       } else {
97         printf(".");
98       }
99     }
100     printf("\n");
101   }
102 }
103 
104 
105 #define BYTECODE(name)                                                      \
106   case BC_##name:                                                           \
107     TraceInterpreter(code_base,                                             \
108                      pc,                                                    \
109                      static_cast<int>(backtrack_sp - backtrack_stack_base), \
110                      current,                                               \
111                      current_char,                                          \
112                      BC_##name##_LENGTH,                                    \
113                      #name);
114 #else
115 #define BYTECODE(name)                                                      \
116   case BC_##name:
117 #endif
118 
119 
Load32Aligned(const byte * pc)120 static int32_t Load32Aligned(const byte* pc) {
121   ASSERT((reinterpret_cast<intptr_t>(pc) & 3) == 0);
122   return *reinterpret_cast<const int32_t *>(pc);
123 }
124 
125 
Load16Aligned(const byte * pc)126 static int32_t Load16Aligned(const byte* pc) {
127   ASSERT((reinterpret_cast<intptr_t>(pc) & 1) == 0);
128   return *reinterpret_cast<const uint16_t *>(pc);
129 }
130 
131 
132 // A simple abstraction over the backtracking stack used by the interpreter.
133 // This backtracking stack does not grow automatically, but it ensures that the
134 // the memory held by the stack is released or remembered in a cache if the
135 // matching terminates.
136 class BacktrackStack {
137  public:
BacktrackStack()138   explicit BacktrackStack() {
139     data_ = NewArray<int>(kBacktrackStackSize);
140   }
141 
~BacktrackStack()142   ~BacktrackStack() {
143     DeleteArray(data_);
144   }
145 
data() const146   int* data() const { return data_; }
147 
max_size() const148   int max_size() const { return kBacktrackStackSize; }
149 
150  private:
151   static const int kBacktrackStackSize = 10000;
152 
153   int* data_;
154 
155   DISALLOW_COPY_AND_ASSIGN(BacktrackStack);
156 };
157 
158 
159 template <typename Char>
RawMatch(Isolate * isolate,const byte * code_base,Vector<const Char> subject,int * registers,int current,uint32_t current_char)160 static RegExpImpl::IrregexpResult RawMatch(Isolate* isolate,
161                                            const byte* code_base,
162                                            Vector<const Char> subject,
163                                            int* registers,
164                                            int current,
165                                            uint32_t current_char) {
166   const byte* pc = code_base;
167   // BacktrackStack ensures that the memory allocated for the backtracking stack
168   // is returned to the system or cached if there is no stack being cached at
169   // the moment.
170   BacktrackStack backtrack_stack;
171   int* backtrack_stack_base = backtrack_stack.data();
172   int* backtrack_sp = backtrack_stack_base;
173   int backtrack_stack_space = backtrack_stack.max_size();
174 #ifdef DEBUG
175   if (FLAG_trace_regexp_bytecodes) {
176     PrintF("\n\nStart bytecode interpreter\n\n");
177   }
178 #endif
179   while (true) {
180     int32_t insn = Load32Aligned(pc);
181     switch (insn & BYTECODE_MASK) {
182       BYTECODE(BREAK)
183         UNREACHABLE();
184         return RegExpImpl::RE_FAILURE;
185       BYTECODE(PUSH_CP)
186         if (--backtrack_stack_space < 0) {
187           return RegExpImpl::RE_EXCEPTION;
188         }
189         *backtrack_sp++ = current;
190         pc += BC_PUSH_CP_LENGTH;
191         break;
192       BYTECODE(PUSH_BT)
193         if (--backtrack_stack_space < 0) {
194           return RegExpImpl::RE_EXCEPTION;
195         }
196         *backtrack_sp++ = Load32Aligned(pc + 4);
197         pc += BC_PUSH_BT_LENGTH;
198         break;
199       BYTECODE(PUSH_REGISTER)
200         if (--backtrack_stack_space < 0) {
201           return RegExpImpl::RE_EXCEPTION;
202         }
203         *backtrack_sp++ = registers[insn >> BYTECODE_SHIFT];
204         pc += BC_PUSH_REGISTER_LENGTH;
205         break;
206       BYTECODE(SET_REGISTER)
207         registers[insn >> BYTECODE_SHIFT] = Load32Aligned(pc + 4);
208         pc += BC_SET_REGISTER_LENGTH;
209         break;
210       BYTECODE(ADVANCE_REGISTER)
211         registers[insn >> BYTECODE_SHIFT] += Load32Aligned(pc + 4);
212         pc += BC_ADVANCE_REGISTER_LENGTH;
213         break;
214       BYTECODE(SET_REGISTER_TO_CP)
215         registers[insn >> BYTECODE_SHIFT] = current + Load32Aligned(pc + 4);
216         pc += BC_SET_REGISTER_TO_CP_LENGTH;
217         break;
218       BYTECODE(SET_CP_TO_REGISTER)
219         current = registers[insn >> BYTECODE_SHIFT];
220         pc += BC_SET_CP_TO_REGISTER_LENGTH;
221         break;
222       BYTECODE(SET_REGISTER_TO_SP)
223         registers[insn >> BYTECODE_SHIFT] =
224             static_cast<int>(backtrack_sp - backtrack_stack_base);
225         pc += BC_SET_REGISTER_TO_SP_LENGTH;
226         break;
227       BYTECODE(SET_SP_TO_REGISTER)
228         backtrack_sp = backtrack_stack_base + registers[insn >> BYTECODE_SHIFT];
229         backtrack_stack_space = backtrack_stack.max_size() -
230             static_cast<int>(backtrack_sp - backtrack_stack_base);
231         pc += BC_SET_SP_TO_REGISTER_LENGTH;
232         break;
233       BYTECODE(POP_CP)
234         backtrack_stack_space++;
235         --backtrack_sp;
236         current = *backtrack_sp;
237         pc += BC_POP_CP_LENGTH;
238         break;
239       BYTECODE(POP_BT)
240         backtrack_stack_space++;
241         --backtrack_sp;
242         pc = code_base + *backtrack_sp;
243         break;
244       BYTECODE(POP_REGISTER)
245         backtrack_stack_space++;
246         --backtrack_sp;
247         registers[insn >> BYTECODE_SHIFT] = *backtrack_sp;
248         pc += BC_POP_REGISTER_LENGTH;
249         break;
250       BYTECODE(FAIL)
251         return RegExpImpl::RE_FAILURE;
252       BYTECODE(SUCCEED)
253         return RegExpImpl::RE_SUCCESS;
254       BYTECODE(ADVANCE_CP)
255         current += insn >> BYTECODE_SHIFT;
256         pc += BC_ADVANCE_CP_LENGTH;
257         break;
258       BYTECODE(GOTO)
259         pc = code_base + Load32Aligned(pc + 4);
260         break;
261       BYTECODE(ADVANCE_CP_AND_GOTO)
262         current += insn >> BYTECODE_SHIFT;
263         pc = code_base + Load32Aligned(pc + 4);
264         break;
265       BYTECODE(CHECK_GREEDY)
266         if (current == backtrack_sp[-1]) {
267           backtrack_sp--;
268           backtrack_stack_space++;
269           pc = code_base + Load32Aligned(pc + 4);
270         } else {
271           pc += BC_CHECK_GREEDY_LENGTH;
272         }
273         break;
274       BYTECODE(LOAD_CURRENT_CHAR) {
275         int pos = current + (insn >> BYTECODE_SHIFT);
276         if (pos >= subject.length()) {
277           pc = code_base + Load32Aligned(pc + 4);
278         } else {
279           current_char = subject[pos];
280           pc += BC_LOAD_CURRENT_CHAR_LENGTH;
281         }
282         break;
283       }
284       BYTECODE(LOAD_CURRENT_CHAR_UNCHECKED) {
285         int pos = current + (insn >> BYTECODE_SHIFT);
286         current_char = subject[pos];
287         pc += BC_LOAD_CURRENT_CHAR_UNCHECKED_LENGTH;
288         break;
289       }
290       BYTECODE(LOAD_2_CURRENT_CHARS) {
291         int pos = current + (insn >> BYTECODE_SHIFT);
292         if (pos + 2 > subject.length()) {
293           pc = code_base + Load32Aligned(pc + 4);
294         } else {
295           Char next = subject[pos + 1];
296           current_char =
297               (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
298           pc += BC_LOAD_2_CURRENT_CHARS_LENGTH;
299         }
300         break;
301       }
302       BYTECODE(LOAD_2_CURRENT_CHARS_UNCHECKED) {
303         int pos = current + (insn >> BYTECODE_SHIFT);
304         Char next = subject[pos + 1];
305         current_char = (subject[pos] | (next << (kBitsPerByte * sizeof(Char))));
306         pc += BC_LOAD_2_CURRENT_CHARS_UNCHECKED_LENGTH;
307         break;
308       }
309       BYTECODE(LOAD_4_CURRENT_CHARS) {
310         ASSERT(sizeof(Char) == 1);
311         int pos = current + (insn >> BYTECODE_SHIFT);
312         if (pos + 4 > subject.length()) {
313           pc = code_base + Load32Aligned(pc + 4);
314         } else {
315           Char next1 = subject[pos + 1];
316           Char next2 = subject[pos + 2];
317           Char next3 = subject[pos + 3];
318           current_char = (subject[pos] |
319                           (next1 << 8) |
320                           (next2 << 16) |
321                           (next3 << 24));
322           pc += BC_LOAD_4_CURRENT_CHARS_LENGTH;
323         }
324         break;
325       }
326       BYTECODE(LOAD_4_CURRENT_CHARS_UNCHECKED) {
327         ASSERT(sizeof(Char) == 1);
328         int pos = current + (insn >> BYTECODE_SHIFT);
329         Char next1 = subject[pos + 1];
330         Char next2 = subject[pos + 2];
331         Char next3 = subject[pos + 3];
332         current_char = (subject[pos] |
333                         (next1 << 8) |
334                         (next2 << 16) |
335                         (next3 << 24));
336         pc += BC_LOAD_4_CURRENT_CHARS_UNCHECKED_LENGTH;
337         break;
338       }
339       BYTECODE(CHECK_4_CHARS) {
340         uint32_t c = Load32Aligned(pc + 4);
341         if (c == current_char) {
342           pc = code_base + Load32Aligned(pc + 8);
343         } else {
344           pc += BC_CHECK_4_CHARS_LENGTH;
345         }
346         break;
347       }
348       BYTECODE(CHECK_CHAR) {
349         uint32_t c = (insn >> BYTECODE_SHIFT);
350         if (c == current_char) {
351           pc = code_base + Load32Aligned(pc + 4);
352         } else {
353           pc += BC_CHECK_CHAR_LENGTH;
354         }
355         break;
356       }
357       BYTECODE(CHECK_NOT_4_CHARS) {
358         uint32_t c = Load32Aligned(pc + 4);
359         if (c != current_char) {
360           pc = code_base + Load32Aligned(pc + 8);
361         } else {
362           pc += BC_CHECK_NOT_4_CHARS_LENGTH;
363         }
364         break;
365       }
366       BYTECODE(CHECK_NOT_CHAR) {
367         uint32_t c = (insn >> BYTECODE_SHIFT);
368         if (c != current_char) {
369           pc = code_base + Load32Aligned(pc + 4);
370         } else {
371           pc += BC_CHECK_NOT_CHAR_LENGTH;
372         }
373         break;
374       }
375       BYTECODE(AND_CHECK_4_CHARS) {
376         uint32_t c = Load32Aligned(pc + 4);
377         if (c == (current_char & Load32Aligned(pc + 8))) {
378           pc = code_base + Load32Aligned(pc + 12);
379         } else {
380           pc += BC_AND_CHECK_4_CHARS_LENGTH;
381         }
382         break;
383       }
384       BYTECODE(AND_CHECK_CHAR) {
385         uint32_t c = (insn >> BYTECODE_SHIFT);
386         if (c == (current_char & Load32Aligned(pc + 4))) {
387           pc = code_base + Load32Aligned(pc + 8);
388         } else {
389           pc += BC_AND_CHECK_CHAR_LENGTH;
390         }
391         break;
392       }
393       BYTECODE(AND_CHECK_NOT_4_CHARS) {
394         uint32_t c = Load32Aligned(pc + 4);
395         if (c != (current_char & Load32Aligned(pc + 8))) {
396           pc = code_base + Load32Aligned(pc + 12);
397         } else {
398           pc += BC_AND_CHECK_NOT_4_CHARS_LENGTH;
399         }
400         break;
401       }
402       BYTECODE(AND_CHECK_NOT_CHAR) {
403         uint32_t c = (insn >> BYTECODE_SHIFT);
404         if (c != (current_char & Load32Aligned(pc + 4))) {
405           pc = code_base + Load32Aligned(pc + 8);
406         } else {
407           pc += BC_AND_CHECK_NOT_CHAR_LENGTH;
408         }
409         break;
410       }
411       BYTECODE(MINUS_AND_CHECK_NOT_CHAR) {
412         uint32_t c = (insn >> BYTECODE_SHIFT);
413         uint32_t minus = Load16Aligned(pc + 4);
414         uint32_t mask = Load16Aligned(pc + 6);
415         if (c != ((current_char - minus) & mask)) {
416           pc = code_base + Load32Aligned(pc + 8);
417         } else {
418           pc += BC_MINUS_AND_CHECK_NOT_CHAR_LENGTH;
419         }
420         break;
421       }
422       BYTECODE(CHECK_CHAR_IN_RANGE) {
423         uint32_t from = Load16Aligned(pc + 4);
424         uint32_t to = Load16Aligned(pc + 6);
425         if (from <= current_char && current_char <= to) {
426           pc = code_base + Load32Aligned(pc + 8);
427         } else {
428           pc += BC_CHECK_CHAR_IN_RANGE_LENGTH;
429         }
430         break;
431       }
432       BYTECODE(CHECK_CHAR_NOT_IN_RANGE) {
433         uint32_t from = Load16Aligned(pc + 4);
434         uint32_t to = Load16Aligned(pc + 6);
435         if (from > current_char || current_char > to) {
436           pc = code_base + Load32Aligned(pc + 8);
437         } else {
438           pc += BC_CHECK_CHAR_NOT_IN_RANGE_LENGTH;
439         }
440         break;
441       }
442       BYTECODE(CHECK_BIT_IN_TABLE) {
443         int mask = RegExpMacroAssembler::kTableMask;
444         byte b = pc[8 + ((current_char & mask) >> kBitsPerByteLog2)];
445         int bit = (current_char & (kBitsPerByte - 1));
446         if ((b & (1 << bit)) != 0) {
447           pc = code_base + Load32Aligned(pc + 4);
448         } else {
449           pc += BC_CHECK_BIT_IN_TABLE_LENGTH;
450         }
451         break;
452       }
453       BYTECODE(CHECK_LT) {
454         uint32_t limit = (insn >> BYTECODE_SHIFT);
455         if (current_char < limit) {
456           pc = code_base + Load32Aligned(pc + 4);
457         } else {
458           pc += BC_CHECK_LT_LENGTH;
459         }
460         break;
461       }
462       BYTECODE(CHECK_GT) {
463         uint32_t limit = (insn >> BYTECODE_SHIFT);
464         if (current_char > limit) {
465           pc = code_base + Load32Aligned(pc + 4);
466         } else {
467           pc += BC_CHECK_GT_LENGTH;
468         }
469         break;
470       }
471       BYTECODE(CHECK_REGISTER_LT)
472         if (registers[insn >> BYTECODE_SHIFT] < Load32Aligned(pc + 4)) {
473           pc = code_base + Load32Aligned(pc + 8);
474         } else {
475           pc += BC_CHECK_REGISTER_LT_LENGTH;
476         }
477         break;
478       BYTECODE(CHECK_REGISTER_GE)
479         if (registers[insn >> BYTECODE_SHIFT] >= Load32Aligned(pc + 4)) {
480           pc = code_base + Load32Aligned(pc + 8);
481         } else {
482           pc += BC_CHECK_REGISTER_GE_LENGTH;
483         }
484         break;
485       BYTECODE(CHECK_REGISTER_EQ_POS)
486         if (registers[insn >> BYTECODE_SHIFT] == current) {
487           pc = code_base + Load32Aligned(pc + 4);
488         } else {
489           pc += BC_CHECK_REGISTER_EQ_POS_LENGTH;
490         }
491         break;
492       BYTECODE(CHECK_NOT_REGS_EQUAL)
493         if (registers[insn >> BYTECODE_SHIFT] ==
494             registers[Load32Aligned(pc + 4)]) {
495           pc += BC_CHECK_NOT_REGS_EQUAL_LENGTH;
496         } else {
497           pc = code_base + Load32Aligned(pc + 8);
498         }
499         break;
500       BYTECODE(CHECK_NOT_BACK_REF) {
501         int from = registers[insn >> BYTECODE_SHIFT];
502         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
503         if (from < 0 || len <= 0) {
504           pc += BC_CHECK_NOT_BACK_REF_LENGTH;
505           break;
506         }
507         if (current + len > subject.length()) {
508           pc = code_base + Load32Aligned(pc + 4);
509           break;
510         } else {
511           int i;
512           for (i = 0; i < len; i++) {
513             if (subject[from + i] != subject[current + i]) {
514               pc = code_base + Load32Aligned(pc + 4);
515               break;
516             }
517           }
518           if (i < len) break;
519           current += len;
520         }
521         pc += BC_CHECK_NOT_BACK_REF_LENGTH;
522         break;
523       }
524       BYTECODE(CHECK_NOT_BACK_REF_NO_CASE) {
525         int from = registers[insn >> BYTECODE_SHIFT];
526         int len = registers[(insn >> BYTECODE_SHIFT) + 1] - from;
527         if (from < 0 || len <= 0) {
528           pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
529           break;
530         }
531         if (current + len > subject.length()) {
532           pc = code_base + Load32Aligned(pc + 4);
533           break;
534         } else {
535           if (BackRefMatchesNoCase(isolate->interp_canonicalize_mapping(),
536                                    from, current, len, subject)) {
537             current += len;
538             pc += BC_CHECK_NOT_BACK_REF_NO_CASE_LENGTH;
539           } else {
540             pc = code_base + Load32Aligned(pc + 4);
541           }
542         }
543         break;
544       }
545       BYTECODE(CHECK_AT_START)
546         if (current == 0) {
547           pc = code_base + Load32Aligned(pc + 4);
548         } else {
549           pc += BC_CHECK_AT_START_LENGTH;
550         }
551         break;
552       BYTECODE(CHECK_NOT_AT_START)
553         if (current == 0) {
554           pc += BC_CHECK_NOT_AT_START_LENGTH;
555         } else {
556           pc = code_base + Load32Aligned(pc + 4);
557         }
558         break;
559       BYTECODE(SET_CURRENT_POSITION_FROM_END) {
560         int by = static_cast<uint32_t>(insn) >> BYTECODE_SHIFT;
561         if (subject.length() - current > by) {
562           current = subject.length() - by;
563           current_char = subject[current - 1];
564         }
565         pc += BC_SET_CURRENT_POSITION_FROM_END_LENGTH;
566         break;
567       }
568       default:
569         UNREACHABLE();
570         break;
571     }
572   }
573 }
574 
575 
Match(Isolate * isolate,Handle<ByteArray> code_array,Handle<String> subject,int * registers,int start_position)576 RegExpImpl::IrregexpResult IrregexpInterpreter::Match(
577     Isolate* isolate,
578     Handle<ByteArray> code_array,
579     Handle<String> subject,
580     int* registers,
581     int start_position) {
582   ASSERT(subject->IsFlat());
583 
584   DisallowHeapAllocation no_gc;
585   const byte* code_base = code_array->GetDataStartAddress();
586   uc16 previous_char = '\n';
587   String::FlatContent subject_content = subject->GetFlatContent();
588   if (subject_content.IsAscii()) {
589     Vector<const uint8_t> subject_vector = subject_content.ToOneByteVector();
590     if (start_position != 0) previous_char = subject_vector[start_position - 1];
591     return RawMatch(isolate,
592                     code_base,
593                     subject_vector,
594                     registers,
595                     start_position,
596                     previous_char);
597   } else {
598     ASSERT(subject_content.IsTwoByte());
599     Vector<const uc16> subject_vector = subject_content.ToUC16Vector();
600     if (start_position != 0) previous_char = subject_vector[start_position - 1];
601     return RawMatch(isolate,
602                     code_base,
603                     subject_vector,
604                     registers,
605                     start_position,
606                     previous_char);
607   }
608 }
609 
610 } }  // namespace v8::internal
611