• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 
12 #ifdef V8_I18N_SUPPORT
13 #include "unicode/uchar.h"
14 #endif  // V8_I18N_SUPPORT
15 
16 namespace v8 {
17 namespace internal {
18 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20     : slow_safe_compiler_(false),
21       global_mode_(NOT_GLOBAL),
22       isolate_(isolate),
23       zone_(zone) {}
24 
25 
~RegExpMacroAssembler()26 RegExpMacroAssembler::~RegExpMacroAssembler() {
27 }
28 
29 
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31                                                      Address byte_offset2,
32                                                      size_t byte_length,
33                                                      Isolate* isolate) {
34   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35       isolate->regexp_macro_assembler_canonicalize();
36   // This function is not allowed to cause a garbage collection.
37   // A GC might move the calling generated code and invalidate the
38   // return address on the stack.
39   DCHECK(byte_length % 2 == 0);
40   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42   size_t length = byte_length >> 1;
43 
44 #ifdef V8_I18N_SUPPORT
45   if (isolate == nullptr) {
46     for (size_t i = 0; i < length; i++) {
47       uc32 c1 = substring1[i];
48       uc32 c2 = substring2[i];
49       if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50         // Non-BMP characters do not have case-equivalents in the BMP.
51         // Both have to be non-BMP for them to be able to match.
52         if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53         if (i + 1 < length) {
54           uc16 c1t = substring1[i + 1];
55           uc16 c2t = substring2[i + 1];
56           if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57               unibrow::Utf16::IsTrailSurrogate(c2t)) {
58             c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59             c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60             i++;
61           }
62         }
63       }
64       c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65       c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66       if (c1 != c2) return 0;
67     }
68     return 1;
69   }
70 #endif  // V8_I18N_SUPPORT
71   DCHECK_NOT_NULL(isolate);
72   for (size_t i = 0; i < length; i++) {
73     unibrow::uchar c1 = substring1[i];
74     unibrow::uchar c2 = substring2[i];
75     if (c1 != c2) {
76       unibrow::uchar s1[1] = {c1};
77       canonicalize->get(c1, '\0', s1);
78       if (s1[0] != c2) {
79         unibrow::uchar s2[1] = {c2};
80         canonicalize->get(c2, '\0', s2);
81         if (s1[0] != s2[0]) {
82           return 0;
83         }
84       }
85     }
86   }
87   return 1;
88 }
89 
90 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92                                                    Label* on_failure) {
93   Label ok;
94   // Check that current character is not a trail surrogate.
95   LoadCurrentCharacter(cp_offset, &ok);
96   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97   // Check that previous character is not a lead surrogate.
98   LoadCurrentCharacter(cp_offset - 1, &ok);
99   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100   Bind(&ok);
101 }
102 
CheckPosition(int cp_offset,Label * on_outside_input)103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
104                                          Label* on_outside_input) {
105   LoadCurrentCharacter(cp_offset, on_outside_input, true);
106 }
107 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109                                                       Label* on_no_match) {
110   return false;
111 }
112 
113 #ifndef V8_INTERPRETED_REGEXP  // Avoid unused code, e.g., on ARM.
114 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116                                                        Zone* zone)
117     : RegExpMacroAssembler(isolate, zone) {}
118 
119 
~NativeRegExpMacroAssembler()120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121 }
122 
123 
CanReadUnaligned()124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
126 }
127 
StringCharacterPosition(String * subject,int start_index)128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129     String* subject,
130     int start_index) {
131   if (subject->IsConsString()) {
132     subject = ConsString::cast(subject)->first();
133   } else if (subject->IsSlicedString()) {
134     start_index += SlicedString::cast(subject)->offset();
135     subject = SlicedString::cast(subject)->parent();
136   }
137   if (subject->IsThinString()) {
138     subject = ThinString::cast(subject)->actual();
139   }
140   DCHECK(start_index >= 0);
141   DCHECK(start_index <= subject->length());
142   if (subject->IsSeqOneByteString()) {
143     return reinterpret_cast<const byte*>(
144         SeqOneByteString::cast(subject)->GetChars() + start_index);
145   } else if (subject->IsSeqTwoByteString()) {
146     return reinterpret_cast<const byte*>(
147         SeqTwoByteString::cast(subject)->GetChars() + start_index);
148   } else if (subject->IsExternalOneByteString()) {
149     return reinterpret_cast<const byte*>(
150         ExternalOneByteString::cast(subject)->GetChars() + start_index);
151   } else {
152     DCHECK(subject->IsExternalTwoByteString());
153     return reinterpret_cast<const byte*>(
154         ExternalTwoByteString::cast(subject)->GetChars() + start_index);
155   }
156 }
157 
158 
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)159 int NativeRegExpMacroAssembler::CheckStackGuardState(
160     Isolate* isolate, int start_index, bool is_direct_call,
161     Address* return_address, Code* re_code, String** subject,
162     const byte** input_start, const byte** input_end) {
163   DCHECK(re_code->instruction_start() <= *return_address);
164   DCHECK(*return_address <= re_code->instruction_end());
165   int return_value = 0;
166   // Prepare for possible GC.
167   HandleScope handles(isolate);
168   Handle<Code> code_handle(re_code);
169   Handle<String> subject_handle(*subject);
170   bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
171 
172   StackLimitCheck check(isolate);
173   if (check.JsHasOverflowed()) {
174     isolate->StackOverflow();
175     return_value = EXCEPTION;
176   } else if (is_direct_call) {
177     // If not real stack overflow the stack guard was used to interrupt
178     // execution for another purpose.  If this is a direct call from JavaScript
179     // retry the RegExp forcing the call through the runtime system.
180     // Currently the direct call cannot handle a GC.
181     return_value = RETRY;
182   } else {
183     Object* result = isolate->stack_guard()->HandleInterrupts();
184     if (result->IsException(isolate)) return_value = EXCEPTION;
185   }
186 
187   DisallowHeapAllocation no_gc;
188 
189   if (*code_handle != re_code) {  // Return address no longer valid
190     intptr_t delta = code_handle->address() - re_code->address();
191     // Overwrite the return address on the stack.
192     *return_address += delta;
193   }
194 
195   // If we continue, we need to update the subject string addresses.
196   if (return_value == 0) {
197     // String encoding might have changed.
198     if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
199       // If we changed between an LATIN1 and an UC16 string, the specialized
200       // code cannot be used, and we need to restart regexp matching from
201       // scratch (including, potentially, compiling a new version of the code).
202       return_value = RETRY;
203     } else {
204       *subject = *subject_handle;
205       intptr_t byte_length = *input_end - *input_start;
206       *input_start = StringCharacterPosition(*subject, start_index);
207       *input_end = *input_start + byte_length;
208     }
209   }
210   return return_value;
211 }
212 
213 
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)214 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
215     Handle<Code> regexp_code,
216     Handle<String> subject,
217     int* offsets_vector,
218     int offsets_vector_length,
219     int previous_index,
220     Isolate* isolate) {
221 
222   DCHECK(subject->IsFlat());
223   DCHECK(previous_index >= 0);
224   DCHECK(previous_index <= subject->length());
225 
226   // No allocations before calling the regexp, but we can't use
227   // DisallowHeapAllocation, since regexps might be preempted, and another
228   // thread might do allocation anyway.
229 
230   String* subject_ptr = *subject;
231   // Character offsets into string.
232   int start_offset = previous_index;
233   int char_length = subject_ptr->length() - start_offset;
234   int slice_offset = 0;
235 
236   // The string has been flattened, so if it is a cons string it contains the
237   // full string in the first part.
238   if (StringShape(subject_ptr).IsCons()) {
239     DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
240     subject_ptr = ConsString::cast(subject_ptr)->first();
241   } else if (StringShape(subject_ptr).IsSliced()) {
242     SlicedString* slice = SlicedString::cast(subject_ptr);
243     subject_ptr = slice->parent();
244     slice_offset = slice->offset();
245   }
246   if (StringShape(subject_ptr).IsThin()) {
247     subject_ptr = ThinString::cast(subject_ptr)->actual();
248   }
249   // Ensure that an underlying string has the same representation.
250   bool is_one_byte = subject_ptr->IsOneByteRepresentation();
251   DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
252   // String is now either Sequential or External
253   int char_size_shift = is_one_byte ? 0 : 1;
254 
255   const byte* input_start =
256       StringCharacterPosition(subject_ptr, start_offset + slice_offset);
257   int byte_length = char_length << char_size_shift;
258   const byte* input_end = input_start + byte_length;
259   Result res = Execute(*regexp_code,
260                        *subject,
261                        start_offset,
262                        input_start,
263                        input_end,
264                        offsets_vector,
265                        offsets_vector_length,
266                        isolate);
267   return res;
268 }
269 
270 
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)271 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
272     Code* code,
273     String* input,  // This needs to be the unpacked (sliced, cons) string.
274     int start_offset,
275     const byte* input_start,
276     const byte* input_end,
277     int* output,
278     int output_size,
279     Isolate* isolate) {
280   // Ensure that the minimum stack has been allocated.
281   RegExpStackScope stack_scope(isolate);
282   Address stack_base = stack_scope.stack()->stack_base();
283 
284   int direct_call = 0;
285   int result = CALL_GENERATED_REGEXP_CODE(
286       isolate, code->entry(), input, start_offset, input_start, input_end,
287       output, output_size, stack_base, direct_call, isolate);
288   DCHECK(result >= RETRY);
289 
290   if (result == EXCEPTION && !isolate->has_pending_exception()) {
291     // We detected a stack overflow (on the backtrack stack) in RegExp code,
292     // but haven't created the exception yet.
293     isolate->StackOverflow();
294   }
295   return static_cast<Result>(result);
296 }
297 
298 
299 const byte NativeRegExpMacroAssembler::word_character_map[] = {
300     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
301     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304 
305     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // '0' - '7'
308     0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
309 
310     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'A' - 'G'
311     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'H' - 'O'
312     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'P' - 'W'
313     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu,  // 'X' - 'Z', '_'
314 
315     0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'a' - 'g'
316     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'h' - 'o'
317     0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu,  // 'p' - 'w'
318     0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
319     // Latin-1 range
320     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324 
325     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329 
330     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334 
335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 };
340 
341 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)342 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
343                                               Address* stack_base,
344                                               Isolate* isolate) {
345   RegExpStack* regexp_stack = isolate->regexp_stack();
346   size_t size = regexp_stack->stack_capacity();
347   Address old_stack_base = regexp_stack->stack_base();
348   DCHECK(old_stack_base == *stack_base);
349   DCHECK(stack_pointer <= old_stack_base);
350   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
351   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
352   if (new_stack_base == NULL) {
353     return NULL;
354   }
355   *stack_base = new_stack_base;
356   intptr_t stack_content_size = old_stack_base - stack_pointer;
357   return new_stack_base - stack_content_size;
358 }
359 
360 #endif  // V8_INTERPRETED_REGEXP
361 
362 }  // namespace internal
363 }  // namespace v8
364