1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-macro-assembler.h"
6
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11
12 namespace v8 {
13 namespace internal {
14
RegExpMacroAssembler(Isolate * isolate,Zone * zone)15 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
16 : slow_safe_compiler_(false),
17 global_mode_(NOT_GLOBAL),
18 isolate_(isolate),
19 zone_(zone) {}
20
21
~RegExpMacroAssembler()22 RegExpMacroAssembler::~RegExpMacroAssembler() {
23 }
24
25
26 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
27
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)28 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
29 Zone* zone)
30 : RegExpMacroAssembler(isolate, zone) {}
31
32
~NativeRegExpMacroAssembler()33 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
34 }
35
36
CanReadUnaligned()37 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
38 return FLAG_enable_unaligned_accesses && !slow_safe();
39 }
40
StringCharacterPosition(String * subject,int start_index)41 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
42 String* subject,
43 int start_index) {
44 if (subject->IsConsString()) {
45 subject = ConsString::cast(subject)->first();
46 } else if (subject->IsSlicedString()) {
47 start_index += SlicedString::cast(subject)->offset();
48 subject = SlicedString::cast(subject)->parent();
49 }
50 DCHECK(start_index >= 0);
51 DCHECK(start_index <= subject->length());
52 if (subject->IsSeqOneByteString()) {
53 return reinterpret_cast<const byte*>(
54 SeqOneByteString::cast(subject)->GetChars() + start_index);
55 } else if (subject->IsSeqTwoByteString()) {
56 return reinterpret_cast<const byte*>(
57 SeqTwoByteString::cast(subject)->GetChars() + start_index);
58 } else if (subject->IsExternalOneByteString()) {
59 return reinterpret_cast<const byte*>(
60 ExternalOneByteString::cast(subject)->GetChars() + start_index);
61 } else {
62 return reinterpret_cast<const byte*>(
63 ExternalTwoByteString::cast(subject)->GetChars() + start_index);
64 }
65 }
66
67
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)68 int NativeRegExpMacroAssembler::CheckStackGuardState(
69 Isolate* isolate, int start_index, bool is_direct_call,
70 Address* return_address, Code* re_code, String** subject,
71 const byte** input_start, const byte** input_end) {
72 DCHECK(re_code->instruction_start() <= *return_address);
73 DCHECK(*return_address <= re_code->instruction_end());
74 int return_value = 0;
75 // Prepare for possible GC.
76 HandleScope handles(isolate);
77 Handle<Code> code_handle(re_code);
78 Handle<String> subject_handle(*subject);
79 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
80
81 StackLimitCheck check(isolate);
82 if (check.JsHasOverflowed()) {
83 isolate->StackOverflow();
84 return_value = EXCEPTION;
85 } else if (is_direct_call) {
86 // If not real stack overflow the stack guard was used to interrupt
87 // execution for another purpose. If this is a direct call from JavaScript
88 // retry the RegExp forcing the call through the runtime system.
89 // Currently the direct call cannot handle a GC.
90 return_value = RETRY;
91 } else {
92 Object* result = isolate->stack_guard()->HandleInterrupts();
93 if (result->IsException()) return_value = EXCEPTION;
94 }
95
96 DisallowHeapAllocation no_gc;
97
98 if (*code_handle != re_code) { // Return address no longer valid
99 intptr_t delta = code_handle->address() - re_code->address();
100 // Overwrite the return address on the stack.
101 *return_address += delta;
102 }
103
104 // If we continue, we need to update the subject string addresses.
105 if (return_value == 0) {
106 // String encoding might have changed.
107 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
108 // If we changed between an LATIN1 and an UC16 string, the specialized
109 // code cannot be used, and we need to restart regexp matching from
110 // scratch (including, potentially, compiling a new version of the code).
111 return_value = RETRY;
112 } else {
113 *subject = *subject_handle;
114 intptr_t byte_length = *input_end - *input_start;
115 *input_start = StringCharacterPosition(*subject, start_index);
116 *input_end = *input_start + byte_length;
117 }
118 }
119 return return_value;
120 }
121
122
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)123 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
124 Handle<Code> regexp_code,
125 Handle<String> subject,
126 int* offsets_vector,
127 int offsets_vector_length,
128 int previous_index,
129 Isolate* isolate) {
130
131 DCHECK(subject->IsFlat());
132 DCHECK(previous_index >= 0);
133 DCHECK(previous_index <= subject->length());
134
135 // No allocations before calling the regexp, but we can't use
136 // DisallowHeapAllocation, since regexps might be preempted, and another
137 // thread might do allocation anyway.
138
139 String* subject_ptr = *subject;
140 // Character offsets into string.
141 int start_offset = previous_index;
142 int char_length = subject_ptr->length() - start_offset;
143 int slice_offset = 0;
144
145 // The string has been flattened, so if it is a cons string it contains the
146 // full string in the first part.
147 if (StringShape(subject_ptr).IsCons()) {
148 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
149 subject_ptr = ConsString::cast(subject_ptr)->first();
150 } else if (StringShape(subject_ptr).IsSliced()) {
151 SlicedString* slice = SlicedString::cast(subject_ptr);
152 subject_ptr = slice->parent();
153 slice_offset = slice->offset();
154 }
155 // Ensure that an underlying string has the same representation.
156 bool is_one_byte = subject_ptr->IsOneByteRepresentation();
157 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
158 // String is now either Sequential or External
159 int char_size_shift = is_one_byte ? 0 : 1;
160
161 const byte* input_start =
162 StringCharacterPosition(subject_ptr, start_offset + slice_offset);
163 int byte_length = char_length << char_size_shift;
164 const byte* input_end = input_start + byte_length;
165 Result res = Execute(*regexp_code,
166 *subject,
167 start_offset,
168 input_start,
169 input_end,
170 offsets_vector,
171 offsets_vector_length,
172 isolate);
173 return res;
174 }
175
176
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)177 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
178 Code* code,
179 String* input, // This needs to be the unpacked (sliced, cons) string.
180 int start_offset,
181 const byte* input_start,
182 const byte* input_end,
183 int* output,
184 int output_size,
185 Isolate* isolate) {
186 // Ensure that the minimum stack has been allocated.
187 RegExpStackScope stack_scope(isolate);
188 Address stack_base = stack_scope.stack()->stack_base();
189
190 int direct_call = 0;
191 int result = CALL_GENERATED_REGEXP_CODE(
192 isolate, code->entry(), input, start_offset, input_start, input_end,
193 output, output_size, stack_base, direct_call, isolate);
194 DCHECK(result >= RETRY);
195
196 if (result == EXCEPTION && !isolate->has_pending_exception()) {
197 // We detected a stack overflow (on the backtrack stack) in RegExp code,
198 // but haven't created the exception yet.
199 isolate->StackOverflow();
200 }
201 return static_cast<Result>(result);
202 }
203
204
205 const byte NativeRegExpMacroAssembler::word_character_map[] = {
206 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
207 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
208 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
209 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
210
211 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
212 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
213 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
214 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
215
216 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
217 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
218 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
219 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
220
221 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
222 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
223 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
224 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
225 // Latin-1 range
226 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
227 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
228 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
229 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
230
231 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
232 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
233 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
234 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
235
236 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
237 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
238 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
239 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
240
241 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
242 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
243 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
244 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
245 };
246
247
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)248 int NativeRegExpMacroAssembler::CaseInsensitiveCompareUC16(
249 Address byte_offset1,
250 Address byte_offset2,
251 size_t byte_length,
252 Isolate* isolate) {
253 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
254 isolate->regexp_macro_assembler_canonicalize();
255 // This function is not allowed to cause a garbage collection.
256 // A GC might move the calling generated code and invalidate the
257 // return address on the stack.
258 DCHECK(byte_length % 2 == 0);
259 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
260 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
261 size_t length = byte_length >> 1;
262
263 for (size_t i = 0; i < length; i++) {
264 unibrow::uchar c1 = substring1[i];
265 unibrow::uchar c2 = substring2[i];
266 if (c1 != c2) {
267 unibrow::uchar s1[1] = { c1 };
268 canonicalize->get(c1, '\0', s1);
269 if (s1[0] != c2) {
270 unibrow::uchar s2[1] = { c2 };
271 canonicalize->get(c2, '\0', s2);
272 if (s1[0] != s2[0]) {
273 return 0;
274 }
275 }
276 }
277 }
278 return 1;
279 }
280
281
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)282 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
283 Address* stack_base,
284 Isolate* isolate) {
285 RegExpStack* regexp_stack = isolate->regexp_stack();
286 size_t size = regexp_stack->stack_capacity();
287 Address old_stack_base = regexp_stack->stack_base();
288 DCHECK(old_stack_base == *stack_base);
289 DCHECK(stack_pointer <= old_stack_base);
290 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
291 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
292 if (new_stack_base == NULL) {
293 return NULL;
294 }
295 *stack_base = new_stack_base;
296 intptr_t stack_content_size = old_stack_base - stack_pointer;
297 return new_stack_base - stack_content_size;
298 }
299
300 #endif // V8_INTERPRETED_REGEXP
301
302 } // namespace internal
303 } // namespace v8
304