1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-macro-assembler.h"
6
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11
12 #ifdef V8_I18N_SUPPORT
13 #include "unicode/uchar.h"
14 #endif // V8_I18N_SUPPORT
15
16 namespace v8 {
17 namespace internal {
18
RegExpMacroAssembler(Isolate * isolate,Zone * zone)19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20 : slow_safe_compiler_(false),
21 global_mode_(NOT_GLOBAL),
22 isolate_(isolate),
23 zone_(zone) {}
24
25
~RegExpMacroAssembler()26 RegExpMacroAssembler::~RegExpMacroAssembler() {
27 }
28
29
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31 Address byte_offset2,
32 size_t byte_length,
33 Isolate* isolate) {
34 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35 isolate->regexp_macro_assembler_canonicalize();
36 // This function is not allowed to cause a garbage collection.
37 // A GC might move the calling generated code and invalidate the
38 // return address on the stack.
39 DCHECK(byte_length % 2 == 0);
40 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42 size_t length = byte_length >> 1;
43
44 #ifdef V8_I18N_SUPPORT
45 if (isolate == nullptr) {
46 for (size_t i = 0; i < length; i++) {
47 uc32 c1 = substring1[i];
48 uc32 c2 = substring2[i];
49 if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50 // Non-BMP characters do not have case-equivalents in the BMP.
51 // Both have to be non-BMP for them to be able to match.
52 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53 if (i + 1 < length) {
54 uc16 c1t = substring1[i + 1];
55 uc16 c2t = substring2[i + 1];
56 if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57 unibrow::Utf16::IsTrailSurrogate(c2t)) {
58 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60 i++;
61 }
62 }
63 }
64 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66 if (c1 != c2) return 0;
67 }
68 return 1;
69 }
70 #endif // V8_I18N_SUPPORT
71 DCHECK_NOT_NULL(isolate);
72 for (size_t i = 0; i < length; i++) {
73 unibrow::uchar c1 = substring1[i];
74 unibrow::uchar c2 = substring2[i];
75 if (c1 != c2) {
76 unibrow::uchar s1[1] = {c1};
77 canonicalize->get(c1, '\0', s1);
78 if (s1[0] != c2) {
79 unibrow::uchar s2[1] = {c2};
80 canonicalize->get(c2, '\0', s2);
81 if (s1[0] != s2[0]) {
82 return 0;
83 }
84 }
85 }
86 }
87 return 1;
88 }
89
90
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92 Label* on_failure) {
93 Label ok;
94 // Check that current character is not a trail surrogate.
95 LoadCurrentCharacter(cp_offset, &ok);
96 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97 // Check that previous character is not a lead surrogate.
98 LoadCurrentCharacter(cp_offset - 1, &ok);
99 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100 Bind(&ok);
101 }
102
CheckPosition(int cp_offset,Label * on_outside_input)103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
104 Label* on_outside_input) {
105 LoadCurrentCharacter(cp_offset, on_outside_input, true);
106 }
107
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109 Label* on_no_match) {
110 return false;
111 }
112
113 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
114
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116 Zone* zone)
117 : RegExpMacroAssembler(isolate, zone) {}
118
119
~NativeRegExpMacroAssembler()120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121 }
122
123
CanReadUnaligned()124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125 return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
126 }
127
StringCharacterPosition(String * subject,int start_index)128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129 String* subject,
130 int start_index) {
131 if (subject->IsConsString()) {
132 subject = ConsString::cast(subject)->first();
133 } else if (subject->IsSlicedString()) {
134 start_index += SlicedString::cast(subject)->offset();
135 subject = SlicedString::cast(subject)->parent();
136 }
137 if (subject->IsThinString()) {
138 subject = ThinString::cast(subject)->actual();
139 }
140 DCHECK(start_index >= 0);
141 DCHECK(start_index <= subject->length());
142 if (subject->IsSeqOneByteString()) {
143 return reinterpret_cast<const byte*>(
144 SeqOneByteString::cast(subject)->GetChars() + start_index);
145 } else if (subject->IsSeqTwoByteString()) {
146 return reinterpret_cast<const byte*>(
147 SeqTwoByteString::cast(subject)->GetChars() + start_index);
148 } else if (subject->IsExternalOneByteString()) {
149 return reinterpret_cast<const byte*>(
150 ExternalOneByteString::cast(subject)->GetChars() + start_index);
151 } else {
152 DCHECK(subject->IsExternalTwoByteString());
153 return reinterpret_cast<const byte*>(
154 ExternalTwoByteString::cast(subject)->GetChars() + start_index);
155 }
156 }
157
158
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)159 int NativeRegExpMacroAssembler::CheckStackGuardState(
160 Isolate* isolate, int start_index, bool is_direct_call,
161 Address* return_address, Code* re_code, String** subject,
162 const byte** input_start, const byte** input_end) {
163 DCHECK(re_code->instruction_start() <= *return_address);
164 DCHECK(*return_address <= re_code->instruction_end());
165 int return_value = 0;
166 // Prepare for possible GC.
167 HandleScope handles(isolate);
168 Handle<Code> code_handle(re_code);
169 Handle<String> subject_handle(*subject);
170 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
171
172 StackLimitCheck check(isolate);
173 if (check.JsHasOverflowed()) {
174 isolate->StackOverflow();
175 return_value = EXCEPTION;
176 } else if (is_direct_call) {
177 // If not real stack overflow the stack guard was used to interrupt
178 // execution for another purpose. If this is a direct call from JavaScript
179 // retry the RegExp forcing the call through the runtime system.
180 // Currently the direct call cannot handle a GC.
181 return_value = RETRY;
182 } else {
183 Object* result = isolate->stack_guard()->HandleInterrupts();
184 if (result->IsException(isolate)) return_value = EXCEPTION;
185 }
186
187 DisallowHeapAllocation no_gc;
188
189 if (*code_handle != re_code) { // Return address no longer valid
190 intptr_t delta = code_handle->address() - re_code->address();
191 // Overwrite the return address on the stack.
192 *return_address += delta;
193 }
194
195 // If we continue, we need to update the subject string addresses.
196 if (return_value == 0) {
197 // String encoding might have changed.
198 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
199 // If we changed between an LATIN1 and an UC16 string, the specialized
200 // code cannot be used, and we need to restart regexp matching from
201 // scratch (including, potentially, compiling a new version of the code).
202 return_value = RETRY;
203 } else {
204 *subject = *subject_handle;
205 intptr_t byte_length = *input_end - *input_start;
206 *input_start = StringCharacterPosition(*subject, start_index);
207 *input_end = *input_start + byte_length;
208 }
209 }
210 return return_value;
211 }
212
213
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)214 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
215 Handle<Code> regexp_code,
216 Handle<String> subject,
217 int* offsets_vector,
218 int offsets_vector_length,
219 int previous_index,
220 Isolate* isolate) {
221
222 DCHECK(subject->IsFlat());
223 DCHECK(previous_index >= 0);
224 DCHECK(previous_index <= subject->length());
225
226 // No allocations before calling the regexp, but we can't use
227 // DisallowHeapAllocation, since regexps might be preempted, and another
228 // thread might do allocation anyway.
229
230 String* subject_ptr = *subject;
231 // Character offsets into string.
232 int start_offset = previous_index;
233 int char_length = subject_ptr->length() - start_offset;
234 int slice_offset = 0;
235
236 // The string has been flattened, so if it is a cons string it contains the
237 // full string in the first part.
238 if (StringShape(subject_ptr).IsCons()) {
239 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
240 subject_ptr = ConsString::cast(subject_ptr)->first();
241 } else if (StringShape(subject_ptr).IsSliced()) {
242 SlicedString* slice = SlicedString::cast(subject_ptr);
243 subject_ptr = slice->parent();
244 slice_offset = slice->offset();
245 }
246 if (StringShape(subject_ptr).IsThin()) {
247 subject_ptr = ThinString::cast(subject_ptr)->actual();
248 }
249 // Ensure that an underlying string has the same representation.
250 bool is_one_byte = subject_ptr->IsOneByteRepresentation();
251 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
252 // String is now either Sequential or External
253 int char_size_shift = is_one_byte ? 0 : 1;
254
255 const byte* input_start =
256 StringCharacterPosition(subject_ptr, start_offset + slice_offset);
257 int byte_length = char_length << char_size_shift;
258 const byte* input_end = input_start + byte_length;
259 Result res = Execute(*regexp_code,
260 *subject,
261 start_offset,
262 input_start,
263 input_end,
264 offsets_vector,
265 offsets_vector_length,
266 isolate);
267 return res;
268 }
269
270
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)271 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
272 Code* code,
273 String* input, // This needs to be the unpacked (sliced, cons) string.
274 int start_offset,
275 const byte* input_start,
276 const byte* input_end,
277 int* output,
278 int output_size,
279 Isolate* isolate) {
280 // Ensure that the minimum stack has been allocated.
281 RegExpStackScope stack_scope(isolate);
282 Address stack_base = stack_scope.stack()->stack_base();
283
284 int direct_call = 0;
285 int result = CALL_GENERATED_REGEXP_CODE(
286 isolate, code->entry(), input, start_offset, input_start, input_end,
287 output, output_size, stack_base, direct_call, isolate);
288 DCHECK(result >= RETRY);
289
290 if (result == EXCEPTION && !isolate->has_pending_exception()) {
291 // We detected a stack overflow (on the backtrack stack) in RegExp code,
292 // but haven't created the exception yet.
293 isolate->StackOverflow();
294 }
295 return static_cast<Result>(result);
296 }
297
298
299 const byte NativeRegExpMacroAssembler::word_character_map[] = {
300 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
301 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
302 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
303 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
304
305 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
306 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
307 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
308 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
309
310 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
311 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
312 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
313 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
314
315 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
316 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
317 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
318 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
319 // Latin-1 range
320 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
323 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324
325 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
328 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329
330 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334
335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 };
340
341
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)342 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
343 Address* stack_base,
344 Isolate* isolate) {
345 RegExpStack* regexp_stack = isolate->regexp_stack();
346 size_t size = regexp_stack->stack_capacity();
347 Address old_stack_base = regexp_stack->stack_base();
348 DCHECK(old_stack_base == *stack_base);
349 DCHECK(stack_pointer <= old_stack_base);
350 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
351 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
352 if (new_stack_base == NULL) {
353 return NULL;
354 }
355 *stack_base = new_stack_base;
356 intptr_t stack_content_size = old_stack_base - stack_pointer;
357 return new_stack_base - stack_content_size;
358 }
359
360 #endif // V8_INTERPRETED_REGEXP
361
362 } // namespace internal
363 } // namespace v8
364