1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-macro-assembler.h"
6
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11
12 #ifdef V8_I18N_SUPPORT
13 #include "unicode/uchar.h"
14 #endif // V8_I18N_SUPPORT
15
16 namespace v8 {
17 namespace internal {
18
RegExpMacroAssembler(Isolate * isolate,Zone * zone)19 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
20 : slow_safe_compiler_(false),
21 global_mode_(NOT_GLOBAL),
22 isolate_(isolate),
23 zone_(zone) {}
24
25
~RegExpMacroAssembler()26 RegExpMacroAssembler::~RegExpMacroAssembler() {
27 }
28
29
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)30 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
31 Address byte_offset2,
32 size_t byte_length,
33 Isolate* isolate) {
34 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
35 isolate->regexp_macro_assembler_canonicalize();
36 // This function is not allowed to cause a garbage collection.
37 // A GC might move the calling generated code and invalidate the
38 // return address on the stack.
39 DCHECK(byte_length % 2 == 0);
40 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
41 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
42 size_t length = byte_length >> 1;
43
44 #ifdef V8_I18N_SUPPORT
45 if (isolate == nullptr) {
46 for (size_t i = 0; i < length; i++) {
47 uc32 c1 = substring1[i];
48 uc32 c2 = substring2[i];
49 if (unibrow::Utf16::IsLeadSurrogate(c1)) {
50 // Non-BMP characters do not have case-equivalents in the BMP.
51 // Both have to be non-BMP for them to be able to match.
52 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
53 if (i + 1 < length) {
54 uc16 c1t = substring1[i + 1];
55 uc16 c2t = substring2[i + 1];
56 if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
57 unibrow::Utf16::IsTrailSurrogate(c2t)) {
58 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
59 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
60 i++;
61 }
62 }
63 }
64 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
65 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
66 if (c1 != c2) return 0;
67 }
68 return 1;
69 }
70 #endif // V8_I18N_SUPPORT
71 DCHECK_NOT_NULL(isolate);
72 for (size_t i = 0; i < length; i++) {
73 unibrow::uchar c1 = substring1[i];
74 unibrow::uchar c2 = substring2[i];
75 if (c1 != c2) {
76 unibrow::uchar s1[1] = {c1};
77 canonicalize->get(c1, '\0', s1);
78 if (s1[0] != c2) {
79 unibrow::uchar s2[1] = {c2};
80 canonicalize->get(c2, '\0', s2);
81 if (s1[0] != s2[0]) {
82 return 0;
83 }
84 }
85 }
86 }
87 return 1;
88 }
89
90
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)91 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
92 Label* on_failure) {
93 Label ok;
94 // Check that current character is not a trail surrogate.
95 LoadCurrentCharacter(cp_offset, &ok);
96 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
97 // Check that previous character is not a lead surrogate.
98 LoadCurrentCharacter(cp_offset - 1, &ok);
99 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
100 Bind(&ok);
101 }
102
CheckPosition(int cp_offset,Label * on_outside_input)103 void RegExpMacroAssembler::CheckPosition(int cp_offset,
104 Label* on_outside_input) {
105 LoadCurrentCharacter(cp_offset, on_outside_input, true);
106 }
107
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)108 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
109 Label* on_no_match) {
110 return false;
111 }
112
113 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
114
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)115 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
116 Zone* zone)
117 : RegExpMacroAssembler(isolate, zone) {}
118
119
~NativeRegExpMacroAssembler()120 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
121 }
122
123
CanReadUnaligned()124 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
125 return FLAG_enable_unaligned_accesses && !slow_safe();
126 }
127
StringCharacterPosition(String * subject,int start_index)128 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
129 String* subject,
130 int start_index) {
131 if (subject->IsConsString()) {
132 subject = ConsString::cast(subject)->first();
133 } else if (subject->IsSlicedString()) {
134 start_index += SlicedString::cast(subject)->offset();
135 subject = SlicedString::cast(subject)->parent();
136 }
137 DCHECK(start_index >= 0);
138 DCHECK(start_index <= subject->length());
139 if (subject->IsSeqOneByteString()) {
140 return reinterpret_cast<const byte*>(
141 SeqOneByteString::cast(subject)->GetChars() + start_index);
142 } else if (subject->IsSeqTwoByteString()) {
143 return reinterpret_cast<const byte*>(
144 SeqTwoByteString::cast(subject)->GetChars() + start_index);
145 } else if (subject->IsExternalOneByteString()) {
146 return reinterpret_cast<const byte*>(
147 ExternalOneByteString::cast(subject)->GetChars() + start_index);
148 } else {
149 return reinterpret_cast<const byte*>(
150 ExternalTwoByteString::cast(subject)->GetChars() + start_index);
151 }
152 }
153
154
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)155 int NativeRegExpMacroAssembler::CheckStackGuardState(
156 Isolate* isolate, int start_index, bool is_direct_call,
157 Address* return_address, Code* re_code, String** subject,
158 const byte** input_start, const byte** input_end) {
159 DCHECK(re_code->instruction_start() <= *return_address);
160 DCHECK(*return_address <= re_code->instruction_end());
161 int return_value = 0;
162 // Prepare for possible GC.
163 HandleScope handles(isolate);
164 Handle<Code> code_handle(re_code);
165 Handle<String> subject_handle(*subject);
166 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
167
168 StackLimitCheck check(isolate);
169 if (check.JsHasOverflowed()) {
170 isolate->StackOverflow();
171 return_value = EXCEPTION;
172 } else if (is_direct_call) {
173 // If not real stack overflow the stack guard was used to interrupt
174 // execution for another purpose. If this is a direct call from JavaScript
175 // retry the RegExp forcing the call through the runtime system.
176 // Currently the direct call cannot handle a GC.
177 return_value = RETRY;
178 } else {
179 Object* result = isolate->stack_guard()->HandleInterrupts();
180 if (result->IsException(isolate)) return_value = EXCEPTION;
181 }
182
183 DisallowHeapAllocation no_gc;
184
185 if (*code_handle != re_code) { // Return address no longer valid
186 intptr_t delta = code_handle->address() - re_code->address();
187 // Overwrite the return address on the stack.
188 *return_address += delta;
189 }
190
191 // If we continue, we need to update the subject string addresses.
192 if (return_value == 0) {
193 // String encoding might have changed.
194 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
195 // If we changed between an LATIN1 and an UC16 string, the specialized
196 // code cannot be used, and we need to restart regexp matching from
197 // scratch (including, potentially, compiling a new version of the code).
198 return_value = RETRY;
199 } else {
200 *subject = *subject_handle;
201 intptr_t byte_length = *input_end - *input_start;
202 *input_start = StringCharacterPosition(*subject, start_index);
203 *input_end = *input_start + byte_length;
204 }
205 }
206 return return_value;
207 }
208
209
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)210 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
211 Handle<Code> regexp_code,
212 Handle<String> subject,
213 int* offsets_vector,
214 int offsets_vector_length,
215 int previous_index,
216 Isolate* isolate) {
217
218 DCHECK(subject->IsFlat());
219 DCHECK(previous_index >= 0);
220 DCHECK(previous_index <= subject->length());
221
222 // No allocations before calling the regexp, but we can't use
223 // DisallowHeapAllocation, since regexps might be preempted, and another
224 // thread might do allocation anyway.
225
226 String* subject_ptr = *subject;
227 // Character offsets into string.
228 int start_offset = previous_index;
229 int char_length = subject_ptr->length() - start_offset;
230 int slice_offset = 0;
231
232 // The string has been flattened, so if it is a cons string it contains the
233 // full string in the first part.
234 if (StringShape(subject_ptr).IsCons()) {
235 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
236 subject_ptr = ConsString::cast(subject_ptr)->first();
237 } else if (StringShape(subject_ptr).IsSliced()) {
238 SlicedString* slice = SlicedString::cast(subject_ptr);
239 subject_ptr = slice->parent();
240 slice_offset = slice->offset();
241 }
242 // Ensure that an underlying string has the same representation.
243 bool is_one_byte = subject_ptr->IsOneByteRepresentation();
244 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
245 // String is now either Sequential or External
246 int char_size_shift = is_one_byte ? 0 : 1;
247
248 const byte* input_start =
249 StringCharacterPosition(subject_ptr, start_offset + slice_offset);
250 int byte_length = char_length << char_size_shift;
251 const byte* input_end = input_start + byte_length;
252 Result res = Execute(*regexp_code,
253 *subject,
254 start_offset,
255 input_start,
256 input_end,
257 offsets_vector,
258 offsets_vector_length,
259 isolate);
260 return res;
261 }
262
263
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)264 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
265 Code* code,
266 String* input, // This needs to be the unpacked (sliced, cons) string.
267 int start_offset,
268 const byte* input_start,
269 const byte* input_end,
270 int* output,
271 int output_size,
272 Isolate* isolate) {
273 // Ensure that the minimum stack has been allocated.
274 RegExpStackScope stack_scope(isolate);
275 Address stack_base = stack_scope.stack()->stack_base();
276
277 int direct_call = 0;
278 int result = CALL_GENERATED_REGEXP_CODE(
279 isolate, code->entry(), input, start_offset, input_start, input_end,
280 output, output_size, stack_base, direct_call, isolate);
281 DCHECK(result >= RETRY);
282
283 if (result == EXCEPTION && !isolate->has_pending_exception()) {
284 // We detected a stack overflow (on the backtrack stack) in RegExp code,
285 // but haven't created the exception yet.
286 isolate->StackOverflow();
287 }
288 return static_cast<Result>(result);
289 }
290
291
292 const byte NativeRegExpMacroAssembler::word_character_map[] = {
293 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
294 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
295 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
296 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
297
298 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
299 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
300 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // '0' - '7'
301 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
302
303 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'A' - 'G'
304 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'H' - 'O'
305 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'P' - 'W'
306 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0xffu, // 'X' - 'Z', '_'
307
308 0x00u, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'a' - 'g'
309 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'h' - 'o'
310 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, 0xffu, // 'p' - 'w'
311 0xffu, 0xffu, 0xffu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
312 // Latin-1 range
313 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
314 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
315 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317
318 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
319 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
320 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
321 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
322
323 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
324 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
325 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
326 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
327
328 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
329 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
330 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 };
333
334
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)335 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
336 Address* stack_base,
337 Isolate* isolate) {
338 RegExpStack* regexp_stack = isolate->regexp_stack();
339 size_t size = regexp_stack->stack_capacity();
340 Address old_stack_base = regexp_stack->stack_base();
341 DCHECK(old_stack_base == *stack_base);
342 DCHECK(stack_pointer <= old_stack_base);
343 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
344 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
345 if (new_stack_base == NULL) {
346 return NULL;
347 }
348 *stack_base = new_stack_base;
349 intptr_t stack_content_size = old_stack_base - stack_pointer;
350 return new_stack_base - stack_content_size;
351 }
352
353 #endif // V8_INTERPRETED_REGEXP
354
355 } // namespace internal
356 } // namespace v8
357