1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-macro-assembler.h"
6
7 #include "src/assembler.h"
8 #include "src/isolate-inl.h"
9 #include "src/regexp/regexp-stack.h"
10 #include "src/simulator.h"
11 #include "src/unicode-inl.h"
12
13 #ifdef V8_INTL_SUPPORT
14 #include "unicode/uchar.h"
15 #endif // V8_INTL_SUPPORT
16
17 namespace v8 {
18 namespace internal {
19
RegExpMacroAssembler(Isolate * isolate,Zone * zone)20 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
21 : slow_safe_compiler_(false),
22 global_mode_(NOT_GLOBAL),
23 isolate_(isolate),
24 zone_(zone) {}
25
26
~RegExpMacroAssembler()27 RegExpMacroAssembler::~RegExpMacroAssembler() {
28 }
29
30
CaseInsensitiveCompareUC16(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)31 int RegExpMacroAssembler::CaseInsensitiveCompareUC16(Address byte_offset1,
32 Address byte_offset2,
33 size_t byte_length,
34 Isolate* isolate) {
35 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
36 isolate->regexp_macro_assembler_canonicalize();
37 // This function is not allowed to cause a garbage collection.
38 // A GC might move the calling generated code and invalidate the
39 // return address on the stack.
40 DCHECK_EQ(0, byte_length % 2);
41 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
42 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
43 size_t length = byte_length >> 1;
44
45 #ifdef V8_INTL_SUPPORT
46 if (isolate == nullptr) {
47 for (size_t i = 0; i < length; i++) {
48 uc32 c1 = substring1[i];
49 uc32 c2 = substring2[i];
50 if (unibrow::Utf16::IsLeadSurrogate(c1)) {
51 // Non-BMP characters do not have case-equivalents in the BMP.
52 // Both have to be non-BMP for them to be able to match.
53 if (!unibrow::Utf16::IsLeadSurrogate(c2)) return 0;
54 if (i + 1 < length) {
55 uc16 c1t = substring1[i + 1];
56 uc16 c2t = substring2[i + 1];
57 if (unibrow::Utf16::IsTrailSurrogate(c1t) &&
58 unibrow::Utf16::IsTrailSurrogate(c2t)) {
59 c1 = unibrow::Utf16::CombineSurrogatePair(c1, c1t);
60 c2 = unibrow::Utf16::CombineSurrogatePair(c2, c2t);
61 i++;
62 }
63 }
64 }
65 c1 = u_foldCase(c1, U_FOLD_CASE_DEFAULT);
66 c2 = u_foldCase(c2, U_FOLD_CASE_DEFAULT);
67 if (c1 != c2) return 0;
68 }
69 return 1;
70 }
71 #endif // V8_INTL_SUPPORT
72 DCHECK_NOT_NULL(isolate);
73 for (size_t i = 0; i < length; i++) {
74 unibrow::uchar c1 = substring1[i];
75 unibrow::uchar c2 = substring2[i];
76 if (c1 != c2) {
77 unibrow::uchar s1[1] = {c1};
78 canonicalize->get(c1, '\0', s1);
79 if (s1[0] != c2) {
80 unibrow::uchar s2[1] = {c2};
81 canonicalize->get(c2, '\0', s2);
82 if (s1[0] != s2[0]) {
83 return 0;
84 }
85 }
86 }
87 }
88 return 1;
89 }
90
91
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)92 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
93 Label* on_failure) {
94 Label ok;
95 // Check that current character is not a trail surrogate.
96 LoadCurrentCharacter(cp_offset, &ok);
97 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
98 // Check that previous character is not a lead surrogate.
99 LoadCurrentCharacter(cp_offset - 1, &ok);
100 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
101 Bind(&ok);
102 }
103
CheckPosition(int cp_offset,Label * on_outside_input)104 void RegExpMacroAssembler::CheckPosition(int cp_offset,
105 Label* on_outside_input) {
106 LoadCurrentCharacter(cp_offset, on_outside_input, true);
107 }
108
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)109 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
110 Label* on_no_match) {
111 return false;
112 }
113
114 #ifndef V8_INTERPRETED_REGEXP // Avoid unused code, e.g., on ARM.
115
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)116 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
117 Zone* zone)
118 : RegExpMacroAssembler(isolate, zone) {}
119
120
~NativeRegExpMacroAssembler()121 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() {
122 }
123
124
CanReadUnaligned()125 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
126 return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
127 }
128
StringCharacterPosition(String * subject,int start_index)129 const byte* NativeRegExpMacroAssembler::StringCharacterPosition(
130 String* subject,
131 int start_index) {
132 if (subject->IsConsString()) {
133 subject = ConsString::cast(subject)->first();
134 } else if (subject->IsSlicedString()) {
135 start_index += SlicedString::cast(subject)->offset();
136 subject = SlicedString::cast(subject)->parent();
137 }
138 if (subject->IsThinString()) {
139 subject = ThinString::cast(subject)->actual();
140 }
141 DCHECK_LE(0, start_index);
142 DCHECK_LE(start_index, subject->length());
143 if (subject->IsSeqOneByteString()) {
144 return reinterpret_cast<const byte*>(
145 SeqOneByteString::cast(subject)->GetChars() + start_index);
146 } else if (subject->IsSeqTwoByteString()) {
147 return reinterpret_cast<const byte*>(
148 SeqTwoByteString::cast(subject)->GetChars() + start_index);
149 } else if (subject->IsExternalOneByteString()) {
150 return reinterpret_cast<const byte*>(
151 ExternalOneByteString::cast(subject)->GetChars() + start_index);
152 } else {
153 DCHECK(subject->IsExternalTwoByteString());
154 return reinterpret_cast<const byte*>(
155 ExternalTwoByteString::cast(subject)->GetChars() + start_index);
156 }
157 }
158
159
CheckStackGuardState(Isolate * isolate,int start_index,bool is_direct_call,Address * return_address,Code * re_code,String ** subject,const byte ** input_start,const byte ** input_end)160 int NativeRegExpMacroAssembler::CheckStackGuardState(
161 Isolate* isolate, int start_index, bool is_direct_call,
162 Address* return_address, Code* re_code, String** subject,
163 const byte** input_start, const byte** input_end) {
164 DCHECK(re_code->raw_instruction_start() <= *return_address);
165 DCHECK(*return_address <= re_code->raw_instruction_end());
166 int return_value = 0;
167 // Prepare for possible GC.
168 HandleScope handles(isolate);
169 Handle<Code> code_handle(re_code, isolate);
170 Handle<String> subject_handle(*subject, isolate);
171 bool is_one_byte = subject_handle->IsOneByteRepresentationUnderneath();
172
173 StackLimitCheck check(isolate);
174 bool js_has_overflowed = check.JsHasOverflowed();
175
176 if (is_direct_call) {
177 // Direct calls from JavaScript can be interrupted in two ways:
178 // 1. A real stack overflow, in which case we let the caller throw the
179 // exception.
180 // 2. The stack guard was used to interrupt execution for another purpose,
181 // forcing the call through the runtime system.
182 return_value = js_has_overflowed ? EXCEPTION : RETRY;
183 } else if (js_has_overflowed) {
184 isolate->StackOverflow();
185 return_value = EXCEPTION;
186 } else {
187 Object* result = isolate->stack_guard()->HandleInterrupts();
188 if (result->IsException(isolate)) return_value = EXCEPTION;
189 }
190
191 DisallowHeapAllocation no_gc;
192
193 if (*code_handle != re_code) { // Return address no longer valid
194 intptr_t delta = code_handle->address() - re_code->address();
195 // Overwrite the return address on the stack.
196 *return_address += delta;
197 }
198
199 // If we continue, we need to update the subject string addresses.
200 if (return_value == 0) {
201 // String encoding might have changed.
202 if (subject_handle->IsOneByteRepresentationUnderneath() != is_one_byte) {
203 // If we changed between an LATIN1 and an UC16 string, the specialized
204 // code cannot be used, and we need to restart regexp matching from
205 // scratch (including, potentially, compiling a new version of the code).
206 return_value = RETRY;
207 } else {
208 *subject = *subject_handle;
209 intptr_t byte_length = *input_end - *input_start;
210 *input_start = StringCharacterPosition(*subject, start_index);
211 *input_end = *input_start + byte_length;
212 }
213 }
214 return return_value;
215 }
216
217
Match(Handle<Code> regexp_code,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)218 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Match(
219 Handle<Code> regexp_code,
220 Handle<String> subject,
221 int* offsets_vector,
222 int offsets_vector_length,
223 int previous_index,
224 Isolate* isolate) {
225
226 DCHECK(subject->IsFlat());
227 DCHECK_LE(0, previous_index);
228 DCHECK_LE(previous_index, subject->length());
229
230 // No allocations before calling the regexp, but we can't use
231 // DisallowHeapAllocation, since regexps might be preempted, and another
232 // thread might do allocation anyway.
233
234 String* subject_ptr = *subject;
235 // Character offsets into string.
236 int start_offset = previous_index;
237 int char_length = subject_ptr->length() - start_offset;
238 int slice_offset = 0;
239
240 // The string has been flattened, so if it is a cons string it contains the
241 // full string in the first part.
242 if (StringShape(subject_ptr).IsCons()) {
243 DCHECK_EQ(0, ConsString::cast(subject_ptr)->second()->length());
244 subject_ptr = ConsString::cast(subject_ptr)->first();
245 } else if (StringShape(subject_ptr).IsSliced()) {
246 SlicedString* slice = SlicedString::cast(subject_ptr);
247 subject_ptr = slice->parent();
248 slice_offset = slice->offset();
249 }
250 if (StringShape(subject_ptr).IsThin()) {
251 subject_ptr = ThinString::cast(subject_ptr)->actual();
252 }
253 // Ensure that an underlying string has the same representation.
254 bool is_one_byte = subject_ptr->IsOneByteRepresentation();
255 DCHECK(subject_ptr->IsExternalString() || subject_ptr->IsSeqString());
256 // String is now either Sequential or External
257 int char_size_shift = is_one_byte ? 0 : 1;
258
259 const byte* input_start =
260 StringCharacterPosition(subject_ptr, start_offset + slice_offset);
261 int byte_length = char_length << char_size_shift;
262 const byte* input_end = input_start + byte_length;
263 Result res = Execute(*regexp_code,
264 *subject,
265 start_offset,
266 input_start,
267 input_end,
268 offsets_vector,
269 offsets_vector_length,
270 isolate);
271 return res;
272 }
273
274
Execute(Code * code,String * input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate)275 NativeRegExpMacroAssembler::Result NativeRegExpMacroAssembler::Execute(
276 Code* code,
277 String* input, // This needs to be the unpacked (sliced, cons) string.
278 int start_offset,
279 const byte* input_start,
280 const byte* input_end,
281 int* output,
282 int output_size,
283 Isolate* isolate) {
284 // Ensure that the minimum stack has been allocated.
285 RegExpStackScope stack_scope(isolate);
286 Address stack_base = stack_scope.stack()->stack_base();
287
288 int direct_call = 0;
289
290 using RegexpMatcherSig = int(
291 String * input, int start_offset, // NOLINT(readability/casting)
292 const byte* input_start, const byte* input_end, int* output,
293 int output_size, Address stack_base, int direct_call, Isolate* isolate);
294
295 auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
296 int result = fn.Call(input, start_offset, input_start, input_end, output,
297 output_size, stack_base, direct_call, isolate);
298 DCHECK(result >= RETRY);
299
300 if (result == EXCEPTION && !isolate->has_pending_exception()) {
301 // We detected a stack overflow (on the backtrack stack) in RegExp code,
302 // but haven't created the exception yet.
303 isolate->StackOverflow();
304 }
305 return static_cast<Result>(result);
306 }
307
308 // clang-format off
309 const byte NativeRegExpMacroAssembler::word_character_map[] = {
310 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
311 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
312 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
313 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
314
315 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
316 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
317 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
318 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
319
320 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
321 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
322 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
323 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
324
325 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
326 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
327 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
328 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
329 // Latin-1 range
330 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
331 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
332 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
333 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
334
335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339
340 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
343 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
344
345 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
346 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
347 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
348 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
349 };
350 // clang-format on
351
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)352 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
353 Address* stack_base,
354 Isolate* isolate) {
355 RegExpStack* regexp_stack = isolate->regexp_stack();
356 size_t size = regexp_stack->stack_capacity();
357 Address old_stack_base = regexp_stack->stack_base();
358 DCHECK(old_stack_base == *stack_base);
359 DCHECK(stack_pointer <= old_stack_base);
360 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
361 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
362 if (new_stack_base == kNullAddress) {
363 return kNullAddress;
364 }
365 *stack_base = new_stack_base;
366 intptr_t stack_content_size = old_stack_base - stack_pointer;
367 return new_stack_base - stack_content_size;
368 }
369
370 #endif // V8_INTERPRETED_REGEXP
371
372 } // namespace internal
373 } // namespace v8
374