1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/regexp/regexp-macro-assembler.h"
6
7 #include "src/codegen/assembler.h"
8 #include "src/execution/isolate-inl.h"
9 #include "src/execution/pointer-authentication.h"
10 #include "src/execution/simulator.h"
11 #include "src/regexp/regexp-stack.h"
12 #include "src/regexp/special-case.h"
13 #include "src/strings/unicode-inl.h"
14
15 #ifdef V8_INTL_SUPPORT
16 #include "unicode/uchar.h"
17 #include "unicode/unistr.h"
18 #endif // V8_INTL_SUPPORT
19
20 namespace v8 {
21 namespace internal {
22
RegExpMacroAssembler(Isolate * isolate,Zone * zone)23 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
24 : slow_safe_compiler_(false),
25 global_mode_(NOT_GLOBAL),
26 isolate_(isolate),
27 zone_(zone) {}
28
29 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
30
CaseInsensitiveCompareNonUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)31 int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
32 Address byte_offset2,
33 size_t byte_length,
34 Isolate* isolate) {
35 #ifdef V8_INTL_SUPPORT
36 // This function is not allowed to cause a garbage collection.
37 // A GC might move the calling generated code and invalidate the
38 // return address on the stack.
39 DisallowHeapAllocation no_gc;
40 DCHECK_EQ(0, byte_length % 2);
41 size_t length = byte_length / 2;
42 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
43 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
44
45 for (size_t i = 0; i < length; i++) {
46 UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
47 UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]);
48 if (c1 != c2) {
49 return 0;
50 }
51 }
52 return 1;
53 #else
54 return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length,
55 isolate);
56 #endif
57 }
58
CaseInsensitiveCompareUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)59 int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
60 Address byte_offset2,
61 size_t byte_length,
62 Isolate* isolate) {
63 // This function is not allowed to cause a garbage collection.
64 // A GC might move the calling generated code and invalidate the
65 // return address on the stack.
66 DisallowHeapAllocation no_gc;
67 DCHECK_EQ(0, byte_length % 2);
68
69 #ifdef V8_INTL_SUPPORT
70 int32_t length = static_cast<int32_t>(byte_length >> 1);
71 icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
72 length);
73 return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
74 length, U_FOLD_CASE_DEFAULT) == 0;
75 #else
76 uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
77 uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
78 size_t length = byte_length >> 1;
79 DCHECK_NOT_NULL(isolate);
80 unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
81 isolate->regexp_macro_assembler_canonicalize();
82 for (size_t i = 0; i < length; i++) {
83 unibrow::uchar c1 = substring1[i];
84 unibrow::uchar c2 = substring2[i];
85 if (c1 != c2) {
86 unibrow::uchar s1[1] = {c1};
87 canonicalize->get(c1, '\0', s1);
88 if (s1[0] != c2) {
89 unibrow::uchar s2[1] = {c2};
90 canonicalize->get(c2, '\0', s2);
91 if (s1[0] != s2[0]) {
92 return 0;
93 }
94 }
95 }
96 }
97 return 1;
98 #endif // V8_INTL_SUPPORT
99 }
100
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)101 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
102 Label* on_failure) {
103 Label ok;
104 // Check that current character is not a trail surrogate.
105 LoadCurrentCharacter(cp_offset, &ok);
106 CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
107 // Check that previous character is not a lead surrogate.
108 LoadCurrentCharacter(cp_offset - 1, &ok);
109 CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
110 Bind(&ok);
111 }
112
CheckPosition(int cp_offset,Label * on_outside_input)113 void RegExpMacroAssembler::CheckPosition(int cp_offset,
114 Label* on_outside_input) {
115 LoadCurrentCharacter(cp_offset, on_outside_input, true);
116 }
117
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)118 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
119 Label* on_end_of_input,
120 bool check_bounds,
121 int characters,
122 int eats_at_least) {
123 // By default, eats_at_least = characters.
124 if (eats_at_least == kUseCharactersValue) {
125 eats_at_least = characters;
126 }
127
128 LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
129 eats_at_least);
130 }
131
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)132 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
133 Label* on_no_match) {
134 return false;
135 }
136
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)137 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
138 Zone* zone)
139 : RegExpMacroAssembler(isolate, zone) {}
140
141 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
142
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)143 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
144 int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
145 int eats_at_least) {
146 // It's possible to preload a small number of characters when each success
147 // path requires a large number of characters, but not the reverse.
148 DCHECK_GE(eats_at_least, characters);
149
150 DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
151 if (check_bounds) {
152 if (cp_offset >= 0) {
153 CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
154 } else {
155 CheckPosition(cp_offset, on_end_of_input);
156 }
157 }
158 LoadCurrentCharacterUnchecked(cp_offset, characters);
159 }
160
CanReadUnaligned()161 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
162 return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
163 }
164
165 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
166
167 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)168 int NativeRegExpMacroAssembler::CheckStackGuardState(
169 Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
170 Address* return_address, Code re_code, Address* subject,
171 const byte** input_start, const byte** input_end) {
172 DisallowHeapAllocation no_gc;
173 Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
174 DCHECK_LE(re_code.raw_instruction_start(), old_pc);
175 DCHECK_LE(old_pc, re_code.raw_instruction_end());
176
177 StackLimitCheck check(isolate);
178 bool js_has_overflowed = check.JsHasOverflowed();
179
180 if (call_origin == RegExp::CallOrigin::kFromJs) {
181 // Direct calls from JavaScript can be interrupted in two ways:
182 // 1. A real stack overflow, in which case we let the caller throw the
183 // exception.
184 // 2. The stack guard was used to interrupt execution for another purpose,
185 // forcing the call through the runtime system.
186
187 // Bug(v8:9540) Investigate why this method is called from JS although no
188 // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
189 // to continue execution normally.
190 if (js_has_overflowed) {
191 return EXCEPTION;
192 } else if (check.InterruptRequested()) {
193 return RETRY;
194 } else {
195 return 0;
196 }
197 }
198 DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
199
200 // Prepare for possible GC.
201 HandleScope handles(isolate);
202 Handle<Code> code_handle(re_code, isolate);
203 Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
204 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
205 int return_value = 0;
206
207 if (js_has_overflowed) {
208 AllowHeapAllocation yes_gc;
209 isolate->StackOverflow();
210 return_value = EXCEPTION;
211 } else if (check.InterruptRequested()) {
212 AllowHeapAllocation yes_gc;
213 Object result = isolate->stack_guard()->HandleInterrupts();
214 if (result.IsException(isolate)) return_value = EXCEPTION;
215 }
216
217 if (*code_handle != re_code) { // Return address no longer valid
218 // Overwrite the return address on the stack.
219 intptr_t delta = code_handle->address() - re_code.address();
220 Address new_pc = old_pc + delta;
221 // TODO(v8:10026): avoid replacing a signed pointer.
222 PointerAuthentication::ReplacePC(return_address, new_pc, 0);
223 }
224
225 // If we continue, we need to update the subject string addresses.
226 if (return_value == 0) {
227 // String encoding might have changed.
228 if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
229 is_one_byte) {
230 // If we changed between an LATIN1 and an UC16 string, the specialized
231 // code cannot be used, and we need to restart regexp matching from
232 // scratch (including, potentially, compiling a new version of the code).
233 return_value = RETRY;
234 } else {
235 *subject = subject_handle->ptr();
236 intptr_t byte_length = *input_end - *input_start;
237 *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
238 *input_end = *input_start + byte_length;
239 }
240 }
241 return return_value;
242 }
243
244 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)245 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
246 Handle<String> subject,
247 int* offsets_vector,
248 int offsets_vector_length,
249 int previous_index, Isolate* isolate) {
250 DCHECK(subject->IsFlat());
251 DCHECK_LE(0, previous_index);
252 DCHECK_LE(previous_index, subject->length());
253
254 // No allocations before calling the regexp, but we can't use
255 // DisallowHeapAllocation, since regexps might be preempted, and another
256 // thread might do allocation anyway.
257
258 String subject_ptr = *subject;
259 // Character offsets into string.
260 int start_offset = previous_index;
261 int char_length = subject_ptr.length() - start_offset;
262 int slice_offset = 0;
263
264 // The string has been flattened, so if it is a cons string it contains the
265 // full string in the first part.
266 if (StringShape(subject_ptr).IsCons()) {
267 DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
268 subject_ptr = ConsString::cast(subject_ptr).first();
269 } else if (StringShape(subject_ptr).IsSliced()) {
270 SlicedString slice = SlicedString::cast(subject_ptr);
271 subject_ptr = slice.parent();
272 slice_offset = slice.offset();
273 }
274 if (StringShape(subject_ptr).IsThin()) {
275 subject_ptr = ThinString::cast(subject_ptr).actual();
276 }
277 // Ensure that an underlying string has the same representation.
278 bool is_one_byte = subject_ptr.IsOneByteRepresentation();
279 DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
280 // String is now either Sequential or External
281 int char_size_shift = is_one_byte ? 0 : 1;
282
283 DisallowHeapAllocation no_gc;
284 const byte* input_start =
285 subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
286 int byte_length = char_length << char_size_shift;
287 const byte* input_end = input_start + byte_length;
288 return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
289 offsets_vector_length, isolate, *regexp);
290 }
291
292 // Returns a {Result} sentinel, or the number of successful matches.
293 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
294 // the signature of the interpreter. We should get rid of JS objects passed to
295 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)296 int NativeRegExpMacroAssembler::Execute(
297 String input, // This needs to be the unpacked (sliced, cons) string.
298 int start_offset, const byte* input_start, const byte* input_end,
299 int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
300 // Ensure that the minimum stack has been allocated.
301 RegExpStackScope stack_scope(isolate);
302 Address stack_base = stack_scope.stack()->stack_base();
303
304 bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
305 Code code = Code::cast(regexp.Code(is_one_byte));
306 RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
307
308 using RegexpMatcherSig = int(
309 Address input_string, int start_offset, // NOLINT(readability/casting)
310 const byte* input_start, const byte* input_end, int* output,
311 int output_size, Address stack_base, int call_origin, Isolate* isolate,
312 Address regexp);
313
314 auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
315 int result =
316 fn.Call(input.ptr(), start_offset, input_start, input_end, output,
317 output_size, stack_base, call_origin, isolate, regexp.ptr());
318 DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
319
320 if (result == EXCEPTION && !isolate->has_pending_exception()) {
321 // We detected a stack overflow (on the backtrack stack) in RegExp code,
322 // but haven't created the exception yet. Additionally, we allow heap
323 // allocation because even though it invalidates {input_start} and
324 // {input_end}, we are about to return anyway.
325 AllowHeapAllocation allow_allocation;
326 isolate->StackOverflow();
327 }
328 return result;
329 }
330
331 #endif // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
332
333 // clang-format off
334 const byte NativeRegExpMacroAssembler::word_character_map[] = {
335 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339
340 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // '0' - '7'
343 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // '8' - '9'
344
345 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'A' - 'G'
346 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'H' - 'O'
347 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'P' - 'W'
348 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu, // 'X' - 'Z', '_'
349
350 0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'a' - 'g'
351 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'h' - 'o'
352 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, // 'p' - 'w'
353 0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, // 'x' - 'z'
354 // Latin-1 range
355 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
356 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
357 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
358 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
359
360 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
361 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
362 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
363 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
364
365 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
366 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
367 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
368 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
369
370 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
371 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
372 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
373 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
374 };
375 // clang-format on
376
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)377 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
378 Address* stack_base,
379 Isolate* isolate) {
380 RegExpStack* regexp_stack = isolate->regexp_stack();
381 size_t size = regexp_stack->stack_capacity();
382 Address old_stack_base = regexp_stack->stack_base();
383 DCHECK(old_stack_base == *stack_base);
384 DCHECK(stack_pointer <= old_stack_base);
385 DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
386 Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
387 if (new_stack_base == kNullAddress) {
388 return kNullAddress;
389 }
390 *stack_base = new_stack_base;
391 intptr_t stack_content_size = old_stack_base - stack_pointer;
392 return new_stack_base - stack_content_size;
393 }
394
395 } // namespace internal
396 } // namespace v8
397