• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/regexp/regexp-macro-assembler.h"
6 
7 #include "src/codegen/assembler.h"
8 #include "src/execution/isolate-inl.h"
9 #include "src/execution/pointer-authentication.h"
10 #include "src/execution/simulator.h"
11 #include "src/regexp/regexp-stack.h"
12 #include "src/regexp/special-case.h"
13 #include "src/strings/unicode-inl.h"
14 
15 #ifdef V8_INTL_SUPPORT
16 #include "unicode/uchar.h"
17 #include "unicode/unistr.h"
18 #endif  // V8_INTL_SUPPORT
19 
20 namespace v8 {
21 namespace internal {
22 
RegExpMacroAssembler(Isolate * isolate,Zone * zone)23 RegExpMacroAssembler::RegExpMacroAssembler(Isolate* isolate, Zone* zone)
24     : slow_safe_compiler_(false),
25       global_mode_(NOT_GLOBAL),
26       isolate_(isolate),
27       zone_(zone) {}
28 
29 RegExpMacroAssembler::~RegExpMacroAssembler() = default;
30 
CaseInsensitiveCompareNonUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)31 int RegExpMacroAssembler::CaseInsensitiveCompareNonUnicode(Address byte_offset1,
32                                                            Address byte_offset2,
33                                                            size_t byte_length,
34                                                            Isolate* isolate) {
35 #ifdef V8_INTL_SUPPORT
36   // This function is not allowed to cause a garbage collection.
37   // A GC might move the calling generated code and invalidate the
38   // return address on the stack.
39   DisallowHeapAllocation no_gc;
40   DCHECK_EQ(0, byte_length % 2);
41   size_t length = byte_length / 2;
42   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
43   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
44 
45   for (size_t i = 0; i < length; i++) {
46     UChar32 c1 = RegExpCaseFolding::Canonicalize(substring1[i]);
47     UChar32 c2 = RegExpCaseFolding::Canonicalize(substring2[i]);
48     if (c1 != c2) {
49       return 0;
50     }
51   }
52   return 1;
53 #else
54   return CaseInsensitiveCompareUnicode(byte_offset1, byte_offset2, byte_length,
55                                        isolate);
56 #endif
57 }
58 
CaseInsensitiveCompareUnicode(Address byte_offset1,Address byte_offset2,size_t byte_length,Isolate * isolate)59 int RegExpMacroAssembler::CaseInsensitiveCompareUnicode(Address byte_offset1,
60                                                         Address byte_offset2,
61                                                         size_t byte_length,
62                                                         Isolate* isolate) {
63   // This function is not allowed to cause a garbage collection.
64   // A GC might move the calling generated code and invalidate the
65   // return address on the stack.
66   DisallowHeapAllocation no_gc;
67   DCHECK_EQ(0, byte_length % 2);
68 
69 #ifdef V8_INTL_SUPPORT
70   int32_t length = static_cast<int32_t>(byte_length >> 1);
71   icu::UnicodeString uni_str_1(reinterpret_cast<const char16_t*>(byte_offset1),
72                                length);
73   return uni_str_1.caseCompare(reinterpret_cast<const char16_t*>(byte_offset2),
74                                length, U_FOLD_CASE_DEFAULT) == 0;
75 #else
76   uc16* substring1 = reinterpret_cast<uc16*>(byte_offset1);
77   uc16* substring2 = reinterpret_cast<uc16*>(byte_offset2);
78   size_t length = byte_length >> 1;
79   DCHECK_NOT_NULL(isolate);
80   unibrow::Mapping<unibrow::Ecma262Canonicalize>* canonicalize =
81       isolate->regexp_macro_assembler_canonicalize();
82   for (size_t i = 0; i < length; i++) {
83     unibrow::uchar c1 = substring1[i];
84     unibrow::uchar c2 = substring2[i];
85     if (c1 != c2) {
86       unibrow::uchar s1[1] = {c1};
87       canonicalize->get(c1, '\0', s1);
88       if (s1[0] != c2) {
89         unibrow::uchar s2[1] = {c2};
90         canonicalize->get(c2, '\0', s2);
91         if (s1[0] != s2[0]) {
92           return 0;
93         }
94       }
95     }
96   }
97   return 1;
98 #endif  // V8_INTL_SUPPORT
99 }
100 
CheckNotInSurrogatePair(int cp_offset,Label * on_failure)101 void RegExpMacroAssembler::CheckNotInSurrogatePair(int cp_offset,
102                                                    Label* on_failure) {
103   Label ok;
104   // Check that current character is not a trail surrogate.
105   LoadCurrentCharacter(cp_offset, &ok);
106   CheckCharacterNotInRange(kTrailSurrogateStart, kTrailSurrogateEnd, &ok);
107   // Check that previous character is not a lead surrogate.
108   LoadCurrentCharacter(cp_offset - 1, &ok);
109   CheckCharacterInRange(kLeadSurrogateStart, kLeadSurrogateEnd, on_failure);
110   Bind(&ok);
111 }
112 
CheckPosition(int cp_offset,Label * on_outside_input)113 void RegExpMacroAssembler::CheckPosition(int cp_offset,
114                                          Label* on_outside_input) {
115   LoadCurrentCharacter(cp_offset, on_outside_input, true);
116 }
117 
LoadCurrentCharacter(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)118 void RegExpMacroAssembler::LoadCurrentCharacter(int cp_offset,
119                                                 Label* on_end_of_input,
120                                                 bool check_bounds,
121                                                 int characters,
122                                                 int eats_at_least) {
123   // By default, eats_at_least = characters.
124   if (eats_at_least == kUseCharactersValue) {
125     eats_at_least = characters;
126   }
127 
128   LoadCurrentCharacterImpl(cp_offset, on_end_of_input, check_bounds, characters,
129                            eats_at_least);
130 }
131 
CheckSpecialCharacterClass(uc16 type,Label * on_no_match)132 bool RegExpMacroAssembler::CheckSpecialCharacterClass(uc16 type,
133                                                       Label* on_no_match) {
134   return false;
135 }
136 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)137 NativeRegExpMacroAssembler::NativeRegExpMacroAssembler(Isolate* isolate,
138                                                        Zone* zone)
139     : RegExpMacroAssembler(isolate, zone) {}
140 
141 NativeRegExpMacroAssembler::~NativeRegExpMacroAssembler() = default;
142 
LoadCurrentCharacterImpl(int cp_offset,Label * on_end_of_input,bool check_bounds,int characters,int eats_at_least)143 void NativeRegExpMacroAssembler::LoadCurrentCharacterImpl(
144     int cp_offset, Label* on_end_of_input, bool check_bounds, int characters,
145     int eats_at_least) {
146   // It's possible to preload a small number of characters when each success
147   // path requires a large number of characters, but not the reverse.
148   DCHECK_GE(eats_at_least, characters);
149 
150   DCHECK(base::IsInRange(cp_offset, kMinCPOffset, kMaxCPOffset));
151   if (check_bounds) {
152     if (cp_offset >= 0) {
153       CheckPosition(cp_offset + eats_at_least - 1, on_end_of_input);
154     } else {
155       CheckPosition(cp_offset, on_end_of_input);
156     }
157   }
158   LoadCurrentCharacterUnchecked(cp_offset, characters);
159 }
160 
CanReadUnaligned()161 bool NativeRegExpMacroAssembler::CanReadUnaligned() {
162   return FLAG_enable_regexp_unaligned_accesses && !slow_safe();
163 }
164 
165 #ifndef COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
166 
167 // This method may only be called after an interrupt.
CheckStackGuardState(Isolate * isolate,int start_index,RegExp::CallOrigin call_origin,Address * return_address,Code re_code,Address * subject,const byte ** input_start,const byte ** input_end)168 int NativeRegExpMacroAssembler::CheckStackGuardState(
169     Isolate* isolate, int start_index, RegExp::CallOrigin call_origin,
170     Address* return_address, Code re_code, Address* subject,
171     const byte** input_start, const byte** input_end) {
172   DisallowHeapAllocation no_gc;
173   Address old_pc = PointerAuthentication::AuthenticatePC(return_address, 0);
174   DCHECK_LE(re_code.raw_instruction_start(), old_pc);
175   DCHECK_LE(old_pc, re_code.raw_instruction_end());
176 
177   StackLimitCheck check(isolate);
178   bool js_has_overflowed = check.JsHasOverflowed();
179 
180   if (call_origin == RegExp::CallOrigin::kFromJs) {
181     // Direct calls from JavaScript can be interrupted in two ways:
182     // 1. A real stack overflow, in which case we let the caller throw the
183     //    exception.
184     // 2. The stack guard was used to interrupt execution for another purpose,
185     //    forcing the call through the runtime system.
186 
187     // Bug(v8:9540) Investigate why this method is called from JS although no
188     // stackoverflow or interrupt is pending on ARM64. We return 0 in this case
189     // to continue execution normally.
190     if (js_has_overflowed) {
191       return EXCEPTION;
192     } else if (check.InterruptRequested()) {
193       return RETRY;
194     } else {
195       return 0;
196     }
197   }
198   DCHECK(call_origin == RegExp::CallOrigin::kFromRuntime);
199 
200   // Prepare for possible GC.
201   HandleScope handles(isolate);
202   Handle<Code> code_handle(re_code, isolate);
203   Handle<String> subject_handle(String::cast(Object(*subject)), isolate);
204   bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject_handle);
205   int return_value = 0;
206 
207   if (js_has_overflowed) {
208     AllowHeapAllocation yes_gc;
209     isolate->StackOverflow();
210     return_value = EXCEPTION;
211   } else if (check.InterruptRequested()) {
212     AllowHeapAllocation yes_gc;
213     Object result = isolate->stack_guard()->HandleInterrupts();
214     if (result.IsException(isolate)) return_value = EXCEPTION;
215   }
216 
217   if (*code_handle != re_code) {  // Return address no longer valid
218     // Overwrite the return address on the stack.
219     intptr_t delta = code_handle->address() - re_code.address();
220     Address new_pc = old_pc + delta;
221     // TODO(v8:10026): avoid replacing a signed pointer.
222     PointerAuthentication::ReplacePC(return_address, new_pc, 0);
223   }
224 
225   // If we continue, we need to update the subject string addresses.
226   if (return_value == 0) {
227     // String encoding might have changed.
228     if (String::IsOneByteRepresentationUnderneath(*subject_handle) !=
229         is_one_byte) {
230       // If we changed between an LATIN1 and an UC16 string, the specialized
231       // code cannot be used, and we need to restart regexp matching from
232       // scratch (including, potentially, compiling a new version of the code).
233       return_value = RETRY;
234     } else {
235       *subject = subject_handle->ptr();
236       intptr_t byte_length = *input_end - *input_start;
237       *input_start = subject_handle->AddressOfCharacterAt(start_index, no_gc);
238       *input_end = *input_start + byte_length;
239     }
240   }
241   return return_value;
242 }
243 
244 // Returns a {Result} sentinel, or the number of successful matches.
Match(Handle<JSRegExp> regexp,Handle<String> subject,int * offsets_vector,int offsets_vector_length,int previous_index,Isolate * isolate)245 int NativeRegExpMacroAssembler::Match(Handle<JSRegExp> regexp,
246                                       Handle<String> subject,
247                                       int* offsets_vector,
248                                       int offsets_vector_length,
249                                       int previous_index, Isolate* isolate) {
250   DCHECK(subject->IsFlat());
251   DCHECK_LE(0, previous_index);
252   DCHECK_LE(previous_index, subject->length());
253 
254   // No allocations before calling the regexp, but we can't use
255   // DisallowHeapAllocation, since regexps might be preempted, and another
256   // thread might do allocation anyway.
257 
258   String subject_ptr = *subject;
259   // Character offsets into string.
260   int start_offset = previous_index;
261   int char_length = subject_ptr.length() - start_offset;
262   int slice_offset = 0;
263 
264   // The string has been flattened, so if it is a cons string it contains the
265   // full string in the first part.
266   if (StringShape(subject_ptr).IsCons()) {
267     DCHECK_EQ(0, ConsString::cast(subject_ptr).second().length());
268     subject_ptr = ConsString::cast(subject_ptr).first();
269   } else if (StringShape(subject_ptr).IsSliced()) {
270     SlicedString slice = SlicedString::cast(subject_ptr);
271     subject_ptr = slice.parent();
272     slice_offset = slice.offset();
273   }
274   if (StringShape(subject_ptr).IsThin()) {
275     subject_ptr = ThinString::cast(subject_ptr).actual();
276   }
277   // Ensure that an underlying string has the same representation.
278   bool is_one_byte = subject_ptr.IsOneByteRepresentation();
279   DCHECK(subject_ptr.IsExternalString() || subject_ptr.IsSeqString());
280   // String is now either Sequential or External
281   int char_size_shift = is_one_byte ? 0 : 1;
282 
283   DisallowHeapAllocation no_gc;
284   const byte* input_start =
285       subject_ptr.AddressOfCharacterAt(start_offset + slice_offset, no_gc);
286   int byte_length = char_length << char_size_shift;
287   const byte* input_end = input_start + byte_length;
288   return Execute(*subject, start_offset, input_start, input_end, offsets_vector,
289                  offsets_vector_length, isolate, *regexp);
290 }
291 
292 // Returns a {Result} sentinel, or the number of successful matches.
293 // TODO(pthier): The JSRegExp object is passed to native irregexp code to match
294 // the signature of the interpreter. We should get rid of JS objects passed to
295 // internal methods.
Execute(String input,int start_offset,const byte * input_start,const byte * input_end,int * output,int output_size,Isolate * isolate,JSRegExp regexp)296 int NativeRegExpMacroAssembler::Execute(
297     String input,  // This needs to be the unpacked (sliced, cons) string.
298     int start_offset, const byte* input_start, const byte* input_end,
299     int* output, int output_size, Isolate* isolate, JSRegExp regexp) {
300   // Ensure that the minimum stack has been allocated.
301   RegExpStackScope stack_scope(isolate);
302   Address stack_base = stack_scope.stack()->stack_base();
303 
304   bool is_one_byte = String::IsOneByteRepresentationUnderneath(input);
305   Code code = Code::cast(regexp.Code(is_one_byte));
306   RegExp::CallOrigin call_origin = RegExp::CallOrigin::kFromRuntime;
307 
308   using RegexpMatcherSig = int(
309       Address input_string, int start_offset,  // NOLINT(readability/casting)
310       const byte* input_start, const byte* input_end, int* output,
311       int output_size, Address stack_base, int call_origin, Isolate* isolate,
312       Address regexp);
313 
314   auto fn = GeneratedCode<RegexpMatcherSig>::FromCode(code);
315   int result =
316       fn.Call(input.ptr(), start_offset, input_start, input_end, output,
317               output_size, stack_base, call_origin, isolate, regexp.ptr());
318   DCHECK_GE(result, SMALLEST_REGEXP_RESULT);
319 
320   if (result == EXCEPTION && !isolate->has_pending_exception()) {
321     // We detected a stack overflow (on the backtrack stack) in RegExp code,
322     // but haven't created the exception yet. Additionally, we allow heap
323     // allocation because even though it invalidates {input_start} and
324     // {input_end}, we are about to return anyway.
325     AllowHeapAllocation allow_allocation;
326     isolate->StackOverflow();
327   }
328   return result;
329 }
330 
331 #endif  // !COMPILING_IRREGEXP_FOR_EXTERNAL_EMBEDDER
332 
333 // clang-format off
334 const byte NativeRegExpMacroAssembler::word_character_map[] = {
335     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
336     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
337     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
338     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
339 
340     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
341     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
342     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // '0' - '7'
343     0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // '8' - '9'
344 
345     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'A' - 'G'
346     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'H' - 'O'
347     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'P' - 'W'
348     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0xFFu,  // 'X' - 'Z', '_'
349 
350     0x00u, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'a' - 'g'
351     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'h' - 'o'
352     0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu, 0xFFu,  // 'p' - 'w'
353     0xFFu, 0xFFu, 0xFFu, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,  // 'x' - 'z'
354     // Latin-1 range
355     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
356     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
357     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
358     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
359 
360     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
361     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
362     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
363     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
364 
365     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
366     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
367     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
368     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
369 
370     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
371     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
372     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
373     0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u, 0x00u,
374 };
375 // clang-format on
376 
GrowStack(Address stack_pointer,Address * stack_base,Isolate * isolate)377 Address NativeRegExpMacroAssembler::GrowStack(Address stack_pointer,
378                                               Address* stack_base,
379                                               Isolate* isolate) {
380   RegExpStack* regexp_stack = isolate->regexp_stack();
381   size_t size = regexp_stack->stack_capacity();
382   Address old_stack_base = regexp_stack->stack_base();
383   DCHECK(old_stack_base == *stack_base);
384   DCHECK(stack_pointer <= old_stack_base);
385   DCHECK(static_cast<size_t>(old_stack_base - stack_pointer) <= size);
386   Address new_stack_base = regexp_stack->EnsureCapacity(size * 2);
387   if (new_stack_base == kNullAddress) {
388     return kNullAddress;
389   }
390   *stack_base = new_stack_base;
391   intptr_t stack_content_size = old_stack_base - stack_pointer;
392   return new_stack_base - stack_content_size;
393 }
394 
395 }  // namespace internal
396 }  // namespace v8
397