• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2012 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
6 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
7 
8 #include "src/base/strings.h"
9 #include "src/regexp/regexp-ast.h"
10 #include "src/regexp/regexp.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 class ByteArray;
16 class JSRegExp;
17 class Label;
18 class String;
19 
20 static const base::uc32 kLeadSurrogateStart = 0xd800;
21 static const base::uc32 kLeadSurrogateEnd = 0xdbff;
22 static const base::uc32 kTrailSurrogateStart = 0xdc00;
23 static const base::uc32 kTrailSurrogateEnd = 0xdfff;
24 static const base::uc32 kNonBmpStart = 0x10000;
25 static const base::uc32 kNonBmpEnd = 0x10ffff;
26 
27 class RegExpMacroAssembler {
28  public:
29   // The implementation must be able to handle at least:
30   static constexpr int kMaxRegisterCount = (1 << 16);
31   static constexpr int kMaxRegister = kMaxRegisterCount - 1;
32   static constexpr int kMaxCPOffset = (1 << 15) - 1;
33   static constexpr int kMinCPOffset = -(1 << 15);
34 
35   static constexpr int kTableSizeBits = 7;
36   static constexpr int kTableSize = 1 << kTableSizeBits;
37   static constexpr int kTableMask = kTableSize - 1;
38 
39   static constexpr int kUseCharactersValue = -1;
40 
41   RegExpMacroAssembler(Isolate* isolate, Zone* zone);
42   virtual ~RegExpMacroAssembler() = default;
43 
44   virtual Handle<HeapObject> GetCode(Handle<String> source) = 0;
45 
46   // This function is called when code generation is aborted, so that
47   // the assembler could clean up internal data structures.
AbortedCodeGeneration()48   virtual void AbortedCodeGeneration() {}
49   // The maximal number of pushes between stack checks. Users must supply
50   // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck)
51   // at least once for every stack_limit() pushes that are executed.
52   virtual int stack_limit_slack() = 0;
53   virtual bool CanReadUnaligned() const = 0;
54 
55   virtual void AdvanceCurrentPosition(int by) = 0;  // Signed cp change.
56   virtual void AdvanceRegister(int reg, int by) = 0;  // r[reg] += by.
57   // Continues execution from the position pushed on the top of the backtrack
58   // stack by an earlier PushBacktrack(Label*).
59   virtual void Backtrack() = 0;
60   virtual void Bind(Label* label) = 0;
61   // Dispatch after looking the current character up in a 2-bits-per-entry
62   // map.  The destinations vector has up to 4 labels.
63   virtual void CheckCharacter(unsigned c, Label* on_equal) = 0;
64   // Bitwise and the current character with the given constant and then
65   // check for a match with c.
66   virtual void CheckCharacterAfterAnd(unsigned c,
67                                       unsigned and_with,
68                                       Label* on_equal) = 0;
69   virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0;
70   virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0;
71   virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0;
72   virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0;
73   virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0;
74   virtual void CheckNotBackReference(int start_reg, bool read_backward,
75                                      Label* on_no_match) = 0;
76   virtual void CheckNotBackReferenceIgnoreCase(int start_reg,
77                                                bool read_backward, bool unicode,
78                                                Label* on_no_match) = 0;
79   // Check the current character for a match with a literal character.  If we
80   // fail to match then goto the on_failure label.  End of input always
81   // matches.  If the label is nullptr then we should pop a backtrack address
82   // off the stack and go to that.
83   virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0;
84   virtual void CheckNotCharacterAfterAnd(unsigned c,
85                                          unsigned and_with,
86                                          Label* on_not_equal) = 0;
87   // Subtract a constant from the current character, then and with the given
88   // constant and then check for a match with c.
89   virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus,
90                                               base::uc16 and_with,
91                                               Label* on_not_equal) = 0;
92   virtual void CheckCharacterInRange(base::uc16 from,
93                                      base::uc16 to,  // Both inclusive.
94                                      Label* on_in_range) = 0;
95   virtual void CheckCharacterNotInRange(base::uc16 from,
96                                         base::uc16 to,  // Both inclusive.
97                                         Label* on_not_in_range) = 0;
98   // Returns true if the check was emitted, false otherwise.
99   virtual bool CheckCharacterInRangeArray(
100       const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0;
101   virtual bool CheckCharacterNotInRangeArray(
102       const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0;
103 
104   // The current character (modulus the kTableSize) is looked up in the byte
105   // array, and if the found byte is non-zero, we jump to the on_bit_set label.
106   virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0;
107 
108   // Checks whether the given offset from the current position is before
109   // the end of the string.  May overwrite the current character.
110   virtual void CheckPosition(int cp_offset, Label* on_outside_input);
111   // Check whether a standard/default character class matches the current
112   // character. Returns false if the type of special character class does
113   // not have custom support.
114   // May clobber the current loaded character.
CheckSpecialCharacterClass(StandardCharacterSet type,Label * on_no_match)115   virtual bool CheckSpecialCharacterClass(StandardCharacterSet type,
116                                           Label* on_no_match) {
117     return false;
118   }
119 
120   // Control-flow integrity:
121   // Define a jump target and bind a label.
BindJumpTarget(Label * label)122   virtual void BindJumpTarget(Label* label) { Bind(label); }
123 
124   virtual void Fail() = 0;
125   virtual void GoTo(Label* label) = 0;
126   // Check whether a register is >= a given constant and go to a label if it
127   // is.  Backtracks instead if the label is nullptr.
128   virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0;
129   // Check whether a register is < a given constant and go to a label if it is.
130   // Backtracks instead if the label is nullptr.
131   virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0;
132   // Check whether a register is == to the current position and go to a
133   // label if it is.
134   virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0;
135   V8_EXPORT_PRIVATE void LoadCurrentCharacter(
136       int cp_offset, Label* on_end_of_input, bool check_bounds = true,
137       int characters = 1, int eats_at_least = kUseCharactersValue);
138   virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
139                                         bool check_bounds, int characters,
140                                         int eats_at_least) = 0;
141   virtual void PopCurrentPosition() = 0;
142   virtual void PopRegister(int register_index) = 0;
143   // Pushes the label on the backtrack stack, so that a following Backtrack
144   // will go to this label. Always checks the backtrack stack limit.
145   virtual void PushBacktrack(Label* label) = 0;
146   virtual void PushCurrentPosition() = 0;
147   enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true };
148   virtual void PushRegister(int register_index,
149                             StackCheckFlag check_stack_limit) = 0;
150   virtual void ReadCurrentPositionFromRegister(int reg) = 0;
151   virtual void ReadStackPointerFromRegister(int reg) = 0;
152   virtual void SetCurrentPositionFromEnd(int by) = 0;
153   virtual void SetRegister(int register_index, int to) = 0;
154   // Return whether the matching (with a global regexp) will be restarted.
155   virtual bool Succeed() = 0;
156   virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0;
157   virtual void ClearRegisters(int reg_from, int reg_to) = 0;
158   virtual void WriteStackPointerToRegister(int reg) = 0;
159 
160   // Check that we are not in the middle of a surrogate pair.
161   void CheckNotInSurrogatePair(int cp_offset, Label* on_failure);
162 
163 #define IMPLEMENTATIONS_LIST(V) \
164   V(IA32)                       \
165   V(ARM)                        \
166   V(ARM64)                      \
167   V(MIPS)                       \
168   V(LOONG64)                    \
169   V(RISCV)                      \
170   V(S390)                       \
171   V(PPC)                        \
172   V(X64)                        \
173   V(Bytecode)
174 
175   enum IrregexpImplementation {
176 #define V(Name) k##Name##Implementation,
177     IMPLEMENTATIONS_LIST(V)
178 #undef V
179   };
180 
ImplementationToString(IrregexpImplementation impl)181   inline const char* ImplementationToString(IrregexpImplementation impl) {
182     static const char* const kNames[] = {
183 #define V(Name) #Name,
184         IMPLEMENTATIONS_LIST(V)
185 #undef V
186     };
187     return kNames[impl];
188   }
189 #undef IMPLEMENTATIONS_LIST
190   virtual IrregexpImplementation Implementation() = 0;
191 
192   // Compare two-byte strings case insensitively.
193   //
194   // Called from generated code.
195   static int CaseInsensitiveCompareNonUnicode(Address byte_offset1,
196                                               Address byte_offset2,
197                                               size_t byte_length,
198                                               Isolate* isolate);
199   static int CaseInsensitiveCompareUnicode(Address byte_offset1,
200                                            Address byte_offset2,
201                                            size_t byte_length,
202                                            Isolate* isolate);
203 
204   // `raw_byte_array` is a ByteArray containing a set of character ranges,
205   // where ranges are encoded as uint16_t elements:
206   //
207   //  [from0, to0, from1, to1, ..., fromN, toN], or
208   //  [from0, to0, from1, to1, ..., fromN]  (open-ended last interval).
209   //
210   // fromN is inclusive, toN is exclusive. Returns zero if not in a range,
211   // non-zero otherwise.
212   //
213   // Called from generated code.
214   static uint32_t IsCharacterInRangeArray(uint32_t current_char,
215                                           Address raw_byte_array,
216                                           Isolate* isolate);
217 
218   // Controls the generation of large inlined constants in the code.
set_slow_safe(bool ssc)219   void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; }
slow_safe()220   bool slow_safe() const { return slow_safe_compiler_; }
221 
222   // Controls after how many backtracks irregexp should abort execution.  If it
223   // can fall back to the experimental engine (see `set_can_fallback`), it will
224   // return the appropriate error code, otherwise it will return the number of
225   // matches found so far (perhaps none).
set_backtrack_limit(uint32_t backtrack_limit)226   void set_backtrack_limit(uint32_t backtrack_limit) {
227     backtrack_limit_ = backtrack_limit;
228   }
229 
230   // Set whether or not irregexp can fall back to the experimental engine on
231   // excessive backtracking.  The number of backtracks considered excessive can
232   // be controlled with set_backtrack_limit.
set_can_fallback(bool val)233   void set_can_fallback(bool val) { can_fallback_ = val; }
234 
235   enum GlobalMode {
236     NOT_GLOBAL,
237     GLOBAL_NO_ZERO_LENGTH_CHECK,
238     GLOBAL,
239     GLOBAL_UNICODE
240   };
241   // Set whether the regular expression has the global flag.  Exiting due to
242   // a failure in a global regexp may still mean success overall.
set_global_mode(GlobalMode mode)243   inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; }
global()244   inline bool global() const { return global_mode_ != NOT_GLOBAL; }
global_with_zero_length_check()245   inline bool global_with_zero_length_check() const {
246     return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE;
247   }
global_unicode()248   inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; }
249 
isolate()250   Isolate* isolate() const { return isolate_; }
zone()251   Zone* zone() const { return zone_; }
252 
253  protected:
254   bool has_backtrack_limit() const;
backtrack_limit()255   uint32_t backtrack_limit() const { return backtrack_limit_; }
256 
can_fallback()257   bool can_fallback() const { return can_fallback_; }
258 
259  private:
260   bool slow_safe_compiler_;
261   uint32_t backtrack_limit_;
262   bool can_fallback_ = false;
263   GlobalMode global_mode_;
264   Isolate* const isolate_;
265   Zone* const zone_;
266 };
267 
268 class NativeRegExpMacroAssembler: public RegExpMacroAssembler {
269  public:
270   // Type of input string to generate code for.
271   enum Mode { LATIN1 = 1, UC16 = 2 };
272 
273   // Result of calling generated native RegExp code.
274   // RETRY: Something significant changed during execution, and the matching
275   //        should be retried from scratch.
276   // EXCEPTION: Something failed during execution. If no exception has been
277   //            thrown, it's an internal out-of-memory, and the caller should
278   //            throw the exception.
279   // FAILURE: Matching failed.
280   // SUCCESS: Matching succeeded, and the output array has been filled with
281   //          capture positions.
282   // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the
283   //                           experimental engine instead.
284   enum Result {
285     FAILURE = RegExp::kInternalRegExpFailure,
286     SUCCESS = RegExp::kInternalRegExpSuccess,
287     EXCEPTION = RegExp::kInternalRegExpException,
288     RETRY = RegExp::kInternalRegExpRetry,
289     FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental,
290     SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult,
291   };
292 
NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)293   NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone)
294       : RegExpMacroAssembler(isolate, zone) {}
295   ~NativeRegExpMacroAssembler() override = default;
296 
297   // Returns a {Result} sentinel, or the number of successful matches.
298   static int Match(Handle<JSRegExp> regexp, Handle<String> subject,
299                    int* offsets_vector, int offsets_vector_length,
300                    int previous_index, Isolate* isolate);
301 
302   V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset,
303                                                  const byte* input_start,
304                                                  const byte* input_end,
305                                                  int* output, int output_size,
306                                                  Isolate* isolate,
307                                                  JSRegExp regexp);
308 
309   bool CanReadUnaligned() const override;
310 
311   void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input,
312                                 bool check_bounds, int characters,
313                                 int eats_at_least) override;
314   // Load a number of characters at the given offset from the
315   // current position, into the current-character register.
316   virtual void LoadCurrentCharacterUnchecked(int cp_offset,
317                                              int character_count) = 0;
318 
319   // Called from RegExp if the backtrack stack limit is hit. Tries to expand
320   // the stack. Returns the new stack-pointer if successful, or returns 0 if
321   // unable to grow the stack.
322   // This function must not trigger a garbage collection.
323   //
324   // Called from generated code.
325   static Address GrowStack(Isolate* isolate);
326 
327   // Called from generated code.
328   static int CheckStackGuardState(Isolate* isolate, int start_index,
329                                   RegExp::CallOrigin call_origin,
330                                   Address* return_address, Code re_code,
331                                   Address* subject, const byte** input_start,
332                                   const byte** input_end);
333 
word_character_map_address()334   static Address word_character_map_address() {
335     return reinterpret_cast<Address>(&word_character_map[0]);
336   }
337 
338  protected:
339   // Byte map of one byte characters with a 0xff if the character is a word
340   // character (digit, letter or underscore) and 0x00 otherwise.
341   // Used by generated RegExp code.
342   static const byte word_character_map[256];
343 
344   Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges);
345 
346  private:
347   // Returns a {Result} sentinel, or the number of successful matches.
348   static int Execute(String input, int start_offset, const byte* input_start,
349                      const byte* input_end, int* output, int output_size,
350                      Isolate* isolate, JSRegExp regexp);
351 
352   std::unordered_map<uint32_t, Handle<ByteArray>> range_array_cache_;
353 };
354 
355 }  // namespace internal
356 }  // namespace v8
357 
358 #endif  // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_
359