1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 6 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 7 8 #include "src/codegen/label.h" 9 #include "src/regexp/regexp-ast.h" 10 #include "src/regexp/regexp.h" 11 12 namespace v8 { 13 namespace internal { 14 15 static const uc32 kLeadSurrogateStart = 0xd800; 16 static const uc32 kLeadSurrogateEnd = 0xdbff; 17 static const uc32 kTrailSurrogateStart = 0xdc00; 18 static const uc32 kTrailSurrogateEnd = 0xdfff; 19 static const uc32 kNonBmpStart = 0x10000; 20 static const uc32 kNonBmpEnd = 0x10ffff; 21 22 struct DisjunctDecisionRow { 23 RegExpCharacterClass cc; 24 Label* on_match; 25 }; 26 27 28 class RegExpMacroAssembler { 29 public: 30 // The implementation must be able to handle at least: 31 static constexpr int kMaxRegisterCount = (1 << 16); 32 static constexpr int kMaxRegister = kMaxRegisterCount - 1; 33 static constexpr int kMaxCPOffset = (1 << 15) - 1; 34 static constexpr int kMinCPOffset = -(1 << 15); 35 36 static constexpr int kTableSizeBits = 7; 37 static constexpr int kTableSize = 1 << kTableSizeBits; 38 static constexpr int kTableMask = kTableSize - 1; 39 40 static constexpr int kUseCharactersValue = -1; 41 42 enum IrregexpImplementation { 43 kIA32Implementation, 44 kARMImplementation, 45 kARM64Implementation, 46 kMIPSImplementation, 47 kS390Implementation, 48 kPPCImplementation, 49 kX64Implementation, 50 kX87Implementation, 51 kBytecodeImplementation 52 }; 53 54 enum StackCheckFlag { 55 kNoStackLimitCheck = false, 56 kCheckStackLimit = true 57 }; 58 59 RegExpMacroAssembler(Isolate* isolate, Zone* zone); 60 virtual ~RegExpMacroAssembler(); 61 // This function is called when code generation is aborted, so that 62 // the assembler could clean up internal data structures. AbortedCodeGeneration()63 virtual void AbortedCodeGeneration() {} 64 // The maximal number of pushes between stack checks. Users must supply 65 // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) 66 // at least once for every stack_limit() pushes that are executed. 67 virtual int stack_limit_slack() = 0; 68 virtual bool CanReadUnaligned() = 0; 69 virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. 70 virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. 71 // Continues execution from the position pushed on the top of the backtrack 72 // stack by an earlier PushBacktrack(Label*). 73 virtual void Backtrack() = 0; 74 virtual void Bind(Label* label) = 0; 75 // Dispatch after looking the current character up in a 2-bits-per-entry 76 // map. The destinations vector has up to 4 labels. 77 virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; 78 // Bitwise and the current character with the given constant and then 79 // check for a match with c. 80 virtual void CheckCharacterAfterAnd(unsigned c, 81 unsigned and_with, 82 Label* on_equal) = 0; 83 virtual void CheckCharacterGT(uc16 limit, Label* on_greater) = 0; 84 virtual void CheckCharacterLT(uc16 limit, Label* on_less) = 0; 85 virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; 86 virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; 87 virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; 88 virtual void CheckNotBackReference(int start_reg, bool read_backward, 89 Label* on_no_match) = 0; 90 virtual void CheckNotBackReferenceIgnoreCase(int start_reg, 91 bool read_backward, bool unicode, 92 Label* on_no_match) = 0; 93 // Check the current character for a match with a literal character. If we 94 // fail to match then goto the on_failure label. End of input always 95 // matches. If the label is nullptr then we should pop a backtrack address 96 // off the stack and go to that. 97 virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; 98 virtual void CheckNotCharacterAfterAnd(unsigned c, 99 unsigned and_with, 100 Label* on_not_equal) = 0; 101 // Subtract a constant from the current character, then and with the given 102 // constant and then check for a match with c. 103 virtual void CheckNotCharacterAfterMinusAnd(uc16 c, 104 uc16 minus, 105 uc16 and_with, 106 Label* on_not_equal) = 0; 107 virtual void CheckCharacterInRange(uc16 from, 108 uc16 to, // Both inclusive. 109 Label* on_in_range) = 0; 110 virtual void CheckCharacterNotInRange(uc16 from, 111 uc16 to, // Both inclusive. 112 Label* on_not_in_range) = 0; 113 114 // The current character (modulus the kTableSize) is looked up in the byte 115 // array, and if the found byte is non-zero, we jump to the on_bit_set label. 116 virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0; 117 118 // Checks whether the given offset from the current position is before 119 // the end of the string. May overwrite the current character. 120 virtual void CheckPosition(int cp_offset, Label* on_outside_input); 121 // Check whether a standard/default character class matches the current 122 // character. Returns false if the type of special character class does 123 // not have custom support. 124 // May clobber the current loaded character. 125 virtual bool CheckSpecialCharacterClass(uc16 type, Label* on_no_match); 126 127 // Control-flow integrity: 128 // Define a jump target and bind a label. BindJumpTarget(Label * label)129 virtual void BindJumpTarget(Label* label) { Bind(label); } 130 131 virtual void Fail() = 0; 132 virtual Handle<HeapObject> GetCode(Handle<String> source) = 0; 133 virtual void GoTo(Label* label) = 0; 134 // Check whether a register is >= a given constant and go to a label if it 135 // is. Backtracks instead if the label is nullptr. 136 virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; 137 // Check whether a register is < a given constant and go to a label if it is. 138 // Backtracks instead if the label is nullptr. 139 virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; 140 // Check whether a register is == to the current position and go to a 141 // label if it is. 142 virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; 143 virtual IrregexpImplementation Implementation() = 0; 144 V8_EXPORT_PRIVATE void LoadCurrentCharacter( 145 int cp_offset, Label* on_end_of_input, bool check_bounds = true, 146 int characters = 1, int eats_at_least = kUseCharactersValue); 147 virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, 148 bool check_bounds, int characters, 149 int eats_at_least) = 0; 150 virtual void PopCurrentPosition() = 0; 151 virtual void PopRegister(int register_index) = 0; 152 // Pushes the label on the backtrack stack, so that a following Backtrack 153 // will go to this label. Always checks the backtrack stack limit. 154 virtual void PushBacktrack(Label* label) = 0; 155 virtual void PushCurrentPosition() = 0; 156 virtual void PushRegister(int register_index, 157 StackCheckFlag check_stack_limit) = 0; 158 virtual void ReadCurrentPositionFromRegister(int reg) = 0; 159 virtual void ReadStackPointerFromRegister(int reg) = 0; 160 virtual void SetCurrentPositionFromEnd(int by) = 0; 161 virtual void SetRegister(int register_index, int to) = 0; 162 // Return whether the matching (with a global regexp) will be restarted. 163 virtual bool Succeed() = 0; 164 virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; 165 virtual void ClearRegisters(int reg_from, int reg_to) = 0; 166 virtual void WriteStackPointerToRegister(int reg) = 0; 167 168 // Compare two-byte strings case insensitively. 169 // Called from generated RegExp code. 170 static int CaseInsensitiveCompareNonUnicode(Address byte_offset1, 171 Address byte_offset2, 172 size_t byte_length, 173 Isolate* isolate); 174 static int CaseInsensitiveCompareUnicode(Address byte_offset1, 175 Address byte_offset2, 176 size_t byte_length, 177 Isolate* isolate); 178 179 // Check that we are not in the middle of a surrogate pair. 180 void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); 181 182 // Controls the generation of large inlined constants in the code. set_slow_safe(bool ssc)183 void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } slow_safe()184 bool slow_safe() { return slow_safe_compiler_; } 185 186 // Controls after how many backtracks irregexp should abort execution. If it 187 // can fall back to the experimental engine (see `set_can_fallback`), it will 188 // return the appropriate error code, otherwise it will return the number of 189 // matches found so far (perhaps none). set_backtrack_limit(uint32_t backtrack_limit)190 void set_backtrack_limit(uint32_t backtrack_limit) { 191 backtrack_limit_ = backtrack_limit; 192 } 193 194 // Set whether or not irregexp can fall back to the experimental engine on 195 // excessive backtracking. The number of backtracks considered excessive can 196 // be controlled with set_backtrack_limit. set_can_fallback(bool val)197 void set_can_fallback(bool val) { can_fallback_ = val; } 198 199 enum GlobalMode { 200 NOT_GLOBAL, 201 GLOBAL_NO_ZERO_LENGTH_CHECK, 202 GLOBAL, 203 GLOBAL_UNICODE 204 }; 205 // Set whether the regular expression has the global flag. Exiting due to 206 // a failure in a global regexp may still mean success overall. set_global_mode(GlobalMode mode)207 inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } global()208 inline bool global() { return global_mode_ != NOT_GLOBAL; } global_with_zero_length_check()209 inline bool global_with_zero_length_check() { 210 return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; 211 } global_unicode()212 inline bool global_unicode() { return global_mode_ == GLOBAL_UNICODE; } 213 isolate()214 Isolate* isolate() const { return isolate_; } zone()215 Zone* zone() const { return zone_; } 216 217 protected: has_backtrack_limit()218 bool has_backtrack_limit() const { 219 return backtrack_limit_ != JSRegExp::kNoBacktrackLimit; 220 } backtrack_limit()221 uint32_t backtrack_limit() const { return backtrack_limit_; } 222 can_fallback()223 bool can_fallback() const { return can_fallback_; } 224 225 private: 226 bool slow_safe_compiler_; 227 uint32_t backtrack_limit_ = JSRegExp::kNoBacktrackLimit; 228 bool can_fallback_ = false; 229 GlobalMode global_mode_; 230 Isolate* isolate_; 231 Zone* zone_; 232 }; 233 234 class NativeRegExpMacroAssembler: public RegExpMacroAssembler { 235 public: 236 // Type of input string to generate code for. 237 enum Mode { LATIN1 = 1, UC16 = 2 }; 238 239 // Result of calling generated native RegExp code. 240 // RETRY: Something significant changed during execution, and the matching 241 // should be retried from scratch. 242 // EXCEPTION: Something failed during execution. If no exception has been 243 // thrown, it's an internal out-of-memory, and the caller should 244 // throw the exception. 245 // FAILURE: Matching failed. 246 // SUCCESS: Matching succeeded, and the output array has been filled with 247 // capture positions. 248 // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the 249 // experimental engine instead. 250 enum Result { 251 FAILURE = RegExp::kInternalRegExpFailure, 252 SUCCESS = RegExp::kInternalRegExpSuccess, 253 EXCEPTION = RegExp::kInternalRegExpException, 254 RETRY = RegExp::kInternalRegExpRetry, 255 FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental, 256 SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult, 257 }; 258 259 NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone); 260 ~NativeRegExpMacroAssembler() override; 261 bool CanReadUnaligned() override; 262 263 // Returns a {Result} sentinel, or the number of successful matches. 264 static int Match(Handle<JSRegExp> regexp, Handle<String> subject, 265 int* offsets_vector, int offsets_vector_length, 266 int previous_index, Isolate* isolate); 267 268 // Called from RegExp if the backtrack stack limit is hit. 269 // Tries to expand the stack. Returns the new stack-pointer if 270 // successful, and updates the stack_top address, or returns 0 if unable 271 // to grow the stack. 272 // This function must not trigger a garbage collection. 273 static Address GrowStack(Address stack_pointer, Address* stack_top, 274 Isolate* isolate); 275 276 static int CheckStackGuardState(Isolate* isolate, int start_index, 277 RegExp::CallOrigin call_origin, 278 Address* return_address, Code re_code, 279 Address* subject, const byte** input_start, 280 const byte** input_end); 281 282 // Byte map of one byte characters with a 0xff if the character is a word 283 // character (digit, letter or underscore) and 0x00 otherwise. 284 // Used by generated RegExp code. 285 static const byte word_character_map[256]; 286 word_character_map_address()287 static Address word_character_map_address() { 288 return reinterpret_cast<Address>(&word_character_map[0]); 289 } 290 291 // Returns a {Result} sentinel, or the number of successful matches. 292 V8_EXPORT_PRIVATE static int Execute(String input, int start_offset, 293 const byte* input_start, 294 const byte* input_end, int* output, 295 int output_size, Isolate* isolate, 296 JSRegExp regexp); 297 void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, 298 bool check_bounds, int characters, 299 int eats_at_least) override; 300 // Load a number of characters at the given offset from the 301 // current position, into the current-character register. 302 virtual void LoadCurrentCharacterUnchecked(int cp_offset, 303 int character_count) = 0; 304 }; 305 306 } // namespace internal 307 } // namespace v8 308 309 #endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 310