1 // Copyright 2012 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 6 #define V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 7 8 #include "src/base/strings.h" 9 #include "src/regexp/regexp-ast.h" 10 #include "src/regexp/regexp.h" 11 12 namespace v8 { 13 namespace internal { 14 15 class ByteArray; 16 class JSRegExp; 17 class Label; 18 class String; 19 20 static const base::uc32 kLeadSurrogateStart = 0xd800; 21 static const base::uc32 kLeadSurrogateEnd = 0xdbff; 22 static const base::uc32 kTrailSurrogateStart = 0xdc00; 23 static const base::uc32 kTrailSurrogateEnd = 0xdfff; 24 static const base::uc32 kNonBmpStart = 0x10000; 25 static const base::uc32 kNonBmpEnd = 0x10ffff; 26 27 class RegExpMacroAssembler { 28 public: 29 // The implementation must be able to handle at least: 30 static constexpr int kMaxRegisterCount = (1 << 16); 31 static constexpr int kMaxRegister = kMaxRegisterCount - 1; 32 static constexpr int kMaxCPOffset = (1 << 15) - 1; 33 static constexpr int kMinCPOffset = -(1 << 15); 34 35 static constexpr int kTableSizeBits = 7; 36 static constexpr int kTableSize = 1 << kTableSizeBits; 37 static constexpr int kTableMask = kTableSize - 1; 38 39 static constexpr int kUseCharactersValue = -1; 40 41 RegExpMacroAssembler(Isolate* isolate, Zone* zone); 42 virtual ~RegExpMacroAssembler() = default; 43 44 virtual Handle<HeapObject> GetCode(Handle<String> source) = 0; 45 46 // This function is called when code generation is aborted, so that 47 // the assembler could clean up internal data structures. AbortedCodeGeneration()48 virtual void AbortedCodeGeneration() {} 49 // The maximal number of pushes between stack checks. Users must supply 50 // kCheckStackLimit flag to push operations (instead of kNoStackLimitCheck) 51 // at least once for every stack_limit() pushes that are executed. 52 virtual int stack_limit_slack() = 0; 53 virtual bool CanReadUnaligned() const = 0; 54 55 virtual void AdvanceCurrentPosition(int by) = 0; // Signed cp change. 56 virtual void AdvanceRegister(int reg, int by) = 0; // r[reg] += by. 57 // Continues execution from the position pushed on the top of the backtrack 58 // stack by an earlier PushBacktrack(Label*). 59 virtual void Backtrack() = 0; 60 virtual void Bind(Label* label) = 0; 61 // Dispatch after looking the current character up in a 2-bits-per-entry 62 // map. The destinations vector has up to 4 labels. 63 virtual void CheckCharacter(unsigned c, Label* on_equal) = 0; 64 // Bitwise and the current character with the given constant and then 65 // check for a match with c. 66 virtual void CheckCharacterAfterAnd(unsigned c, 67 unsigned and_with, 68 Label* on_equal) = 0; 69 virtual void CheckCharacterGT(base::uc16 limit, Label* on_greater) = 0; 70 virtual void CheckCharacterLT(base::uc16 limit, Label* on_less) = 0; 71 virtual void CheckGreedyLoop(Label* on_tos_equals_current_position) = 0; 72 virtual void CheckAtStart(int cp_offset, Label* on_at_start) = 0; 73 virtual void CheckNotAtStart(int cp_offset, Label* on_not_at_start) = 0; 74 virtual void CheckNotBackReference(int start_reg, bool read_backward, 75 Label* on_no_match) = 0; 76 virtual void CheckNotBackReferenceIgnoreCase(int start_reg, 77 bool read_backward, bool unicode, 78 Label* on_no_match) = 0; 79 // Check the current character for a match with a literal character. If we 80 // fail to match then goto the on_failure label. End of input always 81 // matches. If the label is nullptr then we should pop a backtrack address 82 // off the stack and go to that. 83 virtual void CheckNotCharacter(unsigned c, Label* on_not_equal) = 0; 84 virtual void CheckNotCharacterAfterAnd(unsigned c, 85 unsigned and_with, 86 Label* on_not_equal) = 0; 87 // Subtract a constant from the current character, then and with the given 88 // constant and then check for a match with c. 89 virtual void CheckNotCharacterAfterMinusAnd(base::uc16 c, base::uc16 minus, 90 base::uc16 and_with, 91 Label* on_not_equal) = 0; 92 virtual void CheckCharacterInRange(base::uc16 from, 93 base::uc16 to, // Both inclusive. 94 Label* on_in_range) = 0; 95 virtual void CheckCharacterNotInRange(base::uc16 from, 96 base::uc16 to, // Both inclusive. 97 Label* on_not_in_range) = 0; 98 // Returns true if the check was emitted, false otherwise. 99 virtual bool CheckCharacterInRangeArray( 100 const ZoneList<CharacterRange>* ranges, Label* on_in_range) = 0; 101 virtual bool CheckCharacterNotInRangeArray( 102 const ZoneList<CharacterRange>* ranges, Label* on_not_in_range) = 0; 103 104 // The current character (modulus the kTableSize) is looked up in the byte 105 // array, and if the found byte is non-zero, we jump to the on_bit_set label. 106 virtual void CheckBitInTable(Handle<ByteArray> table, Label* on_bit_set) = 0; 107 108 // Checks whether the given offset from the current position is before 109 // the end of the string. May overwrite the current character. 110 virtual void CheckPosition(int cp_offset, Label* on_outside_input); 111 // Check whether a standard/default character class matches the current 112 // character. Returns false if the type of special character class does 113 // not have custom support. 114 // May clobber the current loaded character. CheckSpecialCharacterClass(StandardCharacterSet type,Label * on_no_match)115 virtual bool CheckSpecialCharacterClass(StandardCharacterSet type, 116 Label* on_no_match) { 117 return false; 118 } 119 120 // Control-flow integrity: 121 // Define a jump target and bind a label. BindJumpTarget(Label * label)122 virtual void BindJumpTarget(Label* label) { Bind(label); } 123 124 virtual void Fail() = 0; 125 virtual void GoTo(Label* label) = 0; 126 // Check whether a register is >= a given constant and go to a label if it 127 // is. Backtracks instead if the label is nullptr. 128 virtual void IfRegisterGE(int reg, int comparand, Label* if_ge) = 0; 129 // Check whether a register is < a given constant and go to a label if it is. 130 // Backtracks instead if the label is nullptr. 131 virtual void IfRegisterLT(int reg, int comparand, Label* if_lt) = 0; 132 // Check whether a register is == to the current position and go to a 133 // label if it is. 134 virtual void IfRegisterEqPos(int reg, Label* if_eq) = 0; 135 V8_EXPORT_PRIVATE void LoadCurrentCharacter( 136 int cp_offset, Label* on_end_of_input, bool check_bounds = true, 137 int characters = 1, int eats_at_least = kUseCharactersValue); 138 virtual void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, 139 bool check_bounds, int characters, 140 int eats_at_least) = 0; 141 virtual void PopCurrentPosition() = 0; 142 virtual void PopRegister(int register_index) = 0; 143 // Pushes the label on the backtrack stack, so that a following Backtrack 144 // will go to this label. Always checks the backtrack stack limit. 145 virtual void PushBacktrack(Label* label) = 0; 146 virtual void PushCurrentPosition() = 0; 147 enum StackCheckFlag { kNoStackLimitCheck = false, kCheckStackLimit = true }; 148 virtual void PushRegister(int register_index, 149 StackCheckFlag check_stack_limit) = 0; 150 virtual void ReadCurrentPositionFromRegister(int reg) = 0; 151 virtual void ReadStackPointerFromRegister(int reg) = 0; 152 virtual void SetCurrentPositionFromEnd(int by) = 0; 153 virtual void SetRegister(int register_index, int to) = 0; 154 // Return whether the matching (with a global regexp) will be restarted. 155 virtual bool Succeed() = 0; 156 virtual void WriteCurrentPositionToRegister(int reg, int cp_offset) = 0; 157 virtual void ClearRegisters(int reg_from, int reg_to) = 0; 158 virtual void WriteStackPointerToRegister(int reg) = 0; 159 160 // Check that we are not in the middle of a surrogate pair. 161 void CheckNotInSurrogatePair(int cp_offset, Label* on_failure); 162 163 #define IMPLEMENTATIONS_LIST(V) \ 164 V(IA32) \ 165 V(ARM) \ 166 V(ARM64) \ 167 V(MIPS) \ 168 V(LOONG64) \ 169 V(RISCV) \ 170 V(S390) \ 171 V(PPC) \ 172 V(X64) \ 173 V(Bytecode) 174 175 enum IrregexpImplementation { 176 #define V(Name) k##Name##Implementation, 177 IMPLEMENTATIONS_LIST(V) 178 #undef V 179 }; 180 ImplementationToString(IrregexpImplementation impl)181 inline const char* ImplementationToString(IrregexpImplementation impl) { 182 static const char* const kNames[] = { 183 #define V(Name) #Name, 184 IMPLEMENTATIONS_LIST(V) 185 #undef V 186 }; 187 return kNames[impl]; 188 } 189 #undef IMPLEMENTATIONS_LIST 190 virtual IrregexpImplementation Implementation() = 0; 191 192 // Compare two-byte strings case insensitively. 193 // 194 // Called from generated code. 195 static int CaseInsensitiveCompareNonUnicode(Address byte_offset1, 196 Address byte_offset2, 197 size_t byte_length, 198 Isolate* isolate); 199 static int CaseInsensitiveCompareUnicode(Address byte_offset1, 200 Address byte_offset2, 201 size_t byte_length, 202 Isolate* isolate); 203 204 // `raw_byte_array` is a ByteArray containing a set of character ranges, 205 // where ranges are encoded as uint16_t elements: 206 // 207 // [from0, to0, from1, to1, ..., fromN, toN], or 208 // [from0, to0, from1, to1, ..., fromN] (open-ended last interval). 209 // 210 // fromN is inclusive, toN is exclusive. Returns zero if not in a range, 211 // non-zero otherwise. 212 // 213 // Called from generated code. 214 static uint32_t IsCharacterInRangeArray(uint32_t current_char, 215 Address raw_byte_array, 216 Isolate* isolate); 217 218 // Controls the generation of large inlined constants in the code. set_slow_safe(bool ssc)219 void set_slow_safe(bool ssc) { slow_safe_compiler_ = ssc; } slow_safe()220 bool slow_safe() const { return slow_safe_compiler_; } 221 222 // Controls after how many backtracks irregexp should abort execution. If it 223 // can fall back to the experimental engine (see `set_can_fallback`), it will 224 // return the appropriate error code, otherwise it will return the number of 225 // matches found so far (perhaps none). set_backtrack_limit(uint32_t backtrack_limit)226 void set_backtrack_limit(uint32_t backtrack_limit) { 227 backtrack_limit_ = backtrack_limit; 228 } 229 230 // Set whether or not irregexp can fall back to the experimental engine on 231 // excessive backtracking. The number of backtracks considered excessive can 232 // be controlled with set_backtrack_limit. set_can_fallback(bool val)233 void set_can_fallback(bool val) { can_fallback_ = val; } 234 235 enum GlobalMode { 236 NOT_GLOBAL, 237 GLOBAL_NO_ZERO_LENGTH_CHECK, 238 GLOBAL, 239 GLOBAL_UNICODE 240 }; 241 // Set whether the regular expression has the global flag. Exiting due to 242 // a failure in a global regexp may still mean success overall. set_global_mode(GlobalMode mode)243 inline void set_global_mode(GlobalMode mode) { global_mode_ = mode; } global()244 inline bool global() const { return global_mode_ != NOT_GLOBAL; } global_with_zero_length_check()245 inline bool global_with_zero_length_check() const { 246 return global_mode_ == GLOBAL || global_mode_ == GLOBAL_UNICODE; 247 } global_unicode()248 inline bool global_unicode() const { return global_mode_ == GLOBAL_UNICODE; } 249 isolate()250 Isolate* isolate() const { return isolate_; } zone()251 Zone* zone() const { return zone_; } 252 253 protected: 254 bool has_backtrack_limit() const; backtrack_limit()255 uint32_t backtrack_limit() const { return backtrack_limit_; } 256 can_fallback()257 bool can_fallback() const { return can_fallback_; } 258 259 private: 260 bool slow_safe_compiler_; 261 uint32_t backtrack_limit_; 262 bool can_fallback_ = false; 263 GlobalMode global_mode_; 264 Isolate* const isolate_; 265 Zone* const zone_; 266 }; 267 268 class NativeRegExpMacroAssembler: public RegExpMacroAssembler { 269 public: 270 // Type of input string to generate code for. 271 enum Mode { LATIN1 = 1, UC16 = 2 }; 272 273 // Result of calling generated native RegExp code. 274 // RETRY: Something significant changed during execution, and the matching 275 // should be retried from scratch. 276 // EXCEPTION: Something failed during execution. If no exception has been 277 // thrown, it's an internal out-of-memory, and the caller should 278 // throw the exception. 279 // FAILURE: Matching failed. 280 // SUCCESS: Matching succeeded, and the output array has been filled with 281 // capture positions. 282 // FALLBACK_TO_EXPERIMENTAL: Execute the regexp on this subject using the 283 // experimental engine instead. 284 enum Result { 285 FAILURE = RegExp::kInternalRegExpFailure, 286 SUCCESS = RegExp::kInternalRegExpSuccess, 287 EXCEPTION = RegExp::kInternalRegExpException, 288 RETRY = RegExp::kInternalRegExpRetry, 289 FALLBACK_TO_EXPERIMENTAL = RegExp::kInternalRegExpFallbackToExperimental, 290 SMALLEST_REGEXP_RESULT = RegExp::kInternalRegExpSmallestResult, 291 }; 292 NativeRegExpMacroAssembler(Isolate * isolate,Zone * zone)293 NativeRegExpMacroAssembler(Isolate* isolate, Zone* zone) 294 : RegExpMacroAssembler(isolate, zone) {} 295 ~NativeRegExpMacroAssembler() override = default; 296 297 // Returns a {Result} sentinel, or the number of successful matches. 298 static int Match(Handle<JSRegExp> regexp, Handle<String> subject, 299 int* offsets_vector, int offsets_vector_length, 300 int previous_index, Isolate* isolate); 301 302 V8_EXPORT_PRIVATE static int ExecuteForTesting(String input, int start_offset, 303 const byte* input_start, 304 const byte* input_end, 305 int* output, int output_size, 306 Isolate* isolate, 307 JSRegExp regexp); 308 309 bool CanReadUnaligned() const override; 310 311 void LoadCurrentCharacterImpl(int cp_offset, Label* on_end_of_input, 312 bool check_bounds, int characters, 313 int eats_at_least) override; 314 // Load a number of characters at the given offset from the 315 // current position, into the current-character register. 316 virtual void LoadCurrentCharacterUnchecked(int cp_offset, 317 int character_count) = 0; 318 319 // Called from RegExp if the backtrack stack limit is hit. Tries to expand 320 // the stack. Returns the new stack-pointer if successful, or returns 0 if 321 // unable to grow the stack. 322 // This function must not trigger a garbage collection. 323 // 324 // Called from generated code. 325 static Address GrowStack(Isolate* isolate); 326 327 // Called from generated code. 328 static int CheckStackGuardState(Isolate* isolate, int start_index, 329 RegExp::CallOrigin call_origin, 330 Address* return_address, Code re_code, 331 Address* subject, const byte** input_start, 332 const byte** input_end); 333 word_character_map_address()334 static Address word_character_map_address() { 335 return reinterpret_cast<Address>(&word_character_map[0]); 336 } 337 338 protected: 339 // Byte map of one byte characters with a 0xff if the character is a word 340 // character (digit, letter or underscore) and 0x00 otherwise. 341 // Used by generated RegExp code. 342 static const byte word_character_map[256]; 343 344 Handle<ByteArray> GetOrAddRangeArray(const ZoneList<CharacterRange>* ranges); 345 346 private: 347 // Returns a {Result} sentinel, or the number of successful matches. 348 static int Execute(String input, int start_offset, const byte* input_start, 349 const byte* input_end, int* output, int output_size, 350 Isolate* isolate, JSRegExp regexp); 351 352 std::unordered_map<uint32_t, Handle<ByteArray>> range_array_cache_; 353 }; 354 355 } // namespace internal 356 } // namespace v8 357 358 #endif // V8_REGEXP_REGEXP_MACRO_ASSEMBLER_H_ 359