1 // Copyright 2020 the V8 project authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style license that can be 3 // found in the LICENSE file. 4 5 #ifndef V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ 6 #define V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ 7 8 #include <ios> 9 10 #include "src/base/strings.h" 11 #include "src/base/vector.h" 12 #include "src/regexp/regexp-ast.h" 13 14 // ---------------------------------------------------------------------------- 15 // Definition and semantics of the EXPERIMENTAL bytecode. 16 // Background: 17 // - Russ Cox's blog post series on regular expression matching, in particular 18 // https://swtch.com/~rsc/regexp/regexp2.html 19 // - The re2 regular regexp library: https://github.com/google/re2 20 // 21 // This comment describes the bytecode used by the experimental regexp engine 22 // and its abstract semantics in terms of a VM. An implementation of the 23 // semantics that avoids exponential runtime can be found in `NfaInterpreter`. 24 // 25 // The experimental bytecode describes a non-deterministic finite automaton. It 26 // runs on a multithreaded virtual machine (VM), i.e. in several threads 27 // concurrently. (These "threads" don't need to be actual operating system 28 // threads.) Apart from a list of threads, the VM maintains an immutable 29 // shared input string which threads can read from. Each thread is given by a 30 // program counter (PC, index of the current instruction), a fixed number of 31 // registers of indices into the input string, and a monotonically increasing 32 // index which represents the current position within the input string. 33 // 34 // For the precise encoding of the instruction set, see the definition `struct 35 // RegExpInstruction` below. Currently we support the following instructions: 36 // - CONSUME_RANGE: Check whether the codepoint of the current character is 37 // contained in a non-empty closed interval [min, max] specified in the 38 // instruction payload. Abort this thread if false, otherwise advance the 39 // input position by 1 and continue with the next instruction. 40 // - ACCEPT: Stop this thread and signify the end of a match at the current 41 // input position. 42 // - FORK: If executed by a thread t, spawn a new thread t0 whose register 43 // values and input position agree with those of t, but whose PC value is set 44 // to the value specified in the instruction payload. The register values of 45 // t and t0 agree directly after the FORK, but they can diverge. Thread t 46 // continues with the instruction directly after the current FORK 47 // instruction. 48 // - JMP: Instead of incrementing the PC value after execution of this 49 // instruction by 1, set PC of this thread to the value specified in the 50 // instruction payload and continue there. 51 // - SET_REGISTER_TO_CP: Set a register specified in the paylod to the current 52 // position (CP) within the input, then continue with the next instruction. 53 // - CLEAR_REGISTER: Clear the register specified in the payload by resetting 54 // it to the initial value -1. 55 // 56 // Special care must be exercised with respect to thread priority. It is 57 // possible that more than one thread executes an ACCEPT statement. The output 58 // of the program is given by the contents of the matching thread's registers, 59 // so this is ambiguous in case of multiple matches. To resolve the ambiguity, 60 // every implementation of the VM must output the match that a backtracking 61 // implementation would output (i.e. behave the same as Irregexp). 62 // 63 // A backtracking implementation of the VM maintains a stack of postponed 64 // threads. Upon encountering a FORK statement, this VM will create a copy of 65 // the current thread, set the copy's PC value according to the instruction 66 // payload, and push it to the stack of postponed threads. The VM will then 67 // continue execution of the current thread. 68 // 69 // If at some point a thread t executes a MATCH statement, the VM stops and 70 // outputs the registers of t. Postponed threads are discarded. On the other 71 // hand, if a thread t is aborted because some input character didn't pass a 72 // check, then the VM pops the topmost postponed thread and continues execution 73 // with this thread. If there are no postponed threads, then the VM outputs 74 // failure, i.e. no matches. 75 // 76 // Equivalently, we can describe the behavior of the backtracking VM in terms 77 // of priority: Threads are linearly ordered by priority, and matches generated 78 // by threads with high priority must be preferred over matches generated by 79 // threads with low priority, regardless of the chronological order in which 80 // matches were found. If a thread t executes a FORK statement and spawns a 81 // thread t0, then the priority of t0 is such that the following holds: 82 // * t0 < t, i.e. t0 has lower priority than t. 83 // * For all threads u such that u != t and u != t0, we have t0 < u iff t < u, 84 // i.e. the t0 compares to other threads the same as t. 85 // For example, if there are currently 3 threads s, t, u such that s < t < u, 86 // then after t executes a fork, the thread priorities will be s < t0 < t < u. 87 88 namespace v8 { 89 namespace internal { 90 91 // Bytecode format. 92 // Currently very simple fixed-size: The opcode is encoded in the first 4 93 // bytes, the payload takes another 4 bytes. 94 struct RegExpInstruction { 95 enum Opcode : int32_t { 96 ACCEPT, 97 ASSERTION, 98 CLEAR_REGISTER, 99 CONSUME_RANGE, 100 FORK, 101 JMP, 102 SET_REGISTER_TO_CP, 103 }; 104 105 struct Uc16Range { 106 base::uc16 min; // Inclusive. 107 base::uc16 max; // Inclusive. 108 }; 109 ConsumeRangeRegExpInstruction110 static RegExpInstruction ConsumeRange(base::uc16 min, base::uc16 max) { 111 RegExpInstruction result; 112 result.opcode = CONSUME_RANGE; 113 result.payload.consume_range = Uc16Range{min, max}; 114 return result; 115 } 116 ConsumeAnyCharRegExpInstruction117 static RegExpInstruction ConsumeAnyChar() { 118 return ConsumeRange(0x0000, 0xFFFF); 119 } 120 FailRegExpInstruction121 static RegExpInstruction Fail() { 122 // This is encoded as the empty CONSUME_RANGE of characters 0xFFFF <= c <= 123 // 0x0000. 124 return ConsumeRange(0xFFFF, 0x0000); 125 } 126 ForkRegExpInstruction127 static RegExpInstruction Fork(int32_t alt_index) { 128 RegExpInstruction result; 129 result.opcode = FORK; 130 result.payload.pc = alt_index; 131 return result; 132 } 133 JmpRegExpInstruction134 static RegExpInstruction Jmp(int32_t alt_index) { 135 RegExpInstruction result; 136 result.opcode = JMP; 137 result.payload.pc = alt_index; 138 return result; 139 } 140 AcceptRegExpInstruction141 static RegExpInstruction Accept() { 142 RegExpInstruction result; 143 result.opcode = ACCEPT; 144 return result; 145 } 146 SetRegisterToCpRegExpInstruction147 static RegExpInstruction SetRegisterToCp(int32_t register_index) { 148 RegExpInstruction result; 149 result.opcode = SET_REGISTER_TO_CP; 150 result.payload.register_index = register_index; 151 return result; 152 } 153 ClearRegisterRegExpInstruction154 static RegExpInstruction ClearRegister(int32_t register_index) { 155 RegExpInstruction result; 156 result.opcode = CLEAR_REGISTER; 157 result.payload.register_index = register_index; 158 return result; 159 } 160 AssertionRegExpInstruction161 static RegExpInstruction Assertion(RegExpAssertion::Type t) { 162 RegExpInstruction result; 163 result.opcode = ASSERTION; 164 result.payload.assertion_type = t; 165 return result; 166 } 167 168 Opcode opcode; 169 union { 170 // Payload of CONSUME_RANGE: 171 Uc16Range consume_range; 172 // Payload of FORK and JMP, the next/forked program counter (pc): 173 int32_t pc; 174 // Payload of SET_REGISTER_TO_CP and CLEAR_REGISTER: 175 int32_t register_index; 176 // Payload of ASSERTION: 177 RegExpAssertion::Type assertion_type; 178 } payload; 179 STATIC_ASSERT(sizeof(payload) == 4); 180 }; 181 STATIC_ASSERT(sizeof(RegExpInstruction) == 8); 182 // TODO(mbid,v8:10765): This is rather wasteful. We can fit the opcode in 2-3 183 // bits, so the remaining 29/30 bits can be used as payload. Problem: The 184 // payload of CONSUME_RANGE consists of two 16-bit values `min` and `max`, so 185 // this wouldn't fit. We could encode the payload of a CONSUME_RANGE 186 // instruction by the start of the interval and its length instead, and then 187 // only allows lengths that fit into 14/13 bits. A longer range can then be 188 // encoded as a disjunction of smaller ranges. 189 // 190 // Another thought: CONSUME_RANGEs are only valid if the payloads are such that 191 // min <= max. Thus there are 192 // 193 // 2^16 + 2^16 - 1 + ... + 1 194 // = 2^16 * (2^16 + 1) / 2 195 // = 2^31 + 2^15 196 // 197 // valid payloads for a CONSUME_RANGE instruction. If we want to fit 198 // instructions into 4 bytes, we would still have almost 2^31 instructions left 199 // over if we encode everything as tight as possible. For example, we could 200 // use another 2^29 values for JMP, another 2^29 for FORK, 1 value for ACCEPT, 201 // and then still have almost 2^30 instructions left over for something like 202 // zero-width assertions and captures. 203 204 std::ostream& operator<<(std::ostream& os, const RegExpInstruction& inst); 205 std::ostream& operator<<(std::ostream& os, 206 base::Vector<const RegExpInstruction> insts); 207 208 } // namespace internal 209 } // namespace v8 210 211 #endif // V8_REGEXP_EXPERIMENTAL_EXPERIMENTAL_BYTECODE_H_ 212