1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_REGEXP_REGEXP_BYTECODES_H_
6 #define V8_REGEXP_REGEXP_BYTECODES_H_
7
8 #include "src/base/bounds.h"
9 #include "src/base/macros.h"
10 #include "src/base/strings.h"
11 #include "src/common/globals.h"
12
13 namespace v8 {
14 namespace internal {
15
16 // Maximum number of bytecodes that will be used (next power of 2 of actually
17 // defined bytecodes).
18 // All slots between the last actually defined bytecode and maximum id will be
19 // filled with BREAKs, indicating an invalid operation. This way using
20 // BYTECODE_MASK guarantees no OOB access to the dispatch table.
21 constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
22 constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
23 // The first argument is packed in with the byte code in one word, but so it
24 // has 24 bits, but it can be positive and negative so only use 23 bits for
25 // positive values.
26 const unsigned int MAX_FIRST_ARG = 0x7fffffu;
27 const int BYTECODE_SHIFT = 8;
28 STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
29
30 // The list of bytecodes, in format: V(Name, Code, ByteLength).
31 // TODO(pthier): Argument offsets of bytecodes should be easily accessible by
32 // name or at least by position.
33 // TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
34 #define BYTECODE_ITERATOR(V) \
35 V(BREAK, 0, 4) /* bc8 */ \
36 V(PUSH_CP, 1, 4) /* bc8 pad24 */ \
37 V(PUSH_BT, 2, 8) /* bc8 pad24 offset32 */ \
38 V(PUSH_REGISTER, 3, 4) /* bc8 reg_idx24 */ \
39 V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32 */ \
40 V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24 */ \
41 V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24 */ \
42 V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24 */ \
43 V(SET_REGISTER, 8, 8) /* bc8 reg_idx24 value32 */ \
44 V(ADVANCE_REGISTER, 9, 8) /* bc8 reg_idx24 value32 */ \
45 V(POP_CP, 10, 4) /* bc8 pad24 */ \
46 V(POP_BT, 11, 4) /* bc8 pad24 */ \
47 V(POP_REGISTER, 12, 4) /* bc8 reg_idx24 */ \
48 V(FAIL, 13, 4) /* bc8 pad24 */ \
49 V(SUCCEED, 14, 4) /* bc8 pad24 */ \
50 V(ADVANCE_CP, 15, 4) /* bc8 offset24 */ \
51 /* Jump to another bytecode given its offset. */ \
52 /* Bit Layout: */ \
53 /* 0x00 - 0x07: 0x10 (fixed) Bytecode */ \
54 /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
55 /* 0x20 - 0x3F: Address of bytecode to jump to */ \
56 V(GOTO, 16, 8) /* bc8 pad24 addr32 */ \
57 /* Check if offset is in range and load character at given offset. */ \
58 /* Bit Layout: */ \
59 /* 0x00 - 0x07: 0x11 (fixed) Bytecode */ \
60 /* 0x08 - 0x1F: Offset from current position */ \
61 /* 0x20 - 0x3F: Address of bytecode when load is out of range */ \
62 V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32 */ \
63 /* Load character at given offset without range checks. */ \
64 /* Bit Layout: */ \
65 /* 0x00 - 0x07: 0x12 (fixed) Bytecode */ \
66 /* 0x08 - 0x1F: Offset from current position */ \
67 V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4) /* bc8 offset24 */ \
68 V(LOAD_2_CURRENT_CHARS, 19, 8) /* bc8 offset24 addr32 */ \
69 V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */ \
70 V(LOAD_4_CURRENT_CHARS, 21, 8) /* bc8 offset24 addr32 */ \
71 V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */ \
72 V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32 */ \
73 /* Check if current character is equal to a given character */ \
74 /* Bit Layout: */ \
75 /* 0x00 - 0x07: 0x19 (fixed) Bytecode */ \
76 /* 0x08 - 0x0F: 0x00 (unused) Padding */ \
77 /* 0x10 - 0x1F: Character to check */ \
78 /* 0x20 - 0x3F: Address of bytecode when matched */ \
79 V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32 */ \
80 V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */ \
81 V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32 */ \
82 V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
83 /* Checks if the current character combined with mask (bitwise and) */ \
84 /* matches a character (e.g. used when two characters in a disjunction */ \
85 /* differ by only a single bit */ \
86 /* Bit Layout: */ \
87 /* 0x00 - 0x07: 0x1c (fixed) Bytecode */ \
88 /* 0x08 - 0x0F: 0x00 (unused) Padding */ \
89 /* 0x10 - 0x1F: Character to match against (after mask aplied) */ \
90 /* 0x20 - 0x3F: Bitmask bitwise and combined with current character */ \
91 /* 0x40 - 0x5F: Address of bytecode when matched */ \
92 V(AND_CHECK_CHAR, 28, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
93 V(AND_CHECK_NOT_4_CHARS, 29, 16) /* bc8 pad24 uint32 uint32 addr32 */ \
94 V(AND_CHECK_NOT_CHAR, 30, 12) /* bc8 pad8 uint16 uint32 addr32 */ \
95 V(MINUS_AND_CHECK_NOT_CHAR, 31, \
96 12) /* bc8 pad8 base::uc16 base::uc16 base::uc16 addr32 */ \
97 V(CHECK_CHAR_IN_RANGE, 32, 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
98 V(CHECK_CHAR_NOT_IN_RANGE, 33, \
99 12) /* bc8 pad24 base::uc16 base::uc16 addr32 */ \
100 /* Checks if the current character matches any of the characters encoded */ \
101 /* in a bit table. Similar to/inspired by boyer moore string search */ \
102 /* Bit Layout: */ \
103 /* 0x00 - 0x07: 0x22 (fixed) Bytecode */ \
104 /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
105 /* 0x20 - 0x3F: Address of bytecode when bit is set */ \
106 /* 0x40 - 0xBF: Bit table */ \
107 V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128 */ \
108 V(CHECK_LT, 35, 8) /* bc8 pad8 base::uc16 addr32 */ \
109 V(CHECK_GT, 36, 8) /* bc8 pad8 base::uc16 addr32 */ \
110 V(CHECK_NOT_BACK_REF, 37, 8) /* bc8 reg_idx24 addr32 */ \
111 V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */ \
112 V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8) \
113 V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8) /* bc8 reg_idx24 addr32 */ \
114 V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */ \
115 V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8) \
116 V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */ \
117 V(CHECK_REGISTER_LT, 44, 12) /* bc8 reg_idx24 value32 addr32 */ \
118 V(CHECK_REGISTER_GE, 45, 12) /* bc8 reg_idx24 value32 addr32 */ \
119 V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */ \
120 V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32 */ \
121 V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */ \
122 /* Checks if the current position matches top of backtrack stack */ \
123 /* Bit Layout: */ \
124 /* 0x00 - 0x07: 0x31 (fixed) Bytecode */ \
125 /* 0x08 - 0x1F: 0x00 (unused) Padding */ \
126 /* 0x20 - 0x3F: Address of bytecode when current matches tos */ \
127 V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32 */ \
128 /* Advance character pointer by given offset and jump to another bytecode.*/ \
129 /* Bit Layout: */ \
130 /* 0x00 - 0x07: 0x32 (fixed) Bytecode */ \
131 /* 0x08 - 0x1F: Number of characters to advance */ \
132 /* 0x20 - 0x3F: Address of bytecode to jump to */ \
133 V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32 */ \
134 V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */ \
135 /* Checks if current position + given offset is in range. */ \
136 /* Bit Layout: */ \
137 /* 0x00 - 0x07: 0x34 (fixed) Bytecode */ \
138 /* 0x08 - 0x1F: Offset from current position */ \
139 /* 0x20 - 0x3F: Address of bytecode when position is out of range */ \
140 V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32 */ \
141 /* Combination of: */ \
142 /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO */ \
143 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
144 /* Bit Layout: */ \
145 /* 0x00 - 0x07 0x35 (fixed) Bytecode */ \
146 /* 0x08 - 0x1F Load character offset from current position */ \
147 /* 0x20 - 0x3F Number of characters to advance */ \
148 /* 0x40 - 0xBF Bit Table */ \
149 /* 0xC0 - 0xDF Address of bytecode when character is matched */ \
150 /* 0xE0 - 0xFF Address of bytecode when no match */ \
151 V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32) \
152 /* Combination of: */ \
153 /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR */ \
154 /* and ADVANCE_CP_AND_GOTO */ \
155 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
156 /* Bit Layout: */ \
157 /* 0x00 - 0x07 0x36 (fixed) Bytecode */ \
158 /* 0x08 - 0x1F Load character offset from current position */ \
159 /* 0x20 - 0x2F Number of characters to advance */ \
160 /* 0x30 - 0x3F Character to match against (after mask applied) */ \
161 /* 0x40 - 0x5F: Bitmask bitwise and combined with current character */ \
162 /* 0x60 - 0x7F Minimum number of characters this pattern consumes */ \
163 /* 0x80 - 0x9F Address of bytecode when character is matched */ \
164 /* 0xA0 - 0xBF Address of bytecode when no match */ \
165 V(SKIP_UNTIL_CHAR_AND, 54, 24) \
166 /* Combination of: */ \
167 /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
168 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
169 /* Bit Layout: */ \
170 /* 0x00 - 0x07 0x37 (fixed) Bytecode */ \
171 /* 0x08 - 0x1F Load character offset from current position */ \
172 /* 0x20 - 0x2F Number of characters to advance */ \
173 /* 0x30 - 0x3F Character to match */ \
174 /* 0x40 - 0x5F Address of bytecode when character is matched */ \
175 /* 0x60 - 0x7F Address of bytecode when no match */ \
176 V(SKIP_UNTIL_CHAR, 55, 16) \
177 /* Combination of: */ \
178 /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR */ \
179 /* and ADVANCE_CP_AND_GOTO */ \
180 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
181 /* Bit Layout: */ \
182 /* 0x00 - 0x07 0x38 (fixed) Bytecode */ \
183 /* 0x08 - 0x1F Load character offset from current position */ \
184 /* 0x20 - 0x2F Number of characters to advance */ \
185 /* 0x30 - 0x3F Character to match */ \
186 /* 0x40 - 0x5F Minimum number of characters this pattern consumes */ \
187 /* 0x60 - 0x7F Address of bytecode when character is matched */ \
188 /* 0x80 - 0x9F Address of bytecode when no match */ \
189 V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20) \
190 /* Combination of: */ \
191 /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO */ \
192 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
193 /* Bit Layout: */ \
194 /* 0x00 - 0x07 0x39 (fixed) Bytecode */ \
195 /* 0x08 - 0x1F Load character offset from current position */ \
196 /* 0x20 - 0x3F Number of characters to advance */ \
197 /* 0x40 - 0x4F Character to match */ \
198 /* 0x50 - 0x5F Other Character to match */ \
199 /* 0x60 - 0x7F Address of bytecode when either character is matched */ \
200 /* 0x80 - 0x9F Address of bytecode when no match */ \
201 V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20) \
202 /* Combination of: */ \
203 /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and */ \
204 /* and ADVANCE_CP_AND_GOTO */ \
205 /* Emitted by RegExpBytecodePeepholeOptimization. */ \
206 /* Bit Layout: */ \
207 /* 0x00 - 0x07 0x3A (fixed) Bytecode */ \
208 /* 0x08 - 0x1F Load character offset from current position */ \
209 /* 0x20 - 0x2F Number of characters to advance */ \
210 /* 0x30 - 0x3F Character to check if it is less than current char */ \
211 /* 0x40 - 0xBF Bit Table */ \
212 /* 0xC0 - 0xDF Address of bytecode when character is matched */ \
213 /* 0xE0 - 0xFF Address of bytecode when no match */ \
214 V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
215
216 #define COUNT(...) +1
217 static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
218 #undef COUNT
219
220 // Just making sure we assigned values above properly. They should be
221 // contiguous, strictly increasing, and start at 0.
222 // TODO(jgruber): Do not explicitly assign values, instead generate them
223 // implicitly from the list order.
224 STATIC_ASSERT(kRegExpBytecodeCount == 59);
225
226 #define DECLARE_BYTECODES(name, code, length) \
227 static constexpr int BC_##name = code;
228 BYTECODE_ITERATOR(DECLARE_BYTECODES)
229 #undef DECLARE_BYTECODES
230
231 static constexpr int kRegExpBytecodeLengths[] = {
232 #define DECLARE_BYTECODE_LENGTH(name, code, length) length,
233 BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
234 #undef DECLARE_BYTECODE_LENGTH
235 };
236
RegExpBytecodeLength(int bytecode)237 inline constexpr int RegExpBytecodeLength(int bytecode) {
238 DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
239 return kRegExpBytecodeLengths[bytecode];
240 }
241
242 static constexpr const char* const kRegExpBytecodeNames[] = {
243 #define DECLARE_BYTECODE_NAME(name, ...) #name,
244 BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
245 #undef DECLARE_BYTECODE_NAME
246 };
247
RegExpBytecodeName(int bytecode)248 inline constexpr const char* RegExpBytecodeName(int bytecode) {
249 DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
250 return kRegExpBytecodeNames[bytecode];
251 }
252
253 void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
254 void RegExpBytecodeDisassemble(const byte* code_base, int length,
255 const char* pattern);
256
257 } // namespace internal
258 } // namespace v8
259
260 #endif // V8_REGEXP_REGEXP_BYTECODES_H_
261