• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2011 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #ifndef V8_REGEXP_REGEXP_BYTECODES_H_
6 #define V8_REGEXP_REGEXP_BYTECODES_H_
7 
8 #include "src/base/bounds.h"
9 #include "src/base/macros.h"
10 #include "src/common/globals.h"
11 
12 namespace v8 {
13 namespace internal {
14 
15 // Maximum number of bytecodes that will be used (next power of 2 of actually
16 // defined bytecodes).
17 // All slots between the last actually defined bytecode and maximum id will be
18 // filled with BREAKs, indicating an invalid operation. This way using
19 // BYTECODE_MASK guarantees no OOB access to the dispatch table.
20 constexpr int kRegExpPaddedBytecodeCount = 1 << 6;
21 constexpr int BYTECODE_MASK = kRegExpPaddedBytecodeCount - 1;
22 // The first argument is packed in with the byte code in one word, but so it
23 // has 24 bits, but it can be positive and negative so only use 23 bits for
24 // positive values.
25 const unsigned int MAX_FIRST_ARG = 0x7fffffu;
26 const int BYTECODE_SHIFT = 8;
27 STATIC_ASSERT(1 << BYTECODE_SHIFT > BYTECODE_MASK);
28 
29 // TODO(pthier): Argument offsets of bytecodes should be easily accessible by
30 // name or at least by position.
31 // TODO(jgruber): More precise types (e.g. int32/uint32 instead of value32).
32 #define BYTECODE_ITERATOR(V)                                                   \
33   V(BREAK, 0, 4)              /* bc8                                        */ \
34   V(PUSH_CP, 1, 4)            /* bc8 pad24                                  */ \
35   V(PUSH_BT, 2, 8)            /* bc8 pad24 offset32                         */ \
36   V(PUSH_REGISTER, 3, 4)      /* bc8 reg_idx24                              */ \
37   V(SET_REGISTER_TO_CP, 4, 8) /* bc8 reg_idx24 offset32                     */ \
38   V(SET_CP_TO_REGISTER, 5, 4) /* bc8 reg_idx24                              */ \
39   V(SET_REGISTER_TO_SP, 6, 4) /* bc8 reg_idx24                              */ \
40   V(SET_SP_TO_REGISTER, 7, 4) /* bc8 reg_idx24                              */ \
41   V(SET_REGISTER, 8, 8)       /* bc8 reg_idx24 value32                      */ \
42   V(ADVANCE_REGISTER, 9, 8)   /* bc8 reg_idx24 value32                      */ \
43   V(POP_CP, 10, 4)            /* bc8 pad24                                  */ \
44   V(POP_BT, 11, 4)            /* bc8 pad24                                  */ \
45   V(POP_REGISTER, 12, 4)      /* bc8 reg_idx24                              */ \
46   V(FAIL, 13, 4)              /* bc8 pad24                                  */ \
47   V(SUCCEED, 14, 4)           /* bc8 pad24                                  */ \
48   V(ADVANCE_CP, 15, 4)        /* bc8 offset24                               */ \
49   /* Jump to another bytecode given its offset.                             */ \
50   /* Bit Layout:                                                            */ \
51   /* 0x00 - 0x07:   0x10 (fixed) Bytecode                                   */ \
52   /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
53   /* 0x20 - 0x3F:   Address of bytecode to jump to                          */ \
54   V(GOTO, 16, 8) /* bc8 pad24 addr32                           */              \
55   /* Check if offset is in range and load character at given offset.        */ \
56   /* Bit Layout:                                                            */ \
57   /* 0x00 - 0x07:   0x11 (fixed) Bytecode                                   */ \
58   /* 0x08 - 0x1F:   Offset from current position                            */ \
59   /* 0x20 - 0x3F:   Address of bytecode when load is out of range           */ \
60   V(LOAD_CURRENT_CHAR, 17, 8) /* bc8 offset24 addr32                        */ \
61   /* Load character at given offset without range checks.                   */ \
62   /* Bit Layout:                                                            */ \
63   /* 0x00 - 0x07:   0x12 (fixed) Bytecode                                   */ \
64   /* 0x08 - 0x1F:   Offset from current position                            */ \
65   V(LOAD_CURRENT_CHAR_UNCHECKED, 18, 4)    /* bc8 offset24 */                  \
66   V(LOAD_2_CURRENT_CHARS, 19, 8)           /* bc8 offset24 addr32 */           \
67   V(LOAD_2_CURRENT_CHARS_UNCHECKED, 20, 4) /* bc8 offset24 */                  \
68   V(LOAD_4_CURRENT_CHARS, 21, 8)           /* bc8 offset24 addr32 */           \
69   V(LOAD_4_CURRENT_CHARS_UNCHECKED, 22, 4) /* bc8 offset24 */                  \
70   V(CHECK_4_CHARS, 23, 12) /* bc8 pad24 uint32 addr32                    */    \
71   /* Check if current character is equal to a given character               */ \
72   /* Bit Layout:                                                            */ \
73   /* 0x00 - 0x07:   0x19 (fixed) Bytecode                                   */ \
74   /* 0x08 - 0x0F:   0x00 (unused) Padding                                   */ \
75   /* 0x10 - 0x1F:   Character to check                                      */ \
76   /* 0x20 - 0x3F:   Address of bytecode when matched                        */ \
77   V(CHECK_CHAR, 24, 8) /* bc8 pad8 uint16 addr32                     */        \
78   V(CHECK_NOT_4_CHARS, 25, 12) /* bc8 pad24 uint32 addr32 */                   \
79   V(CHECK_NOT_CHAR, 26, 8) /* bc8 pad8 uint16 addr32                     */    \
80   V(AND_CHECK_4_CHARS, 27, 16) /* bc8 pad24 uint32 uint32 addr32 */            \
81   /* Checks if the current character combined with mask (bitwise and)       */ \
82   /* matches a character (e.g. used when two characters in a disjunction    */ \
83   /* differ by only a single bit                                            */ \
84   /* Bit Layout:                                                            */ \
85   /* 0x00 - 0x07:   0x1c (fixed) Bytecode                                   */ \
86   /* 0x08 - 0x0F:   0x00 (unused) Padding                                   */ \
87   /* 0x10 - 0x1F:   Character to match against (after mask aplied)          */ \
88   /* 0x20 - 0x3F:   Bitmask bitwise and combined with current character     */ \
89   /* 0x40 - 0x5F:   Address of bytecode when matched                        */ \
90   V(AND_CHECK_CHAR, 28, 12)           /* bc8 pad8 uint16 uint32 addr32      */ \
91   V(AND_CHECK_NOT_4_CHARS, 29, 16)    /* bc8 pad24 uint32 uint32 addr32 */     \
92   V(AND_CHECK_NOT_CHAR, 30, 12)       /* bc8 pad8 uint16 uint32 addr32 */      \
93   V(MINUS_AND_CHECK_NOT_CHAR, 31, 12) /* bc8 pad8 uc16 uc16 uc16 addr32 */     \
94   V(CHECK_CHAR_IN_RANGE, 32, 12)      /* bc8 pad24 uc16 uc16 addr32 */         \
95   V(CHECK_CHAR_NOT_IN_RANGE, 33, 12)  /* bc8 pad24 uc16 uc16 addr32 */         \
96   /* Checks if the current character matches any of the characters encoded  */ \
97   /* in a bit table. Similar to/inspired by boyer moore string search       */ \
98   /* Bit Layout:                                                            */ \
99   /* 0x00 - 0x07:   0x22 (fixed) Bytecode                                   */ \
100   /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
101   /* 0x20 - 0x3F:   Address of bytecode when bit is set                     */ \
102   /* 0x40 - 0xBF:   Bit table                                               */ \
103   V(CHECK_BIT_IN_TABLE, 34, 24) /* bc8 pad24 addr32 bits128           */       \
104   V(CHECK_LT, 35, 8) /* bc8 pad8 uc16 addr32                       */          \
105   V(CHECK_GT, 36, 8) /* bc8 pad8 uc16 addr32                       */          \
106   V(CHECK_NOT_BACK_REF, 37, 8)         /* bc8 reg_idx24 addr32 */              \
107   V(CHECK_NOT_BACK_REF_NO_CASE, 38, 8) /* bc8 reg_idx24 addr32 */              \
108   V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE, 39, 8)                                 \
109   V(CHECK_NOT_BACK_REF_BACKWARD, 40, 8)         /* bc8 reg_idx24 addr32 */     \
110   V(CHECK_NOT_BACK_REF_NO_CASE_BACKWARD, 41, 8) /* bc8 reg_idx24 addr32 */     \
111   V(CHECK_NOT_BACK_REF_NO_CASE_UNICODE_BACKWARD, 42, 8)                        \
112   V(CHECK_NOT_REGS_EQUAL, 43, 12) /* bc8 regidx24 reg_idx32 addr32 */          \
113   V(CHECK_REGISTER_LT, 44, 12)    /* bc8 reg_idx24 value32 addr32 */           \
114   V(CHECK_REGISTER_GE, 45, 12)    /* bc8 reg_idx24 value32 addr32 */           \
115   V(CHECK_REGISTER_EQ_POS, 46, 8) /* bc8 reg_idx24 addr32 */                   \
116   V(CHECK_AT_START, 47, 8) /* bc8 pad24 addr32                           */    \
117   V(CHECK_NOT_AT_START, 48, 8) /* bc8 offset24 addr32 */                       \
118   /* Checks if the current position matches top of backtrack stack          */ \
119   /* Bit Layout:                                                            */ \
120   /* 0x00 - 0x07:   0x31 (fixed) Bytecode                                   */ \
121   /* 0x08 - 0x1F:   0x00 (unused) Padding                                   */ \
122   /* 0x20 - 0x3F:   Address of bytecode when current matches tos            */ \
123   V(CHECK_GREEDY, 49, 8) /* bc8 pad24 addr32                           */      \
124   /* Advance character pointer by given offset and jump to another bytecode.*/ \
125   /* Bit Layout:                                                            */ \
126   /* 0x00 - 0x07:   0x32 (fixed) Bytecode                                   */ \
127   /* 0x08 - 0x1F:   Number of characters to advance                         */ \
128   /* 0x20 - 0x3F:   Address of bytecode to jump to                          */ \
129   V(ADVANCE_CP_AND_GOTO, 50, 8) /* bc8 offset24 addr32                    */   \
130   V(SET_CURRENT_POSITION_FROM_END, 51, 4) /* bc8 idx24 */                      \
131   /* Checks if current position + given offset is in range.                 */ \
132   /* Bit Layout:                                                            */ \
133   /* 0x00 - 0x07:   0x34 (fixed) Bytecode                                   */ \
134   /* 0x08 - 0x1F:   Offset from current position                            */ \
135   /* 0x20 - 0x3F:   Address of bytecode when position is out of range       */ \
136   V(CHECK_CURRENT_POSITION, 52, 8) /* bc8 idx24 addr32                     */  \
137   /* Combination of:                                                        */ \
138   /* LOAD_CURRENT_CHAR, CHECK_BIT_IN_TABLE and ADVANCE_CP_AND_GOTO          */ \
139   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
140   /* Bit Layout:                                                            */ \
141   /* 0x00 - 0x07    0x35 (fixed) Bytecode                                   */ \
142   /* 0x08 - 0x1F    Load character offset from current position             */ \
143   /* 0x20 - 0x3F    Number of characters to advance                         */ \
144   /* 0x40 - 0xBF    Bit Table                                               */ \
145   /* 0xC0 - 0xDF    Address of bytecode when character is matched           */ \
146   /* 0xE0 - 0xFF    Address of bytecode when no match                       */ \
147   V(SKIP_UNTIL_BIT_IN_TABLE, 53, 32)                                           \
148   /* Combination of:                                                        */ \
149   /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, AND_CHECK_CHAR    */ \
150   /* and ADVANCE_CP_AND_GOTO                                                */ \
151   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
152   /* Bit Layout:                                                            */ \
153   /* 0x00 - 0x07    0x36 (fixed) Bytecode                                   */ \
154   /* 0x08 - 0x1F    Load character offset from current position             */ \
155   /* 0x20 - 0x2F    Number of characters to advance                         */ \
156   /* 0x30 - 0x3F    Character to match against (after mask applied)         */ \
157   /* 0x40 - 0x5F:   Bitmask bitwise and combined with current character     */ \
158   /* 0x60 - 0x7F    Minimum number of characters this pattern consumes      */ \
159   /* 0x80 - 0x9F    Address of bytecode when character is matched           */ \
160   /* 0xA0 - 0xBF    Address of bytecode when no match                       */ \
161   V(SKIP_UNTIL_CHAR_AND, 54, 24)                                               \
162   /* Combination of:                                                        */ \
163   /* LOAD_CURRENT_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO                  */ \
164   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
165   /* Bit Layout:                                                            */ \
166   /* 0x00 - 0x07    0x37 (fixed) Bytecode                                   */ \
167   /* 0x08 - 0x1F    Load character offset from current position             */ \
168   /* 0x20 - 0x2F    Number of characters to advance                         */ \
169   /* 0x30 - 0x3F    Character to match                                      */ \
170   /* 0x40 - 0x5F    Address of bytecode when character is matched           */ \
171   /* 0x60 - 0x7F    Address of bytecode when no match                       */ \
172   V(SKIP_UNTIL_CHAR, 55, 16)                                                   \
173   /* Combination of:                                                        */ \
174   /* CHECK_CURRENT_POSITION, LOAD_CURRENT_CHAR_UNCHECKED, CHECK_CHAR        */ \
175   /* and ADVANCE_CP_AND_GOTO                                                */ \
176   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
177   /* Bit Layout:                                                            */ \
178   /* 0x00 - 0x07    0x38 (fixed) Bytecode                                   */ \
179   /* 0x08 - 0x1F    Load character offset from current position             */ \
180   /* 0x20 - 0x2F    Number of characters to advance                         */ \
181   /* 0x30 - 0x3F    Character to match                                      */ \
182   /* 0x40 - 0x5F    Minimum number of characters this pattern consumes      */ \
183   /* 0x60 - 0x7F    Address of bytecode when character is matched           */ \
184   /* 0x80 - 0x9F    Address of bytecode when no match                       */ \
185   V(SKIP_UNTIL_CHAR_POS_CHECKED, 56, 20)                                       \
186   /* Combination of:                                                        */ \
187   /* LOAD_CURRENT_CHAR, CHECK_CHAR, CHECK_CHAR and ADVANCE_CP_AND_GOTO      */ \
188   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
189   /* Bit Layout:                                                            */ \
190   /* 0x00 - 0x07    0x39 (fixed) Bytecode                                   */ \
191   /* 0x08 - 0x1F    Load character offset from current position             */ \
192   /* 0x20 - 0x3F    Number of characters to advance                         */ \
193   /* 0x40 - 0x4F    Character to match                                      */ \
194   /* 0x50 - 0x5F    Other Character to match                                */ \
195   /* 0x60 - 0x7F    Address of bytecode when either character is matched    */ \
196   /* 0x80 - 0x9F    Address of bytecode when no match                       */ \
197   V(SKIP_UNTIL_CHAR_OR_CHAR, 57, 20)                                           \
198   /* Combination of:                                                        */ \
199   /* LOAD_CURRENT_CHAR, CHECK_GT, CHECK_BIT_IN_TABLE, GOTO and              */ \
200   /* and ADVANCE_CP_AND_GOTO                                                */ \
201   /* Emitted by RegExpBytecodePeepholeOptimization.                         */ \
202   /* Bit Layout:                                                            */ \
203   /* 0x00 - 0x07    0x3A (fixed) Bytecode                                   */ \
204   /* 0x08 - 0x1F    Load character offset from current position             */ \
205   /* 0x20 - 0x2F    Number of characters to advance                         */ \
206   /* 0x30 - 0x3F    Character to check if it is less than current char      */ \
207   /* 0x40 - 0xBF    Bit Table                                               */ \
208   /* 0xC0 - 0xDF    Address of bytecode when character is matched           */ \
209   /* 0xE0 - 0xFF    Address of bytecode when no match                       */ \
210   V(SKIP_UNTIL_GT_OR_NOT_BIT_IN_TABLE, 58, 32)
211 
212 #define COUNT(...) +1
213 static constexpr int kRegExpBytecodeCount = BYTECODE_ITERATOR(COUNT);
214 #undef COUNT
215 
216 // Just making sure we assigned values above properly. They should be
217 // contiguous, strictly increasing, and start at 0.
218 // TODO(jgruber): Do not explicitly assign values, instead generate them
219 // implicitly from the list order.
220 STATIC_ASSERT(kRegExpBytecodeCount == 59);
221 
222 #define DECLARE_BYTECODES(name, code, length) \
223   static constexpr int BC_##name = code;
224 BYTECODE_ITERATOR(DECLARE_BYTECODES)
225 #undef DECLARE_BYTECODES
226 
227 static constexpr int kRegExpBytecodeLengths[] = {
228 #define DECLARE_BYTECODE_LENGTH(name, code, length) length,
229     BYTECODE_ITERATOR(DECLARE_BYTECODE_LENGTH)
230 #undef DECLARE_BYTECODE_LENGTH
231 };
232 
RegExpBytecodeLength(int bytecode)233 inline constexpr int RegExpBytecodeLength(int bytecode) {
234   CONSTEXPR_DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
235   return kRegExpBytecodeLengths[bytecode];
236 }
237 
238 static constexpr const char* const kRegExpBytecodeNames[] = {
239 #define DECLARE_BYTECODE_NAME(name, ...) #name,
240     BYTECODE_ITERATOR(DECLARE_BYTECODE_NAME)
241 #undef DECLARE_BYTECODE_NAME
242 };
243 
RegExpBytecodeName(int bytecode)244 inline constexpr const char* RegExpBytecodeName(int bytecode) {
245   CONSTEXPR_DCHECK(base::IsInRange(bytecode, 0, kRegExpBytecodeCount - 1));
246   return kRegExpBytecodeNames[bytecode];
247 }
248 
249 void RegExpBytecodeDisassembleSingle(const byte* code_base, const byte* pc);
250 void RegExpBytecodeDisassemble(const byte* code_base, int length,
251                                const char* pattern);
252 
253 }  // namespace internal
254 }  // namespace v8
255 
256 #endif  // V8_REGEXP_REGEXP_BYTECODES_H_
257