1 // Copyright 2017 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #ifndef V8_OBJECTS_JS_REGEXP_H_
6 #define V8_OBJECTS_JS_REGEXP_H_
7
8 #include "src/objects/js-array.h"
9 #include "torque-generated/bit-fields.h"
10
11 // Has to be the last include (doesn't have include guards):
12 #include "src/objects/object-macros.h"
13
14 namespace v8 {
15 namespace internal {
16
17 #include "torque-generated/src/objects/js-regexp-tq.inc"
18
19 // Regular expressions
20 // The regular expression holds a single reference to a FixedArray in
21 // the kDataOffset field.
22 // The FixedArray contains the following data:
23 // - tag : type of regexp implementation (not compiled yet, atom or irregexp)
24 // - reference to the original source string
25 // - reference to the original flag string
26 // If it is an atom regexp
27 // - a reference to a literal string to search for
28 // If it is an irregexp regexp:
29 // - a reference to code for Latin1 inputs (bytecode or compiled), or a smi
30 // used for tracking the last usage (used for regexp code flushing).
31 // - a reference to code for UC16 inputs (bytecode or compiled), or a smi
32 // used for tracking the last usage (used for regexp code flushing).
33 // - max number of registers used by irregexp implementations.
34 // - number of capture registers (output values) of the regexp.
35 class JSRegExp : public TorqueGeneratedJSRegExp<JSRegExp, JSObject> {
36 public:
37 // Meaning of Type:
38 // NOT_COMPILED: Initial value. No data has been stored in the JSRegExp yet.
39 // ATOM: A simple string to match against using an indexOf operation.
40 // IRREGEXP: Compiled with Irregexp.
41 // EXPERIMENTAL: Compiled to use the new linear time engine.
42 enum Type { NOT_COMPILED, ATOM, IRREGEXP, EXPERIMENTAL };
DEFINE_TORQUE_GENERATED_JS_REG_EXP_FLAGS()43 DEFINE_TORQUE_GENERATED_JS_REG_EXP_FLAGS()
44
45 static base::Optional<Flag> FlagFromChar(char c) {
46 STATIC_ASSERT(kFlagCount == 7);
47 // clang-format off
48 return c == 'g' ? base::Optional<Flag>(kGlobal)
49 : c == 'i' ? base::Optional<Flag>(kIgnoreCase)
50 : c == 'm' ? base::Optional<Flag>(kMultiline)
51 : c == 'y' ? base::Optional<Flag>(kSticky)
52 : c == 'u' ? base::Optional<Flag>(kUnicode)
53 : c == 's' ? base::Optional<Flag>(kDotAll)
54 : (FLAG_enable_experimental_regexp_engine && c == 'l')
55 ? base::Optional<Flag>(kLinear)
56 : base::Optional<Flag>();
57 // clang-format on
58 }
59
60 STATIC_ASSERT(static_cast<int>(kNone) == v8::RegExp::kNone);
61 STATIC_ASSERT(static_cast<int>(kGlobal) == v8::RegExp::kGlobal);
62 STATIC_ASSERT(static_cast<int>(kIgnoreCase) == v8::RegExp::kIgnoreCase);
63 STATIC_ASSERT(static_cast<int>(kMultiline) == v8::RegExp::kMultiline);
64 STATIC_ASSERT(static_cast<int>(kSticky) == v8::RegExp::kSticky);
65 STATIC_ASSERT(static_cast<int>(kUnicode) == v8::RegExp::kUnicode);
66 STATIC_ASSERT(static_cast<int>(kDotAll) == v8::RegExp::kDotAll);
67 STATIC_ASSERT(static_cast<int>(kLinear) == v8::RegExp::kLinear);
68 STATIC_ASSERT(kFlagCount == v8::RegExp::kFlagCount);
69
70 DECL_ACCESSORS(last_index, Object)
71
72 // If the backtrack limit is set to this marker value, no limit is applied.
73 static constexpr uint32_t kNoBacktrackLimit = 0;
74
75 V8_EXPORT_PRIVATE static MaybeHandle<JSRegExp> New(
76 Isolate* isolate, Handle<String> source, Flags flags,
77 uint32_t backtrack_limit = kNoBacktrackLimit);
78 static Handle<JSRegExp> Copy(Handle<JSRegExp> regexp);
79
80 static MaybeHandle<JSRegExp> Initialize(
81 Handle<JSRegExp> regexp, Handle<String> source, Flags flags,
82 uint32_t backtrack_limit = kNoBacktrackLimit);
83 static MaybeHandle<JSRegExp> Initialize(Handle<JSRegExp> regexp,
84 Handle<String> source,
85 Handle<String> flags_string);
86
87 static Flags FlagsFromString(Isolate* isolate, Handle<String> flags,
88 bool* success);
89
90 bool CanTierUp();
91 bool MarkedForTierUp();
92 void ResetLastTierUpTick();
93 void TierUpTick();
94 void MarkTierUpForNextExec();
95
96 inline Type TypeTag() const;
TypeSupportsCaptures(Type t)97 static bool TypeSupportsCaptures(Type t) {
98 return t == IRREGEXP || t == EXPERIMENTAL;
99 }
100
101 // Maximum number of captures allowed.
102 static constexpr int kMaxCaptures = 1 << 16;
103
104 // Number of captures (without the match itself).
105 inline int CaptureCount() const;
106 // Each capture (including the match itself) needs two registers.
RegistersForCaptureCount(int count)107 static int RegistersForCaptureCount(int count) { return (count + 1) * 2; }
108
109 inline int MaxRegisterCount() const;
110 inline Flags GetFlags();
111 inline String Pattern();
112 inline Object CaptureNameMap();
113 inline Object DataAt(int index) const;
114 // Set implementation data after the object has been prepared.
115 inline void SetDataAt(int index, Object value);
116 inline void SetCaptureNameMap(Handle<FixedArray> capture_name_map);
117
code_index(bool is_latin1)118 static constexpr int code_index(bool is_latin1) {
119 return is_latin1 ? kIrregexpLatin1CodeIndex : kIrregexpUC16CodeIndex;
120 }
121
bytecode_index(bool is_latin1)122 static constexpr int bytecode_index(bool is_latin1) {
123 return is_latin1 ? kIrregexpLatin1BytecodeIndex
124 : kIrregexpUC16BytecodeIndex;
125 }
126
127 // This could be a Smi kUninitializedValue or Code.
128 V8_EXPORT_PRIVATE Object Code(bool is_latin1) const;
129 // This could be a Smi kUninitializedValue or ByteArray.
130 V8_EXPORT_PRIVATE Object Bytecode(bool is_latin1) const;
131
132 bool ShouldProduceBytecode();
133 inline bool HasCompiledCode() const;
134 inline void DiscardCompiledCodeForSerialization();
135
136 uint32_t BacktrackLimit() const;
137
138 // Dispatched behavior.
139 DECL_PRINTER(JSRegExp)
140 DECL_VERIFIER(JSRegExp)
141
142 /* This is already an in-object field. */
143 // TODO(v8:8944): improve handling of in-object fields
144 static constexpr int kLastIndexOffset = kHeaderSize;
145
146 // Indices in the data array.
147 static const int kTagIndex = 0;
148 static const int kSourceIndex = kTagIndex + 1;
149 static const int kFlagsIndex = kSourceIndex + 1;
150 static const int kDataIndex = kFlagsIndex + 1;
151
152 // TODO(jgruber): Rename kDataIndex to something more appropriate.
153 // There is no 'data' field, kDataIndex is just a marker for the
154 // first non-generic index.
155 static constexpr int kMinDataArrayLength = kDataIndex;
156
157 // The data fields are used in different ways depending on the
158 // value of the tag.
159 // Atom regexps (literal strings).
160 static const int kAtomPatternIndex = kDataIndex;
161
162 static const int kAtomDataSize = kAtomPatternIndex + 1;
163
164 // Irregexp compiled code or trampoline to interpreter for Latin1. If
165 // compilation fails, this fields hold an exception object that should be
166 // thrown if the regexp is used again.
167 static const int kIrregexpLatin1CodeIndex = kDataIndex;
168 // Irregexp compiled code or trampoline to interpreter for UC16. If
169 // compilation fails, this fields hold an exception object that should be
170 // thrown if the regexp is used again.
171 static const int kIrregexpUC16CodeIndex = kDataIndex + 1;
172 // Bytecode to interpret the regexp for Latin1. Contains kUninitializedValue
173 // if we haven't compiled the regexp yet, regexp are always compiled or if
174 // tier-up has happened (i.e. when kIrregexpLatin1CodeIndex contains native
175 // irregexp code).
176 static const int kIrregexpLatin1BytecodeIndex = kDataIndex + 2;
177 // Bytecode to interpret the regexp for UC16. Contains kUninitializedValue if
178 // we haven't compiled the regxp yet, regexp are always compiled or if tier-up
179 // has happened (i.e. when kIrregexpUC16CodeIndex contains native irregexp
180 // code).
181 static const int kIrregexpUC16BytecodeIndex = kDataIndex + 3;
182 // Maximal number of registers used by either Latin1 or UC16.
183 // Only used to check that there is enough stack space
184 static const int kIrregexpMaxRegisterCountIndex = kDataIndex + 4;
185 // Number of captures in the compiled regexp.
186 static const int kIrregexpCaptureCountIndex = kDataIndex + 5;
187 // Maps names of named capture groups (at indices 2i) to their corresponding
188 // (1-based) capture group indices (at indices 2i + 1).
189 static const int kIrregexpCaptureNameMapIndex = kDataIndex + 6;
190 // Tier-up ticks are set to the value of the tier-up ticks flag. The value is
191 // decremented on each execution of the bytecode, so that the tier-up
192 // happens once the ticks reach zero.
193 // This value is ignored if the regexp-tier-up flag isn't turned on.
194 static const int kIrregexpTicksUntilTierUpIndex = kDataIndex + 7;
195 // A smi containing either the backtracking limit or kNoBacktrackLimit.
196 // TODO(jgruber): If needed, this limit could be packed into other fields
197 // above to save space.
198 static const int kIrregexpBacktrackLimit = kDataIndex + 8;
199 static const int kIrregexpDataSize = kDataIndex + 9;
200
201 // TODO(mbid,v8:10765): At the moment the EXPERIMENTAL data array conforms
202 // to the format of an IRREGEXP data array, with most fields set to some
203 // default/uninitialized value. This is because EXPERIMENTAL and IRREGEXP
204 // regexps take the same code path in `RegExpExecInternal`, which reads off
205 // various fields from the data array. `RegExpExecInternal` should probably
206 // distinguish between EXPERIMENTAL and IRREGEXP, and then we can get rid of
207 // all the IRREGEXP only fields.
208 static constexpr int kExperimentalDataSize = kIrregexpDataSize;
209
210 // In-object fields.
211 static const int kLastIndexFieldIndex = 0;
212 static const int kInObjectFieldCount = 1;
213
214 // Descriptor array index to important methods in the prototype.
215 static const int kExecFunctionDescriptorIndex = 1;
216 static const int kSymbolMatchFunctionDescriptorIndex = 13;
217 static const int kSymbolMatchAllFunctionDescriptorIndex = 14;
218 static const int kSymbolReplaceFunctionDescriptorIndex = 15;
219 static const int kSymbolSearchFunctionDescriptorIndex = 16;
220 static const int kSymbolSplitFunctionDescriptorIndex = 17;
221
222 // The uninitialized value for a regexp code object.
223 static const int kUninitializedValue = -1;
224
225 // The heuristic value for the length of the subject string for which we
226 // tier-up to the compiler immediately, instead of using the interpreter.
227 static constexpr int kTierUpForSubjectLengthValue = 1000;
228
229 TQ_OBJECT_CONSTRUCTORS(JSRegExp)
230 };
231
DEFINE_OPERATORS_FOR_FLAGS(JSRegExp::Flags)232 DEFINE_OPERATORS_FOR_FLAGS(JSRegExp::Flags)
233
234 // JSRegExpResult is just a JSArray with a specific initial map.
235 // This initial map adds in-object properties for "index" and "input"
236 // properties, as assigned by RegExp.prototype.exec, which allows
237 // faster creation of RegExp exec results.
238 // This class just holds constants used when creating the result.
239 // After creation the result must be treated as a JSArray in all regards.
240 class JSRegExpResult : public JSArray {
241 public:
242 DECL_CAST(JSRegExpResult)
243
244 // TODO(joshualitt): We would like to add printers and verifiers to
245 // JSRegExpResult, and maybe JSRegExpResultIndices, but both have the same
246 // instance type as JSArray.
247
248 // Layout description.
249 DEFINE_FIELD_OFFSET_CONSTANTS(JSArray::kHeaderSize,
250 TORQUE_GENERATED_JS_REG_EXP_RESULT_FIELDS)
251
252 static MaybeHandle<JSArray> GetAndCacheIndices(
253 Isolate* isolate, Handle<JSRegExpResult> regexp_result);
254
255 // Indices of in-object properties.
256 static const int kIndexIndex = 0;
257 static const int kInputIndex = 1;
258 static const int kGroupsIndex = 2;
259
260 // Private internal only fields.
261 static const int kCachedIndicesOrRegExpIndex = 3;
262 static const int kNamesIndex = 4;
263 static const int kRegExpInputIndex = 5;
264 static const int kRegExpLastIndex = 6;
265 static const int kInObjectPropertyCount = 7;
266
267 OBJECT_CONSTRUCTORS(JSRegExpResult, JSArray);
268 };
269
270 // JSRegExpResultIndices is just a JSArray with a specific initial map.
271 // This initial map adds in-object properties for "group"
272 // properties, as assigned by RegExp.prototype.exec, which allows
273 // faster creation of RegExp exec results.
274 // This class just holds constants used when creating the result.
275 // After creation the result must be treated as a JSArray in all regards.
276 class JSRegExpResultIndices : public JSArray {
277 public:
278 DECL_CAST(JSRegExpResultIndices)
279
280 // Layout description.
281 DEFINE_FIELD_OFFSET_CONSTANTS(
282 JSArray::kHeaderSize, TORQUE_GENERATED_JS_REG_EXP_RESULT_INDICES_FIELDS)
283
284 static Handle<JSRegExpResultIndices> BuildIndices(
285 Isolate* isolate, Handle<RegExpMatchInfo> match_info,
286 Handle<Object> maybe_names);
287
288 // Indices of in-object properties.
289 static const int kGroupsIndex = 0;
290 static const int kInObjectPropertyCount = 1;
291
292 // Descriptor index of groups.
293 static const int kGroupsDescriptorIndex = 1;
294
295 OBJECT_CONSTRUCTORS(JSRegExpResultIndices, JSArray);
296 };
297
298 } // namespace internal
299 } // namespace v8
300
301 #include "src/objects/object-macros-undef.h"
302
303 #endif // V8_OBJECTS_JS_REGEXP_H_
304