1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/objects/js-regexp.h"
6
7 #include "src/base/strings.h"
8 #include "src/common/globals.h"
9 #include "src/objects/code.h"
10 #include "src/objects/js-array-inl.h"
11 #include "src/objects/js-regexp-inl.h"
12 #include "src/regexp/regexp.h"
13
14 namespace v8 {
15 namespace internal {
16
BuildIndices(Isolate * isolate,Handle<RegExpMatchInfo> match_info,Handle<Object> maybe_names)17 Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices(
18 Isolate* isolate, Handle<RegExpMatchInfo> match_info,
19 Handle<Object> maybe_names) {
20 Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast(
21 isolate->factory()->NewJSObjectFromMap(
22 isolate->regexp_result_indices_map())));
23
24 // Initialize indices length to avoid having a partially initialized object
25 // should GC be triggered by creating a NewFixedArray.
26 indices->set_length(Smi::zero());
27
28 // Build indices array from RegExpMatchInfo.
29 int num_indices = match_info->NumberOfCaptureRegisters();
30 int num_results = num_indices >> 1;
31 Handle<FixedArray> indices_array =
32 isolate->factory()->NewFixedArray(num_results);
33 JSArray::SetContent(indices, indices_array);
34
35 for (int i = 0; i < num_results; i++) {
36 int base_offset = i * 2;
37 int start_offset = match_info->Capture(base_offset);
38 int end_offset = match_info->Capture(base_offset + 1);
39
40 // Any unmatched captures are set to undefined, otherwise we set them to a
41 // subarray of the indices.
42 if (start_offset == -1) {
43 indices_array->set(i, ReadOnlyRoots(isolate).undefined_value());
44 } else {
45 Handle<FixedArray> indices_sub_array(
46 isolate->factory()->NewFixedArray(2));
47 indices_sub_array->set(0, Smi::FromInt(start_offset));
48 indices_sub_array->set(1, Smi::FromInt(end_offset));
49 Handle<JSArray> indices_sub_jsarray =
50 isolate->factory()->NewJSArrayWithElements(indices_sub_array,
51 PACKED_SMI_ELEMENTS, 2);
52 indices_array->set(i, *indices_sub_jsarray);
53 }
54 }
55
56 // If there are no capture groups, set the groups property to undefined.
57 FieldIndex groups_index = FieldIndex::ForDescriptor(
58 indices->map(), InternalIndex(kGroupsDescriptorIndex));
59 if (maybe_names->IsUndefined(isolate)) {
60 indices->FastPropertyAtPut(groups_index,
61 ReadOnlyRoots(isolate).undefined_value());
62 return indices;
63 }
64
65 // Create a groups property which returns a dictionary of named captures to
66 // their corresponding capture indices.
67 Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names));
68 int num_names = names->length() >> 1;
69 Handle<HeapObject> group_names;
70 if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
71 group_names = isolate->factory()->NewSwissNameDictionary(num_names);
72 } else {
73 group_names = isolate->factory()->NewNameDictionary(num_names);
74 }
75 for (int i = 0; i < num_names; i++) {
76 int base_offset = i * 2;
77 int name_offset = base_offset;
78 int index_offset = base_offset + 1;
79 Handle<String> name(String::cast(names->get(name_offset)), isolate);
80 Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate);
81 Handle<Object> capture_indices(indices_array->get(smi_index->value()),
82 isolate);
83 if (!capture_indices->IsUndefined(isolate)) {
84 capture_indices = Handle<JSArray>::cast(capture_indices);
85 }
86 if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
87 group_names = SwissNameDictionary::Add(
88 isolate, Handle<SwissNameDictionary>::cast(group_names), name,
89 capture_indices, PropertyDetails::Empty());
90 } else {
91 group_names = NameDictionary::Add(
92 isolate, Handle<NameDictionary>::cast(group_names), name,
93 capture_indices, PropertyDetails::Empty());
94 }
95 }
96
97 // Convert group_names to a JSObject and store at the groups property of the
98 // result indices.
99 Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array();
100 Handle<HeapObject> null =
101 Handle<HeapObject>::cast(isolate->factory()->null_value());
102 Handle<JSObject> js_group_names =
103 isolate->factory()->NewSlowJSObjectWithPropertiesAndElements(
104 null, group_names, elements);
105 indices->FastPropertyAtPut(groups_index, *js_group_names);
106 return indices;
107 }
108
backtrack_limit() const109 uint32_t JSRegExp::backtrack_limit() const {
110 CHECK_EQ(type_tag(), IRREGEXP);
111 return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit)));
112 }
113
114 // static
FlagsFromString(Isolate * isolate,Handle<String> flags)115 base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString(
116 Isolate* isolate, Handle<String> flags) {
117 const int length = flags->length();
118
119 // A longer flags string cannot be valid.
120 if (length > JSRegExp::kFlagCount) return {};
121
122 RegExpFlags value;
123 FlatStringReader reader(isolate, String::Flatten(isolate, flags));
124
125 for (int i = 0; i < length; i++) {
126 base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i));
127 if (!flag.has_value()) return {};
128 if (value & flag.value()) return {}; // Duplicate.
129 value |= flag.value();
130 }
131
132 return JSRegExp::AsJSRegExpFlags(value);
133 }
134
135 // static
StringFromFlags(Isolate * isolate,JSRegExp::Flags flags)136 Handle<String> JSRegExp::StringFromFlags(Isolate* isolate,
137 JSRegExp::Flags flags) {
138 static constexpr int kStringTerminator = 1;
139 int cursor = 0;
140 char buffer[kFlagCount + kStringTerminator];
141 #define V(Lower, Camel, LowerCamel, Char, Bit) \
142 if (flags & JSRegExp::k##Camel) buffer[cursor++] = Char;
143 REGEXP_FLAG_LIST(V)
144 #undef V
145 buffer[cursor++] = '\0';
146 DCHECK_LE(cursor, kFlagCount + kStringTerminator);
147 return isolate->factory()->NewStringFromAsciiChecked(buffer);
148 }
149
150 // static
New(Isolate * isolate,Handle<String> pattern,Flags flags,uint32_t backtrack_limit)151 MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern,
152 Flags flags, uint32_t backtrack_limit) {
153 Handle<JSFunction> constructor = isolate->regexp_function();
154 Handle<JSRegExp> regexp =
155 Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor));
156
157 return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
158 }
159
code(bool is_latin1) const160 Object JSRegExp::code(bool is_latin1) const {
161 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
162 Object value = DataAt(code_index(is_latin1));
163 DCHECK_IMPLIES(V8_EXTERNAL_CODE_SPACE_BOOL, value.IsSmi() || value.IsCodeT());
164 return value;
165 }
166
set_code(bool is_latin1,Handle<Code> code)167 void JSRegExp::set_code(bool is_latin1, Handle<Code> code) {
168 SetDataAt(code_index(is_latin1), ToCodeT(*code));
169 }
170
bytecode(bool is_latin1) const171 Object JSRegExp::bytecode(bool is_latin1) const {
172 DCHECK(type_tag() == JSRegExp::IRREGEXP ||
173 type_tag() == JSRegExp::EXPERIMENTAL);
174 return DataAt(bytecode_index(is_latin1));
175 }
176
set_bytecode_and_trampoline(Isolate * isolate,Handle<ByteArray> bytecode)177 void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate,
178 Handle<ByteArray> bytecode) {
179 SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode);
180 SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode);
181
182 Handle<CodeT> trampoline =
183 BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
184 SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
185 SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
186 }
187
ShouldProduceBytecode()188 bool JSRegExp::ShouldProduceBytecode() {
189 return FLAG_regexp_interpret_all ||
190 (FLAG_regexp_tier_up && !MarkedForTierUp());
191 }
192
193 // Only irregexps are subject to tier-up.
CanTierUp()194 bool JSRegExp::CanTierUp() {
195 return FLAG_regexp_tier_up && type_tag() == JSRegExp::IRREGEXP;
196 }
197
198 // An irregexp is considered to be marked for tier up if the tier-up ticks
199 // value reaches zero.
MarkedForTierUp()200 bool JSRegExp::MarkedForTierUp() {
201 DCHECK(data().IsFixedArray());
202
203 if (!CanTierUp()) {
204 return false;
205 }
206
207 return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0;
208 }
209
ResetLastTierUpTick()210 void JSRegExp::ResetLastTierUpTick() {
211 DCHECK(FLAG_regexp_tier_up);
212 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
213 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1;
214 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
215 Smi::FromInt(tier_up_ticks));
216 }
217
TierUpTick()218 void JSRegExp::TierUpTick() {
219 DCHECK(FLAG_regexp_tier_up);
220 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
221 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex));
222 if (tier_up_ticks == 0) {
223 return;
224 }
225 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
226 Smi::FromInt(tier_up_ticks - 1));
227 }
228
MarkTierUpForNextExec()229 void JSRegExp::MarkTierUpForNextExec() {
230 DCHECK(FLAG_regexp_tier_up);
231 DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
232 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
233 Smi::zero());
234 }
235
236 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Handle<String> flags_string)237 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
238 Handle<String> source,
239 Handle<String> flags_string) {
240 Isolate* isolate = regexp->GetIsolate();
241 base::Optional<Flags> flags =
242 JSRegExp::FlagsFromString(isolate, flags_string);
243 if (!flags.has_value()) {
244 THROW_NEW_ERROR(
245 isolate,
246 NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string),
247 JSRegExp);
248 }
249 return Initialize(regexp, source, flags.value());
250 }
251
252 namespace {
253
IsLineTerminator(int c)254 bool IsLineTerminator(int c) {
255 // Expected to return true for '\n', '\r', 0x2028, and 0x2029.
256 return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
257 }
258
259 // TODO(jgruber): Consider merging CountAdditionalEscapeChars and
260 // WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
261 // and move related code closer to each other.
262 template <typename Char>
CountAdditionalEscapeChars(Handle<String> source,bool * needs_escapes_out)263 int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
264 DisallowGarbageCollection no_gc;
265 int escapes = 0;
266 bool needs_escapes = false;
267 bool in_char_class = false;
268 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
269 for (int i = 0; i < src.length(); i++) {
270 const Char c = src[i];
271 if (c == '\\') {
272 if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
273 // This '\' is ignored since the next character itself will be escaped.
274 escapes--;
275 } else {
276 // Escape. Skip next character, which will be copied verbatim;
277 i++;
278 }
279 } else if (c == '/' && !in_char_class) {
280 // Not escaped forward-slash needs escape.
281 needs_escapes = true;
282 escapes++;
283 } else if (c == '[') {
284 in_char_class = true;
285 } else if (c == ']') {
286 in_char_class = false;
287 } else if (c == '\n') {
288 needs_escapes = true;
289 escapes++;
290 } else if (c == '\r') {
291 needs_escapes = true;
292 escapes++;
293 } else if (static_cast<int>(c) == 0x2028) {
294 needs_escapes = true;
295 escapes += std::strlen("\\u2028") - 1;
296 } else if (static_cast<int>(c) == 0x2029) {
297 needs_escapes = true;
298 escapes += std::strlen("\\u2029") - 1;
299 } else {
300 DCHECK(!IsLineTerminator(c));
301 }
302 }
303 DCHECK(!in_char_class);
304 DCHECK_GE(escapes, 0);
305 DCHECK_IMPLIES(escapes != 0, needs_escapes);
306 *needs_escapes_out = needs_escapes;
307 return escapes;
308 }
309
310 template <typename Char>
WriteStringToCharVector(base::Vector<Char> v,int * d,const char * string)311 void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) {
312 int s = 0;
313 while (string[s] != '\0') v[(*d)++] = string[s++];
314 }
315
316 template <typename Char, typename StringType>
WriteEscapedRegExpSource(Handle<String> source,Handle<StringType> result)317 Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
318 Handle<StringType> result) {
319 DisallowGarbageCollection no_gc;
320 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
321 base::Vector<Char> dst(result->GetChars(no_gc), result->length());
322 int s = 0;
323 int d = 0;
324 bool in_char_class = false;
325 while (s < src.length()) {
326 const Char c = src[s];
327 if (c == '\\') {
328 if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
329 // This '\' is ignored since the next character itself will be escaped.
330 s++;
331 continue;
332 } else {
333 // Escape. Copy this and next character.
334 dst[d++] = src[s++];
335 }
336 if (s == src.length()) break;
337 } else if (c == '/' && !in_char_class) {
338 // Not escaped forward-slash needs escape.
339 dst[d++] = '\\';
340 } else if (c == '[') {
341 in_char_class = true;
342 } else if (c == ']') {
343 in_char_class = false;
344 } else if (c == '\n') {
345 WriteStringToCharVector(dst, &d, "\\n");
346 s++;
347 continue;
348 } else if (c == '\r') {
349 WriteStringToCharVector(dst, &d, "\\r");
350 s++;
351 continue;
352 } else if (static_cast<int>(c) == 0x2028) {
353 WriteStringToCharVector(dst, &d, "\\u2028");
354 s++;
355 continue;
356 } else if (static_cast<int>(c) == 0x2029) {
357 WriteStringToCharVector(dst, &d, "\\u2029");
358 s++;
359 continue;
360 } else {
361 DCHECK(!IsLineTerminator(c));
362 }
363 dst[d++] = src[s++];
364 }
365 DCHECK_EQ(result->length(), d);
366 DCHECK(!in_char_class);
367 return result;
368 }
369
EscapeRegExpSource(Isolate * isolate,Handle<String> source)370 MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
371 Handle<String> source) {
372 DCHECK(source->IsFlat());
373 if (source->length() == 0) return isolate->factory()->query_colon_string();
374 bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
375 bool needs_escapes = false;
376 int additional_escape_chars =
377 one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
378 : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes);
379 if (!needs_escapes) return source;
380 int length = source->length() + additional_escape_chars;
381 if (one_byte) {
382 Handle<SeqOneByteString> result;
383 ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
384 isolate->factory()->NewRawOneByteString(length),
385 String);
386 return WriteEscapedRegExpSource<uint8_t>(source, result);
387 } else {
388 Handle<SeqTwoByteString> result;
389 ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
390 isolate->factory()->NewRawTwoByteString(length),
391 String);
392 return WriteEscapedRegExpSource<base::uc16>(source, result);
393 }
394 }
395
396 } // namespace
397
398 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Flags flags,uint32_t backtrack_limit)399 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
400 Handle<String> source, Flags flags,
401 uint32_t backtrack_limit) {
402 Isolate* isolate = regexp->GetIsolate();
403 Factory* factory = isolate->factory();
404 // If source is the empty string we set it to "(?:)" instead as
405 // suggested by ECMA-262, 5th, section 15.10.4.1.
406 if (source->length() == 0) source = factory->query_colon_string();
407
408 source = String::Flatten(isolate, source);
409
410 RETURN_ON_EXCEPTION(
411 isolate,
412 RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags),
413 backtrack_limit),
414 JSRegExp);
415
416 Handle<String> escaped_source;
417 ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,
418 EscapeRegExpSource(isolate, source), JSRegExp);
419
420 regexp->set_source(*escaped_source);
421 regexp->set_flags(Smi::FromInt(flags));
422
423 Map map = regexp->map();
424 Object constructor = map.GetConstructor();
425 if (constructor.IsJSFunction() &&
426 JSFunction::cast(constructor).initial_map() == map) {
427 // If we still have the original map, set in-object properties directly.
428 regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex,
429 Smi::FromInt(kInitialLastIndexValue),
430 SKIP_WRITE_BARRIER);
431 } else {
432 // Map has changed, so use generic, but slower, method.
433 RETURN_ON_EXCEPTION(
434 isolate,
435 Object::SetProperty(
436 isolate, regexp, factory->lastIndex_string(),
437 Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)),
438 JSRegExp);
439 }
440
441 return regexp;
442 }
443
444 } // namespace internal
445 } // namespace v8
446