1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "src/objects/js-regexp.h"
6
7 #include "src/objects/js-array-inl.h"
8 #include "src/objects/js-regexp-inl.h"
9 #include "src/regexp/regexp.h"
10
11 namespace v8 {
12 namespace internal {
13
GetAndCacheIndices(Isolate * isolate,Handle<JSRegExpResult> regexp_result)14 MaybeHandle<JSArray> JSRegExpResult::GetAndCacheIndices(
15 Isolate* isolate, Handle<JSRegExpResult> regexp_result) {
16 // Check for cached indices. We do a slow lookup and set of
17 // the cached_indices_or_match_info and names fields just in
18 // case they have been migrated to dictionaries.
19 Handle<Object> indices_or_regexp(
20 GetProperty(
21 isolate, regexp_result,
22 isolate->factory()->regexp_result_cached_indices_or_regexp_symbol())
23 .ToHandleChecked());
24 if (indices_or_regexp->IsJSRegExp()) {
25 // Build and cache indices for next lookup.
26 // TODO(joshualitt): Instead of caching the indices, we could call
27 // ReconfigureToDataProperty on 'indices' setting its value to this
28 // newly created array. However, care would have to be taken to ensure
29 // a new map is not created each time.
30
31 // Grab regexp, its last_index, and the original subject string from the
32 // result and the re-execute the regexp to generate a new MatchInfo.
33 Handle<JSRegExp> regexp(JSRegExp::cast(*indices_or_regexp), isolate);
34 Handle<Object> input_object(
35 GetProperty(isolate, regexp_result,
36 isolate->factory()->regexp_result_regexp_input_symbol())
37 .ToHandleChecked());
38 Handle<String> subject(String::cast(*input_object), isolate);
39 Handle<Object> last_index_object(
40 GetProperty(
41 isolate, regexp_result,
42 isolate->factory()->regexp_result_regexp_last_index_symbol())
43 .ToHandleChecked());
44
45 int capture_count = regexp->CaptureCount();
46 Handle<RegExpMatchInfo> match_info =
47 RegExpMatchInfo::New(isolate, capture_count);
48
49 int last_index = Smi::ToInt(*last_index_object);
50 Handle<Object> result;
51 ASSIGN_RETURN_ON_EXCEPTION(
52 isolate, result,
53 RegExp::Exec(isolate, regexp, subject, last_index, match_info),
54 JSArray);
55 DCHECK_EQ(*result, *match_info);
56
57 Handle<Object> maybe_names(
58 GetProperty(isolate, regexp_result,
59 isolate->factory()->regexp_result_names_symbol())
60 .ToHandleChecked());
61 indices_or_regexp =
62 JSRegExpResultIndices::BuildIndices(isolate, match_info, maybe_names);
63
64 // Cache the result and clear the names array, last_index and subject.
65 SetProperty(
66 isolate, regexp_result,
67 isolate->factory()->regexp_result_cached_indices_or_regexp_symbol(),
68 indices_or_regexp)
69 .ToHandleChecked();
70 SetProperty(isolate, regexp_result,
71 isolate->factory()->regexp_result_names_symbol(),
72 isolate->factory()->undefined_value())
73 .ToHandleChecked();
74 SetProperty(isolate, regexp_result,
75 isolate->factory()->regexp_result_regexp_last_index_symbol(),
76 isolate->factory()->undefined_value())
77 .ToHandleChecked();
78 SetProperty(isolate, regexp_result,
79 isolate->factory()->regexp_result_regexp_input_symbol(),
80 isolate->factory()->undefined_value())
81 .ToHandleChecked();
82 }
83 return Handle<JSArray>::cast(indices_or_regexp);
84 }
85
BuildIndices(Isolate * isolate,Handle<RegExpMatchInfo> match_info,Handle<Object> maybe_names)86 Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices(
87 Isolate* isolate, Handle<RegExpMatchInfo> match_info,
88 Handle<Object> maybe_names) {
89 Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast(
90 isolate->factory()->NewJSObjectFromMap(
91 isolate->regexp_result_indices_map())));
92
93 // Initialize indices length to avoid having a partially initialized object
94 // should GC be triggered by creating a NewFixedArray.
95 indices->set_length(Smi::zero());
96
97 // Build indices array from RegExpMatchInfo.
98 int num_indices = match_info->NumberOfCaptureRegisters();
99 int num_results = num_indices >> 1;
100 Handle<FixedArray> indices_array =
101 isolate->factory()->NewFixedArray(num_results);
102 JSArray::SetContent(indices, indices_array);
103
104 for (int i = 0; i < num_results; i++) {
105 int base_offset = i * 2;
106 int start_offset = match_info->Capture(base_offset);
107 int end_offset = match_info->Capture(base_offset + 1);
108
109 // Any unmatched captures are set to undefined, otherwise we set them to a
110 // subarray of the indices.
111 if (start_offset == -1) {
112 indices_array->set(i, ReadOnlyRoots(isolate).undefined_value());
113 } else {
114 Handle<FixedArray> indices_sub_array(
115 isolate->factory()->NewFixedArray(2));
116 indices_sub_array->set(0, Smi::FromInt(start_offset));
117 indices_sub_array->set(1, Smi::FromInt(end_offset));
118 Handle<JSArray> indices_sub_jsarray =
119 isolate->factory()->NewJSArrayWithElements(indices_sub_array,
120 PACKED_SMI_ELEMENTS, 2);
121 indices_array->set(i, *indices_sub_jsarray);
122 }
123 }
124
125 // If there are no capture groups, set the groups property to undefined.
126 FieldIndex groups_index = FieldIndex::ForDescriptor(
127 indices->map(), InternalIndex(kGroupsDescriptorIndex));
128 if (maybe_names->IsUndefined(isolate)) {
129 indices->RawFastPropertyAtPut(groups_index,
130 ReadOnlyRoots(isolate).undefined_value());
131 return indices;
132 }
133
134 // Create a groups property which returns a dictionary of named captures to
135 // their corresponding capture indices.
136 Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names));
137 int num_names = names->length() >> 1;
138 Handle<NameDictionary> group_names = NameDictionary::New(isolate, num_names);
139 for (int i = 0; i < num_names; i++) {
140 int base_offset = i * 2;
141 int name_offset = base_offset;
142 int index_offset = base_offset + 1;
143 Handle<String> name(String::cast(names->get(name_offset)), isolate);
144 Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate);
145 Handle<Object> capture_indices(indices_array->get(smi_index->value()),
146 isolate);
147 if (!capture_indices->IsUndefined(isolate)) {
148 capture_indices = Handle<JSArray>::cast(capture_indices);
149 }
150 group_names = NameDictionary::Add(
151 isolate, group_names, name, capture_indices, PropertyDetails::Empty());
152 }
153
154 // Convert group_names to a JSObject and store at the groups property of the
155 // result indices.
156 Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array();
157 Handle<HeapObject> null =
158 Handle<HeapObject>::cast(isolate->factory()->null_value());
159 Handle<JSObject> js_group_names =
160 isolate->factory()->NewSlowJSObjectWithPropertiesAndElements(
161 null, group_names, elements);
162 indices->RawFastPropertyAtPut(groups_index, *js_group_names);
163 return indices;
164 }
165
BacktrackLimit() const166 uint32_t JSRegExp::BacktrackLimit() const {
167 CHECK_EQ(TypeTag(), IRREGEXP);
168 return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit)));
169 }
170
171 // static
FlagsFromString(Isolate * isolate,Handle<String> flags,bool * success)172 JSRegExp::Flags JSRegExp::FlagsFromString(Isolate* isolate,
173 Handle<String> flags, bool* success) {
174 int length = flags->length();
175 if (length == 0) {
176 *success = true;
177 return JSRegExp::kNone;
178 }
179 // A longer flags string cannot be valid.
180 if (length > JSRegExp::kFlagCount) return JSRegExp::Flags(0);
181 JSRegExp::Flags value(0);
182 if (flags->IsSeqOneByteString()) {
183 DisallowHeapAllocation no_gc;
184 SeqOneByteString seq_flags = SeqOneByteString::cast(*flags);
185 for (int i = 0; i < length; i++) {
186 base::Optional<JSRegExp::Flag> maybe_flag =
187 JSRegExp::FlagFromChar(seq_flags.Get(i));
188 if (!maybe_flag.has_value()) return JSRegExp::Flags(0);
189 JSRegExp::Flag flag = *maybe_flag;
190 // Duplicate flag.
191 if (value & flag) return JSRegExp::Flags(0);
192 value |= flag;
193 }
194 } else {
195 flags = String::Flatten(isolate, flags);
196 DisallowHeapAllocation no_gc;
197 String::FlatContent flags_content = flags->GetFlatContent(no_gc);
198 for (int i = 0; i < length; i++) {
199 base::Optional<JSRegExp::Flag> maybe_flag =
200 JSRegExp::FlagFromChar(flags_content.Get(i));
201 if (!maybe_flag.has_value()) return JSRegExp::Flags(0);
202 JSRegExp::Flag flag = *maybe_flag;
203 // Duplicate flag.
204 if (value & flag) return JSRegExp::Flags(0);
205 value |= flag;
206 }
207 }
208 *success = true;
209 return value;
210 }
211
212 // static
New(Isolate * isolate,Handle<String> pattern,Flags flags,uint32_t backtrack_limit)213 MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern,
214 Flags flags, uint32_t backtrack_limit) {
215 Handle<JSFunction> constructor = isolate->regexp_function();
216 Handle<JSRegExp> regexp =
217 Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor));
218
219 return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
220 }
221
222 // static
Copy(Handle<JSRegExp> regexp)223 Handle<JSRegExp> JSRegExp::Copy(Handle<JSRegExp> regexp) {
224 Isolate* const isolate = regexp->GetIsolate();
225 return Handle<JSRegExp>::cast(isolate->factory()->CopyJSObject(regexp));
226 }
227
Code(bool is_latin1) const228 Object JSRegExp::Code(bool is_latin1) const {
229 DCHECK_EQ(TypeTag(), JSRegExp::IRREGEXP);
230 return DataAt(code_index(is_latin1));
231 }
232
Bytecode(bool is_latin1) const233 Object JSRegExp::Bytecode(bool is_latin1) const {
234 DCHECK_EQ(TypeTag(), JSRegExp::IRREGEXP);
235 return DataAt(bytecode_index(is_latin1));
236 }
237
ShouldProduceBytecode()238 bool JSRegExp::ShouldProduceBytecode() {
239 return FLAG_regexp_interpret_all ||
240 (FLAG_regexp_tier_up && !MarkedForTierUp());
241 }
242
243 // Only irregexps are subject to tier-up.
CanTierUp()244 bool JSRegExp::CanTierUp() {
245 return FLAG_regexp_tier_up && TypeTag() == JSRegExp::IRREGEXP;
246 }
247
248 // An irregexp is considered to be marked for tier up if the tier-up ticks
249 // value reaches zero.
MarkedForTierUp()250 bool JSRegExp::MarkedForTierUp() {
251 DCHECK(data().IsFixedArray());
252
253 if (!CanTierUp()) {
254 return false;
255 }
256
257 return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0;
258 }
259
ResetLastTierUpTick()260 void JSRegExp::ResetLastTierUpTick() {
261 DCHECK(FLAG_regexp_tier_up);
262 DCHECK_EQ(TypeTag(), JSRegExp::IRREGEXP);
263 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1;
264 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
265 Smi::FromInt(tier_up_ticks));
266 }
267
TierUpTick()268 void JSRegExp::TierUpTick() {
269 DCHECK(FLAG_regexp_tier_up);
270 DCHECK_EQ(TypeTag(), JSRegExp::IRREGEXP);
271 int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex));
272 if (tier_up_ticks == 0) {
273 return;
274 }
275 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
276 Smi::FromInt(tier_up_ticks - 1));
277 }
278
MarkTierUpForNextExec()279 void JSRegExp::MarkTierUpForNextExec() {
280 DCHECK(FLAG_regexp_tier_up);
281 DCHECK_EQ(TypeTag(), JSRegExp::IRREGEXP);
282 FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
283 Smi::zero());
284 }
285
286 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Handle<String> flags_string)287 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
288 Handle<String> source,
289 Handle<String> flags_string) {
290 Isolate* isolate = regexp->GetIsolate();
291 bool success = false;
292 Flags flags = JSRegExp::FlagsFromString(isolate, flags_string, &success);
293 if (!success) {
294 THROW_NEW_ERROR(
295 isolate,
296 NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string),
297 JSRegExp);
298 }
299 return Initialize(regexp, source, flags);
300 }
301
302 namespace {
303
IsLineTerminator(int c)304 bool IsLineTerminator(int c) {
305 // Expected to return true for '\n', '\r', 0x2028, and 0x2029.
306 return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
307 }
308
309 // TODO(jgruber): Consider merging CountAdditionalEscapeChars and
310 // WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
311 // and move related code closer to each other.
312 template <typename Char>
CountAdditionalEscapeChars(Handle<String> source,bool * needs_escapes_out)313 int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
314 DisallowHeapAllocation no_gc;
315 int escapes = 0;
316 bool needs_escapes = false;
317 bool in_char_class = false;
318 Vector<const Char> src = source->GetCharVector<Char>(no_gc);
319 for (int i = 0; i < src.length(); i++) {
320 const Char c = src[i];
321 if (c == '\\') {
322 if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
323 // This '\' is ignored since the next character itself will be escaped.
324 escapes--;
325 } else {
326 // Escape. Skip next character, which will be copied verbatim;
327 i++;
328 }
329 } else if (c == '/' && !in_char_class) {
330 // Not escaped forward-slash needs escape.
331 needs_escapes = true;
332 escapes++;
333 } else if (c == '[') {
334 in_char_class = true;
335 } else if (c == ']') {
336 in_char_class = false;
337 } else if (c == '\n') {
338 needs_escapes = true;
339 escapes++;
340 } else if (c == '\r') {
341 needs_escapes = true;
342 escapes++;
343 } else if (static_cast<int>(c) == 0x2028) {
344 needs_escapes = true;
345 escapes += std::strlen("\\u2028") - 1;
346 } else if (static_cast<int>(c) == 0x2029) {
347 needs_escapes = true;
348 escapes += std::strlen("\\u2029") - 1;
349 } else {
350 DCHECK(!IsLineTerminator(c));
351 }
352 }
353 DCHECK(!in_char_class);
354 DCHECK_GE(escapes, 0);
355 DCHECK_IMPLIES(escapes != 0, needs_escapes);
356 *needs_escapes_out = needs_escapes;
357 return escapes;
358 }
359
360 template <typename Char>
WriteStringToCharVector(Vector<Char> v,int * d,const char * string)361 void WriteStringToCharVector(Vector<Char> v, int* d, const char* string) {
362 int s = 0;
363 while (string[s] != '\0') v[(*d)++] = string[s++];
364 }
365
366 template <typename Char, typename StringType>
WriteEscapedRegExpSource(Handle<String> source,Handle<StringType> result)367 Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
368 Handle<StringType> result) {
369 DisallowHeapAllocation no_gc;
370 Vector<const Char> src = source->GetCharVector<Char>(no_gc);
371 Vector<Char> dst(result->GetChars(no_gc), result->length());
372 int s = 0;
373 int d = 0;
374 bool in_char_class = false;
375 while (s < src.length()) {
376 const Char c = src[s];
377 if (c == '\\') {
378 if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
379 // This '\' is ignored since the next character itself will be escaped.
380 s++;
381 continue;
382 } else {
383 // Escape. Copy this and next character.
384 dst[d++] = src[s++];
385 }
386 if (s == src.length()) break;
387 } else if (c == '/' && !in_char_class) {
388 // Not escaped forward-slash needs escape.
389 dst[d++] = '\\';
390 } else if (c == '[') {
391 in_char_class = true;
392 } else if (c == ']') {
393 in_char_class = false;
394 } else if (c == '\n') {
395 WriteStringToCharVector(dst, &d, "\\n");
396 s++;
397 continue;
398 } else if (c == '\r') {
399 WriteStringToCharVector(dst, &d, "\\r");
400 s++;
401 continue;
402 } else if (static_cast<int>(c) == 0x2028) {
403 WriteStringToCharVector(dst, &d, "\\u2028");
404 s++;
405 continue;
406 } else if (static_cast<int>(c) == 0x2029) {
407 WriteStringToCharVector(dst, &d, "\\u2029");
408 s++;
409 continue;
410 } else {
411 DCHECK(!IsLineTerminator(c));
412 }
413 dst[d++] = src[s++];
414 }
415 DCHECK_EQ(result->length(), d);
416 DCHECK(!in_char_class);
417 return result;
418 }
419
EscapeRegExpSource(Isolate * isolate,Handle<String> source)420 MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
421 Handle<String> source) {
422 DCHECK(source->IsFlat());
423 if (source->length() == 0) return isolate->factory()->query_colon_string();
424 bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
425 bool needs_escapes = false;
426 int additional_escape_chars =
427 one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
428 : CountAdditionalEscapeChars<uc16>(source, &needs_escapes);
429 if (!needs_escapes) return source;
430 int length = source->length() + additional_escape_chars;
431 if (one_byte) {
432 Handle<SeqOneByteString> result;
433 ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
434 isolate->factory()->NewRawOneByteString(length),
435 String);
436 return WriteEscapedRegExpSource<uint8_t>(source, result);
437 } else {
438 Handle<SeqTwoByteString> result;
439 ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
440 isolate->factory()->NewRawTwoByteString(length),
441 String);
442 return WriteEscapedRegExpSource<uc16>(source, result);
443 }
444 }
445
446 } // namespace
447
448 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Flags flags,uint32_t backtrack_limit)449 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
450 Handle<String> source, Flags flags,
451 uint32_t backtrack_limit) {
452 Isolate* isolate = regexp->GetIsolate();
453 Factory* factory = isolate->factory();
454 // If source is the empty string we set it to "(?:)" instead as
455 // suggested by ECMA-262, 5th, section 15.10.4.1.
456 if (source->length() == 0) source = factory->query_colon_string();
457
458 source = String::Flatten(isolate, source);
459
460 RETURN_ON_EXCEPTION(
461 isolate, RegExp::Compile(isolate, regexp, source, flags, backtrack_limit),
462 JSRegExp);
463
464 Handle<String> escaped_source;
465 ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,
466 EscapeRegExpSource(isolate, source), JSRegExp);
467
468 regexp->set_source(*escaped_source);
469 regexp->set_flags(Smi::FromInt(flags));
470
471 Map map = regexp->map();
472 Object constructor = map.GetConstructor();
473 if (constructor.IsJSFunction() &&
474 JSFunction::cast(constructor).initial_map() == map) {
475 // If we still have the original map, set in-object properties directly.
476 regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex, Smi::zero(),
477 SKIP_WRITE_BARRIER);
478 } else {
479 // Map has changed, so use generic, but slower, method.
480 RETURN_ON_EXCEPTION(
481 isolate,
482 Object::SetProperty(isolate, regexp, factory->lastIndex_string(),
483 Handle<Smi>(Smi::zero(), isolate)),
484 JSRegExp);
485 }
486
487 return regexp;
488 }
489
490 } // namespace internal
491 } // namespace v8
492