• Home
  • Line#
  • Scopes#
  • Navigate#
  • Raw
  • Download
1 // Copyright 2019 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/objects/js-regexp.h"
6 
7 #include "src/base/strings.h"
8 #include "src/common/globals.h"
9 #include "src/objects/code.h"
10 #include "src/objects/js-array-inl.h"
11 #include "src/objects/js-regexp-inl.h"
12 #include "src/regexp/regexp.h"
13 
14 namespace v8 {
15 namespace internal {
16 
BuildIndices(Isolate * isolate,Handle<RegExpMatchInfo> match_info,Handle<Object> maybe_names)17 Handle<JSRegExpResultIndices> JSRegExpResultIndices::BuildIndices(
18     Isolate* isolate, Handle<RegExpMatchInfo> match_info,
19     Handle<Object> maybe_names) {
20   Handle<JSRegExpResultIndices> indices(Handle<JSRegExpResultIndices>::cast(
21       isolate->factory()->NewJSObjectFromMap(
22           isolate->regexp_result_indices_map())));
23 
24   // Initialize indices length to avoid having a partially initialized object
25   // should GC be triggered by creating a NewFixedArray.
26   indices->set_length(Smi::zero());
27 
28   // Build indices array from RegExpMatchInfo.
29   int num_indices = match_info->NumberOfCaptureRegisters();
30   int num_results = num_indices >> 1;
31   Handle<FixedArray> indices_array =
32       isolate->factory()->NewFixedArray(num_results);
33   JSArray::SetContent(indices, indices_array);
34 
35   for (int i = 0; i < num_results; i++) {
36     int base_offset = i * 2;
37     int start_offset = match_info->Capture(base_offset);
38     int end_offset = match_info->Capture(base_offset + 1);
39 
40     // Any unmatched captures are set to undefined, otherwise we set them to a
41     // subarray of the indices.
42     if (start_offset == -1) {
43       indices_array->set(i, ReadOnlyRoots(isolate).undefined_value());
44     } else {
45       Handle<FixedArray> indices_sub_array(
46           isolate->factory()->NewFixedArray(2));
47       indices_sub_array->set(0, Smi::FromInt(start_offset));
48       indices_sub_array->set(1, Smi::FromInt(end_offset));
49       Handle<JSArray> indices_sub_jsarray =
50           isolate->factory()->NewJSArrayWithElements(indices_sub_array,
51                                                      PACKED_SMI_ELEMENTS, 2);
52       indices_array->set(i, *indices_sub_jsarray);
53     }
54   }
55 
56   // If there are no capture groups, set the groups property to undefined.
57   FieldIndex groups_index = FieldIndex::ForDescriptor(
58       indices->map(), InternalIndex(kGroupsDescriptorIndex));
59   if (maybe_names->IsUndefined(isolate)) {
60     indices->FastPropertyAtPut(groups_index,
61                                ReadOnlyRoots(isolate).undefined_value());
62     return indices;
63   }
64 
65   // Create a groups property which returns a dictionary of named captures to
66   // their corresponding capture indices.
67   Handle<FixedArray> names(Handle<FixedArray>::cast(maybe_names));
68   int num_names = names->length() >> 1;
69   Handle<HeapObject> group_names;
70   if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
71     group_names = isolate->factory()->NewSwissNameDictionary(num_names);
72   } else {
73     group_names = isolate->factory()->NewNameDictionary(num_names);
74   }
75   for (int i = 0; i < num_names; i++) {
76     int base_offset = i * 2;
77     int name_offset = base_offset;
78     int index_offset = base_offset + 1;
79     Handle<String> name(String::cast(names->get(name_offset)), isolate);
80     Handle<Smi> smi_index(Smi::cast(names->get(index_offset)), isolate);
81     Handle<Object> capture_indices(indices_array->get(smi_index->value()),
82                                    isolate);
83     if (!capture_indices->IsUndefined(isolate)) {
84       capture_indices = Handle<JSArray>::cast(capture_indices);
85     }
86     if (V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL) {
87       group_names = SwissNameDictionary::Add(
88           isolate, Handle<SwissNameDictionary>::cast(group_names), name,
89           capture_indices, PropertyDetails::Empty());
90     } else {
91       group_names = NameDictionary::Add(
92           isolate, Handle<NameDictionary>::cast(group_names), name,
93           capture_indices, PropertyDetails::Empty());
94     }
95   }
96 
97   // Convert group_names to a JSObject and store at the groups property of the
98   // result indices.
99   Handle<FixedArrayBase> elements = isolate->factory()->empty_fixed_array();
100   Handle<HeapObject> null =
101       Handle<HeapObject>::cast(isolate->factory()->null_value());
102   Handle<JSObject> js_group_names =
103       isolate->factory()->NewSlowJSObjectWithPropertiesAndElements(
104           null, group_names, elements);
105   indices->FastPropertyAtPut(groups_index, *js_group_names);
106   return indices;
107 }
108 
backtrack_limit() const109 uint32_t JSRegExp::backtrack_limit() const {
110   CHECK_EQ(type_tag(), IRREGEXP);
111   return static_cast<uint32_t>(Smi::ToInt(DataAt(kIrregexpBacktrackLimit)));
112 }
113 
114 // static
FlagsFromString(Isolate * isolate,Handle<String> flags)115 base::Optional<JSRegExp::Flags> JSRegExp::FlagsFromString(
116     Isolate* isolate, Handle<String> flags) {
117   const int length = flags->length();
118 
119   // A longer flags string cannot be valid.
120   if (length > JSRegExp::kFlagCount) return {};
121 
122   RegExpFlags value;
123   FlatStringReader reader(isolate, String::Flatten(isolate, flags));
124 
125   for (int i = 0; i < length; i++) {
126     base::Optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i));
127     if (!flag.has_value()) return {};
128     if (value & flag.value()) return {};  // Duplicate.
129     value |= flag.value();
130   }
131 
132   return JSRegExp::AsJSRegExpFlags(value);
133 }
134 
135 // static
StringFromFlags(Isolate * isolate,JSRegExp::Flags flags)136 Handle<String> JSRegExp::StringFromFlags(Isolate* isolate,
137                                          JSRegExp::Flags flags) {
138   static constexpr int kStringTerminator = 1;
139   int cursor = 0;
140   char buffer[kFlagCount + kStringTerminator];
141 #define V(Lower, Camel, LowerCamel, Char, Bit) \
142   if (flags & JSRegExp::k##Camel) buffer[cursor++] = Char;
143   REGEXP_FLAG_LIST(V)
144 #undef V
145   buffer[cursor++] = '\0';
146   DCHECK_LE(cursor, kFlagCount + kStringTerminator);
147   return isolate->factory()->NewStringFromAsciiChecked(buffer);
148 }
149 
150 // static
New(Isolate * isolate,Handle<String> pattern,Flags flags,uint32_t backtrack_limit)151 MaybeHandle<JSRegExp> JSRegExp::New(Isolate* isolate, Handle<String> pattern,
152                                     Flags flags, uint32_t backtrack_limit) {
153   Handle<JSFunction> constructor = isolate->regexp_function();
154   Handle<JSRegExp> regexp =
155       Handle<JSRegExp>::cast(isolate->factory()->NewJSObject(constructor));
156 
157   return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
158 }
159 
code(bool is_latin1) const160 Object JSRegExp::code(bool is_latin1) const {
161   DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
162   Object value = DataAt(code_index(is_latin1));
163   DCHECK_IMPLIES(V8_EXTERNAL_CODE_SPACE_BOOL, value.IsSmi() || value.IsCodeT());
164   return value;
165 }
166 
set_code(bool is_latin1,Handle<Code> code)167 void JSRegExp::set_code(bool is_latin1, Handle<Code> code) {
168   SetDataAt(code_index(is_latin1), ToCodeT(*code));
169 }
170 
bytecode(bool is_latin1) const171 Object JSRegExp::bytecode(bool is_latin1) const {
172   DCHECK(type_tag() == JSRegExp::IRREGEXP ||
173          type_tag() == JSRegExp::EXPERIMENTAL);
174   return DataAt(bytecode_index(is_latin1));
175 }
176 
set_bytecode_and_trampoline(Isolate * isolate,Handle<ByteArray> bytecode)177 void JSRegExp::set_bytecode_and_trampoline(Isolate* isolate,
178                                            Handle<ByteArray> bytecode) {
179   SetDataAt(kIrregexpLatin1BytecodeIndex, *bytecode);
180   SetDataAt(kIrregexpUC16BytecodeIndex, *bytecode);
181 
182   Handle<CodeT> trampoline =
183       BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
184   SetDataAt(JSRegExp::kIrregexpLatin1CodeIndex, *trampoline);
185   SetDataAt(JSRegExp::kIrregexpUC16CodeIndex, *trampoline);
186 }
187 
ShouldProduceBytecode()188 bool JSRegExp::ShouldProduceBytecode() {
189   return FLAG_regexp_interpret_all ||
190          (FLAG_regexp_tier_up && !MarkedForTierUp());
191 }
192 
193 // Only irregexps are subject to tier-up.
CanTierUp()194 bool JSRegExp::CanTierUp() {
195   return FLAG_regexp_tier_up && type_tag() == JSRegExp::IRREGEXP;
196 }
197 
198 // An irregexp is considered to be marked for tier up if the tier-up ticks
199 // value reaches zero.
MarkedForTierUp()200 bool JSRegExp::MarkedForTierUp() {
201   DCHECK(data().IsFixedArray());
202 
203   if (!CanTierUp()) {
204     return false;
205   }
206 
207   return Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) == 0;
208 }
209 
ResetLastTierUpTick()210 void JSRegExp::ResetLastTierUpTick() {
211   DCHECK(FLAG_regexp_tier_up);
212   DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
213   int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex)) + 1;
214   FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
215                                Smi::FromInt(tier_up_ticks));
216 }
217 
TierUpTick()218 void JSRegExp::TierUpTick() {
219   DCHECK(FLAG_regexp_tier_up);
220   DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
221   int tier_up_ticks = Smi::ToInt(DataAt(kIrregexpTicksUntilTierUpIndex));
222   if (tier_up_ticks == 0) {
223     return;
224   }
225   FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
226                                Smi::FromInt(tier_up_ticks - 1));
227 }
228 
MarkTierUpForNextExec()229 void JSRegExp::MarkTierUpForNextExec() {
230   DCHECK(FLAG_regexp_tier_up);
231   DCHECK_EQ(type_tag(), JSRegExp::IRREGEXP);
232   FixedArray::cast(data()).set(JSRegExp::kIrregexpTicksUntilTierUpIndex,
233                                Smi::zero());
234 }
235 
236 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Handle<String> flags_string)237 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
238                                            Handle<String> source,
239                                            Handle<String> flags_string) {
240   Isolate* isolate = regexp->GetIsolate();
241   base::Optional<Flags> flags =
242       JSRegExp::FlagsFromString(isolate, flags_string);
243   if (!flags.has_value()) {
244     THROW_NEW_ERROR(
245         isolate,
246         NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string),
247         JSRegExp);
248   }
249   return Initialize(regexp, source, flags.value());
250 }
251 
252 namespace {
253 
IsLineTerminator(int c)254 bool IsLineTerminator(int c) {
255   // Expected to return true for '\n', '\r', 0x2028, and 0x2029.
256   return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
257 }
258 
259 // TODO(jgruber): Consider merging CountAdditionalEscapeChars and
260 // WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
261 // and move related code closer to each other.
262 template <typename Char>
CountAdditionalEscapeChars(Handle<String> source,bool * needs_escapes_out)263 int CountAdditionalEscapeChars(Handle<String> source, bool* needs_escapes_out) {
264   DisallowGarbageCollection no_gc;
265   int escapes = 0;
266   bool needs_escapes = false;
267   bool in_char_class = false;
268   base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
269   for (int i = 0; i < src.length(); i++) {
270     const Char c = src[i];
271     if (c == '\\') {
272       if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
273         // This '\' is ignored since the next character itself will be escaped.
274         escapes--;
275       } else {
276         // Escape. Skip next character, which will be copied verbatim;
277         i++;
278       }
279     } else if (c == '/' && !in_char_class) {
280       // Not escaped forward-slash needs escape.
281       needs_escapes = true;
282       escapes++;
283     } else if (c == '[') {
284       in_char_class = true;
285     } else if (c == ']') {
286       in_char_class = false;
287     } else if (c == '\n') {
288       needs_escapes = true;
289       escapes++;
290     } else if (c == '\r') {
291       needs_escapes = true;
292       escapes++;
293     } else if (static_cast<int>(c) == 0x2028) {
294       needs_escapes = true;
295       escapes += std::strlen("\\u2028") - 1;
296     } else if (static_cast<int>(c) == 0x2029) {
297       needs_escapes = true;
298       escapes += std::strlen("\\u2029") - 1;
299     } else {
300       DCHECK(!IsLineTerminator(c));
301     }
302   }
303   DCHECK(!in_char_class);
304   DCHECK_GE(escapes, 0);
305   DCHECK_IMPLIES(escapes != 0, needs_escapes);
306   *needs_escapes_out = needs_escapes;
307   return escapes;
308 }
309 
310 template <typename Char>
WriteStringToCharVector(base::Vector<Char> v,int * d,const char * string)311 void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) {
312   int s = 0;
313   while (string[s] != '\0') v[(*d)++] = string[s++];
314 }
315 
316 template <typename Char, typename StringType>
WriteEscapedRegExpSource(Handle<String> source,Handle<StringType> result)317 Handle<StringType> WriteEscapedRegExpSource(Handle<String> source,
318                                             Handle<StringType> result) {
319   DisallowGarbageCollection no_gc;
320   base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
321   base::Vector<Char> dst(result->GetChars(no_gc), result->length());
322   int s = 0;
323   int d = 0;
324   bool in_char_class = false;
325   while (s < src.length()) {
326     const Char c = src[s];
327     if (c == '\\') {
328       if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
329         // This '\' is ignored since the next character itself will be escaped.
330         s++;
331         continue;
332       } else {
333         // Escape. Copy this and next character.
334         dst[d++] = src[s++];
335       }
336       if (s == src.length()) break;
337     } else if (c == '/' && !in_char_class) {
338       // Not escaped forward-slash needs escape.
339       dst[d++] = '\\';
340     } else if (c == '[') {
341       in_char_class = true;
342     } else if (c == ']') {
343       in_char_class = false;
344     } else if (c == '\n') {
345       WriteStringToCharVector(dst, &d, "\\n");
346       s++;
347       continue;
348     } else if (c == '\r') {
349       WriteStringToCharVector(dst, &d, "\\r");
350       s++;
351       continue;
352     } else if (static_cast<int>(c) == 0x2028) {
353       WriteStringToCharVector(dst, &d, "\\u2028");
354       s++;
355       continue;
356     } else if (static_cast<int>(c) == 0x2029) {
357       WriteStringToCharVector(dst, &d, "\\u2029");
358       s++;
359       continue;
360     } else {
361       DCHECK(!IsLineTerminator(c));
362     }
363     dst[d++] = src[s++];
364   }
365   DCHECK_EQ(result->length(), d);
366   DCHECK(!in_char_class);
367   return result;
368 }
369 
EscapeRegExpSource(Isolate * isolate,Handle<String> source)370 MaybeHandle<String> EscapeRegExpSource(Isolate* isolate,
371                                        Handle<String> source) {
372   DCHECK(source->IsFlat());
373   if (source->length() == 0) return isolate->factory()->query_colon_string();
374   bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
375   bool needs_escapes = false;
376   int additional_escape_chars =
377       one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
378                : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes);
379   if (!needs_escapes) return source;
380   int length = source->length() + additional_escape_chars;
381   if (one_byte) {
382     Handle<SeqOneByteString> result;
383     ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
384                                isolate->factory()->NewRawOneByteString(length),
385                                String);
386     return WriteEscapedRegExpSource<uint8_t>(source, result);
387   } else {
388     Handle<SeqTwoByteString> result;
389     ASSIGN_RETURN_ON_EXCEPTION(isolate, result,
390                                isolate->factory()->NewRawTwoByteString(length),
391                                String);
392     return WriteEscapedRegExpSource<base::uc16>(source, result);
393   }
394 }
395 
396 }  // namespace
397 
398 // static
Initialize(Handle<JSRegExp> regexp,Handle<String> source,Flags flags,uint32_t backtrack_limit)399 MaybeHandle<JSRegExp> JSRegExp::Initialize(Handle<JSRegExp> regexp,
400                                            Handle<String> source, Flags flags,
401                                            uint32_t backtrack_limit) {
402   Isolate* isolate = regexp->GetIsolate();
403   Factory* factory = isolate->factory();
404   // If source is the empty string we set it to "(?:)" instead as
405   // suggested by ECMA-262, 5th, section 15.10.4.1.
406   if (source->length() == 0) source = factory->query_colon_string();
407 
408   source = String::Flatten(isolate, source);
409 
410   RETURN_ON_EXCEPTION(
411       isolate,
412       RegExp::Compile(isolate, regexp, source, JSRegExp::AsRegExpFlags(flags),
413                       backtrack_limit),
414       JSRegExp);
415 
416   Handle<String> escaped_source;
417   ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,
418                              EscapeRegExpSource(isolate, source), JSRegExp);
419 
420   regexp->set_source(*escaped_source);
421   regexp->set_flags(Smi::FromInt(flags));
422 
423   Map map = regexp->map();
424   Object constructor = map.GetConstructor();
425   if (constructor.IsJSFunction() &&
426       JSFunction::cast(constructor).initial_map() == map) {
427     // If we still have the original map, set in-object properties directly.
428     regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex,
429                                   Smi::FromInt(kInitialLastIndexValue),
430                                   SKIP_WRITE_BARRIER);
431   } else {
432     // Map has changed, so use generic, but slower, method.
433     RETURN_ON_EXCEPTION(
434         isolate,
435         Object::SetProperty(
436             isolate, regexp, factory->lastIndex_string(),
437             Handle<Smi>(Smi::FromInt(kInitialLastIndexValue), isolate)),
438         JSRegExp);
439   }
440 
441   return regexp;
442 }
443 
444 }  // namespace internal
445 }  // namespace v8
446