v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
js-regexp.cc
Go to the documentation of this file.
1// Copyright 2019 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
7#include <optional>
8
9#include "src/base/strings.h"
10#include "src/common/globals.h"
11#include "src/objects/code.h"
14#include "src/regexp/regexp.h"
15
16namespace v8::internal {
17
19 Isolate* isolate, DirectHandle<RegExpMatchInfo> match_info,
20 DirectHandle<Object> maybe_names) {
22 Cast<JSRegExpResultIndices>(isolate->factory()->NewJSObjectFromMap(
23 isolate->regexp_result_indices_map())));
24
25 // Initialize indices length to avoid having a partially initialized object
26 // should GC be triggered by creating a NewFixedArray.
27 indices->set_length(Smi::zero());
28
29 // Build indices array from RegExpMatchInfo.
30 int num_indices = match_info->number_of_capture_registers();
31 int num_results = num_indices >> 1;
32 DirectHandle<FixedArray> indices_array =
33 isolate->factory()->NewFixedArray(num_results);
34 JSArray::SetContent(indices, indices_array);
35
36 for (int i = 0; i < num_results; i++) {
37 const int start_offset =
38 match_info->capture(RegExpMatchInfo::capture_start_index(i));
39 const int end_offset =
40 match_info->capture(RegExpMatchInfo::capture_end_index(i));
41
42 // Any unmatched captures are set to undefined, otherwise we set them to a
43 // subarray of the indices.
44 if (start_offset == -1) {
45 indices_array->set(i, ReadOnlyRoots(isolate).undefined_value());
46 } else {
47 DirectHandle<FixedArray> indices_sub_array(
48 isolate->factory()->NewFixedArray(2));
49 indices_sub_array->set(0, Smi::FromInt(start_offset));
50 indices_sub_array->set(1, Smi::FromInt(end_offset));
51 DirectHandle<JSArray> indices_sub_jsarray =
52 isolate->factory()->NewJSArrayWithElements(indices_sub_array,
54 indices_array->set(i, *indices_sub_jsarray);
55 }
56 }
57
58 // If there are no capture groups, set the groups property to undefined.
60 indices->map(), InternalIndex(kGroupsDescriptorIndex));
61 if (IsUndefined(*maybe_names, isolate)) {
62 indices->FastPropertyAtPut(groups_index,
63 ReadOnlyRoots(isolate).undefined_value());
64 return indices;
65 }
66
67 // Create a groups property which returns a dictionary of named captures to
68 // their corresponding capture indices.
69 auto names = Cast<FixedArray>(maybe_names);
70 int num_names = names->length() >> 1;
71 DirectHandle<HeapObject> group_names;
73 group_names = isolate->factory()->NewSwissNameDictionary(num_names);
74 } else {
75 group_names = isolate->factory()->NewNameDictionary(num_names);
76 }
77 DirectHandle<PropertyDictionary> group_names_dict =
78 Cast<PropertyDictionary>(group_names);
79 for (int i = 0; i < num_names; i++) {
80 int base_offset = i * 2;
81 int name_offset = base_offset;
82 int index_offset = base_offset + 1;
83 DirectHandle<String> name(Cast<String>(names->get(name_offset)), isolate);
84 Tagged<Smi> smi_index = Cast<Smi>(names->get(index_offset));
85 DirectHandle<Object> capture_indices(indices_array->get(smi_index.value()),
86 isolate);
87 if (!IsUndefined(*capture_indices, isolate)) {
88 capture_indices = Cast<JSArray>(capture_indices);
89 }
90 InternalIndex group_entry = group_names_dict->FindEntry(isolate, name);
91 // Duplicate group entries are possible if the capture groups are in
92 // different alternatives, i.e. only one of them can actually match.
93 // Therefore when we find a duplicate entry, either the current entry is
94 // undefined (didn't match anything) or the indices for the current capture
95 // are undefined. In the latter case we don't do anything, in the former
96 // case we update the entry.
97 if (group_entry.is_found()) {
98 DCHECK(v8_flags.js_regexp_duplicate_named_groups);
99 if (!IsUndefined(*capture_indices, isolate)) {
100 DCHECK(IsUndefined(group_names_dict->ValueAt(group_entry), isolate));
101 group_names_dict->ValueAtPut(group_entry, *capture_indices);
102 }
103 } else {
104 group_names_dict =
105 PropertyDictionary::Add(isolate, group_names_dict, name,
106 capture_indices, PropertyDetails::Empty());
107 }
108 }
109
110 // Convert group_names to a JSObject and store at the groups property of the
111 // result indices.
113 isolate->factory()->empty_fixed_array();
114 DirectHandle<Null> null = isolate->factory()->null_value();
115 DirectHandle<JSObject> js_group_names =
116 isolate->factory()->NewSlowJSObjectWithPropertiesAndElements(
117 null, group_names, elements);
118 indices->FastPropertyAtPut(groups_index, *js_group_names);
119 return indices;
120}
121
122// static
123std::optional<JSRegExp::Flags> JSRegExp::FlagsFromString(
124 Isolate* isolate, DirectHandle<String> flags) {
125 const int length = flags->length();
126
127 // A longer flags string cannot be valid.
128 if (length > JSRegExp::kFlagCount) return {};
129
131 FlatStringReader reader(isolate, String::Flatten(isolate, flags));
132
133 for (int i = 0; i < length; i++) {
134 std::optional<RegExpFlag> flag = JSRegExp::FlagFromChar(reader.Get(i));
135 if (!flag.has_value()) return {};
136 if (value & flag.value()) return {}; // Duplicate.
137 value |= flag.value();
138 }
139
140 return JSRegExp::AsJSRegExpFlags(value);
141}
142
143// static
145 JSRegExp::Flags flags) {
146 FlagsBuffer buffer;
147 return isolate->factory()->NewStringFromAsciiChecked(
148 FlagsToString(flags, &buffer));
149}
150
151// static
154 Flags flags,
155 uint32_t backtrack_limit) {
156 DirectHandle<JSFunction> constructor = isolate->regexp_function();
158 Cast<JSRegExp>(isolate->factory()->NewJSObject(constructor));
159
160 // Clear the data field, as a GC can be triggered before the field is set
161 // during compilation.
162 regexp->clear_data();
163
164 return JSRegExp::Initialize(regexp, pattern, flags, backtrack_limit);
165}
166
167// static
170 DirectHandle<String> flags_string) {
171 Isolate* isolate = regexp->GetIsolate();
172 std::optional<Flags> flags = JSRegExp::FlagsFromString(isolate, flags_string);
173 if (!flags.has_value() ||
176 isolate,
177 NewSyntaxError(MessageTemplate::kInvalidRegExpFlags, flags_string));
178 }
179 return Initialize(regexp, source, flags.value());
180}
181
182namespace {
183
184bool IsLineTerminator(int c) {
185 // Expected to return true for '\n', '\r', 0x2028, and 0x2029.
186 return unibrow::IsLineTerminator(static_cast<unibrow::uchar>(c));
187}
188
189// TODO(jgruber): Consider merging CountAdditionalEscapeChars and
190// WriteEscapedRegExpSource into a single function to deduplicate dispatch logic
191// and move related code closer to each other.
192template <typename Char>
193int CountAdditionalEscapeChars(DirectHandle<String> source,
194 bool* needs_escapes_out) {
196 int escapes = 0;
197 bool needs_escapes = false;
198 bool in_character_class = false;
199 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
200 for (int i = 0; i < src.length(); i++) {
201 const Char c = src[i];
202 if (c == '\\') {
203 if (i + 1 < src.length() && IsLineTerminator(src[i + 1])) {
204 // This '\' is ignored since the next character itself will be escaped.
205 escapes--;
206 } else {
207 // Escape. Skip next character, which will be copied verbatim;
208 i++;
209 }
210 } else if (c == '/' && !in_character_class) {
211 // Not escaped forward-slash needs escape.
212 needs_escapes = true;
213 escapes++;
214 } else if (c == '[') {
215 in_character_class = true;
216 } else if (c == ']') {
217 in_character_class = false;
218 } else if (c == '\n') {
219 needs_escapes = true;
220 escapes++;
221 } else if (c == '\r') {
222 needs_escapes = true;
223 escapes++;
224 } else if (static_cast<int>(c) == 0x2028) {
225 needs_escapes = true;
226 escapes += std::strlen("\\u2028") - 1;
227 } else if (static_cast<int>(c) == 0x2029) {
228 needs_escapes = true;
229 escapes += std::strlen("\\u2029") - 1;
230 } else {
232 }
233 }
234 DCHECK(!in_character_class);
235 DCHECK_GE(escapes, 0);
236 DCHECK_IMPLIES(escapes != 0, needs_escapes);
237 *needs_escapes_out = needs_escapes;
238 return escapes;
239}
240
241template <typename Char>
242void WriteStringToCharVector(base::Vector<Char> v, int* d, const char* string) {
243 int s = 0;
244 while (string[s] != '\0') v[(*d)++] = string[s++];
245}
246
247template <typename Char, typename StringType>
248DirectHandle<StringType> WriteEscapedRegExpSource(
249 DirectHandle<String> source, DirectHandle<StringType> result) {
251 base::Vector<const Char> src = source->GetCharVector<Char>(no_gc);
252 base::Vector<Char> dst(result->GetChars(no_gc), result->length());
253 int s = 0;
254 int d = 0;
255 bool in_character_class = false;
256 while (s < src.length()) {
257 const Char c = src[s];
258 if (c == '\\') {
259 if (s + 1 < src.length() && IsLineTerminator(src[s + 1])) {
260 // This '\' is ignored since the next character itself will be escaped.
261 s++;
262 continue;
263 } else {
264 // Escape. Copy this and next character.
265 dst[d++] = src[s++];
266 }
267 if (s == src.length()) break;
268 } else if (c == '/' && !in_character_class) {
269 // Not escaped forward-slash needs escape.
270 dst[d++] = '\\';
271 } else if (c == '[') {
272 in_character_class = true;
273 } else if (c == ']') {
274 in_character_class = false;
275 } else if (c == '\n') {
276 WriteStringToCharVector(dst, &d, "\\n");
277 s++;
278 continue;
279 } else if (c == '\r') {
280 WriteStringToCharVector(dst, &d, "\\r");
281 s++;
282 continue;
283 } else if (static_cast<int>(c) == 0x2028) {
284 WriteStringToCharVector(dst, &d, "\\u2028");
285 s++;
286 continue;
287 } else if (static_cast<int>(c) == 0x2029) {
288 WriteStringToCharVector(dst, &d, "\\u2029");
289 s++;
290 continue;
291 } else {
293 }
294 dst[d++] = src[s++];
295 }
296 DCHECK_EQ(result->length(), d);
297 DCHECK(!in_character_class);
298 return result;
299}
300
301MaybeDirectHandle<String> EscapeRegExpSource(Isolate* isolate,
302 DirectHandle<String> source) {
303 DCHECK(source->IsFlat());
304 if (source->length() == 0) return isolate->factory()->query_colon_string();
305 bool one_byte = String::IsOneByteRepresentationUnderneath(*source);
306 bool needs_escapes = false;
307 int additional_escape_chars =
308 one_byte ? CountAdditionalEscapeChars<uint8_t>(source, &needs_escapes)
309 : CountAdditionalEscapeChars<base::uc16>(source, &needs_escapes);
310 if (!needs_escapes) return source;
311 int length = source->length() + additional_escape_chars;
312 if (one_byte) {
313 DirectHandle<SeqOneByteString> result;
315 isolate->factory()->NewRawOneByteString(length));
316 return WriteEscapedRegExpSource<uint8_t>(source, result);
317 } else {
318 DirectHandle<SeqTwoByteString> result;
320 isolate->factory()->NewRawTwoByteString(length));
321 return WriteEscapedRegExpSource<base::uc16>(source, result);
322 }
323}
324
325} // namespace
326
327// static
330 Flags flags,
331 uint32_t backtrack_limit) {
332 Isolate* isolate = regexp->GetIsolate();
333 Factory* factory = isolate->factory();
334 // If source is the empty string we set it to "(?:)" instead as
335 // suggested by ECMA-262, 5th, section 15.10.4.1.
336 if (source->length() == 0) source = factory->query_colon_string();
337
338 source = String::Flatten(isolate, source);
339
340 RETURN_ON_EXCEPTION(isolate, RegExp::Compile(isolate, regexp, source,
342 backtrack_limit));
343
344 DirectHandle<String> escaped_source;
345 ASSIGN_RETURN_ON_EXCEPTION(isolate, escaped_source,
346 EscapeRegExpSource(isolate, source));
347
348 regexp->set_source(*escaped_source);
349 regexp->set_flags(Smi::FromInt(flags));
350
351 Tagged<Map> map = regexp->map();
352 Tagged<Object> constructor = map->GetConstructor();
353 if (IsJSFunction(constructor) &&
354 Cast<JSFunction>(constructor)->initial_map() == map) {
355 // If we still have the original map, set in-object properties directly.
356 regexp->InObjectPropertyAtPut(JSRegExp::kLastIndexFieldIndex,
359 } else {
360 // Map has changed, so use generic, but slower, method.
362 isolate,
364 isolate, regexp, factory->lastIndex_string(),
366 }
367
368 return regexp;
369}
370
372 if (type_tag() != Type::IRREGEXP) return false;
374 return re_data->has_latin1_code() || re_data->has_uc16_code();
375}
376
377// Only irregexps are subject to tier-up.
379 return v8_flags.regexp_tier_up && type_tag() == Type::IRREGEXP;
380}
381
382// An irregexp is considered to be marked for tier up if the tier-up ticks
383// value reaches zero.
385 if (!CanTierUp()) {
386 return false;
387 }
388
389 return ticks_until_tier_up() == 0;
390}
391
393 DCHECK(v8_flags.regexp_tier_up);
394 int tier_up_ticks = ticks_until_tier_up();
395 set_ticks_until_tier_up(tier_up_ticks + 1);
396}
397
399 int tier_up_ticks = ticks_until_tier_up();
400 if (tier_up_ticks == 0) {
401 return;
402 }
403
404 set_ticks_until_tier_up(tier_up_ticks - 1);
405}
406
408 DCHECK(v8_flags.regexp_tier_up);
409 set_ticks_until_tier_up(0);
410}
411
413 return v8_flags.regexp_interpret_all ||
414 (v8_flags.regexp_tier_up && !MarkedForTierUp());
415}
416
419 clear_latin1_code();
420 clear_uc16_code();
421 clear_latin1_bytecode();
422 clear_uc16_bytecode();
423}
424
426 Isolate* isolate, Tagged<TrustedByteArray> bytecode) {
427 set_latin1_bytecode(bytecode);
428 set_uc16_bytecode(bytecode);
429
430 Tagged<Code> trampoline =
431 *BUILTIN_CODE(isolate, RegExpExperimentalTrampoline);
432 set_latin1_code(trampoline);
433 set_uc16_code(trampoline);
434}
435
436} // namespace v8::internal
#define BUILTIN_CODE(isolate, name)
Definition builtins.h:45
static FieldIndex ForDescriptor(Tagged< Map > map, InternalIndex descriptor_index)
base::uc32 Get(uint32_t index) const
Definition string-inl.h:390
void SetBytecodeForExperimental(Isolate *isolate, Tagged< TrustedByteArray > bytecode)
Definition js-regexp.cc:425
void DiscardCompiledCodeForSerialization()
Definition js-regexp.cc:417
static void SetContent(DirectHandle< JSArray > array, DirectHandle< FixedArrayBase > storage)
static DirectHandle< JSRegExpResultIndices > BuildIndices(Isolate *isolate, DirectHandle< RegExpMatchInfo > match_info, DirectHandle< Object > maybe_names)
Definition js-regexp.cc:18
static constexpr int kGroupsDescriptorIndex
Definition js-regexp.h:365
static const char * FlagsToString(Flags flags, FlagsBuffer *out_buffer)
static constexpr int kLastIndexFieldIndex
Definition js-regexp.h:114
static MaybeDirectHandle< JSRegExp > Initialize(DirectHandle< JSRegExp > regexp, DirectHandle< String > source, Flags flags, uint32_t backtrack_limit=kNoBacktrackLimit)
Definition js-regexp.cc:328
static constexpr RegExpFlags AsRegExpFlags(Flags f)
Definition js-regexp.h:57
static V8_EXPORT_PRIVATE DirectHandle< String > StringFromFlags(Isolate *isolate, Flags flags)
Definition js-regexp.cc:144
static V8_EXPORT_PRIVATE MaybeDirectHandle< JSRegExp > New(Isolate *isolate, DirectHandle< String > source, Flags flags, uint32_t backtrack_limit=kNoBacktrackLimit)
Definition js-regexp.cc:152
static constexpr Flags AsJSRegExpFlags(RegExpFlags f)
Definition js-regexp.h:54
static std::optional< RegExpFlag > FlagFromChar(char c)
Definition js-regexp.h:61
static std::optional< Flags > FlagsFromString(Isolate *isolate, DirectHandle< String > flags)
Definition js-regexp.cc:123
static constexpr int kInitialLastIndexValue
Definition js-regexp.h:111
V8_EXPORT_PRIVATE static V8_WARN_UNUSED_RESULT Maybe< bool > SetProperty(LookupIterator *it, DirectHandle< Object > value, StoreOrigin store_origin, Maybe< ShouldThrow > should_throw=Nothing< ShouldThrow >())
Definition objects.cc:2439
static constexpr PropertyDetails Empty(PropertyCellType cell_type=PropertyCellType::kNoCell)
V8_EXPORT_PRIVATE bool HasCompiledCode() const
Definition js-regexp.cc:371
static constexpr int capture_start_index(int capture_index)
static constexpr int capture_end_index(int capture_index)
static V8_EXPORT_PRIVATE bool VerifyFlags(RegExpFlags flags)
Definition regexp.cc:117
static V8_WARN_UNUSED_RESULT MaybeDirectHandle< Object > Compile(Isolate *isolate, DirectHandle< JSRegExp > re, DirectHandle< String > pattern, RegExpFlags flags, uint32_t backtrack_limit)
Definition regexp.cc:200
static constexpr Tagged< Smi > FromInt(int value)
Definition smi.h:38
static constexpr Tagged< Smi > zero()
Definition smi.h:99
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
static bool IsOneByteRepresentationUnderneath(Tagged< String > string)
Definition string-inl.h:373
#define V8_ENABLE_SWISS_NAME_DICTIONARY_BOOL
Definition globals.h:242
#define RETURN_ON_EXCEPTION(isolate, call)
Definition isolate.h:395
#define ASSIGN_RETURN_ON_EXCEPTION(isolate, dst, call)
Definition isolate.h:291
#define THROW_NEW_ERROR(isolate, call)
Definition isolate.h:307
std::string pattern
ZoneVector< RpoNumber > & result
int s
Definition mul-fft.cc:297
V8_INLINE bool IsLineTerminator(uchar c)
Definition unicode.h:267
unsigned int uchar
Definition unicode.h:21
uint16_t uc16
Definition strings.h:18
V8_EXPORT_PRIVATE base::Vector< Flag > Flags()
Definition flags.cc:300
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
@ SKIP_WRITE_BARRIER
Definition objects.h:52
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in name
Definition flags.cc:2086
V8_EXPORT_PRIVATE FlagValues v8_flags
return value
Definition map-inl.h:893
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485