v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
regexp.cc
Go to the documentation of this file.
1// Copyright 2012 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
5#include "src/regexp/regexp.h"
6
7#include "src/base/strings.h"
11#include "src/heap/heap-inl.h"
25#include "src/utils/ostreams.h"
26
27namespace v8 {
28namespace internal {
29
30using namespace regexp_compiler_constants; // NOLINT(build/namespaces)
31
32class RegExpImpl final : public AllStatic {
33 public:
34 // Returns a string representation of a regular expression.
35 // Implements RegExp.prototype.toString, see ECMA-262 section 15.10.6.4.
36 // This function calls the garbage collector if necessary.
38
39 // Prepares a JSRegExp object with Irregexp-specific data.
40 static void IrregexpInitialize(Isolate* isolate, DirectHandle<JSRegExp> re,
42 RegExpFlags flags, int capture_count,
43 uint32_t backtrack_limit);
44
45 // Prepare a RegExp for being executed one or more times (using
46 // IrregexpExecOnce) on the subject.
47 // This ensures that the regexp is compiled for the subject, and that
48 // the subject is flat.
49 // Returns the number of integer spaces required by IrregexpExecOnce
50 // as its "registers" argument. If the regexp cannot be compiled,
51 // an exception is thrown as indicated by a negative return value.
52 static int IrregexpPrepare(Isolate* isolate,
54 DirectHandle<String> subject);
55
56 static void AtomCompile(Isolate* isolate, DirectHandle<JSRegExp> re,
58 DirectHandle<String> match_pattern);
59
60 static int AtomExecRaw(Isolate* isolate,
62 DirectHandle<String> subject, int index,
63 int32_t* result_offsets_vector,
64 int result_offsets_vector_length);
65 static int AtomExecRaw(Isolate* isolate, const String::FlatContent& pattern,
66 const String::FlatContent& subject, int index,
67 RegExpFlags flags, int32_t* result_offsets_vector,
68 int result_offsets_vector_length,
69 const DisallowGarbageCollection& no_gc);
70
71 static int AtomExec(Isolate* isolate,
73 DirectHandle<String> subject, int index,
74 int32_t* result_offsets_vector,
75 int result_offsets_vector_length);
76
77 // Execute a regular expression on the subject, starting from index.
78 // If matching succeeds, return the number of matches. This can be larger
79 // than one in the case of global regular expressions.
80 // The captures and subcaptures are stored into the registers vector.
81 // If matching fails, returns RE_FAILURE.
82 // If execution fails, sets an exception and returns RE_EXCEPTION.
83 static int IrregexpExecRaw(Isolate* isolate,
85 DirectHandle<String> subject, int index,
86 int32_t* output, int output_size);
87
88 // Execute an Irregexp bytecode pattern. Returns the number of matches, or an
89 // empty handle in case of an exception.
90 V8_WARN_UNUSED_RESULT static std::optional<int> IrregexpExec(
91 Isolate* isolate, DirectHandle<IrRegExpData> regexp_data,
92 DirectHandle<String> subject, int index, int32_t* result_offsets_vector,
93 uint32_t result_offsets_vector_length);
94
95 static bool CompileIrregexp(Isolate* isolate,
97 DirectHandle<String> sample_subject,
98 bool is_one_byte);
99 static inline bool EnsureCompiledIrregexp(Isolate* isolate,
101 DirectHandle<String> sample_subject,
102 bool is_one_byte);
103
104 // Returns true on success, false on failure.
105 static bool Compile(Isolate* isolate, Zone* zone, RegExpCompileData* input,
107 DirectHandle<String> sample_subject, bool is_one_byte,
108 uint32_t& backtrack_limit);
109};
110
111// static
113 return v8_flags.regexp_interpret_all || v8_flags.regexp_tier_up;
114}
115
116// static
118 if (IsUnicode(flags) && IsUnicodeSets(flags)) return false;
119 return true;
120}
121
122// static
123template <class CharT>
124bool RegExp::VerifySyntax(Zone* zone, uintptr_t stack_limit, const CharT* input,
125 int input_length, RegExpFlags flags,
126 RegExpError* regexp_error_out,
127 const DisallowGarbageCollection& no_gc) {
129 bool pattern_is_valid = RegExpParser::VerifyRegExpSyntax(
130 zone, stack_limit, input, input_length, flags, &data, no_gc);
131 *regexp_error_out = data.error;
132 return pattern_is_valid;
133}
134
135template bool RegExp::VerifySyntax<uint8_t>(Zone*, uintptr_t, const uint8_t*,
136 int, RegExpFlags,
137 RegExpError* regexp_error_out,
139template bool RegExp::VerifySyntax<base::uc16>(
140 Zone*, uintptr_t, const base::uc16*, int, RegExpFlags,
141 RegExpError* regexp_error_out, const DisallowGarbageCollection&);
142
145 RegExpError error) {
146 base::Vector<const char> error_data =
148 DirectHandle<String> error_text =
149 isolate->factory()
150 ->NewStringFromOneByte(base::Vector<const uint8_t>::cast(error_data))
151 .ToHandleChecked();
152 DirectHandle<String> flag_string =
154 THROW_NEW_ERROR(isolate, NewSyntaxError(MessageTemplate::kMalformedRegExp,
155 pattern, flag_string, error_text));
156}
157
160 RegExpError error_text) {
161 USE(ThrowRegExpException(isolate, JSRegExp::AsRegExpFlags(re_data->flags()),
162 direct_handle(re_data->source(), isolate),
163 error_text));
164}
165
167 DirectHandle<JSRegExp> regexp) {
168 return RegExpUtils::IsUnmodifiedRegExp(isolate, regexp);
169}
170
171namespace {
172
173// Identifies the sort of regexps where the regexp engine is faster
174// than the code used for atom matches.
175bool HasFewDifferentCharacters(DirectHandle<String> pattern) {
176 uint32_t length = std::min(kMaxLookaheadForBoyerMoore, pattern->length());
177 if (length <= kPatternTooShortForBoyerMoore) return false;
178 const int kMod = 128;
179 bool character_found[kMod];
180 uint32_t different = 0;
181 memset(&character_found[0], 0, sizeof(character_found));
182 for (uint32_t i = 0; i < length; i++) {
183 int ch = (pattern->Get(i) & (kMod - 1));
184 if (!character_found[ch]) {
185 character_found[ch] = true;
186 different++;
187 // We declare a regexp low-alphabet if it has at least 3 times as many
188 // characters as it has different characters.
189 if (different * 3 > length) return false;
190 }
191 }
192 return true;
193}
194
195} // namespace
196
197// Generic RegExp methods. Dispatches to implementation specific methods.
198
199// static
203 RegExpFlags flags,
204 uint32_t backtrack_limit) {
205 DCHECK(pattern->IsFlat());
206
207 // Caching is based only on the pattern and flags, but code also differs when
208 // a backtrack limit is set. A present backtrack limit is very much *not* the
209 // common case, so just skip the cache for these.
210 const bool is_compilation_cache_enabled =
211 (backtrack_limit == JSRegExp::kNoBacktrackLimit);
212
213 Zone zone(isolate->allocator(), ZONE_NAME);
214 CompilationCache* compilation_cache = nullptr;
215 if (is_compilation_cache_enabled) {
216 compilation_cache = isolate->compilation_cache();
217 MaybeDirectHandle<RegExpData> maybe_cached =
218 compilation_cache->LookupRegExp(pattern,
221 if (maybe_cached.ToHandle(&cached)) {
222 re->set_data(*cached);
223 return re;
224 }
225 }
226
227 PostponeInterruptsScope postpone(isolate);
228 RegExpCompileData parse_result;
229 DCHECK(!isolate->has_exception());
230 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
231 &parse_result)) {
232 // Throw an exception if we fail to parse the pattern.
233 return RegExp::ThrowRegExpException(isolate, flags, pattern,
234 parse_result.error);
235 }
236
237 bool has_been_compiled = false;
238
239 if (v8_flags.default_to_experimental_regexp_engine &&
240 ExperimentalRegExp::CanBeHandled(parse_result.tree, pattern, flags,
241 parse_result.capture_count)) {
242 DCHECK(v8_flags.enable_experimental_regexp_engine);
243 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
244 parse_result.capture_count);
245 has_been_compiled = true;
246 } else if (flags & JSRegExp::kLinear) {
247 DCHECK(v8_flags.enable_experimental_regexp_engine);
248 if (!ExperimentalRegExp::CanBeHandled(parse_result.tree, pattern, flags,
249 parse_result.capture_count)) {
250 // TODO(mbid): The error could provide a reason for why the regexp can't
251 // be executed in linear time (e.g. due to back references).
252 return RegExp::ThrowRegExpException(isolate, flags, pattern,
253 RegExpError::kNotLinear);
254 }
255 ExperimentalRegExp::Initialize(isolate, re, pattern, flags,
256 parse_result.capture_count);
257 has_been_compiled = true;
258 } else if (parse_result.simple && !IsIgnoreCase(flags) && !IsSticky(flags) &&
259 !HasFewDifferentCharacters(pattern)) {
260 // Parse-tree is a single atom that is equal to the pattern.
261 RegExpImpl::AtomCompile(isolate, re, pattern, flags, pattern);
262 has_been_compiled = true;
263 } else if (parse_result.tree->IsAtom() && !IsSticky(flags) &&
264 parse_result.capture_count == 0) {
265 RegExpAtom* atom = parse_result.tree->AsAtom();
266 // The pattern source might (?) contain escape sequences, but they're
267 // resolved in atom_string.
268 base::Vector<const base::uc16> atom_pattern = atom->data();
269 DirectHandle<String> atom_string;
271 isolate, atom_string,
272 isolate->factory()->NewStringFromTwoByte(atom_pattern));
273 if (!IsIgnoreCase(flags) && !HasFewDifferentCharacters(atom_string)) {
274 RegExpImpl::AtomCompile(isolate, re, pattern, flags, atom_string);
275 has_been_compiled = true;
276 }
277 }
278 if (!has_been_compiled) {
279 RegExpImpl::IrregexpInitialize(isolate, re, pattern, flags,
280 parse_result.capture_count, backtrack_limit);
281 }
282 // Compilation succeeded so the data is set on the regexp
283 // and we can store it in the cache.
284 DirectHandle<RegExpData> data(re->data(isolate), isolate);
285 if (is_compilation_cache_enabled) {
286 compilation_cache->PutRegExp(pattern, JSRegExp::AsJSRegExpFlags(flags),
287 data);
288 }
289
290 return re;
291}
292
293// static
296 DirectHandle<String> subject) {
297 switch (re_data->type_tag()) {
299 return true;
302 subject) == -1) {
303 DCHECK(isolate->has_exception());
304 return false;
305 }
306 return true;
309 isolate) &&
311 DCHECK(isolate->has_exception());
312 return false;
313 }
314 return true;
315 }
316 UNREACHABLE();
317}
318
319// static
321 Isolate* isolate, DirectHandle<JSRegExp> regexp,
322 DirectHandle<String> subject, int index, int32_t* result_offsets_vector,
323 uint32_t result_offsets_vector_length) {
324 DirectHandle<RegExpData> data(regexp->data(isolate), isolate);
327 subject, index, result_offsets_vector,
328 result_offsets_vector_length);
329}
330
331// static
332std::optional<int> RegExp::Exec(Isolate* isolate, DirectHandle<JSRegExp> regexp,
333 DirectHandle<String> subject, int index,
334 int32_t* result_offsets_vector,
335 uint32_t result_offsets_vector_length) {
336 DirectHandle<RegExpData> data(regexp->data(isolate), isolate);
337 switch (data->type_tag()) {
339 return RegExpImpl::AtomExec(isolate, Cast<AtomRegExpData>(data), subject,
340 index, result_offsets_vector,
341 result_offsets_vector_length);
343 return RegExpImpl::IrregexpExec(isolate, Cast<IrRegExpData>(data),
344 subject, index, result_offsets_vector,
345 result_offsets_vector_length);
347 return ExperimentalRegExp::Exec(isolate, Cast<IrRegExpData>(data),
348 subject, index, result_offsets_vector,
349 result_offsets_vector_length);
350 }
351 // This UNREACHABLE() is necessary because we don't return a value here,
352 // which causes the compiler to emit potentially unsafe code for the switch
353 // above. See the commit message and b/326086002 for more details.
354 UNREACHABLE();
355}
356
357// static
359 Isolate* isolate, DirectHandle<JSRegExp> regexp,
360 DirectHandle<String> subject, int index,
361 DirectHandle<RegExpMatchInfo> last_match_info) {
362 RegExpStackScope stack_scope(isolate);
363 DirectHandle<RegExpData> data(regexp->data(isolate), isolate);
364 int capture_count = data->capture_count();
365 int result_offsets_vector_length =
367 RegExpResultVectorScope result_vector_scope(isolate,
368 result_offsets_vector_length);
369 std::optional<int> result =
370 RegExp::Exec(isolate, regexp, subject, index, result_vector_scope.value(),
371 result_offsets_vector_length);
372 DCHECK_EQ(!result, isolate->has_exception());
373 if (!result) return {};
374
375 if (result.value() == 0) {
376 return isolate->factory()->null_value();
377 }
378
379 DCHECK_EQ(result.value(), 1);
380 return RegExp::SetLastMatchInfo(isolate, last_match_info, subject,
381 capture_count, result_vector_scope.value());
382}
383
384// RegExp Atom implementation: Simple string search using indexOf.
385
388 DirectHandle<String> match_pattern) {
389 isolate->factory()->SetRegExpAtomData(
390 re, pattern, JSRegExp::AsJSRegExpFlags(flags), match_pattern);
391}
392
393namespace {
394
395template <typename SChar, typename PChar>
396int AtomExecRawImpl(Isolate* isolate, base::Vector<const SChar> subject,
398 RegExpFlags flags, int32_t* output, int output_size,
399 const DisallowGarbageCollection& no_gc) {
400 const int subject_length = subject.length();
401 const int pattern_length = pattern.length();
402 DCHECK_GT(pattern_length, 0);
403 const int max_index = subject_length - pattern_length;
404
405 StringSearch<PChar, SChar> search(isolate, pattern);
406 for (int i = 0; i < output_size; i += JSRegExp::kAtomRegisterCount) {
407 if constexpr (std::is_same_v<SChar, uint16_t>) {
408 if (index > 0 && index < subject_length &&
410 // See https://github.com/tc39/ecma262/issues/128 and
411 // https://codereview.chromium.org/1608693003.
412 if (unibrow::Utf16::IsTrailSurrogate(subject[index]) &&
413 unibrow::Utf16::IsLeadSurrogate(subject[index - 1])) {
414 index--;
415 }
416 }
417 }
418
419 if (index > max_index) {
420 static_assert(RegExp::RE_FAILURE == 0);
421 return i / JSRegExp::kAtomRegisterCount; // Return number of matches.
422 }
423 index = search.Search(subject, index);
424 if (index == -1) {
425 static_assert(RegExp::RE_FAILURE == 0);
426 return i / JSRegExp::kAtomRegisterCount; // Return number of matches.
427 } else {
428 output[i] = index; // match start
429 index += pattern_length;
430 output[i + 1] = index; // match end
431 }
432 }
433
434 return output_size / JSRegExp::kAtomRegisterCount;
435}
436
437} // namespace
438
439// static
442 DirectHandle<String> subject, int index,
443 int32_t* result_offsets_vector,
444 int result_offsets_vector_length) {
445 subject = String::Flatten(isolate, subject);
446
448 Tagged<String> needle = regexp_data->pattern(isolate);
449 RegExpFlags flags = JSRegExp::AsRegExpFlags(regexp_data->flags());
450 String::FlatContent needle_content = needle->GetFlatContent(no_gc);
451 String::FlatContent subject_content = subject->GetFlatContent(no_gc);
452 return AtomExecRaw(isolate, needle_content, subject_content, index, flags,
453 result_offsets_vector, result_offsets_vector_length,
454 no_gc);
455}
456
457// static
460 const String::FlatContent& subject, int index,
461 RegExpFlags flags, int32_t* result_offsets_vector,
462 int result_offsets_vector_length,
463 const DisallowGarbageCollection& no_gc) {
464 DCHECK_GE(index, 0);
465 DCHECK_LE(index, subject.length());
466 CHECK_EQ(result_offsets_vector_length % JSRegExp::kAtomRegisterCount, 0);
467 DCHECK(pattern.IsFlat());
468 DCHECK(subject.IsFlat());
469
470 return pattern.IsOneByte()
471 ? (subject.IsOneByte()
472 ? AtomExecRawImpl(isolate, subject.ToOneByteVector(),
473 pattern.ToOneByteVector(), index, flags,
474 result_offsets_vector,
475 result_offsets_vector_length, no_gc)
476 : AtomExecRawImpl(isolate, subject.ToUC16Vector(),
477 pattern.ToOneByteVector(), index, flags,
478 result_offsets_vector,
479 result_offsets_vector_length, no_gc))
480 : (subject.IsOneByte()
481 ? AtomExecRawImpl(isolate, subject.ToOneByteVector(),
482 pattern.ToUC16Vector(), index, flags,
483 result_offsets_vector,
484 result_offsets_vector_length, no_gc)
485 : AtomExecRawImpl(isolate, subject.ToUC16Vector(),
486 pattern.ToUC16Vector(), index, flags,
487 result_offsets_vector,
488 result_offsets_vector_length, no_gc));
489}
490
491// static
492intptr_t RegExp::AtomExecRaw(Isolate* isolate,
493 Address /* AtomRegExpData */ data_address,
494 Address /* String */ subject_address,
495 int32_t index, int32_t* result_offsets_vector,
496 int32_t result_offsets_vector_length) {
498
500 auto data = Cast<AtomRegExpData>(Tagged<Object>(data_address));
501 auto subject = Cast<String>(Tagged<Object>(subject_address));
502
503 Tagged<String> pattern = data->pattern(isolate);
504 RegExpFlags flags = JSRegExp::AsRegExpFlags(data->flags());
505 String::FlatContent pattern_content = pattern->GetFlatContent(no_gc);
506 String::FlatContent subject_content = subject->GetFlatContent(no_gc);
507 return RegExpImpl::AtomExecRaw(isolate, pattern_content, subject_content,
508 index, flags, result_offsets_vector,
509 result_offsets_vector_length, no_gc);
510}
511
513 DirectHandle<String> subject, int index,
514 int32_t* result_offsets_vector,
515 int result_offsets_vector_length) {
516 int res = AtomExecRaw(isolate, re_data, subject, index, result_offsets_vector,
517 result_offsets_vector_length);
518
520 return res;
521}
522
523// Irregexp implementation.
524
525// Ensures that the regexp object contains a compiled version of the
526// source for either one-byte or two-byte subject strings.
527// If the compiled version doesn't already exist, it is compiled
528// from the source pattern.
529// If compilation fails, an exception is thrown and this function
530// returns false.
533 DirectHandle<String> sample_subject,
534 bool is_one_byte) {
535 bool has_bytecode = re_data->has_bytecode(is_one_byte);
536 bool needs_initial_compilation = !re_data->has_code(is_one_byte);
537 // Recompile is needed when we're dealing with the first execution of the
538 // regexp after the decision to tier up has been made. If the tiering up
539 // strategy is not in use, this value is always false.
540 bool needs_tier_up_compilation = re_data->MarkedForTierUp() && has_bytecode;
541
542 if (v8_flags.trace_regexp_tier_up && needs_tier_up_compilation) {
543 PrintF("JSRegExp object (data: %p) needs tier-up compilation\n",
544 reinterpret_cast<void*>(re_data->ptr()));
545 }
546
547 if (!needs_initial_compilation && !needs_tier_up_compilation) {
548 DCHECK(re_data->has_code(is_one_byte));
549 DCHECK_IMPLIES(v8_flags.regexp_interpret_all, has_bytecode);
550 return true;
551 }
552
553 DCHECK_IMPLIES(needs_tier_up_compilation, has_bytecode);
554
555 return CompileIrregexp(isolate, re_data, sample_subject, is_one_byte);
556}
557
558namespace {
559
560#ifdef DEBUG
561bool RegExpCodeIsValidForPreCompilation(IsolateForSandbox isolate,
563 bool is_one_byte) {
564 bool has_code = re_data->has_code(is_one_byte);
565 bool has_bytecode = re_data->has_bytecode(is_one_byte);
566 if (re_data->ShouldProduceBytecode()) {
567 DCHECK(!has_code);
568 DCHECK(!has_bytecode);
569 } else {
570 DCHECK_IMPLIES(has_code, has_bytecode);
571 }
572
573 return true;
574}
575#endif
576
577struct RegExpCaptureIndexLess {
578 bool operator()(const RegExpCapture* lhs, const RegExpCapture* rhs) const {
579 DCHECK_NOT_NULL(lhs);
580 DCHECK_NOT_NULL(rhs);
581 return lhs->index() < rhs->index();
582 }
583};
584
585} // namespace
586
587// static
589 Isolate* isolate, ZoneVector<RegExpCapture*>* named_captures) {
590 if (named_captures == nullptr) return DirectHandle<FixedArray>();
591
592 DCHECK(!named_captures->empty());
593
594 // Named captures are sorted by name (because the set is used to ensure
595 // name uniqueness). But the capture name map must to be sorted by index.
596
597 std::sort(named_captures->begin(), named_captures->end(),
598 RegExpCaptureIndexLess{});
599
600 int len = static_cast<int>(named_captures->size()) * 2;
601 DirectHandle<FixedArray> array = isolate->factory()->NewFixedArray(len);
602
603 int i = 0;
604 for (const RegExpCapture* capture : *named_captures) {
605 base::Vector<const base::uc16> capture_name(capture->name()->data(),
606 capture->name()->size());
607 // CSA code in ConstructNewResultFromMatchInfo requires these strings to be
608 // internalized so they can be used as property names in the 'exec' results.
610 isolate->factory()->InternalizeString(capture_name);
611 array->set(i * 2, *name);
612 array->set(i * 2 + 1, Smi::FromInt(capture->index()));
613
614 i++;
615 }
616 DCHECK_EQ(i * 2, len);
617
618 return array;
619}
620
623 DirectHandle<String> sample_subject,
624 bool is_one_byte) {
625 // Since we can't abort gracefully during compilation, check for sufficient
626 // stack space (including the additional gap as used for Turbofan
627 // compilation) here in advance.
628 StackLimitCheck check(isolate);
629 if (check.JsHasOverflowed(kStackSpaceRequiredForCompilation * KB)) {
630 if (v8_flags.correctness_fuzzer_suppressions) {
631 FATAL("Aborting on stack overflow");
632 }
633 RegExp::ThrowRegExpException(isolate, re_data,
634 RegExpError::kAnalysisStackOverflow);
635 return false;
636 }
637
638 // Compile the RegExp.
639 Zone zone(isolate->allocator(), ZONE_NAME);
640 PostponeInterruptsScope postpone(isolate);
641
642 DCHECK(RegExpCodeIsValidForPreCompilation(isolate, re_data, is_one_byte));
643
644 RegExpFlags flags = JSRegExp::AsRegExpFlags(re_data->flags());
645
646 DirectHandle<String> pattern(re_data->source(), isolate);
647 pattern = String::Flatten(isolate, pattern);
648 RegExpCompileData compile_data;
649 if (!RegExpParser::ParseRegExpFromHeapString(isolate, &zone, pattern, flags,
650 &compile_data)) {
651 // Throw an exception if we fail to parse the pattern.
652 // THIS SHOULD NOT HAPPEN. We already pre-parsed it successfully once.
654 compile_data.error));
655 return false;
656 }
657 // The compilation target is a kBytecode if we're interpreting all regexp
658 // objects, or if we're using the tier-up strategy but the tier-up hasn't
659 // happened yet. The compilation target is a kNative if we're using the
660 // tier-up strategy and we need to recompile to tier-up, or if we're producing
661 // native code for all regexp objects.
662 compile_data.compilation_target = re_data->ShouldProduceBytecode()
665 uint32_t backtrack_limit = re_data->backtrack_limit();
666 const bool compilation_succeeded =
667 Compile(isolate, &zone, &compile_data, flags, pattern, sample_subject,
668 is_one_byte, backtrack_limit);
669 if (!compilation_succeeded) {
670 DCHECK(compile_data.error != RegExpError::kNone);
671 RegExp::ThrowRegExpException(isolate, re_data, compile_data.error);
672 return false;
673 }
674
676 re_data->set_code(is_one_byte, Cast<Code>(*compile_data.code));
677
678 // Reset bytecode to uninitialized. In case we use tier-up we know that
679 // tier-up has happened this way.
680 re_data->clear_bytecode(is_one_byte);
681 } else {
682 DCHECK_EQ(compile_data.compilation_target,
684 // Store code generated by compiler in bytecode and trampoline to
685 // interpreter in code.
686 re_data->set_bytecode(is_one_byte,
687 Cast<TrustedByteArray>(*compile_data.code));
688 DirectHandle<Code> trampoline =
689 BUILTIN_CODE(isolate, RegExpInterpreterTrampoline);
690 re_data->set_code(is_one_byte, *trampoline);
691 }
693 RegExp::CreateCaptureNameMap(isolate, compile_data.named_captures);
694 re_data->set_capture_name_map(capture_name_map);
695 int register_max = re_data->max_register_count();
696 if (compile_data.register_count > register_max) {
697 re_data->set_max_register_count(compile_data.register_count);
698 }
699 re_data->set_backtrack_limit(backtrack_limit);
700
701 if (v8_flags.trace_regexp_tier_up) {
702 PrintF("JSRegExp data object %p %s size: %d\n",
703 reinterpret_cast<void*>(re_data->ptr()),
704 re_data->ShouldProduceBytecode() ? "bytecode" : "native code",
705 re_data->ShouldProduceBytecode()
706 ? re_data->bytecode(is_one_byte)->AllocatedSize()
707 : re_data->code(isolate, is_one_byte)->Size());
708 }
709
710 return true;
711}
712
715 RegExpFlags flags, int capture_count,
716 uint32_t backtrack_limit) {
717 // Initialize compiled code entries to null.
718 isolate->factory()->SetRegExpIrregexpData(re, pattern,
720 capture_count, backtrack_limit);
721}
722
723// static
726 DirectHandle<String> subject) {
727 DCHECK(subject->IsFlat());
728
729 // Check representation of the underlying storage.
730 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
731 if (!RegExpImpl::EnsureCompiledIrregexp(isolate, re_data, subject,
732 is_one_byte)) {
733 return -1;
734 }
735
736 // Only reserve room for output captures. Internal registers are allocated by
737 // the engine.
738 return JSRegExp::RegistersForCaptureCount(re_data->capture_count());
739}
740
742 DirectHandle<IrRegExpData> regexp_data,
743 DirectHandle<String> subject, int index,
744 int32_t* output, int output_size) {
745 DCHECK_LE(0, index);
746 DCHECK_LE(index, subject->length());
747 DCHECK(subject->IsFlat());
748 DCHECK_GE(output_size,
749 JSRegExp::RegistersForCaptureCount(regexp_data->capture_count()));
750
751 bool is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
752
753 if (!regexp_data->ShouldProduceBytecode()) {
754 do {
755 EnsureCompiledIrregexp(isolate, regexp_data, subject, is_one_byte);
756 // The stack is used to allocate registers for the compiled regexp code.
757 // This means that in case of failure, the output registers array is left
758 // untouched and contains the capture results from the previous successful
759 // match. We can use that to set the last match info lazily.
760 int res = NativeRegExpMacroAssembler::Match(regexp_data, subject, output,
761 output_size, index, isolate);
764 isolate->has_exception());
765 static_assert(static_cast<int>(NativeRegExpMacroAssembler::SUCCESS) ==
767 static_assert(static_cast<int>(NativeRegExpMacroAssembler::FAILURE) ==
769 static_assert(static_cast<int>(NativeRegExpMacroAssembler::EXCEPTION) ==
771 return res;
772 }
773 // If result is RETRY, the string has changed representation, and we
774 // must restart from scratch.
775 // In this case, it means we must make sure we are prepared to handle
776 // the, potentially, different subject (the string can switch between
777 // being internal and external, and even between being Latin1 and
778 // UC16, but the characters are always the same).
779 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
780 } while (true);
781 UNREACHABLE();
782 } else {
783 DCHECK(regexp_data->ShouldProduceBytecode());
784
785 do {
787 isolate, regexp_data, subject, output, output_size, index);
789 isolate->has_exception());
790
791 static_assert(IrregexpInterpreter::FAILURE == 0);
792 static_assert(IrregexpInterpreter::SUCCESS == 1);
794 static_assert(IrregexpInterpreter::EXCEPTION < 0);
795 static_assert(IrregexpInterpreter::RETRY < 0);
797 return result;
798 }
799
801 // The string has changed representation, and we must restart the
802 // match. We need to reset the tier up to start over with compilation.
803 if (v8_flags.regexp_tier_up) regexp_data->ResetLastTierUpTick();
804 is_one_byte = String::IsOneByteRepresentationUnderneath(*subject);
805 EnsureCompiledIrregexp(isolate, regexp_data, subject, is_one_byte);
806 } else {
809 return result;
810 }
811 } while (true);
812 UNREACHABLE();
813 }
814}
815
816std::optional<int> RegExpImpl::IrregexpExec(
817 Isolate* isolate, DirectHandle<IrRegExpData> regexp_data,
818 DirectHandle<String> subject, int previous_index,
819 int32_t* result_offsets_vector, uint32_t result_offsets_vector_length) {
820 subject = String::Flatten(isolate, subject);
821
822#ifdef DEBUG
823 if (v8_flags.trace_regexp_bytecodes && regexp_data->ShouldProduceBytecode()) {
824 PrintF("\n\nRegexp match: /%s/\n\n",
825 regexp_data->source()->ToCString().get());
826 PrintF("\n\nSubject string: '%s'\n\n", subject->ToCString().get());
827 }
828#endif
829
830 const int original_register_count =
831 JSRegExp::RegistersForCaptureCount(regexp_data->capture_count());
832
833 // Maybe force early tier up:
834 if (v8_flags.regexp_tier_up) {
835 if (subject->length() >= JSRegExp::kTierUpForSubjectLengthValue) {
836 // For very long subject strings, the regexp interpreter is currently much
837 // slower than the jitted code execution. If the tier-up strategy is
838 // turned on, we want to avoid this performance penalty so we eagerly
839 // tier-up if the subject string length is equal or greater than the given
840 // heuristic value.
841 regexp_data->MarkTierUpForNextExec();
842 if (v8_flags.trace_regexp_tier_up) {
843 PrintF(
844 "Forcing tier-up for very long strings in "
845 "RegExpImpl::IrregexpExec\n");
846 }
847 } else if (static_cast<uint32_t>(original_register_count) <
848 result_offsets_vector_length) {
849 // Tier up because the interpreter doesn't do global execution.
850 Cast<IrRegExpData>(regexp_data)->MarkTierUpForNextExec();
851 if (v8_flags.trace_regexp_tier_up) {
852 PrintF(
853 "Forcing tier-up of RegExpData object %p for global irregexp "
854 "mode\n",
855 reinterpret_cast<void*>(regexp_data->ptr()));
856 }
857 }
858 }
859
860 int output_register_count =
861 RegExpImpl::IrregexpPrepare(isolate, regexp_data, subject);
862 if (output_register_count < 0) {
863 DCHECK(isolate->has_exception());
864 return {};
865 }
866
867 // TODO(jgruber): Consider changing these into DCHECKs once we're convinced
868 // the conditions hold.
869 CHECK_EQ(original_register_count, output_register_count);
870 CHECK_LE(static_cast<uint32_t>(output_register_count),
871 result_offsets_vector_length);
872
873 RegExpStackScope stack_scope(isolate);
874
875 int res = RegExpImpl::IrregexpExecRaw(isolate, regexp_data, subject,
876 previous_index, result_offsets_vector,
877 result_offsets_vector_length);
878
879 if (res >= RegExp::RE_SUCCESS) {
880 DCHECK_LE(res * output_register_count, result_offsets_vector_length);
881 return res;
882 } else if (res == RegExp::RE_FALLBACK_TO_EXPERIMENTAL) {
884 isolate, regexp_data, subject, previous_index, result_offsets_vector,
885 result_offsets_vector_length);
886 } else if (res == RegExp::RE_EXCEPTION) {
887 DCHECK(isolate->has_exception());
888 return {};
889 } else {
891 return 0;
892 }
893}
894
895// static
897 Isolate* isolate, DirectHandle<RegExpMatchInfo> last_match_info,
898 DirectHandle<String> subject, int capture_count, int32_t* match) {
900 RegExpMatchInfo::ReserveCaptures(isolate, last_match_info, capture_count);
901 if (*result != *last_match_info) {
902 if (*last_match_info == *isolate->regexp_last_match_info()) {
903 // This inner condition is only needed for special situations like the
904 // regexp fuzzer, where we pass our own custom RegExpMatchInfo to
905 // RegExpImpl::Exec; there actually want to bypass the Isolate's match
906 // info and execute the regexp without side effects.
907 isolate->native_context()->set_regexp_last_match_info(*result);
908 }
909 }
910
911 int capture_register_count =
914 if (match != nullptr) {
915 for (int i = 0; i < capture_register_count; i += 2) {
916 result->set_capture(i, match[i]);
917 result->set_capture(i + 1, match[i + 1]);
918 }
919 }
920 result->set_last_subject(*subject);
921 result->set_last_input(*subject);
922 return result;
923}
924
925// static
928}
929
930namespace {
931
932// Returns true if we've either generated too much irregex code within this
933// isolate, or the pattern string is too long.
934bool TooMuchRegExpCode(Isolate* isolate, DirectHandle<String> pattern) {
935 // Limit the space regexps take up on the heap. In order to limit this we
936 // would like to keep track of the amount of regexp code on the heap. This
937 // is not tracked, however. As a conservative approximation we track the
938 // total regexp code compiled including code that has subsequently been freed
939 // and the total executable memory at any point.
940 static constexpr size_t kRegExpExecutableMemoryLimit = 16 * MB;
941 static constexpr size_t kRegExpCompiledLimit = 1 * MB;
942
943 Heap* heap = isolate->heap();
944 if (pattern->length() > RegExp::kRegExpTooLargeToOptimize) return true;
945 return (isolate->total_regexp_code_generated() > kRegExpCompiledLimit &&
946 heap->CommittedMemoryExecutable() > kRegExpExecutableMemoryLimit);
947}
948
949} // namespace
950
951// static
953 RegExpCompileData* data, RegExpFlags flags,
955 DirectHandle<String> sample_subject,
956 bool is_one_byte) {
957 uint32_t backtrack_limit = JSRegExp::kNoBacktrackLimit;
958 return RegExpImpl::Compile(isolate, zone, data, flags, pattern,
959 sample_subject, is_one_byte, backtrack_limit);
960}
961
964 DirectHandle<String> sample_subject, bool is_one_byte,
965 uint32_t& backtrack_limit) {
966 if (JSRegExp::RegistersForCaptureCount(data->capture_count) >
968 data->error = RegExpError::kTooLarge;
969 return false;
970 }
971
972 RegExpCompiler compiler(isolate, zone, data->capture_count, flags,
973 is_one_byte);
974
975 if (compiler.optimize()) {
976 compiler.set_optimize(!TooMuchRegExpCode(isolate, pattern));
977 }
978
979 // Sample some characters from the middle of the string.
980 static const int kSampleSize = 128;
981
982 sample_subject = String::Flatten(isolate, sample_subject);
983 uint32_t start, end;
984 if (sample_subject->length() > kSampleSize) {
985 start = (sample_subject->length() - kSampleSize) / 2;
986 end = start + kSampleSize;
987 } else {
988 start = 0;
989 end = sample_subject->length();
990 }
991 for (uint32_t i = start; i < end; i++) {
992 compiler.frequency_collator()->CountCharacter(sample_subject->Get(i));
993 }
994
995 data->node = compiler.PreprocessRegExp(data, is_one_byte);
996 if (data->error != RegExpError::kNone) {
997 return false;
998 }
999 data->error = AnalyzeRegExp(isolate, is_one_byte, flags, data->node);
1000 if (data->error != RegExpError::kNone) {
1001 return false;
1002 }
1003
1004 if (v8_flags.trace_regexp_graph) DotPrinter::DotPrint("Start", data->node);
1005
1006 // Create the correct assembler for the architecture.
1007 std::unique_ptr<RegExpMacroAssembler> macro_assembler;
1008 if (data->compilation_target == RegExpCompilationTarget::kNative) {
1009 // Native regexp implementation.
1010 DCHECK(!v8_flags.jitless);
1011
1015
1016 const int output_register_count =
1017 JSRegExp::RegistersForCaptureCount(data->capture_count);
1018#if V8_TARGET_ARCH_IA32
1019 macro_assembler.reset(new RegExpMacroAssemblerIA32(isolate, zone, mode,
1020 output_register_count));
1021#elif V8_TARGET_ARCH_X64
1022 macro_assembler.reset(new RegExpMacroAssemblerX64(isolate, zone, mode,
1023 output_register_count));
1024#elif V8_TARGET_ARCH_ARM
1025 macro_assembler.reset(new RegExpMacroAssemblerARM(isolate, zone, mode,
1026 output_register_count));
1027#elif V8_TARGET_ARCH_ARM64
1028 macro_assembler.reset(new RegExpMacroAssemblerARM64(isolate, zone, mode,
1029 output_register_count));
1030#elif V8_TARGET_ARCH_S390X
1031 macro_assembler.reset(new RegExpMacroAssemblerS390(isolate, zone, mode,
1032 output_register_count));
1033#elif V8_TARGET_ARCH_PPC64
1034 macro_assembler.reset(new RegExpMacroAssemblerPPC(isolate, zone, mode,
1035 output_register_count));
1036#elif V8_TARGET_ARCH_MIPS64
1037 macro_assembler.reset(new RegExpMacroAssemblerMIPS(isolate, zone, mode,
1038 output_register_count));
1039#elif V8_TARGET_ARCH_RISCV64
1040 macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
1041 output_register_count));
1042#elif V8_TARGET_ARCH_RISCV32
1043 macro_assembler.reset(new RegExpMacroAssemblerRISCV(isolate, zone, mode,
1044 output_register_count));
1045#elif V8_TARGET_ARCH_LOONG64
1046 macro_assembler.reset(new RegExpMacroAssemblerLOONG64(
1047 isolate, zone, mode, output_register_count));
1048#else
1049#error "Unsupported architecture"
1050#endif
1051 } else {
1052 DCHECK_EQ(data->compilation_target, RegExpCompilationTarget::kBytecode);
1053 // Interpreted regexp implementation.
1054 macro_assembler.reset(new RegExpBytecodeGenerator(isolate, zone));
1055 }
1056
1057 macro_assembler->set_slow_safe(TooMuchRegExpCode(isolate, pattern));
1058 if (v8_flags.enable_experimental_regexp_engine_on_excessive_backtracks &&
1059 ExperimentalRegExp::CanBeHandled(data->tree, pattern, flags,
1060 data->capture_count)) {
1061 if (backtrack_limit == JSRegExp::kNoBacktrackLimit) {
1062 backtrack_limit = v8_flags.regexp_backtracks_before_fallback;
1063 } else {
1064 backtrack_limit = std::min(
1065 backtrack_limit, v8_flags.regexp_backtracks_before_fallback.value());
1066 }
1067 macro_assembler->set_backtrack_limit(backtrack_limit);
1068 macro_assembler->set_can_fallback(true);
1069 } else {
1070 macro_assembler->set_backtrack_limit(backtrack_limit);
1071 macro_assembler->set_can_fallback(false);
1072 }
1073
1074 // Inserted here, instead of in Assembler, because it depends on information
1075 // in the AST that isn't replicated in the Node structure.
1076 bool is_end_anchored = data->tree->IsAnchoredAtEnd();
1077 bool is_start_anchored = data->tree->IsAnchoredAtStart();
1078 int max_length = data->tree->max_match();
1079 static const int kMaxBacksearchLimit = 1024;
1080 if (is_end_anchored && !is_start_anchored && !IsSticky(flags) &&
1081 max_length < kMaxBacksearchLimit) {
1082 macro_assembler->SetCurrentPositionFromEnd(max_length);
1083 }
1084
1085 if (IsGlobal(flags)) {
1087 if (data->tree->min_match() > 0) {
1089 } else if (IsEitherUnicode(flags)) {
1091 }
1092 macro_assembler->set_global_mode(mode);
1093 }
1094
1095 RegExpMacroAssembler* macro_assembler_ptr = macro_assembler.get();
1096#ifdef DEBUG
1097 std::unique_ptr<RegExpMacroAssembler> tracer_macro_assembler;
1098 if (v8_flags.trace_regexp_assembler) {
1099 tracer_macro_assembler.reset(
1100 new RegExpMacroAssemblerTracer(isolate, macro_assembler_ptr));
1101 macro_assembler_ptr = tracer_macro_assembler.get();
1102 }
1103#endif
1104
1105 RegExpCompiler::CompilationResult result = compiler.Assemble(
1106 isolate, macro_assembler_ptr, data->node, data->capture_count, pattern);
1107
1108 // Code / bytecode printing.
1109 {
1110#ifdef ENABLE_DISASSEMBLER
1111 if (v8_flags.print_regexp_code &&
1112 data->compilation_target == RegExpCompilationTarget::kNative) {
1113 CodeTracer::Scope trace_scope(isolate->GetCodeTracer());
1114 OFStream os(trace_scope.file());
1115 auto code = Cast<Code>(result.code);
1116 std::unique_ptr<char[]> pattern_cstring = pattern->ToCString();
1117 code->Disassemble(pattern_cstring.get(), os, isolate);
1118 }
1119#endif
1120 if (v8_flags.print_regexp_bytecode &&
1121 data->compilation_target == RegExpCompilationTarget::kBytecode) {
1122 auto bytecode = Cast<TrustedByteArray>(result.code);
1123 std::unique_ptr<char[]> pattern_cstring = pattern->ToCString();
1124 RegExpBytecodeDisassemble(bytecode->begin(), bytecode->length(),
1125 pattern_cstring.get());
1126 }
1127 }
1128
1129 if (result.error != RegExpError::kNone) {
1130 if (v8_flags.correctness_fuzzer_suppressions &&
1131 result.error == RegExpError::kStackOverflow) {
1132 FATAL("Aborting on stack overflow");
1133 }
1134 data->error = result.error;
1135 }
1136
1137 data->code = result.code;
1138 data->register_count = result.num_registers;
1139
1140 return result.Succeeded();
1141}
1142
1145 Isolate* isolate)
1146 : result_vector_scope_(isolate),
1147 regexp_data_(regexp_data),
1148 subject_(subject),
1149 isolate_(isolate) {
1150 DCHECK(IsGlobal(JSRegExp::AsRegExpFlags(regexp_data->flags())));
1151
1152 switch (regexp_data_->type_tag()) {
1156 break;
1157 }
1161 if (registers_per_match_ < 0) {
1162 num_matches_ = -1; // Signal exception.
1163 return;
1164 }
1165 if (Cast<IrRegExpData>(regexp_data_)->ShouldProduceBytecode()) {
1166 // Global loop in interpreted regexp is not implemented. We choose the
1167 // size of the offsets vector so that it can only store one match.
1169 } else {
1170 register_array_size_ = std::max(
1172 }
1173 break;
1174 }
1177 isolate_) &&
1180 DCHECK(isolate->has_exception());
1181 num_matches_ = -1; // Signal exception.
1182 return;
1183 }
1185 Cast<IrRegExpData>(regexp_data_)->capture_count());
1186 register_array_size_ = std::max(
1188 break;
1189 }
1190 }
1191
1192 // Cache the result vector location.
1193
1195
1196 // Set state so that fetching the results the first time triggers a call
1197 // to the compiled regexp.
1200 DCHECK_LE(2, registers_per_match_); // Each match has at least one capture.
1202 int32_t* last_match =
1204 last_match[0] = -1;
1205 last_match[1] = 0;
1206}
1207
1210 static_cast<uint32_t>(last_index + 1) < subject_->length() &&
1211 unibrow::Utf16::IsLeadSurrogate(subject_->Get(last_index)) &&
1212 unibrow::Utf16::IsTrailSurrogate(subject_->Get(last_index + 1))) {
1213 // Advance over the surrogate pair.
1214 return last_index + 2;
1215 }
1216 return last_index + 1;
1217}
1218
1221
1223 // Current batch of results exhausted.
1224 // Fail if last batch was not even fully filled.
1225 if (num_matches_ < max_matches()) {
1226 num_matches_ = 0; // Signal failed match.
1227 return nullptr;
1228 }
1229
1230 int32_t* last_match =
1232 int last_end_index = last_match[1];
1233
1234 switch (regexp_data_->type_tag()) {
1238 last_end_index, register_array_, register_array_size_);
1239 break;
1242 isolate_));
1246 *subject_, register_array_, register_array_size_, last_end_index);
1247 break;
1248 }
1250 int last_start_index = last_match[0];
1251 if (last_start_index == last_end_index) {
1252 // Zero-length match. Advance by one code point.
1253 last_end_index = AdvanceZeroLength(last_end_index);
1254 }
1255 if (static_cast<uint32_t>(last_end_index) > subject_->length()) {
1256 num_matches_ = 0; // Signal failed match.
1257 return nullptr;
1258 }
1261 last_end_index, register_array_, register_array_size_);
1262 break;
1263 }
1264 }
1265
1266 // Fall back to experimental engine if needed and possible.
1270 register_array_size_, last_end_index);
1271 }
1272
1273 if (num_matches_ <= 0) {
1274 return nullptr;
1275 }
1276
1277 // Number of matches can't exceed maximum matches.
1278 // This check is enough to prevent OOB accesses to register_array_ in the
1279 // else branch below, since current_match_index < num_matches_ in this
1280 // branch, it follows that current_match_index < max_matches(). And since
1281 // max_matches() = register_array_size_ / registers_per_match it follows
1282 // that current_match_index * registers_per_match_ < register_array_size_.
1284
1286 return register_array_;
1287 } else {
1289 }
1290}
1291
1294 if (num_matches_ == 0) {
1295 // After a failed match we shift back by one result.
1296 index -= registers_per_match_;
1297 }
1298 return &register_array_[index];
1299}
1300
1302 Tagged<Object> key_pattern,
1303 Tagged<FixedArray>* last_match_cache,
1304 ResultsCacheType type) {
1305 if (V8_UNLIKELY(!v8_flags.regexp_results_cache)) return Smi::zero();
1306 Tagged<FixedArray> cache;
1307 if (!IsInternalizedString(key_string)) return Smi::zero();
1308 if (type == STRING_SPLIT_SUBSTRINGS) {
1309 DCHECK(IsString(key_pattern));
1310 if (!IsInternalizedString(key_pattern)) return Smi::zero();
1311 cache = heap->string_split_cache();
1312 } else {
1314 DCHECK(IsRegExpDataWrapper(key_pattern));
1315 cache = heap->regexp_multiple_cache();
1316 }
1317
1318 uint32_t hash = key_string->hash();
1319 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1321 if (cache->get(index + kStringOffset) != key_string ||
1322 cache->get(index + kPatternOffset) != key_pattern) {
1323 index =
1325 if (cache->get(index + kStringOffset) != key_string ||
1326 cache->get(index + kPatternOffset) != key_pattern) {
1327 return Smi::zero();
1328 }
1329 }
1330
1331 *last_match_cache = Cast<FixedArray>(cache->get(index + kLastMatchOffset));
1332 return cache->get(index + kArrayOffset);
1333}
1334
1336 DirectHandle<String> key_string,
1337 DirectHandle<Object> key_pattern,
1338 DirectHandle<FixedArray> value_array,
1339 DirectHandle<FixedArray> last_match_cache,
1340 ResultsCacheType type) {
1341 if (V8_UNLIKELY(!v8_flags.regexp_results_cache)) return;
1342 Factory* factory = isolate->factory();
1344 if (!IsInternalizedString(*key_string)) return;
1345 if (type == STRING_SPLIT_SUBSTRINGS) {
1346 DCHECK(IsString(*key_pattern));
1347 if (!IsInternalizedString(*key_pattern)) return;
1348 cache = factory->string_split_cache();
1349 } else {
1351 DCHECK(IsRegExpDataWrapper(*key_pattern));
1352 cache = factory->regexp_multiple_cache();
1353 }
1354
1355 uint32_t hash = key_string->hash();
1356 uint32_t index = ((hash & (kRegExpResultsCacheSize - 1)) &
1358 if (cache->get(index + kStringOffset) == Smi::zero()) {
1359 cache->set(index + kStringOffset, *key_string);
1360 cache->set(index + kPatternOffset, *key_pattern);
1361 cache->set(index + kArrayOffset, *value_array);
1362 cache->set(index + kLastMatchOffset, *last_match_cache);
1363 } else {
1364 uint32_t index2 =
1366 if (cache->get(index2 + kStringOffset) == Smi::zero()) {
1367 cache->set(index2 + kStringOffset, *key_string);
1368 cache->set(index2 + kPatternOffset, *key_pattern);
1369 cache->set(index2 + kArrayOffset, *value_array);
1370 cache->set(index2 + kLastMatchOffset, *last_match_cache);
1371 } else {
1372 cache->set(index2 + kStringOffset, Smi::zero());
1373 cache->set(index2 + kPatternOffset, Smi::zero());
1374 cache->set(index2 + kArrayOffset, Smi::zero());
1375 cache->set(index2 + kLastMatchOffset, Smi::zero());
1376 cache->set(index + kStringOffset, *key_string);
1377 cache->set(index + kPatternOffset, *key_pattern);
1378 cache->set(index + kArrayOffset, *value_array);
1379 cache->set(index + kLastMatchOffset, *last_match_cache);
1380 }
1381 }
1382 // If the array is a reasonably short list of substrings, convert it into a
1383 // list of internalized strings.
1384 if (type == STRING_SPLIT_SUBSTRINGS && value_array->length() < 100) {
1385 for (int i = 0; i < value_array->length(); i++) {
1386 DirectHandle<String> str(Cast<String>(value_array->get(i)), isolate);
1387 DirectHandle<String> internalized_str = factory->InternalizeString(str);
1388 value_array->set(i, *internalized_str);
1389 }
1390 }
1391 // Convert backing store to a copy-on-write array.
1392 value_array->set_map_no_write_barrier(
1393 isolate, ReadOnlyRoots(isolate).fixed_cow_array_map());
1394}
1395
1397 for (int i = 0; i < kRegExpResultsCacheSize; i++) {
1398 cache->set(i, Smi::zero());
1399 }
1400}
1401
1402// static
1404 Tagged<String> subject,
1406 int number_of_matches,
1407 int last_match_index) {
1409 DCHECK(Smi::IsValid(number_of_matches));
1410 DCHECK(Smi::IsValid(last_match_index));
1411 if (!IsSlicedString(subject)) return;
1412 Tagged<FixedArray> cache = isolate->heap()->regexp_match_global_atom_cache();
1413 DCHECK_EQ(cache->length(), kSize);
1414 cache->set(kSubjectIndex, subject);
1415 cache->set(kPatternIndex, pattern);
1416 cache->set(kNumberOfMatchesIndex, Smi::FromInt(number_of_matches));
1417 cache->set(kLastMatchIndexIndex, Smi::FromInt(last_match_index));
1418}
1419
1420// static
1422 Tagged<String> subject,
1424 int* number_of_matches_out,
1425 int* last_match_index_out) {
1427 Tagged<FixedArray> cache = isolate->heap()->regexp_match_global_atom_cache();
1428 DCHECK_EQ(cache->length(), kSize);
1429
1430 if (!IsSlicedString(subject)) return false;
1431 if (pattern != cache->get(kPatternIndex)) return false;
1432
1433 // Here we are looking for a subject slice that 1. starts at the same point
1434 // and 2. is of equal length or longer than the cached subject slice.
1435 Tagged<SlicedString> sliced_subject = Cast<SlicedString>(subject);
1436 Tagged<Object> cached_subject_object = cache->get(kSubjectIndex);
1437 if (!Is<SlicedString>(cached_subject_object)) {
1438 // Note while we insert only sliced strings, they may be converted into
1439 // other kinds, e.g. during GC or internalization.
1440 Clear(isolate->heap());
1441 return false;
1442 }
1443 auto cached_subject = Cast<SlicedString>(cached_subject_object);
1444 if (cached_subject->parent() != sliced_subject->parent()) return false;
1445 if (cached_subject->offset() != sliced_subject->offset()) return false;
1446 if (cached_subject->length() > sliced_subject->length()) return false;
1447
1448 *number_of_matches_out = Smi::ToInt(cache->get(kNumberOfMatchesIndex));
1449 *last_match_index_out = Smi::ToInt(cache->get(kLastMatchIndexIndex));
1450 return true;
1451}
1452
1454 MemsetTagged(heap->regexp_match_global_atom_cache()->RawFieldOfFirstElement(),
1455 Smi::zero(), kSize);
1456}
1457
1458std::ostream& operator<<(std::ostream& os, RegExpFlags flags) {
1459#define V(Lower, Camel, LowerCamel, Char, Bit) \
1460 if (flags & RegExpFlag::k##Camel) os << Char;
1462#undef V
1463 return os;
1464}
1465
1466} // namespace internal
1467} // namespace v8
Isolate * isolate_
union v8::internal::@341::BuiltinMetadata::KindSpecificData data
#define BUILTIN_CODE(isolate, name)
Definition builtins.h:45
#define SBXCHECK_LE(lhs, rhs)
Definition check.h:67
#define SBXCHECK(condition)
Definition check.h:61
static bool IsTrailSurrogate(int code)
Definition unicode.h:109
static bool IsLeadSurrogate(int code)
Definition unicode.h:106
int length() const
Definition vector.h:64
void PutRegExp(DirectHandle< String > source, JSRegExp::Flags flags, DirectHandle< RegExpData > data)
MaybeDirectHandle< RegExpData > LookupRegExp(DirectHandle< String > source, JSRegExp::Flags flags)
static void DotPrint(const char *label, RegExpNode *node)
static int32_t ExecRaw(Isolate *isolate, RegExp::CallOrigin call_origin, Tagged< IrRegExpData > regexp_data, Tagged< String > subject, int32_t *output_registers, int32_t output_register_count, int32_t subject_index)
static void Initialize(Isolate *isolate, DirectHandle< JSRegExp > re, DirectHandle< String > pattern, RegExpFlags flags, int capture_count)
static bool CanBeHandled(RegExpTree *tree, DirectHandle< String > pattern, RegExpFlags flags, int capture_count)
static bool IsCompiled(DirectHandle< IrRegExpData > re_data, Isolate *isolate)
static V8_WARN_UNUSED_RESULT bool Compile(Isolate *isolate, DirectHandle< IrRegExpData > re_data)
static std::optional< int > Exec(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, uint32_t result_offsets_vector_length)
static std::optional< int > OneshotExec(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, uint32_t result_offsets_vector_length)
static int32_t OneshotExecRaw(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int32_t *output_registers, int32_t output_register_count, int32_t subject_index)
Handle< String > InternalizeString(base::Vector< const char > str, bool convert_encoding=false)
Definition factory.h:216
static int MatchForCallFromRuntime(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject_string, int *output_registers, int output_register_count, int start_position)
static const int kJSRegexpStaticOffsetsVectorSize
Definition isolate.h:1533
static constexpr int kAtomRegisterCount
Definition js-regexp.h:100
static constexpr RegExpFlags AsRegExpFlags(Flags f)
Definition js-regexp.h:57
static constexpr uint32_t kNoBacktrackLimit
Definition js-regexp.h:133
static V8_EXPORT_PRIVATE DirectHandle< String > StringFromFlags(Isolate *isolate, Flags flags)
Definition js-regexp.cc:144
static constexpr Flags AsJSRegExpFlags(RegExpFlags f)
Definition js-regexp.h:54
static constexpr int RegistersForCaptureCount(int count)
Definition js-regexp.h:90
static constexpr int kTierUpForSubjectLengthValue
Definition js-regexp.h:137
V8_WARN_UNUSED_RESULT V8_INLINE bool ToHandle(DirectHandle< S > *out) const
static int Match(DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int *offsets_vector, int offsets_vector_length, int previous_index, Isolate *isolate)
base::Vector< const base::uc16 > data() const
Definition regexp-ast.h:483
int32_t * LastSuccessfulMatch() const
Definition regexp.cc:1292
RegExpGlobalExecRunner(DirectHandle< RegExpData > regexp_data, DirectHandle< String > subject, Isolate *isolate)
Definition regexp.cc:1143
DirectHandle< String > subject_
Definition regexp.h:214
DirectHandle< RegExpData > regexp_data_
Definition regexp.h:213
int AdvanceZeroLength(int last_index) const
Definition regexp.cc:1208
RegExpResultVectorScope result_vector_scope_
Definition regexp.h:206
static int AtomExec(Isolate *isolate, DirectHandle< AtomRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, int result_offsets_vector_length)
Definition regexp.cc:512
static V8_WARN_UNUSED_RESULT std::optional< int > IrregexpExec(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, uint32_t result_offsets_vector_length)
Definition regexp.cc:816
static int IrregexpPrepare(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject)
Definition regexp.cc:724
static int AtomExecRaw(Isolate *isolate, DirectHandle< AtomRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, int result_offsets_vector_length)
Definition regexp.cc:440
static void AtomCompile(Isolate *isolate, DirectHandle< JSRegExp > re, DirectHandle< String > pattern, RegExpFlags flags, DirectHandle< String > match_pattern)
Definition regexp.cc:386
static bool EnsureCompiledIrregexp(Isolate *isolate, DirectHandle< IrRegExpData > re_data, DirectHandle< String > sample_subject, bool is_one_byte)
Definition regexp.cc:531
static void IrregexpInitialize(Isolate *isolate, DirectHandle< JSRegExp > re, DirectHandle< String > pattern, RegExpFlags flags, int capture_count, uint32_t backtrack_limit)
Definition regexp.cc:713
static int IrregexpExecRaw(Isolate *isolate, DirectHandle< IrRegExpData > regexp_data, DirectHandle< String > subject, int index, int32_t *output, int output_size)
Definition regexp.cc:741
static bool Compile(Isolate *isolate, Zone *zone, RegExpCompileData *input, RegExpFlags flags, DirectHandle< String > pattern, DirectHandle< String > sample_subject, bool is_one_byte, uint32_t &backtrack_limit)
Definition regexp.cc:962
static bool CompileIrregexp(Isolate *isolate, DirectHandle< IrRegExpData > re_data, DirectHandle< String > sample_subject, bool is_one_byte)
Definition regexp.cc:621
static DirectHandle< String > ToString(DirectHandle< Object > value)
static DirectHandle< RegExpMatchInfo > ReserveCaptures(Isolate *isolate, DirectHandle< RegExpMatchInfo > match_info, int capture_count)
static bool VerifyRegExpSyntax(Zone *zone, uintptr_t stack_limit, const CharT *input, int input_length, RegExpFlags flags, RegExpCompileData *result, const DisallowGarbageCollection &no_gc)
static bool ParseRegExpFromHeapString(Isolate *isolate, Zone *zone, DirectHandle< String > input, RegExpFlags flags, RegExpCompileData *result)
static bool TryGet(Isolate *isolate, Tagged< String > subject, Tagged< String > pattern, int *number_of_matches_out, int *last_match_index_out)
Definition regexp.cc:1421
static void TryInsert(Isolate *isolate, Tagged< String > subject, Tagged< String > pattern, int number_of_matches, int last_match_index)
Definition regexp.cc:1403
static constexpr int kPatternOffset
Definition regexp.h:244
static void Clear(Tagged< FixedArray > cache)
Definition regexp.cc:1396
static constexpr int kLastMatchOffset
Definition regexp.h:246
static void Enter(Isolate *isolate, DirectHandle< String > key_string, DirectHandle< Object > key_pattern, DirectHandle< FixedArray > value_array, DirectHandle< FixedArray > last_match_cache, ResultsCacheType type)
Definition regexp.cc:1335
static constexpr int kRegExpResultsCacheSize
Definition regexp.h:240
static constexpr int kStringOffset
Definition regexp.h:243
static constexpr int kArrayEntriesPerCacheEntry
Definition regexp.h:247
static constexpr int kArrayOffset
Definition regexp.h:245
static Tagged< Object > Lookup(Heap *heap, Tagged< String > key_string, Tagged< Object > key_pattern, Tagged< FixedArray > *last_match_out, ResultsCacheType type)
Definition regexp.cc:1301
static bool IsUnmodifiedRegExp(Isolate *isolate, DirectHandle< Object > obj)
static V8_EXPORT_PRIVATE bool CompileForTesting(Isolate *isolate, Zone *zone, RegExpCompileData *input, RegExpFlags flags, DirectHandle< String > pattern, DirectHandle< String > sample_subject, bool is_one_byte)
Definition regexp.cc:952
V8_EXPORT_PRIVATE static V8_WARN_UNUSED_RESULT std::optional< int > ExperimentalOneshotExec(Isolate *isolate, DirectHandle< JSRegExp > regexp, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, uint32_t result_offsets_vector_length)
Definition regexp.cc:320
static DirectHandle< RegExpMatchInfo > SetLastMatchInfo(Isolate *isolate, DirectHandle< RegExpMatchInfo > last_match_info, DirectHandle< String > subject, int capture_count, int32_t *match)
Definition regexp.cc:896
static V8_EXPORT_PRIVATE intptr_t AtomExecRaw(Isolate *isolate, Address data_address, Address subject_address, int32_t index, int32_t *result_offsets_vector, int32_t result_offsets_vector_length)
Definition regexp.cc:492
static bool VerifySyntax(Zone *zone, uintptr_t stack_limit, const CharT *input, int input_length, RegExpFlags flags, RegExpError *regexp_error_out, const DisallowGarbageCollection &no_gc)
Definition regexp.cc:124
V8_EXPORT_PRIVATE static V8_WARN_UNUSED_RESULT std::optional< int > Exec(Isolate *isolate, DirectHandle< JSRegExp > regexp, DirectHandle< String > subject, int index, int32_t *result_offsets_vector, uint32_t result_offsets_vector_length)
Definition regexp.cc:332
V8_EXPORT_PRIVATE static V8_WARN_UNUSED_RESULT MaybeDirectHandle< Object > Exec_Single(Isolate *isolate, DirectHandle< JSRegExp > regexp, DirectHandle< String > subject, int index, DirectHandle< RegExpMatchInfo > last_match_info)
Definition regexp.cc:358
static V8_EXPORT_PRIVATE void DotPrintForTesting(const char *label, RegExpNode *node)
Definition regexp.cc:926
static bool CanGenerateBytecode()
Definition regexp.cc:112
static DirectHandle< FixedArray > CreateCaptureNameMap(Isolate *isolate, ZoneVector< RegExpCapture * > *named_captures)
Definition regexp.cc:588
static V8_EXPORT_PRIVATE bool VerifyFlags(RegExpFlags flags)
Definition regexp.cc:117
static V8_WARN_UNUSED_RESULT MaybeDirectHandle< Object > ThrowRegExpException(Isolate *isolate, RegExpFlags flags, DirectHandle< String > pattern, RegExpError error)
Definition regexp.cc:143
static bool IsUnmodifiedRegExp(Isolate *isolate, DirectHandle< JSRegExp > regexp)
Definition regexp.cc:166
static const int kRegExpTooLargeToOptimize
Definition regexp.h:163
static V8_WARN_UNUSED_RESULT bool EnsureFullyCompiled(Isolate *isolate, DirectHandle< RegExpData > re_data, DirectHandle< String > subject)
Definition regexp.cc:294
static constexpr int kInternalRegExpFallbackToExperimental
Definition regexp.h:138
static V8_WARN_UNUSED_RESULT MaybeDirectHandle< Object > Compile(Isolate *isolate, DirectHandle< JSRegExp > re, DirectHandle< String > pattern, RegExpFlags flags, uint32_t backtrack_limit)
Definition regexp.cc:200
static constexpr int ToInt(const Tagged< Object > object)
Definition smi.h:33
static constexpr Tagged< Smi > FromInt(int value)
Definition smi.h:38
static bool constexpr IsValid(T value)
Definition smi.h:67
static constexpr Tagged< Smi > zero()
Definition smi.h:99
base::Vector< const uint8_t > ToOneByteVector() const
Definition string.h:139
base::Vector< const base::uc16 > ToUC16Vector() const
Definition string.h:145
static V8_INLINE HandleType< String > Flatten(Isolate *isolate, HandleType< T > string, AllocationType allocation=AllocationType::kYoung)
static bool IsOneByteRepresentationUnderneath(Tagged< String > string)
Definition string-inl.h:373
int start
int end
#define ASSIGN_RETURN_ON_EXCEPTION(isolate, dst, call)
Definition isolate.h:291
#define THROW_NEW_ERROR(isolate, call)
Definition isolate.h:307
Label label
DirectHandle< FixedArray > capture_name_map
std::string pattern
ZoneVector< RpoNumber > & result
uint16_t uc16
Definition strings.h:18
Vector< const char > CStrVector(const char *data)
Definition vector.h:331
bool Is(IndirectHandle< U > value)
Definition handles-inl.h:51
void PrintF(const char *format,...)
Definition utils.cc:39
constexpr bool IsEitherUnicode(RegExpFlags f)
constexpr bool ShouldOptionallyStepBackToLeadSurrogate(RegExpFlags f)
void MemsetTagged(Tagged_t *start, Tagged< MaybeObject > value, size_t counter)
Definition slots-inl.h:486
V8_INLINE DirectHandle< T > direct_handle(Tagged< T > object, Isolate *isolate)
std::ostream & operator<<(std::ostream &os, AtomicMemoryOrder order)
const char * RegExpErrorString(RegExpError error)
V8_EXPORT_PRIVATE FlagValues v8_flags
RegExpError AnalyzeRegExp(Isolate *isolate, bool is_one_byte, RegExpFlags flags, RegExpNode *node)
void RegExpBytecodeDisassemble(const uint8_t *code_base, int length, const char *pattern)
constexpr int kStackSpaceRequiredForCompilation
Definition globals.h:207
too high values may cause the compiler to set high thresholds for inlining to as much as possible avoid inlined allocation of objects that cannot escape trace load stores from virtual maglev objects use TurboFan fast string builder analyze liveness of environment slots and zap dead values trace TurboFan load elimination emit data about basic block usage in builtins to this enable builtin reordering when run mksnapshot flag for emit warnings when applying builtin profile data verify register allocation in TurboFan randomly schedule instructions to stress dependency tracking enable store store elimination in TurboFan rewrite far to near simulate GC compiler thread race related to allow float parameters to be passed in simulator mode JS Wasm Run additional turbo_optimize_inlined_js_wasm_wrappers enable experimental feedback collection in generic lowering enable Turboshaft s WasmLoadElimination enable Turboshaft s low level load elimination for JS enable Turboshaft s escape analysis for string concatenation use enable Turbolev features that we want to ship in the not too far future trace individual Turboshaft reduction steps trace intermediate Turboshaft reduction steps invocation count threshold for early optimization Enables optimizations which favor memory size over execution speed Enables sampling allocation profiler with X as a sample interval min size of a semi the new space consists of two semi spaces max size of the Collect garbage after Collect garbage after keeps maps alive for< n > old space garbage collections print one detailed trace line in allocation gc speed threshold for starting incremental marking via a task in percent of available threshold for starting incremental marking immediately in percent of available Use a single schedule for determining a marking schedule between JS and C objects schedules the minor GC task with kUserVisible priority max worker number of concurrent for NumberOfWorkerThreads start background threads that allocate memory concurrent_array_buffer_sweeping use parallel threads to clear weak refs in the atomic pause trace progress of the incremental marking trace object counts and memory usage * MB
Definition flags.cc:2197
Tagged< To > Cast(Tagged< From > value, const v8::SourceLocation &loc=INIT_SOURCE_LOCATION_IN_DEBUG)
Definition casting.h:150
DirectHandle< String > subject_
#define FATAL(...)
Definition logging.h:47
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define CHECK_LE(lhs, rhs)
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define CHECK_EQ(lhs, rhs)
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define DCHECK_GT(v1, v2)
Definition logging.h:487
#define USE(...)
Definition macros.h:293
ZoneVector< RegExpCapture * > * named_captures
Definition regexp.h:52
DirectHandle< Object > code
Definition regexp.h:40
RegExpCompilationTarget compilation_target
Definition regexp.h:69
#define V8_WARN_UNUSED_RESULT
Definition v8config.h:671
#define V8_UNLIKELY(condition)
Definition v8config.h:660
#define ZONE_NAME
Definition zone.h:22