v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
experimental-interpreter.cc
Go to the documentation of this file.
1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
9#include "src/sandbox/check.h"
10
11namespace v8 {
12namespace internal {
13
14namespace {
15constexpr int kUndefinedRegisterValue = -1;
16constexpr int kUndefinedMatchIndexValue = -1;
17constexpr uint64_t kUndefinedClockValue = -1;
18
19template <class Character>
20bool SatisfiesAssertion(RegExpAssertion::Type type,
21 base::Vector<const Character> context, int position) {
22 DCHECK_LE(position, context.length());
24
25 switch (type) {
27 return position == 0;
29 return position == context.length();
31 if (position == 0) return true;
32 return unibrow::IsLineTerminator(context[position - 1]);
34 if (position == context.length()) return true;
35 return unibrow::IsLineTerminator(context[position]);
37 if (context.length() == 0) {
38 return false;
39 } else if (position == 0) {
40 return IsRegExpWord(context[position]);
41 } else if (position == context.length()) {
42 return IsRegExpWord(context[position - 1]);
43 } else {
44 return IsRegExpWord(context[position - 1]) !=
45 IsRegExpWord(context[position]);
46 }
48 return !SatisfiesAssertion(RegExpAssertion::Type::BOUNDARY, context,
49 position);
50 }
51}
52
53base::Vector<RegExpInstruction> ToInstructionVector(
55 const DisallowGarbageCollection& no_gc) {
56 RegExpInstruction* inst_begin =
57 reinterpret_cast<RegExpInstruction*>(raw_bytes->begin());
58 int inst_num = raw_bytes->length() / sizeof(RegExpInstruction);
59 DCHECK_EQ(sizeof(RegExpInstruction) * inst_num, raw_bytes->length());
60 return base::Vector<RegExpInstruction>(inst_begin, inst_num);
61}
62
63template <class Character>
64base::Vector<const Character> ToCharacterVector(
66
67template <>
68base::Vector<const uint8_t> ToCharacterVector<uint8_t>(
69 Tagged<String> str, const DisallowGarbageCollection& no_gc) {
70 DCHECK(str->IsFlat());
71 String::FlatContent content = str->GetFlatContent(no_gc);
72 DCHECK(content.IsOneByte());
73 return content.ToOneByteVector();
74}
75
76template <>
77base::Vector<const base::uc16> ToCharacterVector<base::uc16>(
78 Tagged<String> str, const DisallowGarbageCollection& no_gc) {
79 DCHECK(str->IsFlat());
80 String::FlatContent content = str->GetFlatContent(no_gc);
81 DCHECK(content.IsTwoByte());
82 return content.ToUC16Vector();
83}
84
85class FilterGroups {
86 public:
87 static base::Vector<int> Filter(
88 int pc, base::Vector<int> registers,
89 base::Vector<uint64_t> quantifiers_clocks,
90 base::Vector<uint64_t> capture_clocks,
91 std::optional<base::Vector<uint64_t>> lookaround_clocks,
92 base::Vector<int> filtered_registers,
93 base::Vector<const RegExpInstruction> bytecode, Zone* zone) {
94 /* Capture groups that were not traversed in the last iteration of a
95 * quantifier need to be discarded. In order to determine which groups need
96 * to be discarded, the interpreter maintains a clock, an internal count of
97 * bytecode instructions executed. Whenever it reaches a quantifier or
98 * a capture group, it records the current clock. After a match is found,
99 * the interpreter filters out capture groups that were defined in any other
100 * iteration than the last. To do so, it compares the last clock value of
101 * the group with the last clock value of its parent quantifier/group,
102 * keeping only groups that were defined after the parent quantifier/group
103 * last iteration. The structure of the bytecode used is explained in
104 * `FilterGroupsCompileVisitor` (experimental-compiler.cc). */
105
106 return FilterGroups(pc, bytecode, zone)
107 .Run(registers, quantifiers_clocks, capture_clocks, lookaround_clocks,
108 filtered_registers);
109 }
110
111 private:
112 FilterGroups(int pc, base::Vector<const RegExpInstruction> bytecode,
113 Zone* zone)
114 : pc_(pc),
115 max_clock_(0),
116 pc_stack_(zone),
117 max_clock_stack_(zone),
118 bytecode_(bytecode) {}
119
120 /* Goes back to the parent node, restoring pc_ and max_clock_. If already at
121 * the root of the tree, completes the filtering process. */
122 void Up() {
123 if (pc_stack_.size() > 0) {
124 pc_ = pc_stack_.top();
126 pc_stack_.pop();
127 max_clock_stack_.pop();
128 }
129 }
130
131 /* Increments pc_. When at the end of a node, goes back to the parent node. */
132 void IncrementPC() {
133 if (IsAtNodeEnd()) {
134 Up();
135 } else {
136 pc_++;
137 }
138 }
139
140 bool IsAtNodeEnd() {
141 return pc_ + 1 == bytecode_.length() ||
143 }
144
145 base::Vector<int> Run(base::Vector<int> registers_,
146 base::Vector<uint64_t> quantifiers_clocks_,
147 base::Vector<uint64_t> capture_clocks_,
148 std::optional<base::Vector<uint64_t>> lookaround_clocks,
149 base::Vector<int> filtered_registers_) {
150 pc_stack_.push(pc_);
151 max_clock_stack_.push(max_clock_);
152
153 while (!pc_stack_.empty()) {
154 auto instr = bytecode_[pc_];
155 switch (instr.opcode) {
157 // We only need to come back for the next instructions if we are at
158 // the end of the node.
159 if (!IsAtNodeEnd()) {
160 pc_stack_.push(pc_ + 1);
161 max_clock_stack_.push(max_clock_);
162 }
163
164 // Enter the child's node.
165 pc_ = instr.payload.pc;
166 break;
167
169 int group_id = instr.payload.group_id;
170
171 // Checks whether the captured group should be saved or discarded.
172 int register_id = 2 * group_id;
173 if (capture_clocks_[register_id] >= max_clock_ &&
174 capture_clocks_[register_id] != kUndefinedClockValue) {
175 filtered_registers_[register_id] = registers_[register_id];
176 filtered_registers_[register_id + 1] = registers_[register_id + 1];
177 IncrementPC();
178 } else {
179 // If the node should be discarded, all its children should be too.
180 // By going back to the parent, we don't visit the children, and
181 // therefore don't copy their registers.
182 Up();
183 }
184 break;
185 }
186
188 int quantifier_id = instr.payload.quantifier_id;
189
190 // Checks whether the quantifier should be saved or discarded.
191 if (quantifiers_clocks_[quantifier_id] >= max_clock_) {
192 max_clock_ = quantifiers_clocks_[quantifier_id];
193 IncrementPC();
194 } else {
195 // If the node should be discarded, all its children should be too.
196 // By going back to the parent, we don't visit the children, and
197 // therefore don't copy their registers.
198 Up();
199 }
200 break;
201 }
202
204 // Checks whether the lookaround should be saved or discarded.
205 if (!lookaround_clocks.has_value() ||
206 lookaround_clocks->at(instr.payload.lookaround_id) >=
207 max_clock_) {
208 // lookaround_clocks->at(instr.payload.lookaround_id);
209 IncrementPC();
210 } else {
211 // If the node should be discarded, all its children should be
212 // too. By going back to the parent, we don't visit the
213 // children, and therefore don't copy their registers.
214 Up();
215 }
216 break;
217 }
218
219 default:
220 UNREACHABLE();
221 }
222 }
223
224 return filtered_registers_;
225 }
226
227 int pc_;
228
229 // The last clock encountered (either from a quantifier or a capture group).
230 // Any groups whose clock is less then max_clock_ needs to be discarded.
231 uint64_t max_clock_;
232
233 // Stores pc_ and max_clock_ when the interpreter enters a node.
234 ZoneStack<int> pc_stack_;
235 ZoneStack<uint64_t> max_clock_stack_;
236
237 base::Vector<const RegExpInstruction> bytecode_;
238};
239
240template <class Character>
241class NfaInterpreter {
242 // Executes a bytecode program in breadth-first mode, without backtracking.
243 // `Character` can be instantiated with `uint8_t` or `base::uc16` for one byte
244 // or two byte input strings.
245 //
246 // In contrast to the backtracking implementation, this has linear time
247 // complexity in the length of the input string. Breadth-first mode means
248 // that threads are executed in lockstep with respect to their input
249 // position, i.e. the threads share a common input index. This is similar
250 // to breadth-first simulation of a non-deterministic finite automaton (nfa),
251 // hence the name of the class.
252 //
253 // To follow the semantics of a backtracking VM implementation, we have to be
254 // careful about whether we stop execution when a thread executes ACCEPT.
255 // For example, consider execution of the bytecode generated by the regexp
256 //
257 // r = /abc|..|[a-c]{10,}/
258 //
259 // on input "abcccccccccccccc". Clearly the three alternatives
260 // - /abc/
261 // - /../
262 // - /[a-c]{10,}/
263 // all match this input. A backtracking implementation will report "abc" as
264 // match, because it explores the first alternative before the others.
265 //
266 // However, if we execute breadth first, then we execute the 3 threads
267 // - t1, which tries to match /abc/
268 // - t2, which tries to match /../
269 // - t3, which tries to match /[a-c]{10,}/
270 // in lockstep i.e. by iterating over the input and feeding all threads one
271 // character at a time. t2 will execute an ACCEPT after two characters,
272 // while t1 will only execute ACCEPT after three characters. Thus we find a
273 // match for the second alternative before a match of the first alternative.
274 //
275 // This shows that we cannot always stop searching as soon as some thread t
276 // executes ACCEPT: If there is a thread u with higher priority than t, then
277 // it must be finished first. If u produces a match, then we can discard the
278 // match of t because matches produced by threads with higher priority are
279 // preferred over matches of threads with lower priority. On the other hand,
280 // we are allowed to abort all threads with lower priority than t if t
281 // produces a match: Such threads can only produce worse matches. In the
282 // example above, we can abort t3 after two characters because of t2's match.
283 //
284 // Thus the interpreter keeps track of a priority-ordered list of threads.
285 // If a thread ACCEPTs, all threads with lower priority are discarded, and
286 // the search continues with the threads with higher priority. If no threads
287 // with high priority are left, we return the match that was produced by the
288 // ACCEPTing thread with highest priority.
289 //
290 // The handling of lookarounds is split into two cases: either there are only
291 // captureless lookbehinds, or other lookarounds are present (capturing and/or
292 // lookaheads). The latter case require the
293 // `experimental_regexp_engine_capture_group_opt` flag to be set. The bytecode
294 // for both cases is similar, and the interpreter will choose which algorithm
295 // to use when iterating over the bytecode in the constructor.
296 //
297 // In the first case (caputreless lookbehinds), the interpreter will run each
298 // lookbehinds in a thread, in parrallel to the main expression, and those
299 // threads will write into the `lookbehind_table_` when they find a match.
300 //
301 // In the second case, before running the main expression, the interpreter
302 // builds a table indicating at each index of the input which lookaround does
303 // match, therefore having a size of input_size x lookaround_count. It is
304 // constructed before starting the search, by running each lookaround's
305 // automaton independently on the whole input. Since a lookaround may depend
306 // on others, it is imperative to run them in a correct order, starting from
307 // those not containing any other. This order is inferred from the order in
308 // which the lookarounds' automata appear in the bytecode. Once the table is
309 // completed, the interpreter runs the main expression's bytecode to find a
310 // match.
311 //
312 // The lookaround's automaton ends with the `WRITE_LOOKAROUND_TABLE`
313 // instruction, setting the corresponding boolean of the lookaround table to
314 // true. Since the compiler appends a /.*/ at the beginning of the
315 // lookaround's automaton, the search starts with exactly one thread, and
316 // destroys those after reaching the write instruction. To work on lookaheads,
317 // they need to be ran in reverse, such that they end their match on the input
318 // where they are required, and the compiler produces their automata reversed.
319 //
320 // Once the interpreter has found a match, it still needs to compute the
321 // captures from groups within lookarounds. To achieve this, it runs each
322 // lookaround's automaton on the index where it was matched, and recovers the
323 // resulting capture groups. Once again, the lookarounds need to be run in a
324 // particular order, from parents to children, such that the lookarounds
325 // requiring other ones can add those to the list of lookarounds to capture.
326 // Since the threads only retains the index where the lookaround matched,
327 // lookbehinds need to be reversed. Again the compiler produces their automata
328 // reversed.
329 //
330 // As the lookaround table construction and the capture require automata in
331 // different directions, the compiler produces two automata per lookaround,
332 // delimited by the `START_LOOKAROUND` and `END_LOOKAROUND` instructions, and
333 // with the separation occurring right after the `WRITE_LOOKAROUND_TABLE`
334 // instruction.
335 public:
336 NfaInterpreter(Isolate* isolate, RegExp::CallOrigin call_origin,
338 int register_count_per_match, Tagged<String> input,
339 int32_t input_index, Zone* zone)
340 : isolate_(isolate),
342 bytecode_object_(bytecode),
343 bytecode_(ToInstructionVector(bytecode, no_gc_)),
344 register_count_per_match_(register_count_per_match),
346 input_object_(input),
347 input_(ToCharacterVector<Character>(input, no_gc_)),
348 input_index_(input_index),
349 clock(0),
351 zone->AllocateArray<LastInputIndex>(bytecode->length()),
352 bytecode->length()),
353 active_threads_(0, zone),
354 blocked_threads_(0, zone),
358 quantifier_array_allocator_(std::nullopt),
359 capture_clock_array_allocator_(std::nullopt),
360 best_match_thread_(std::nullopt),
361 lookarounds_(0, zone),
362 lookaround_table_(std::nullopt),
363 lookbehind_table_(std::nullopt),
365 reverse_(false),
367 filter_groups_pc_(std::nullopt),
368 zone_(zone) {
369 DCHECK(!bytecode_.empty());
371 DCHECK_LE(input_index_, input_.length());
372
373 // Iterate over the bytecode to find the PC of the filtering
374 // instructions and lookarounds, and the number of quantifiers.
375 std::optional<struct Lookaround> lookaround;
376 bool in_lookaround = false;
377 int lookaround_index;
378 for (int i = 0; i < bytecode_.length() - 1; ++i) {
379 auto& inst = bytecode_[i];
380
381 if (inst.opcode == RegExpInstruction::START_LOOKAROUND) {
382 DCHECK(!lookaround.has_value());
383 in_lookaround = true;
384
385 // Stores the partial information for a lookaround. The rest will be
386 // determined upon reaching a `WRITE_LOOKAROUND_TABLE` instruction.
387 lookaround_index = inst.payload.lookaround.index();
388 lookaround = Lookaround{.match_pc = i,
389 .capture_pc = -1,
390 .type = inst.payload.lookaround.type()};
391
392 if (inst.payload.lookaround.type() ==
395 }
396 }
397
398 if (inst.opcode == RegExpInstruction::SET_REGISTER_TO_CP &&
399 in_lookaround) {
401 }
402
404 DCHECK(lookaround.has_value());
405
406 // Fills the current lookaround data.
407 lookaround->capture_pc = i + 1;
408
409 // Since the lookarounds are not in order in the `lookarounds_` array,
410 // we first fill it until it has the correct size.
411 while (lookarounds_.length() <= lookaround_index) {
413 }
414 lookarounds_.Set(lookaround_index, *lookaround);
415 lookaround = {};
416 }
417
418 if (inst.opcode == RegExpInstruction::END_LOOKAROUND) {
419 in_lookaround = false;
420 }
421
422 // The first `FILTER_*` instruction encountered is the start of the
423 // `FILTER_*` section.
424 if (!filter_groups_pc_.has_value() && RegExpInstruction::IsFilter(inst)) {
425 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
427 }
428
430 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
432 std::max(quantifier_count_, inst.payload.quantifier_id + 1);
433 }
434 }
435
436 // Iniitializes the lookaround truth table and required allocators.
438 lookbehind_table_.emplace(lookarounds_.length(), zone_);
439 lookbehind_table_->AddBlock(false, lookarounds_.length(), zone_);
440 } else {
441 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
442
445
446 lookaround_table_.emplace(zone_);
447 for (int i = lookarounds_.length() - 1; i >= 0; --i) {
448 lookaround_table_->emplace_back(input_.length() + 1, zone_);
449 }
450 }
451
452 // Precomputes the memory consumption of a single thread, to be used by
453 // `CheckMemoryConsumption()`.
454 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
457
459 register_count_per_match_ * sizeof(int) + // RegisterArray
460 quantifier_count_ * sizeof(uint64_t) + // QuantifierClockArray
461 register_count_per_match_ * sizeof(uint64_t) + // CaptureClockArray
462 lookarounds_.length() * sizeof(uint64_t) + // LookaroundClockArray
463 lookarounds_.length() * sizeof(int) + // LookaroundMatchIndexArray
464 sizeof(InterpreterThread);
465 }
466
467 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(),
468 LastInputIndex());
469 }
470
471 // Finds matches and writes their concatenated capture registers to
472 // `output_registers`. `output_registers[i]` has to be valid for all i <
473 // output_register_count. The search continues until all remaining matches
474 // have been found or there is no space left in `output_registers`. Returns
475 // the number of matches found.
476 int FindMatches(int32_t* output_registers, int output_register_count) {
477 const int max_match_num = output_register_count / register_count_per_match_;
478
479 if (!only_captureless_lookbehinds_) {
480 int err_code = FillLookaroundTable();
481 if (err_code != RegExp::kInternalRegExpSuccess) {
482 return err_code;
483 }
484 }
485
486 int match_num = 0;
487 while (match_num != max_match_num) {
488 int err_code = FindNextMatch();
489 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
490
491 if (!FoundMatch()) break;
492
493 base::Vector<int> registers;
494
495 err_code = GetFilteredRegisters(*best_match_thread_, registers);
496 if (err_code != RegExp::kInternalRegExpSuccess) {
497 return err_code;
498 }
499
500 output_registers =
501 std::copy(registers.begin(), registers.end(), output_registers);
502
503 ++match_num;
504
505 const int match_begin = registers[0];
506 const int match_end = registers[1];
507 DCHECK_LE(match_begin, match_end);
508 const int match_length = match_end - match_begin;
509 if (match_length != 0) {
510 SetInputIndex(match_end);
511 } else if (match_end == input_.length()) {
512 // Zero-length match, input exhausted.
513 SetInputIndex(match_end);
514 break;
515 } else {
516 // Zero-length match, more input. We don't want to report more matches
517 // here endlessly, so we advance by 1.
518 SetInputIndex(match_end + 1);
519
520 // TODO(mbid,v8:10765): If we're in unicode mode, we have to advance to
521 // the next codepoint, not to the next code unit. See also
522 // `RegExpUtils::AdvanceStringIndex`.
524 }
525 }
526
527 return match_num;
528 }
529
530 private:
531 // The state of a "thread" executing experimental regexp bytecode. (Not to
532 // be confused with an OS thread.)
533 class InterpreterThread {
534 public:
535 enum class ConsumedCharacter { DidConsume, DidNotConsume };
536
537 InterpreterThread(int pc, int* register_array_begin,
538 int* lookaround_match_index_array_begin,
539 uint64_t* quantifier_clock_array_begin,
540 uint64_t* capture_clock_array_begin,
541 uint64_t* lookaround_clock_array_begin,
542 ConsumedCharacter consumed_since_last_quantifier)
543 : pc(pc),
548 captures_clock_array_begin(capture_clock_array_begin),
551
552 // This thread's program counter, i.e. the index within `bytecode_` of the
553 // next instruction to be executed.
554 int pc;
555 // Pointer to the array of registers, which is always size
556 // `register_count_per_match_`. Should be deallocated with
557 // `register_array_allocator_`.
559
560 // Pointer to an array containing the input index when the thread did match
561 // a lookaround. Should be deallocated with
562 // `lookaround_match_index_array_allocator_`.
564
565 // Pointer to an array containing the clock when the
566 // register/quantifier/lookaround was last saved. Should be deallocated
567 // with, respectively, `quantifier_array_allocator_`,
568 // `capture_clock_array_allocator_` and `lookaround_clock_array_allocator_`.
572
573 // Describe whether the thread consumed a character since it last entered a
574 // quantifier. Since quantifier iterations that match the empty string are
575 // not allowed, we need to distinguish threads that are allowed to exit a
576 // quantifier iteration from those that are not.
578 };
579
580 V8_WARN_UNUSED_RESULT int FillLookaroundTable() {
581 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
583
584 if (lookarounds_.is_empty()) {
586 }
587
588 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(),
589 LastInputIndex());
590
591 int old_input_index = input_index_;
592
593 for (int i = lookarounds_.length() - 1; i >= 0; --i) {
594 // Clean up left-over data from last iteration.
595 for (InterpreterThread t : blocked_threads_) {
596 DestroyThread(t);
597 }
598 blocked_threads_.Rewind(0);
599
600 for (InterpreterThread t : active_threads_) {
601 DestroyThread(t);
602 }
603 active_threads_.Rewind(0);
604
607 input_index_ = reverse_ ? input_.length() : 0;
608
609 active_threads_.Add(NewEmptyThread(lookarounds_.at(i).match_pc), zone_);
610
611 int err_code = RunActiveThreadsToEnd();
612 if (err_code != RegExp::kInternalRegExpSuccess) {
613 return err_code;
614 }
615 }
616
617 reverse_ = false;
619 input_index_ = old_input_index;
620
622 }
623
624 // Update the capture groups for matched lookarounds.
625 V8_WARN_UNUSED_RESULT int FillLookaroundCaptures(
626 InterpreterThread& main_thread) {
627 DCHECK(best_match_thread_.has_value());
628 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
629 DCHECK(!only_captureless_lookbehinds_);
630
631 if (lookarounds_.is_empty()) {
633 }
634
635 // We need to capture the lookarounds from parents to childrens, since we
636 // need the index on which the lookaround was matched, and those indexes are
637 // computed when the parent expression is captured.
638 for (int i = 0; i < lookarounds_.length(); ++i) {
639 if (GetLookaroundMatchIndexArray(main_thread)[i] ==
640 kUndefinedMatchIndexValue) {
641 continue;
642 }
643
644 Lookaround& lookaround = lookarounds_[i];
645
646 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(),
647 LastInputIndex());
648
649 // Clean up left-over data from last iteration.
650 for (InterpreterThread t : blocked_threads_) {
651 DestroyThread(t);
652 }
653 blocked_threads_.Rewind(0);
654
655 for (InterpreterThread t : active_threads_) {
656 DestroyThread(t);
657 }
658 active_threads_.Rewind(0);
659
660 best_match_thread_ = std::nullopt;
661
662 reverse_ = lookaround.type == RegExpLookaround::Type::LOOKBEHIND;
663 input_index_ = GetLookaroundMatchIndexArray(main_thread)[i];
664
665 // We reuse the same thread as initial thread, to avoid having to merge
666 // the new `best_match_thread_` with the previous results.
667 main_thread.pc = lookaround.capture_pc;
668 main_thread.consumed_since_last_quantifier =
669 InterpreterThread::ConsumedCharacter::DidConsume;
670 active_threads_.Add(main_thread, zone_);
671
672 int err_code = RunActiveThreadsToEnd();
673 if (err_code != RegExp::kInternalRegExpSuccess) {
674 return err_code;
675 }
676
677 // The lookaround has already been matched once on this position during
678 // the match research.
679 DCHECK(best_match_thread_.has_value());
680 main_thread = *best_match_thread_;
681 }
682
684 }
685
686 V8_WARN_UNUSED_RESULT int RunActiveThreadsToEnd() {
687 // Run the initial thread, potentially forking new threads, until every
688 // thread is blocked without further input.
689 RunActiveThreads();
690
691 // We stop if one of the following conditions hold:
692 // - We have exhausted the entire input.
693 // - We have found a match at some point, and there are no remaining
694 // threads with higher priority than the thread that produced the match.
695 // Threads with low priority have been aborted earlier, and the remaining
696 // threads are blocked here, so the latter simply means that
697 // `blocked_threads_` is empty.
698 while ((reverse_ ? ((0 < input_index_ && input_index_ <= input_.length()))
699 : (0 <= input_index_ && input_index_ < input_.length())) &&
700 !(FoundMatch() && blocked_threads_.is_empty())) {
701 DCHECK(active_threads_.is_empty());
702
703 if (lookbehind_table_.has_value()) {
704 std::fill(lookbehind_table_->begin(), lookbehind_table_->end(), false);
705 }
706
707 if (reverse_) {
708 --input_index_;
709 }
710
711 base::uc16 input_char = input_[input_index_];
712
713 if (!reverse_) {
714 ++input_index_;
715 }
716
717 static constexpr int kTicksBetweenInterruptHandling = 64;
718 if (input_index_ % kTicksBetweenInterruptHandling == 0) {
719 int err_code = HandleInterrupts();
720 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
721 }
722
723 // We unblock all blocked_threads_ by feeding them the input char.
724 FlushBlockedThreads(input_char);
725
726 // Run all threads until they block or accept.
727 RunActiveThreads();
728 }
729
731 }
732
733 // Handles pending interrupts if there are any. Returns
734 // RegExp::kInternalRegExpSuccess if execution can continue, and an error
735 // code otherwise.
736 V8_WARN_UNUSED_RESULT int HandleInterrupts() {
737 StackLimitCheck check(isolate_);
738 if (call_origin_ == RegExp::CallOrigin::kFromJs) {
739 // Direct calls from JavaScript can be interrupted in two ways:
740 // 1. A real stack overflow, in which case we let the caller throw the
741 // exception.
742 // 2. The stack guard was used to interrupt execution for another purpose,
743 // forcing the call through the runtime system.
744 if (check.JsHasOverflowed()) {
746 } else if (check.InterruptRequested()) {
748 }
749 } else {
751 HandleScope handles(isolate_);
752 DirectHandle<TrustedByteArray> bytecode_handle(bytecode_object_,
753 isolate_);
754 DirectHandle<String> input_handle(input_object_, isolate_);
755
756 if (check.JsHasOverflowed()) {
757 // We abort the interpreter now anyway, so gc can't invalidate any
758 // pointers.
760 isolate_->StackOverflow();
762 } else if (check.InterruptRequested()) {
763 // TODO(mbid): Is this really equivalent to whether the string is
764 // one-byte or two-byte? A comment at the declaration of
765 // IsOneByteRepresentationUnderneath says that this might fail for
766 // external strings.
767 const bool was_one_byte =
769
771 {
773 result = isolate_->stack_guard()->HandleInterrupts();
774 }
775 if (IsException(result, isolate_)) {
777 }
778
779 // If we changed between a LATIN1 and a UC16 string, we need to restart
780 // regexp matching with the appropriate template instantiation of
781 // RawMatch.
783 was_one_byte) {
785 }
786
787 // Update objects and pointers in case they have changed during gc.
788 bytecode_object_ = *bytecode_handle;
789 bytecode_ = ToInstructionVector(bytecode_object_, no_gc_);
790 input_object_ = *input_handle;
791 input_ = ToCharacterVector<Character>(input_object_, no_gc_);
792 }
793 }
795 }
796
797 // Change the current input index for future calls to `FindNextMatch`.
798 void SetInputIndex(int new_input_index) {
799 DCHECK_GE(new_input_index, 0);
800 DCHECK_LE(new_input_index, input_.length());
801
802 input_index_ = new_input_index;
803 }
804
805 // Find the next match and return the corresponding capture registers and
806 // write its capture registers to `best_match_thread_`. The search starts
807 // at the current `input_index_`. Returns RegExp::kInternalRegExpSuccess if
808 // execution could finish regularly (with or without a match) and an error
809 // code due to interrupt otherwise.
810 int FindNextMatch() {
811 DCHECK(active_threads_.is_empty());
812 // TODO(mbid,v8:10765): Can we get around resetting `pc_last_input_index_`
813 // here? As long as
814 //
815 // pc_last_input_index_[pc] < input_index_
816 //
817 // for all possible program counters pc that are reachable without input
818 // from pc = 0 and
819 //
820 // pc_last_input_index_[k] <= input_index_
821 //
822 // for all k > 0 hold I think everything should be fine. Maybe we can do
823 // something about this in `SetInputIndex`.
824 std::fill(pc_last_input_index_.begin(), pc_last_input_index_.end(),
825 LastInputIndex());
826
827 // Clean up left-over data from a previous call to FindNextMatch.
828 for (InterpreterThread t : blocked_threads_) {
829 DestroyThread(t);
830 }
831 blocked_threads_.Rewind(0);
832
833 for (InterpreterThread t : active_threads_) {
834 DestroyThread(t);
835 }
836 active_threads_.Rewind(0);
837
838 if (best_match_thread_.has_value()) {
839 DestroyThread(*best_match_thread_);
840 best_match_thread_ = std::nullopt;
841 }
842
843 active_threads_.Add(NewEmptyThread(0), zone_);
844
845 if (only_captureless_lookbehinds_) {
846 for (int i = 0; i < lookarounds_.length(); ++i) {
847 active_threads_.Add(NewEmptyThread(lookarounds_.at(i).match_pc), zone_);
848 }
849 }
850
851 int err_code = RunActiveThreadsToEnd();
852 if (err_code != RegExp::kInternalRegExpSuccess) {
853 return err_code;
854 }
855
857 }
858
859 // Run an active thread `t` until it executes a CONSUME_RANGE or ACCEPT
860 // or RANGE_COUNT instruction, or its PC value was already processed.
861 // - If processing of `t` can't continue because of CONSUME_RANGE or
862 // RANGE_COUNT, it is pushed on `blocked_threads_`.
863 // - If `t` executes ACCEPT, set `best_match` according to `t.match_begin` and
864 // the current input index. All remaining `active_threads_` are discarded.
865 int RunActiveThread(InterpreterThread t) {
866 while (true) {
867 SBXCHECK_GE(t.pc, 0);
868 SBXCHECK_LT(t.pc, bytecode_.size());
869
870 ++clock;
871
872 // Since the clock is a `uint64_t`, it is almost guaranteed
873 // not to overflow. An `uint64_t` being at least 64 bits, it
874 // would take at least a hundred years to overflow if the clock was
875 // incremented at each cycle of a 3 GHz processor.
876 DCHECK_GT(clock, 0);
877
878 if (IsPcProcessed(t.pc, t.consumed_since_last_quantifier)) {
879 DestroyThread(t);
881 }
882 MarkPcProcessed(t.pc, t.consumed_since_last_quantifier);
883
884 RegExpInstruction inst = bytecode_[t.pc];
885
886 switch (inst.opcode) {
889 blocked_threads_.Add(t, zone_);
891 }
893 if (!SatisfiesAssertion(inst.payload.assertion_type, input_,
894 input_index_)) {
895 DestroyThread(t);
897 }
898 ++t.pc;
899 break;
901 InterpreterThread fork = NewUninitializedThread(inst.payload.pc);
902 fork.consumed_since_last_quantifier =
903 t.consumed_since_last_quantifier;
904
905 base::Vector<int> fork_registers = GetRegisterArray(fork);
906 base::Vector<int> t_registers = GetRegisterArray(t);
907 DCHECK_EQ(fork_registers.length(), t_registers.length());
908 std::copy(t_registers.begin(), t_registers.end(),
909 fork_registers.begin());
910
911 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
912 base::Vector<uint64_t> fork_quantifier_clocks =
913 GetQuantifierClockArray(fork);
914 base::Vector<uint64_t> t_fork_quantifier_clocks =
915 GetQuantifierClockArray(t);
916 DCHECK_EQ(fork_quantifier_clocks.length(),
917 t_fork_quantifier_clocks.length());
918 std::copy(t_fork_quantifier_clocks.begin(),
919 t_fork_quantifier_clocks.end(),
920 fork_quantifier_clocks.begin());
921
922 base::Vector<uint64_t> fork_capture_clocks =
923 GetCaptureClockArray(fork);
924 base::Vector<uint64_t> t_fork_capture_clocks =
925 GetCaptureClockArray(t);
926 DCHECK_EQ(fork_capture_clocks.length(),
927 t_fork_capture_clocks.length());
928 std::copy(t_fork_capture_clocks.begin(),
929 t_fork_capture_clocks.end(), fork_capture_clocks.begin());
930
931 if (!only_captureless_lookbehinds_) {
932 base::Vector<int> fork_lookaround_match_index =
933 GetLookaroundMatchIndexArray(fork);
934 base::Vector<int> t_fork_lookaround_match_index =
935 GetLookaroundMatchIndexArray(t);
936 DCHECK_EQ(fork_lookaround_match_index.length(),
937 t_fork_lookaround_match_index.length());
938 std::copy(t_fork_lookaround_match_index.begin(),
939 t_fork_lookaround_match_index.end(),
940 fork_lookaround_match_index.begin());
941
942 base::Vector<uint64_t> fork_lookaround_clocks =
943 GetLookaroundClockArray(fork);
944 base::Vector<uint64_t> t_fork_lookaround_clocks =
945 GetLookaroundClockArray(t);
946 DCHECK_EQ(fork_lookaround_clocks.length(),
947 t_fork_lookaround_clocks.length());
948 std::copy(t_fork_lookaround_clocks.begin(),
949 t_fork_lookaround_clocks.end(),
950 fork_lookaround_clocks.begin());
951 }
952 }
953
954 active_threads_.Add(fork, zone_);
955
956 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
957 int err_code = CheckMemoryConsumption();
958 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
959 }
960
961 ++t.pc;
962 break;
963 }
965 t.pc = inst.payload.pc;
966 break;
968 if (best_match_thread_.has_value()) {
969 DestroyThread(*best_match_thread_);
970 }
972
973 for (InterpreterThread s : active_threads_) {
974 DestroyThread(s);
975 }
976 active_threads_.Rewind(0);
979 GetQuantifierClockArray(t)[inst.payload.quantifier_id] = clock;
980 ++t.pc;
981 break;
982
984 SBXCHECK_BOUNDS(inst.payload.register_index,
985 register_count_per_match_);
986 GetRegisterArray(t)[inst.payload.register_index] =
987 kUndefinedRegisterValue;
988 ++t.pc;
989 break;
991 SBXCHECK_BOUNDS(inst.payload.register_index,
992 register_count_per_match_);
993 GetRegisterArray(t)[inst.payload.register_index] = input_index_;
994 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
995 GetCaptureClockArray(t)[inst.payload.register_index] = clock;
996 }
997 ++t.pc;
998 break;
1003 UNREACHABLE();
1005 t.consumed_since_last_quantifier =
1006 InterpreterThread::ConsumedCharacter::DidNotConsume;
1007 ++t.pc;
1008 break;
1010 // If the thread did not consume any character during a whole
1011 // quantifier iteration,then it must be destroyed, since quantifier
1012 // repetitions are not allowed to match the empty string.
1013 if (t.consumed_since_last_quantifier ==
1014 InterpreterThread::ConsumedCharacter::DidNotConsume) {
1015 DestroyThread(t);
1017 }
1018 ++t.pc;
1019 break;
1021 ++t.pc;
1022 break;
1023
1025 if (best_match_thread_.has_value()) {
1026 DestroyThread(*best_match_thread_);
1027 }
1029
1030 for (InterpreterThread s : active_threads_) {
1031 DestroyThread(s);
1032 }
1033 active_threads_.Rewind(0);
1036 // Reaching this instruction means that the current lookaround thread
1037 // has found a match and needs to be destroyed. Since the lookaround
1038 // is verified at this position, we update the `lookaround_table_`.
1039
1040 if (!only_captureless_lookbehinds_) {
1041 SBXCHECK_BOUNDS(current_lookaround_, lookaround_table_->size());
1042 (*lookaround_table_)[current_lookaround_][input_index_] = true;
1043 } else {
1044 SBXCHECK_BOUNDS(inst.payload.lookaround_id,
1045 lookbehind_table_->length());
1046 lookbehind_table_->Set(inst.payload.lookaround_id, true);
1047 }
1048
1049 DestroyThread(t);
1052 // Destroy the thread if the corresponding lookaround did or did not
1053 // complete a match at the current position (depending on whether or
1054 // not the lookaround is positive). The lookaround priority list
1055 // ensures that all the relevant lookarounds has already been run.
1056
1057 if (!only_captureless_lookbehinds_) {
1058 SBXCHECK_BOUNDS(inst.payload.lookaround.index(),
1059 lookaround_table_->size());
1060
1061 if ((*lookaround_table_)[inst.payload.lookaround.index()]
1062 [input_index_] !=
1063 inst.payload.lookaround.is_positive()) {
1064 DestroyThread(t);
1066 }
1067
1068 // Store the match informations for positive lookarounds.
1069 if (inst.payload.lookaround.is_positive() &&
1070 !only_captureless_lookbehinds_) {
1071 GetLookaroundClockArray(t)[inst.payload.lookaround.index()] =
1072 clock;
1073 GetLookaroundMatchIndexArray(t)[inst.payload.lookaround.index()] =
1075 }
1076
1077 ++t.pc;
1078 break;
1079 } else {
1080 const int32_t lookbehind_index = inst.payload.lookaround.index();
1081 SBXCHECK_BOUNDS(lookbehind_index, lookbehind_table_->length());
1082
1083 if (lookbehind_table_->at(lookbehind_index) !=
1084 inst.payload.lookaround.is_positive()) {
1085 DestroyThread(t);
1087 }
1088
1089 ++t.pc;
1090 break;
1091 }
1092 }
1093 }
1094 }
1095
1096 // Run each active thread until it can't continue without further input.
1097 // `active_threads_` is empty afterwards. `blocked_threads_` are sorted from
1098 // low to high priority.
1099 int RunActiveThreads() {
1100 while (!active_threads_.is_empty()) {
1101 int err_code = RunActiveThread(active_threads_.RemoveLast());
1102 if (err_code != RegExp::kInternalRegExpSuccess) return err_code;
1103 }
1104
1106 }
1107
1108 // Unblock all blocked_threads_ by feeding them an `input_char`. Should only
1109 // be called with `input_index_` pointing to the character *after*
1110 // `input_char` so that `pc_last_input_index_` is updated correctly.
1111 void FlushBlockedThreads(base::uc16 input_char) {
1112 // The threads in blocked_threads_ are sorted from high to low priority,
1113 // but active_threads_ needs to be sorted from low to high priority, so we
1114 // need to activate blocked threads in reverse order.
1115 for (int i = blocked_threads_.length() - 1; i >= 0; --i) {
1116 InterpreterThread t = blocked_threads_[i];
1117 // Number of ranges to check.
1118 int32_t ranges = 1;
1119 // Consume success.
1120 bool has_matched = false;
1121
1122 if (bytecode_[t.pc].opcode == RegExpInstruction::RANGE_COUNT) {
1123 ranges = bytecode_[t.pc].payload.num_ranges;
1124 ++t.pc;
1125 }
1126
1127 // pc of the instruction after all ranges.
1128 int next_pc = t.pc + ranges;
1129
1130 // Checking all ranges.
1131 for (int pc = t.pc; pc < next_pc; ++pc) {
1132 RegExpInstruction inst = bytecode_[pc];
1134 RegExpInstruction::Uc16Range range = inst.payload.consume_range;
1135 if (input_char >= range.min && input_char <= range.max) {
1136 // The current char matches the current range.
1137 t.pc = next_pc;
1138 t.consumed_since_last_quantifier =
1139 InterpreterThread::ConsumedCharacter::DidConsume;
1140 has_matched = true;
1141 break;
1142 }
1143 }
1144 if (has_matched) {
1145 active_threads_.Add(t, zone_);
1146 } else {
1147 DestroyThread(t);
1148 }
1149 }
1150 blocked_threads_.Rewind(0);
1151 }
1152
1153 bool FoundMatch() const { return best_match_thread_.has_value(); }
1154
1155 size_t ApproximateTotalMemoryUsage() {
1156 return (blocked_threads_.length() + active_threads_.length()) *
1158 }
1159
1160 // Checks that the approximative memory usage does not go past a fixed
1161 // threshold. Returns the appropriate error code.
1162 int CheckMemoryConsumption() {
1163 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1164
1165 // Copmputes an approximation of the total current memory usage of the
1166 // intepreter. It is based only on the threads' consumption, since the rest
1167 // is negligible in comparison.
1168 uint64_t approx = (blocked_threads_.length() + active_threads_.length()) *
1170
1171 return (approx <
1172 v8_flags.experimental_regexp_engine_capture_group_opt_max_memory_usage *
1173 MB)
1176 }
1177
1178 base::Vector<int> GetRegisterArray(InterpreterThread t) {
1179 return base::Vector<int>(t.register_array_begin, register_count_per_match_);
1180 }
1181 base::Vector<int> GetLookaroundMatchIndexArray(InterpreterThread t) {
1182 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1183 DCHECK_NOT_NULL(t.lookaround_match_index_array_begin);
1184
1185 return base::Vector<int>(t.lookaround_match_index_array_begin,
1186 lookaround_table_->size());
1187 }
1188
1189 base::Vector<uint64_t> GetQuantifierClockArray(InterpreterThread t) {
1190 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1191 DCHECK_NOT_NULL(t.captures_clock_array_begin);
1192
1193 return base::Vector<uint64_t>(t.quantifier_clock_array_begin,
1194 quantifier_count_);
1195 }
1196 base::Vector<uint64_t> GetCaptureClockArray(InterpreterThread t) {
1197 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1198 DCHECK_NOT_NULL(t.captures_clock_array_begin);
1199
1200 return base::Vector<uint64_t>(t.captures_clock_array_begin,
1201 register_count_per_match_);
1202 }
1203 base::Vector<uint64_t> GetLookaroundClockArray(InterpreterThread t) {
1204 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1205 DCHECK_NOT_NULL(t.lookaround_clock_array_begin);
1206
1207 return base::Vector<uint64_t>(t.lookaround_clock_array_begin,
1208 lookaround_table_->size());
1209 }
1210
1211 int* NewRegisterArrayUninitialized() {
1212 return register_array_allocator_.allocate(register_count_per_match_);
1213 }
1214
1215 int* NewRegisterArray(int fill_value) {
1216 int* array_begin = NewRegisterArrayUninitialized();
1217 int* array_end = array_begin + register_count_per_match_;
1218 std::fill(array_begin, array_end, fill_value);
1219 return array_begin;
1220 }
1221
1222 void FreeRegisterArray(int* register_array_begin) {
1224 register_count_per_match_);
1225 }
1226
1227 int* NewLookaroundMatchIndexArrayUninitialized() {
1228 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1230 lookaround_table_->size());
1231 }
1232
1233 int* NewLookaroundMatchIndexArray(int fill_value) {
1234 int* array_begin = NewLookaroundMatchIndexArrayUninitialized();
1235 int* array_end = array_begin + lookaround_table_->size();
1236 std::fill(array_begin, array_end, fill_value);
1237 return array_begin;
1238 }
1239
1240 void FreeLookaroundMatchIndexArray(int* lookaround_match_index_array_begin) {
1241 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1244 }
1245
1246 uint64_t* NewQuantifierClockArrayUninitialized() {
1247 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1248 return quantifier_array_allocator_->allocate(quantifier_count_);
1249 }
1250
1251 uint64_t* NewQuantifierClockArray(uint64_t fill_value) {
1252 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1253
1254 uint64_t* array_begin = NewQuantifierClockArrayUninitialized();
1255 uint64_t* array_end = array_begin + quantifier_count_;
1256 std::fill(array_begin, array_end, fill_value);
1257 return array_begin;
1258 }
1259
1260 void FreeQuantifierClockArray(uint64_t* quantifier_clock_array_begin) {
1261 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1263 quantifier_count_);
1264 }
1265
1266 uint64_t* NewCaptureClockArrayUninitialized() {
1267 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1268 return capture_clock_array_allocator_->allocate(register_count_per_match_);
1269 }
1270
1271 uint64_t* NewCaptureClockArray(uint64_t fill_value) {
1272 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1273 uint64_t* array_begin = NewCaptureClockArrayUninitialized();
1274 uint64_t* array_end = array_begin + register_count_per_match_;
1275 std::fill(array_begin, array_end, fill_value);
1276 return array_begin;
1277 }
1278
1279 void FreeCaptureClockArray(uint64_t* capture_clock_array_begin) {
1280 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1281 capture_clock_array_allocator_->deallocate(capture_clock_array_begin,
1282 register_count_per_match_);
1283 }
1284
1285 uint64_t* NewLookaroundClockArrayUninitialized() {
1286 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1287 return lookaround_clock_array_allocator_->allocate(
1288 lookaround_table_->size());
1289 }
1290
1291 uint64_t* NewLookaroundClockArray(uint64_t fill_value) {
1292 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1293 uint64_t* array_begin = NewLookaroundClockArrayUninitialized();
1294 uint64_t* array_end = array_begin + lookaround_table_->size();
1295 std::fill(array_begin, array_end, fill_value);
1296 return array_begin;
1297 }
1298
1299 void FreeLookaroundClockArray(uint64_t* lookaround_clock_array_begin) {
1300 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1302 lookaround_table_->size());
1303 }
1304
1305 // Creates an `InterpreterThread` at the given pc and allocates its arrays.
1306 // The register array is initialized to `kUndefinedRegisterValue`. The clocks'
1307 // arrays are set to `nullptr` if irrelevant, or initialized to 0.
1308 InterpreterThread NewEmptyThread(int pc) {
1309 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
1310 return InterpreterThread(
1311 pc, NewRegisterArray(kUndefinedRegisterValue),
1312 only_captureless_lookbehinds_
1313 ? nullptr
1314 : NewLookaroundMatchIndexArray(kUndefinedMatchIndexValue),
1315 NewQuantifierClockArray(0), NewCaptureClockArray(0),
1316 only_captureless_lookbehinds_ ? nullptr : NewLookaroundClockArray(0),
1317 InterpreterThread::ConsumedCharacter::DidConsume);
1318 } else {
1319 return InterpreterThread(
1320 pc, NewRegisterArray(kUndefinedRegisterValue), nullptr, nullptr,
1321 nullptr, nullptr, InterpreterThread::ConsumedCharacter::DidConsume);
1322 }
1323 }
1324
1325 // Creates an `InterpreterThread` at the given pc and allocates its arrays.
1326 // The clocks' arrays are set to `nullptr` if irrelevant. All arrays are left
1327 // uninitialized.
1328 InterpreterThread NewUninitializedThread(int pc) {
1329 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
1330 return InterpreterThread(
1331 pc, NewRegisterArrayUninitialized(),
1332 only_captureless_lookbehinds_
1333 ? nullptr
1334 : NewLookaroundMatchIndexArrayUninitialized(),
1335 NewQuantifierClockArrayUninitialized(),
1336 NewCaptureClockArrayUninitialized(),
1337 only_captureless_lookbehinds_
1338 ? nullptr
1339 : NewLookaroundClockArrayUninitialized(),
1340 InterpreterThread::ConsumedCharacter::DidConsume);
1341 } else {
1342 return InterpreterThread(
1343 pc, NewRegisterArrayUninitialized(), nullptr, nullptr, nullptr,
1344 nullptr, InterpreterThread::ConsumedCharacter::DidConsume);
1345 }
1346 }
1347
1348 V8_WARN_UNUSED_RESULT int GetFilteredRegisters(
1349 InterpreterThread main_thread, base::Vector<int>& filtered_registers) {
1350 if (!only_captureless_lookbehinds_) {
1351 int err_code = FillLookaroundCaptures(main_thread);
1352 if (err_code != RegExp::kInternalRegExpSuccess) {
1353 return err_code;
1354 }
1355 }
1356
1357 base::Vector<int> registers = GetRegisterArray(main_thread);
1358
1359 if (filter_groups_pc_.has_value()) {
1360 filtered_registers = base::Vector<int>(
1361 NewRegisterArray(kUndefinedRegisterValue), register_count_per_match_);
1362
1363 filtered_registers[0] = registers[0];
1364 filtered_registers[1] = registers[1];
1365
1366 filtered_registers = FilterGroups::Filter(
1367 *filter_groups_pc_, GetRegisterArray(main_thread),
1368 GetQuantifierClockArray(main_thread),
1369 GetCaptureClockArray(main_thread),
1370 only_captureless_lookbehinds_
1371 ? std::nullopt
1372 : std::optional(GetLookaroundClockArray(main_thread)),
1373 filtered_registers, bytecode_, zone_);
1374 } else {
1375 filtered_registers = registers;
1376 }
1377
1379 }
1380
1381 void DestroyThread(InterpreterThread t) {
1382 FreeRegisterArray(t.register_array_begin);
1383
1384 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
1385 FreeQuantifierClockArray(t.quantifier_clock_array_begin);
1386 FreeCaptureClockArray(t.captures_clock_array_begin);
1387
1388 if (!only_captureless_lookbehinds_) {
1389 FreeLookaroundClockArray(t.lookaround_clock_array_begin);
1390 FreeLookaroundMatchIndexArray(t.lookaround_match_index_array_begin);
1391 }
1392 }
1393 }
1394
1395 // It is redundant to have two threads t, t0 execute at the same PC and
1396 // consumed_since_last_quantifier values, because one of t, t0 matches iff the
1397 // other does. We can thus discard the one with lower priority. We check
1398 // whether a thread executed at some PC value by recording for every possible
1399 // value of PC what the value of input_index_ was the last time a thread
1400 // executed at PC. If a thread tries to continue execution at a PC value that
1401 // we have seen before at the current input index, we abort it. (We execute
1402 // threads with higher priority first, so the second thread is guaranteed to
1403 // have lower priority.)
1404 //
1405 // Check whether we've seen an active thread with a given pc and
1406 // consumed_since_last_quantifier value since the last increment of
1407 // `input_index_`.
1408 bool IsPcProcessed(int pc, typename InterpreterThread::ConsumedCharacter
1411 case InterpreterThread::ConsumedCharacter::DidConsume:
1412 return pc_last_input_index_[pc].having_consumed_character ==
1414 case InterpreterThread::ConsumedCharacter::DidNotConsume:
1415 return pc_last_input_index_[pc].not_having_consumed_character ==
1417 }
1418 }
1419
1420 // Mark a pc as having been processed since the last increment of
1421 // `input_index_`.
1422 void MarkPcProcessed(int pc, typename InterpreterThread::ConsumedCharacter
1425 case InterpreterThread::ConsumedCharacter::DidConsume:
1426 pc_last_input_index_[pc].having_consumed_character = input_index_;
1427 break;
1428 case InterpreterThread::ConsumedCharacter::DidNotConsume:
1429 pc_last_input_index_[pc].not_having_consumed_character = input_index_;
1430 break;
1431 }
1432 }
1433
1434 Isolate* const isolate_;
1435
1437
1439
1441 base::Vector<const RegExpInstruction> bytecode_;
1442
1443 // Number of registers used per thread.
1445
1446 // Number of quantifiers in the regexp.
1448
1450 base::Vector<const Character> input_;
1452
1453 // Global clock counting the total of executed instructions.
1454 uint64_t clock;
1455
1456 // Stores the last input index at which a thread was activated for a given pc.
1457 // Two values are stored, depending on the value
1458 // consumed_since_last_quantifier of the thread.
1459 class LastInputIndex {
1460 public:
1461 LastInputIndex() : LastInputIndex(-1, -1) {}
1462 LastInputIndex(int having_consumed_character,
1466
1469 };
1470
1471 // pc_last_input_index_[k] records the values of input_index_ the last
1472 // time a thread t such that t.pc == k was activated for both values of
1473 // consumed_since_last_quantifier. Thus pc_last_input_index.size() ==
1474 // bytecode.size(). See also `RunActiveThread`.
1475 base::Vector<LastInputIndex> pc_last_input_index_;
1476
1477 // Active threads can potentially (but not necessarily) continue without
1478 // input. Sorted from low to high priority.
1479 ZoneList<InterpreterThread> active_threads_;
1480
1481 // The pc of a blocked thread points to an instruction that consumes a
1482 // character. Sorted from high to low priority (so the opposite of
1483 // `active_threads_`).
1484 ZoneList<InterpreterThread> blocked_threads_;
1485
1486 // RecyclingZoneAllocator maintains a linked list through freed allocations
1487 // for reuse if possible.
1488 RecyclingZoneAllocator<int> register_array_allocator_;
1489 std::optional<RecyclingZoneAllocator<int>>
1491 std::optional<RecyclingZoneAllocator<uint64_t>>
1493 std::optional<RecyclingZoneAllocator<uint64_t>> quantifier_array_allocator_;
1494 std::optional<RecyclingZoneAllocator<uint64_t>>
1496
1497 std::optional<InterpreterThread> best_match_thread_;
1498
1499 struct Lookaround {
1503 };
1504
1505 // Stores the match pc, capture pc and direction of each lookaround,
1506 // mapped by lookaround id. Computed during the NFA instantiation (see the
1507 // constructor). It also serves as a priority list: the compilation ensures
1508 // that a lookaround's bytecode appears before the bytecode of all the
1509 // lookarounds it contains. Thus lookarounds must be run from the last to the
1510 // first (regarding their appearance order) to never execute a lookaround
1511 // before one of its child.
1512 ZoneList<Lookaround> lookarounds_;
1513
1514 // Truth table for the lookarounds. lookaround_table_[l][r] indicates
1515 // whether the lookaround of index l did complete a match on the position
1516 // r.
1517 // Only used when `only_captureless_lookbehinds_` is false.
1518 std::optional<ZoneVector<ZoneVector<bool>>> lookaround_table_;
1519
1520 // Truth table for the lookbehinds. lookbehind_table_[k] indicates whether
1521 // the lookbehind of index k did complete a match on the current position.
1522 // Only used when `only_captureless_lookbehinds_` is true.
1523 std::optional<ZoneList<bool>> lookbehind_table_;
1524
1525 // This indicates whether the regexp only contains captureless lookbehinds. In
1526 // that case, it can benefit from a huge optimization, and is available
1527 // without the `--experimental_regexp_engine_capture_group_opt` flag.
1529
1530 // Whether we are traversing the input from end to start.
1532
1533 // When computing the `lookaround_table_`, the id of the lookaround
1534 // currently being ran.
1536
1537 // PC of the first FILTER_* instruction. Computed during the NFA
1538 // instantiation (see the constructor). May be empty if their are no such
1539 // instructions (in the case where there are no capture groups or
1540 // quantifiers).
1541 std::optional<int> filter_groups_pc_;
1542
1544
1546};
1547
1548} // namespace
1549
1552 Tagged<TrustedByteArray> bytecode, int register_count_per_match,
1553 Tagged<String> input, int start_index, int32_t* output_registers,
1554 int output_register_count, Zone* zone) {
1555 DCHECK(input->IsFlat());
1557
1558 if (input->GetFlatContent(no_gc).IsOneByte()) {
1559 NfaInterpreter<uint8_t> interpreter(isolate, call_origin, bytecode,
1560 register_count_per_match, input,
1561 start_index, zone);
1562 return interpreter.FindMatches(output_registers, output_register_count);
1563 } else {
1564 DCHECK(input->GetFlatContent(no_gc).IsTwoByte());
1565 NfaInterpreter<base::uc16> interpreter(isolate, call_origin, bytecode,
1566 register_count_per_match, input,
1567 start_index, zone);
1568 return interpreter.FindMatches(output_registers, output_register_count);
1569 }
1570}
1571
1572} // namespace internal
1573} // namespace v8
Isolate * isolate_
friend Zone
Definition asm-types.cc:195
#define SBXCHECK_LT(lhs, rhs)
Definition check.h:66
#define SBXCHECK_BOUNDS(index, limit)
Definition check.h:68
#define SBXCHECK_GE(lhs, rhs)
Definition check.h:65
static int FindMatches(Isolate *isolate, RegExp::CallOrigin call_origin, Tagged< TrustedByteArray > bytecode, int capture_count, Tagged< String > input, int start_index, int32_t *output_registers, int output_register_count, Zone *zone)
static constexpr bool kSupportsUnicode
static constexpr int kInternalRegExpException
Definition regexp.h:136
static constexpr int kInternalRegExpSuccess
Definition regexp.h:135
static constexpr int kInternalRegExpRetry
Definition regexp.h:137
static bool IsOneByteRepresentationUnderneath(Tagged< String > string)
Definition string-inl.h:373
Zone * zone_
XMMRegister const input_
ZoneLinkedList< RegExpLookaround * > lookarounds_
ZoneStack< uint64_t > max_clock_stack_
int * lookaround_match_index_array_begin
RecyclingZoneAllocator< int > register_array_allocator_
uint64_t memory_consumption_per_thread_
const int register_count_per_match_
int not_having_consumed_character
base::Vector< LastInputIndex > pc_last_input_index_
bool only_captureless_lookbehinds_
Tagged< TrustedByteArray > bytecode_object_
ZoneStack< int > pc_stack_
std::optional< ZoneVector< ZoneVector< bool > > > lookaround_table_
uint64_t * captures_clock_array_begin
std::optional< int > filter_groups_pc_
std::optional< RecyclingZoneAllocator< uint64_t > > capture_clock_array_allocator_
std::optional< RecyclingZoneAllocator< int > > lookaround_match_index_array_allocator_
std::optional< ZoneList< bool > > lookbehind_table_
const RegExp::CallOrigin call_origin_
uint64_t clock
int current_lookaround_
uint64_t * quantifier_clock_array_begin
ZoneList< InterpreterThread > active_threads_
int having_consumed_character
ZoneList< InterpreterThread > blocked_threads_
DisallowGarbageCollection no_gc_
uint64_t max_clock_
ConsumedCharacter consumed_since_last_quantifier
std::optional< RecyclingZoneAllocator< uint64_t > > lookaround_clock_array_allocator_
int * register_array_begin
Tagged< String > input_object_
uint64_t * lookaround_clock_array_begin
std::optional< InterpreterThread > best_match_thread_
base::Vector< const RegExpInstruction > bytecode_
std::optional< RecyclingZoneAllocator< uint64_t > > quantifier_array_allocator_
Instruction * instr
ZoneVector< RpoNumber > & result
int position
Definition liveedit.cc:290
RegListBase< RegisterT > registers
int int32_t
Definition unicode.cc:40
V8_INLINE bool IsLineTerminator(uchar c)
Definition unicode.h:267
uint16_t uc16
Definition strings.h:18
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
Tagged(T object) -> Tagged< T >
PerThreadAssertScopeDebugOnly< true, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > AllowGarbageCollection
constexpr bool IsRegExpWord(base::uc32 c)
V8_EXPORT_PRIVATE FlagValues v8_flags
base::SmallVector< RegisterT, kStaticCapacity > registers_
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK_NOT_NULL(val)
Definition logging.h:492
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
#define DCHECK_GT(v1, v2)
Definition logging.h:487
static bool IsFilter(const RegExpInstruction &instruction)
#define V8_WARN_UNUSED_RESULT
Definition v8config.h:671