v8
V8 is Google’s open source high-performance JavaScript and WebAssembly engine, written in C++.
Loading...
Searching...
No Matches
experimental-compiler.cc
Go to the documentation of this file.
1// Copyright 2020 the V8 project authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4
6
7#include "src/flags/flags.h"
12
13namespace v8 {
14namespace internal {
15
16namespace {
17
18// TODO(mbid, v8:10765): Currently the experimental engine doesn't support
19// UTF-16, but this shouldn't be too hard to implement.
20constexpr base::uc32 kMaxSupportedCodepoint = 0xFFFFu;
21#ifdef DEBUG
22constexpr base::uc32 kMaxCodePoint = 0x10ffff;
23#endif // DEBUG
24
25class CanBeHandledVisitor final : private RegExpVisitor {
26 // Visitor to implement `ExperimentalRegExp::CanBeHandled`.
27 public:
28 static bool Check(RegExpTree* tree, RegExpFlags flags, int capture_count) {
29 if (!AreSuitableFlags(flags)) return false;
30 CanBeHandledVisitor visitor{flags};
31 tree->Accept(&visitor, nullptr);
32 return visitor.result_;
33 }
34
35 private:
36 explicit CanBeHandledVisitor(RegExpFlags flags) : flags_(flags) {}
37
38 static bool AreSuitableFlags(RegExpFlags flags) {
39 // TODO(mbid, v8:10765): We should be able to support all flags in the
40 // future.
41 static constexpr RegExpFlags kAllowedFlags =
42 RegExpFlag::kGlobal | RegExpFlag::kSticky | RegExpFlag::kMultiline |
43 RegExpFlag::kDotAll | RegExpFlag::kLinear;
44 // We support Unicode iff kUnicode is among the supported flags.
46 IsUnicode(kAllowedFlags));
47 return (flags & ~kAllowedFlags) == 0;
48 }
49
50 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
51 for (RegExpTree* alt : *node->alternatives()) {
52 alt->Accept(this, nullptr);
53 if (!result_) {
54 return nullptr;
55 }
56 }
57 return nullptr;
58 }
59
60 void* VisitAlternative(RegExpAlternative* node, void*) override {
61 for (RegExpTree* child : *node->nodes()) {
62 child->Accept(this, nullptr);
63 if (!result_) {
64 return nullptr;
65 }
66 }
67 return nullptr;
68 }
69
70 void* VisitClassRanges(RegExpClassRanges* node, void*) override {
71 return nullptr;
72 }
73
74 void* VisitClassSetOperand(RegExpClassSetOperand* node, void*) override {
75 result_ = !node->has_strings();
76 return nullptr;
77 }
78
79 void* VisitClassSetExpression(RegExpClassSetExpression* node,
80 void*) override {
81 result_ = false;
82 return nullptr;
83 }
84
85 void* VisitAssertion(RegExpAssertion* node, void*) override {
86 return nullptr;
87 }
88
89 void* VisitAtom(RegExpAtom* node, void*) override { return nullptr; }
90
91 void* VisitText(RegExpText* node, void*) override {
92 for (TextElement& el : *node->elements()) {
93 el.tree()->Accept(this, nullptr);
94 if (!result_) {
95 return nullptr;
96 }
97 }
98 return nullptr;
99 }
100
101 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
102 // Finite but large values of `min()` and `max()` are bad for the
103 // breadth-first engine because finite (optional) repetition is dealt with
104 // by replicating the bytecode of the body of the quantifier. The number
105 // of replications grows exponentially in how deeply quantifiers are nested.
106 // `replication_factor_` keeps track of how often the current node will
107 // have to be replicated in the generated bytecode, and we don't allow this
108 // to exceed some small value.
109 static constexpr int kMaxReplicationFactor = 16;
110
111 // First we rule out values for min and max that are too big even before
112 // taking into account the ambient replication_factor_. This also guards
113 // against overflows in `local_replication` or `replication_factor_`.
114 if (node->min() > kMaxReplicationFactor ||
115 (node->max() != RegExpTree::kInfinity &&
116 node->max() > kMaxReplicationFactor)) {
117 result_ = false;
118 return nullptr;
119 }
120
121 // Save the current replication factor so that it can be restored if we
122 // return with `result_ == true`.
123 int before_replication_factor = replication_factor_;
124
125 int local_replication;
126 if (node->max() == RegExpTree::kInfinity) {
127 if (node->min() > 0 && node->min_match() > 0) {
128 // Quantifier can be reduced to a non nullable plus.
129 local_replication = std::max(node->min(), 1);
130 } else {
131 local_replication = node->min() + 1;
132 }
133 } else {
134 local_replication = node->max();
135 }
136
137 replication_factor_ *= local_replication;
138 if (replication_factor_ > kMaxReplicationFactor) {
139 result_ = false;
140 return nullptr;
141 }
142
143 switch (node->quantifier_type()) {
146 break;
148 // TODO(mbid, v8:10765): It's not clear to me whether this can be
149 // supported in breadth-first mode. Re2 doesn't support it.
150 result_ = false;
151 return nullptr;
152 }
153
154 node->body()->Accept(this, nullptr);
155 replication_factor_ = before_replication_factor;
156 return nullptr;
157 }
158
159 void* VisitCapture(RegExpCapture* node, void*) override {
160 node->body()->Accept(this, nullptr);
161 return nullptr;
162 }
163
164 void* VisitGroup(RegExpGroup* node, void*) override {
165 if (flags() != node->flags()) {
166 // Flags that aren't supported by the experimental engine at all, are not
167 // supported via modifiers either.
168 // TODO(pthier): Currently the only flag supported in modifiers and in
169 // the experimental engine is multi-line, which is already handled in the
170 // parser. If more flags are supported either by the experimental engine
171 // or in modifiers we need to add general support for modifiers to the
172 // experimental engine.
173 if (!AreSuitableFlags(node->flags())) {
174 result_ = false;
175 return nullptr;
176 }
177 }
178 node->body()->Accept(this, nullptr);
179 return nullptr;
180 }
181
182 void* VisitLookaround(RegExpLookaround* node, void*) override {
183 if (IsGlobal(flags()) || IsSticky(flags())) {
184 result_ = false;
185 return nullptr;
186 }
187
188 // If `experimental_regexp_engine_capture_group_opt` is false, reject all
189 // lookaheads or capturing lookbehinds.
190 if (!v8_flags.experimental_regexp_engine_capture_group_opt &&
191 (node->type() == RegExpLookaround::LOOKAHEAD ||
192 node->capture_count() > 0)) {
193 result_ = false;
194 return nullptr;
195 }
196
197 node->body()->Accept(this, nullptr);
198 return nullptr;
199 }
200
201 void* VisitBackReference(RegExpBackReference* node, void*) override {
202 // This can't be implemented without backtracking.
203 result_ = false;
204 return nullptr;
205 }
206
207 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
208
209 private:
210 RegExpFlags flags() const { return flags_; }
211
212 // See comment in `VisitQuantifier`:
214
215 bool result_ = true;
217};
218
219} // namespace
220
222 RegExpFlags flags,
223 int capture_count) {
224 return CanBeHandledVisitor::Check(tree, flags, capture_count);
225}
226
227namespace {
228
229// A label in bytecode which starts with no known address. The address *must*
230// be bound with `Bind` before the label goes out of scope.
231// Implemented as a linked list through the `payload.pc` of FORK and JMP
232// instructions.
233struct Label {
234 public:
235 Label() = default;
236 ~Label() {
237 DCHECK_EQ(state_, BOUND);
239 }
240
241 // Don't copy, don't move. Moving could be implemented, but it's not
242 // needed anywhere.
243 Label(const Label&) = delete;
244 Label& operator=(const Label&) = delete;
245
246 private:
247 friend class BytecodeAssembler;
248
249 // UNBOUND implies unbound_patch_list_begin_.
250 // BOUND implies bound_index_.
251 enum { UNBOUND, BOUND } state_ = UNBOUND;
252 union {
255 };
256};
257
258class BytecodeAssembler {
259 public:
260 // TODO(mbid,v8:10765): Use some upper bound for code_ capacity computed from
261 // the `tree` size we're going to compile?
262 explicit BytecodeAssembler(Zone* zone) : zone_(zone), code_(0, zone) {}
263
264 ZoneList<RegExpInstruction> IntoCode() && { return std::move(code_); }
265
266 void Accept() { code_.Add(RegExpInstruction::Accept(), zone_); }
267
268 void Assertion(RegExpAssertion::Type t) {
269 code_.Add(RegExpInstruction::Assertion(t), zone_);
270 }
271
272 void ClearRegister(int32_t register_index) {
273 code_.Add(RegExpInstruction::ClearRegister(register_index), zone_);
274 }
275
276 void ConsumeRange(base::uc16 from, base::uc16 to) {
277 code_.Add(RegExpInstruction::ConsumeRange(from, to), zone_);
278 }
279
280 void ConsumeAnyChar() {
282 }
283
284 void RangeCount(int32_t num_ranges) {
285 code_.Add(RegExpInstruction::RangeCount(num_ranges), zone_);
286 }
287
288 void Fork(Label& target) {
289 LabelledInstrImpl(RegExpInstruction::Opcode::FORK, target);
290 }
291
292 void Jmp(Label& target) {
293 LabelledInstrImpl(RegExpInstruction::Opcode::JMP, target);
294 }
295
296 void SetRegisterToCp(int32_t register_index) {
297 code_.Add(RegExpInstruction::SetRegisterToCp(register_index), zone_);
298 }
299
300 void BeginLoop() { code_.Add(RegExpInstruction::BeginLoop(), zone_); }
301
302 void EndLoop() { code_.Add(RegExpInstruction::EndLoop(), zone_); }
303
304 void StartLookaround(int lookaround_index, bool is_positive,
306 code_.Add(
307 RegExpInstruction::StartLookaround(lookaround_index, is_positive, type),
308 zone_);
309 }
310
311 void EndLookaround() { code_.Add(RegExpInstruction::EndLookaround(), zone_); }
312
313 void WriteLookaroundTable(int index) {
314 code_.Add(RegExpInstruction::WriteLookTable(index), zone_);
315 }
316
317 void ReadLookaroundTable(int index, bool is_positive,
319 code_.Add(RegExpInstruction::ReadLookTable(index, is_positive, type),
320 zone_);
321 }
322
323 void SetQuantifierToClock(int32_t quantifier_id) {
324 code_.Add(RegExpInstruction::SetQuantifierToClock(quantifier_id), zone_);
325 }
326
327 void FilterQuantifier(int32_t quantifier_id) {
328 code_.Add(RegExpInstruction::FilterQuantifier(quantifier_id), zone_);
329 }
330
331 void FilterGroup(int32_t group_id) {
332 code_.Add(RegExpInstruction::FilterGroup(group_id), zone_);
333 }
334
335 void FilterLookaround(int32_t lookaround_id) {
336 code_.Add(RegExpInstruction::FilterLookaround(lookaround_id), zone_);
337 }
338
339 void FilterChild(Label& target) {
340 LabelledInstrImpl(RegExpInstruction::Opcode::FILTER_CHILD, target);
341 }
342
343 void Bind(Label& target) {
344 DCHECK_EQ(target.state_, Label::UNBOUND);
345
346 int index = code_.length();
347
348 while (target.unbound_patch_list_begin_ != -1) {
349 RegExpInstruction& inst = code_[target.unbound_patch_list_begin_];
350 DCHECK(inst.opcode == RegExpInstruction::FORK ||
351 inst.opcode == RegExpInstruction::JMP ||
352 inst.opcode == RegExpInstruction::FILTER_CHILD);
353
354 target.unbound_patch_list_begin_ = inst.payload.pc;
355 inst.payload.pc = index;
356 }
357
358 target.state_ = Label::BOUND;
359 target.bound_index_ = index;
360 }
361
362 void Fail() { code_.Add(RegExpInstruction::Fail(), zone_); }
363
364 private:
365 void LabelledInstrImpl(RegExpInstruction::Opcode op, Label& target) {
366 RegExpInstruction result;
367 result.opcode = op;
368
369 if (target.state_ == Label::BOUND) {
370 result.payload.pc = target.bound_index_;
371 } else {
372 DCHECK_EQ(target.state_, Label::UNBOUND);
373 int new_list_begin = code_.length();
374 DCHECK_GE(new_list_begin, 0);
375
376 result.payload.pc = target.unbound_patch_list_begin_;
377
378 target.unbound_patch_list_begin_ = new_list_begin;
379 }
380
381 code_.Add(result, zone_);
382 }
383
385 ZoneList<RegExpInstruction> code_;
386};
387
388class FilterGroupsCompileVisitor final : private RegExpVisitor {
389 public:
390 static void CompileFilter(Zone* zone, RegExpTree* tree,
391 BytecodeAssembler& assembler,
392 const ZoneMap<int, int>& quantifier_id_remapping,
393 const ZoneMap<int, int>& lookaround_id_remapping) {
394 /* To filter out groups that were not matched in the last iteration of a
395 * quantifier, the regexp's AST is compiled using a special sets of
396 * instructions: `FILTER_GROUP`, `FILTER_QUANTIFIER` and `FILTER_CHILD`.
397 * They encode a simplified AST containing only the groups and quantifiers.
398 * Each node is represented as either a `FILTER_GROUP` or a
399 * `FILTER_QUANTIFIER` instruction, containing the index of the respective
400 * group or quantifier, followed by a variable number of `FILTER_CHILD`
401 * instructions each containing the index of their respective node in the
402 * bytecode.
403 *
404 * The regexp's AST is traversed in breadth-first mode, compiling one node
405 * at a time, while saving its children in a queue. */
406
407 FilterGroupsCompileVisitor visitor(assembler, zone, quantifier_id_remapping,
408 lookaround_id_remapping);
409
410 tree->Accept(&visitor, nullptr);
411
412 while (!visitor.nodes_.empty()) {
413 auto& entry = visitor.nodes_.front();
414
415 visitor.assembler_.Bind(entry.label);
416 visitor.can_compile_node_ = true;
417 entry.node->Accept(&visitor, nullptr);
418
419 visitor.nodes_.pop_front();
420 }
421 }
422
423 private:
424 FilterGroupsCompileVisitor(BytecodeAssembler& assembler, Zone* zone,
425 const ZoneMap<int, int>& quantifier_id_remapping,
426 const ZoneMap<int, int>& lookaround_id_remapping)
427 : zone_(zone),
428 assembler_(assembler),
429 nodes_(zone_),
430 can_compile_node_(false),
431 quantifier_id_remapping_(quantifier_id_remapping),
432 lookaround_id_remapping_(lookaround_id_remapping) {}
433
434 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
435 for (RegExpTree* alt : *node->alternatives()) {
436 alt->Accept(this, nullptr);
437 }
438 return nullptr;
439 }
440
441 void* VisitAlternative(RegExpAlternative* node, void*) override {
442 for (RegExpTree* alt : *node->nodes()) {
443 alt->Accept(this, nullptr);
444 }
445 return nullptr;
446 }
447
448 void* VisitClassRanges(RegExpClassRanges* node, void*) override {
449 return nullptr;
450 }
451
452 void* VisitClassSetOperand(RegExpClassSetOperand* node, void*) override {
453 return nullptr;
454 }
455
456 void* VisitClassSetExpression(RegExpClassSetExpression* node,
457 void*) override {
458 return nullptr;
459 }
460
461 void* VisitAssertion(RegExpAssertion* node, void*) override {
462 return nullptr;
463 }
464
465 void* VisitAtom(RegExpAtom* node, void*) override { return nullptr; }
466
467 void* VisitText(RegExpText* node, void*) override { return nullptr; }
468
469 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
470 if (can_compile_node_) {
471 assembler_.FilterQuantifier(quantifier_id_remapping_.at(node->index()));
472 can_compile_node_ = false;
473 node->body()->Accept(this, nullptr);
474 } else {
475 if (node->CaptureRegisters().is_empty()) {
476 return nullptr;
477 }
478
479 nodes_.emplace_back(node);
480 assembler_.FilterChild(nodes_.back().label);
481 }
482
483 return nullptr;
484 }
485
486 void* VisitCapture(RegExpCapture* node, void*) override {
487 if (can_compile_node_) {
488 assembler_.FilterGroup(node->index());
489 can_compile_node_ = false;
490 node->body()->Accept(this, nullptr);
491 } else {
492 nodes_.emplace_back(node);
493 assembler_.FilterChild(nodes_.back().label);
494 }
495
496 return nullptr;
497 }
498
499 void* VisitGroup(RegExpGroup* node, void*) override {
500 node->body()->Accept(this, nullptr);
501 return nullptr;
502 }
503
504 void* VisitLookaround(RegExpLookaround* node, void*) override {
505 if (can_compile_node_) {
506 assembler_.FilterLookaround(lookaround_id_remapping_.at(node->index()));
507 can_compile_node_ = false;
508 node->body()->Accept(this, nullptr);
509 } else {
510 if (node->CaptureRegisters().is_empty()) {
511 return nullptr;
512 }
513
514 nodes_.emplace_back(node);
515 assembler_.FilterChild(nodes_.back().label);
516 }
517
518 return nullptr;
519 }
520
521 void* VisitBackReference(RegExpBackReference* node, void*) override {
522 return nullptr;
523 }
524
525 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
526
527 private:
528 class BFEntry {
529 public:
530 explicit BFEntry(RegExpTree* node) : label(), node(node) {}
531
532 Label label;
533 RegExpTree* node;
534 };
535
536 Zone* zone_;
537
538 BytecodeAssembler& assembler_;
539 ZoneLinkedList<BFEntry> nodes_;
540
541 // Whether we can compile a node or we should add it to `nodes_`. This is
542 // set to true after popping an element from the queue, and false after
543 // having compiled one.
545
546 const ZoneMap<int, int>& quantifier_id_remapping_;
547
548 // The lookarounds ids are remapped to be contiguous. This may not already be
549 // the case, since parts of the AST may be optimized out during parsing.
550 const ZoneMap<int, int>& lookaround_id_remapping_;
551};
552
553class CompileVisitor : private RegExpVisitor {
554 public:
555 static ZoneList<RegExpInstruction> Compile(RegExpTree* tree,
556 RegExpFlags flags, Zone* zone) {
557 CompileVisitor compiler(zone);
558
559 if (!IsSticky(flags) && !tree->IsAnchoredAtStart()) {
560 // The match is not anchored, i.e. may start at any input position, so we
561 // emit a preamble corresponding to /.*?/. This skips an arbitrary
562 // prefix in the input non-greedily.
563 compiler.CompileNonGreedyStar(
564 [&]() { compiler.assembler_.ConsumeAnyChar(); });
565 }
566
567 compiler.assembler_.SetRegisterToCp(0);
568 tree->Accept(&compiler, nullptr);
569 compiler.assembler_.SetRegisterToCp(1);
570 compiler.assembler_.Accept();
571
572 // Each lookaround is compiled as two different bytecode sections: a
573 // reverse, captureless automaton and a capturing one. The indexes are
574 // remapped to avoid holes in the lookaround table, which would occur when
575 // lookarounds are optimized out of the AST. The resulting indexes have the
576 // following order: lookarounds can only depend on ones with a higher index.
577 //
578 // During the compilation of the main expression, lookarounds are not
579 // immediately compiled, and are instead pushed at the back of a queue.
580 // After compiling the filtering instructions, the queue is emptied and
581 // each lookaround body is compiled into two sections:
582 //
583 // 1. A non-capturing automaton, which is reversed if it is a lookahead.
584 // 2. A capturing automaton, reversed if it is a lookbehind. Since it is
585 // only used for capturing, it can be empty if the lookaround does
586 // not contain any captures.
587 //
588 // See the comment on `NfaInterpreter` in experimental-interpreter.cc for
589 // detailed explanation on the reversal's reasons. The sections are
590 // delimited by `START_LOOKAROUND` and `END_LOOKAROUND`, and the separation
591 // occurs right after `WRITE_LOOKAROUND_TABLE`.
592 //
593 // If another lookaround is encountered during this phase, it is added
594 // at the back of the queue.
595 //
596 // This approach prevents the use of the sticky or global flags. In both
597 // cases, when resuming the search, it starts at a non null index, while the
598 // lookbehinds always need to start at the beginning of the string. A future
599 // implementation for the global flag may store the active lookbehind
600 // threads in the regexp to resume the execution of the lookbehinds
601 // automata.
602 while (!compiler.lookarounds_.empty()) {
603 auto node = compiler.lookarounds_.front();
604 compiler.CompileLookaround(node);
605 compiler.lookarounds_.pop_front();
606 }
607
608 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
609 FilterGroupsCompileVisitor::CompileFilter(
610 zone, tree, compiler.assembler_,
611 compiler.quantifier_id_remapping_.value(),
612 compiler.lookaround_id_remapping_);
613 }
614
615 return std::move(compiler.assembler_).IntoCode();
616 }
617
618 private:
619 explicit CompileVisitor(Zone* zone)
620 : zone_(zone),
621 lookarounds_(zone),
624 assembler_(zone),
625 reverse_(false),
626 ignore_captures_(false),
627 ignore_lookarounds_(false) {
628 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
629 quantifier_id_remapping_.emplace(zone_);
630 }
631 }
632
633 // Generate all the instructions to match and capture a lookaround.
634 void CompileLookaround(RegExpLookaround* lookaround) {
635 // Generate the first section, reversed in the case of a lookahead.
636 assembler_.StartLookaround(RemapLookaround(lookaround->index()),
637 lookaround->is_positive(), lookaround->type());
638
639 // If the lookaround is not anchored, we add a /.*?/ at its start, such
640 // that the resulting automaton will run over the whole input.
641 if ((lookaround->type() == RegExpLookaround::LOOKAHEAD &&
642 !lookaround->body()->IsAnchoredAtEnd()) ||
643 (lookaround->type() == RegExpLookaround::LOOKBEHIND &&
644 !lookaround->body()->IsAnchoredAtStart())) {
645 CompileNonGreedyStar([&]() { assembler_.ConsumeAnyChar(); });
646 }
647
648 reverse_ = lookaround->type() == RegExpLookaround::LOOKAHEAD;
649
650 ignore_captures_ = true;
651 lookaround->body()->Accept(this, nullptr);
652 ignore_captures_ = false;
653
654 assembler_.WriteLookaroundTable(RemapLookaround(lookaround->index()));
655
656 // Generate the second sections, reversed in the case of a lookbehind.
657 if (lookaround->capture_count() > 0 && lookaround->is_positive()) {
658 reverse_ = lookaround->type() == RegExpLookaround::LOOKBEHIND;
659
660 ignore_lookarounds_ = true;
661 lookaround->body()->Accept(this, nullptr);
662 ignore_lookarounds_ = false;
663 }
664
665 assembler_.EndLookaround();
666 }
667
668 // Generate a disjunction of code fragments compiled by a function
669 // `alt_gen`. `alt_gen` is called repeatedly with argument `int i = 0, 1,
670 // ..., alt_num - 1` and should build code corresponding to the ith
671 // alternative.
672 template <class F>
673 void CompileDisjunction(int alt_num, F&& gen_alt) {
674 // An alternative a1 | ... | an is compiled into
675 //
676 // FORK tail1
677 // <a1>
678 // JMP end
679 // tail1:
680 // FORK tail2
681 // <a2>
682 // JMP end
683 // tail2:
684 // ...
685 // ...
686 // tail{n -1}:
687 // <an>
688 // end:
689 //
690 // By the semantics of the FORK instruction (see above at definition and
691 // semantics), a forked thread has lower priority than the thread that
692 // spawned it. This means that with the code we're generating here, the
693 // thread matching the alternative a1 has indeed highest priority, followed
694 // by the thread for a2 and so on.
695
696 if (alt_num == 0) {
697 // The empty disjunction. This can never match.
698 assembler_.Fail();
699 return;
700 }
701
702 Label end;
703
704 for (int i = 0; i != alt_num - 1; ++i) {
705 Label tail;
706 assembler_.Fork(tail);
707 gen_alt(i);
708 assembler_.Jmp(end);
709 assembler_.Bind(tail);
710 }
711
712 gen_alt(alt_num - 1);
713
714 assembler_.Bind(end);
715 }
716
717 void* VisitDisjunction(RegExpDisjunction* node, void*) override {
718 ZoneList<RegExpTree*>& alts = *node->alternatives();
719 CompileDisjunction(alts.length(),
720 [&](int i) { alts[i]->Accept(this, nullptr); });
721 return nullptr;
722 }
723
724 void* VisitAlternative(RegExpAlternative* node, void*) override {
725 if (reverse_) {
726 ZoneList<RegExpTree*>* children = node->nodes();
727 for (int i = children->length() - 1; i >= 0; --i) {
728 children->at(i)->Accept(this, nullptr);
729 }
730 } else {
731 for (RegExpTree* child : *node->nodes()) {
732 child->Accept(this, nullptr);
733 }
734 }
735 return nullptr;
736 }
737
738 void* VisitAssertion(RegExpAssertion* node, void*) override {
739 assembler_.Assertion(node->assertion_type());
740 return nullptr;
741 }
742
743 void CompileCharacterRanges(ZoneList<CharacterRange>* ranges, bool negated) {
744 // A character class is compiled as disjoint character ranges.
745 // A single range is represented by a single `CONSUME_RANGE` instruction.
746 // Disjoint ranges are represented by a `RANGE_COUNT` instruction followed
747 // by `CONSUME_RANGE` instructions.
749 if (negated) {
750 // The complement of a disjoint, non-adjacent (i.e. `Canonicalize`d)
751 // union of k intervals is a union of at most k + 1 intervals.
752 ZoneList<CharacterRange>* negated_ranges =
753 zone_->New<ZoneList<CharacterRange>>(ranges->length() + 1, zone_);
754 CharacterRange::Negate(ranges, negated_ranges, zone_);
755 DCHECK_LE(negated_ranges->length(), ranges->length() + 1);
756 ranges = negated_ranges;
757 }
758
759 if (ranges->length() == 0) {
760 assembler_.Fail();
761 return;
762 }
763
764 if (ranges->length() > 1) {
765 // In the case of a multiple ranges, we need a RANGE_COUNT instruction.
766 assembler_.RangeCount(ranges->length());
767 }
768
769 for (int i = 0; i < ranges->length(); ++i) {
770 // We don't support utf16 for now, so only ranges that can be specified
771 // by (complements of) ranges with base::uc16 bounds.
772 static_assert(kMaxSupportedCodepoint <=
773 std::numeric_limits<base::uc16>::max());
774
775 base::uc32 from = (*ranges)[i].from();
776 DCHECK_LE(from, kMaxSupportedCodepoint);
777 base::uc16 from_uc16 = static_cast<base::uc16>(from);
778
779 base::uc32 to = (*ranges)[i].to();
780 DCHECK_IMPLIES(to > kMaxSupportedCodepoint, to == kMaxCodePoint);
781 base::uc16 to_uc16 =
782 static_cast<base::uc16>(std::min(to, kMaxSupportedCodepoint));
783
784 assembler_.ConsumeRange(from_uc16, to_uc16);
785 }
786 }
787
788 void* VisitClassRanges(RegExpClassRanges* node, void*) override {
789 CompileCharacterRanges(node->ranges(zone_), node->is_negated());
790 return nullptr;
791 }
792
793 void* VisitClassSetOperand(RegExpClassSetOperand* node, void*) override {
794 // TODO(v8:11935): Support strings.
795 DCHECK(!node->has_strings());
796 CompileCharacterRanges(node->ranges(), false);
797 return nullptr;
798 }
799
800 void* VisitClassSetExpression(RegExpClassSetExpression* node,
801 void*) override {
802 // TODO(v8:11935): Add support.
803 UNREACHABLE();
804 }
805
806 void* VisitAtom(RegExpAtom* node, void*) override {
807 if (reverse_) {
808 base::Vector<const base::uc16> data = node->data();
809 for (int i = data.length() - 1; i >= 0; --i) {
810 assembler_.ConsumeRange(data.at(i), data.at(i));
811 }
812 } else {
813 for (base::uc16 c : node->data()) {
814 assembler_.ConsumeRange(c, c);
815 }
816 }
817 return nullptr;
818 }
819
820 void ClearRegisters(Interval indices) {
821 if (indices.is_empty()) return;
822 DCHECK_EQ(indices.from() % 2, 0);
823 DCHECK_EQ(indices.to() % 2, 1);
824 for (int i = indices.from(); i <= indices.to(); i += 2) {
825 // It suffices to clear the register containing the `begin` of a capture
826 // because this indicates that the capture is undefined, regardless of
827 // the value in the `end` register.
828 assembler_.ClearRegister(i);
829 }
830 }
831
832 // Emit bytecode corresponding to /<emit_body>*/.
833 template <class F>
834 void CompileGreedyStar(F&& emit_body) {
835 // This is compiled into
836 //
837 // begin:
838 // FORK end
839 // BEGIN_LOOP
840 // <body>
841 // END_LOOP
842 // JMP begin
843 // end:
844 // ...
845 //
846 // This is greedy because a forked thread has lower priority than the
847 // thread that spawned it.
848 Label begin;
849 Label end;
850
851 assembler_.Bind(begin);
852 assembler_.Fork(end);
853 assembler_.BeginLoop();
854 emit_body();
855 assembler_.EndLoop();
856 assembler_.Jmp(begin);
857
858 assembler_.Bind(end);
859 }
860
861 // Emit bytecode corresponding to /<emit_body>*?/.
862 template <class F>
863 void CompileNonGreedyStar(F&& emit_body) {
864 // This is compiled into
865 //
866 // FORK body
867 // JMP end
868 // body:
869 // BEGIN_LOOP
870 // <body>
871 // END_LOOP
872 // FORK body
873 // end:
874 // ...
875
876 Label body;
877 Label end;
878
879 assembler_.Fork(body);
880 assembler_.Jmp(end);
881
882 assembler_.Bind(body);
883 assembler_.BeginLoop();
884 emit_body();
885 assembler_.EndLoop();
886 assembler_.Fork(body);
887
888 assembler_.Bind(end);
889 }
890
891 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}/.
892 template <class F>
893 void CompileGreedyRepetition(F&& emit_body, int max_repetition_num) {
894 // This is compiled into
895 //
896 // FORK end
897 // BEGIN_LOOP
898 // <body>
899 // END_LOOP
900 // FORK end
901 // BEGIN_LOOP
902 // <body>
903 // END_LOOP
904 // ...
905 // ...
906 // FORK end
907 // <body>
908 // end:
909 // ...
910 //
911 // We add `BEGIN_LOOP` and `END_LOOP` instructions because these optional
912 // repetitions of the body cannot match the empty string.
913
914 Label end;
915 for (int i = 0; i != max_repetition_num; ++i) {
916 assembler_.Fork(end);
917 assembler_.BeginLoop();
918 emit_body();
919 assembler_.EndLoop();
920 }
921 assembler_.Bind(end);
922 }
923
924 // Emit bytecode corresponding to /<emit_body>{0, max_repetition_num}?/.
925 template <class F>
926 void CompileNonGreedyRepetition(F&& emit_body, int max_repetition_num) {
927 // This is compiled into
928 //
929 // FORK body0
930 // JMP end
931 // body0:
932 // BEGIN_LOOP
933 // <body>
934 // END_LOOP
935 //
936 // FORK body1
937 // JMP end
938 // body1:
939 // BEGIN_LOOP
940 // <body>
941 // END_LOOP
942 // ...
943 // ...
944 // body{max_repetition_num - 1}:
945 // BEGIN_LOOP
946 // <body>
947 // END_LOOP
948 // end:
949 // ...
950 //
951 // We add `BEGIN_LOOP` and `END_LOOP` instructions because these optional
952 // repetitions of the body cannot match the empty string.
953
954 Label end;
955 for (int i = 0; i != max_repetition_num; ++i) {
956 Label body;
957 assembler_.Fork(body);
958 assembler_.Jmp(end);
959
960 assembler_.Bind(body);
961 assembler_.BeginLoop();
962 emit_body();
963 assembler_.EndLoop();
964 }
965 assembler_.Bind(end);
966 }
967
968 // In the general case, the first repetition of <body>+ is different
969 // from the following ones as it is allowed to match the empty string. This is
970 // compiled by repeating <body>, but it can result in a bytecode that grows
971 // quadratically with the size of the regex when nesting pluses or repetition
972 // upper-bounded with infinity.
973 //
974 // In the particular case where <body> cannot match the empty string, the
975 // plus can be compiled without duplicating the bytecode of <body>, resulting
976 // in a bytecode linear in the size of the regex in case of nested
977 // non-nullable pluses.
978 //
979 // E.g. `/.+/` will compile `/./` once, while `/(?:.?)+/` will be compiled as
980 // `/(?:.?)(?:.?)*/`, resulting in two repetitions of the body.
981
982 // Emit bytecode corresponding to /<emit_body>+/, with <emit_body> not
983 // nullable.
984 template <class F>
985 void CompileNonNullableGreedyPlus(F&& emit_body) {
986 // This is compiled into
987 //
988 // begin:
989 // <body>
990 //
991 // FORK end
992 // JMP begin
993 // end:
994 // ...
995 Label begin, end;
996
997 assembler_.Bind(begin);
998 emit_body();
999
1000 assembler_.Fork(end);
1001 assembler_.Jmp(begin);
1002 assembler_.Bind(end);
1003 }
1004
1005 // Emit bytecode corresponding to /<emit_body>+?/, with <emit_body> not
1006 // nullable.
1007 template <class F>
1008 void CompileNonNullableNonGreedyPlus(F&& emit_body) {
1009 // This is compiled into
1010 //
1011 // begin:
1012 // <body>
1013 //
1014 // FORK begin
1015 // ...
1016 Label begin;
1017
1018 assembler_.Bind(begin);
1019 emit_body();
1020
1021 assembler_.Fork(begin);
1022 }
1023
1024 void* VisitQuantifier(RegExpQuantifier* node, void*) override {
1025 // If the quantifier must match nothing, we do not produce its body, but
1026 // still need the `SET_QUANTIFIER_TO_CLOCK` for the Nfa to be able to
1027 // correctly determine the number of quantifiers.
1028 if (v8_flags.experimental_regexp_engine_capture_group_opt &&
1029 node->max() == 0) {
1030 if (!node->CaptureRegisters().is_empty()) {
1031 assembler_.SetQuantifierToClock(RemapQuantifier(node->index()));
1032 }
1033
1034 return nullptr;
1035 }
1036
1037 // Emit the body, but clear registers occurring in body first.
1038 //
1039 // TODO(mbid,v8:10765): It's not always necessary to a) capture registers
1040 // and b) clear them. For example, we don't have to capture anything for
1041 // the first 4 repetitions if node->min() >= 5, and then we don't have to
1042 // clear registers in the first node->min() repetitions.
1043 // Later, and if node->min() == 0, we don't have to clear registers before
1044 // the first optional repetition.
1045 Interval body_registers = node->body()->CaptureRegisters();
1046 auto emit_body = [&]() {
1047 if (v8_flags.experimental_regexp_engine_capture_group_opt) {
1048 assembler_.SetQuantifierToClock(RemapQuantifier(node->index()));
1049 } else {
1050 ClearRegisters(body_registers);
1051 }
1052
1053 node->body()->Accept(this, nullptr);
1054 };
1055
1056 bool can_be_reduced_to_non_nullable_plus =
1057 node->min() > 0 && node->max() == RegExpTree::kInfinity &&
1058 node->min_match() > 0;
1059
1060 if (can_be_reduced_to_non_nullable_plus) {
1061 // Compile <body>+ with an optimization allowing linear sized bytecode in
1062 // the case of nested pluses. Repetitions with infinite upperbound like
1063 // <body>{n,}, with n != 0, are compiled into <body>{n-1}<body+>, avoiding
1064 // one repetition, compared to <body>{n}<body>*.
1065
1066 // Compile the mandatory repetitions. We repeat `min() - 1` times, such
1067 // that the last repetition, compiled later, can be reused in a loop.
1068 for (int i = 0; i < node->min() - 1; ++i) {
1069 emit_body();
1070 }
1071
1072 // Compile the optional repetitions, using an optimized plus when
1073 // possible.
1074 switch (node->quantifier_type()) {
1076 UNREACHABLE();
1078 // Compile both last mandatory repetition and optional ones.
1079 CompileNonNullableGreedyPlus(emit_body);
1080 break;
1081 }
1083 // Compile both last mandatory repetition and optional ones.
1084 CompileNonNullableNonGreedyPlus(emit_body);
1085 break;
1086 }
1087 }
1088 } else {
1089 // Compile <body>+ into <body><body>*, and <body>{n,}, with n != 0, into
1090 // <body>{n}<body>*.
1091
1092 // Compile the first `min()` repetitions.
1093 for (int i = 0; i < node->min(); ++i) {
1094 emit_body();
1095 }
1096
1097 // Compile the optional repetitions, using stars or repetitions.
1098 switch (node->quantifier_type()) {
1100 UNREACHABLE();
1102 if (node->max() == RegExpTree::kInfinity) {
1103 CompileGreedyStar(emit_body);
1104 } else {
1105 DCHECK_NE(node->max(), RegExpTree::kInfinity);
1106 CompileGreedyRepetition(emit_body, node->max() - node->min());
1107 }
1108 break;
1109 }
1111 if (node->max() == RegExpTree::kInfinity) {
1112 CompileNonGreedyStar(emit_body);
1113 } else {
1114 DCHECK_NE(node->max(), RegExpTree::kInfinity);
1115 CompileNonGreedyRepetition(emit_body, node->max() - node->min());
1116 }
1117 break;
1118 }
1119 }
1120 }
1121
1122 return nullptr;
1123 }
1124
1125 void* VisitCapture(RegExpCapture* node, void*) override {
1126 if (ignore_captures_) {
1127 // Skips the `SET_REGISTER_TO_CP` instructions.
1128 node->body()->Accept(this, nullptr);
1129 } else {
1130 int index = node->index();
1131 int start_register = RegExpCapture::StartRegister(index);
1132 int end_register = RegExpCapture::EndRegister(index);
1133
1134 assembler_.SetRegisterToCp(reverse_ ? end_register : start_register);
1135 node->body()->Accept(this, nullptr);
1136 assembler_.SetRegisterToCp(reverse_ ? start_register : end_register);
1137 }
1138
1139 return nullptr;
1140 }
1141
1142 void* VisitGroup(RegExpGroup* node, void*) override {
1143 node->body()->Accept(this, nullptr);
1144 return nullptr;
1145 }
1146
1147 void* VisitLookaround(RegExpLookaround* node, void*) override {
1148 assembler_.ReadLookaroundTable(RemapLookaround(node->index()),
1149 node->is_positive(), node->type());
1150
1151 // Add the lookbehind to the queue of lookbehinds to be compiled.
1152 if (!ignore_lookarounds_) {
1153 lookarounds_.push_back(node);
1154 }
1155
1156 return nullptr;
1157 }
1158
1159 void* VisitBackReference(RegExpBackReference* node, void*) override {
1160 UNREACHABLE();
1161 }
1162
1163 void* VisitEmpty(RegExpEmpty* node, void*) override { return nullptr; }
1164
1165 void* VisitText(RegExpText* node, void*) override {
1166 if (reverse_) {
1167 ZoneList<TextElement>* elements = node->elements();
1168 for (int i = elements->length() - 1; i >= 0; --i) {
1169 elements->at(i).tree()->Accept(this, nullptr);
1170 }
1171 } else {
1172 for (TextElement& text_el : *node->elements()) {
1173 text_el.tree()->Accept(this, nullptr);
1174 }
1175 }
1176 return nullptr;
1177 }
1178
1179 int RemapQuantifier(int id) {
1180 DCHECK(v8_flags.experimental_regexp_engine_capture_group_opt);
1181 DCHECK(quantifier_id_remapping_.has_value());
1182 auto& map = quantifier_id_remapping_.value();
1183
1184 if (!map.contains(id)) {
1185 map[id] = static_cast<int>(map.size());
1186 }
1187
1188 return map[id];
1189 }
1190
1191 int RemapLookaround(int id) {
1192 if (!lookaround_id_remapping_.contains(id)) {
1194 static_cast<int>(lookaround_id_remapping_.size());
1195 }
1196
1197 return lookaround_id_remapping_[id];
1198 }
1199
1200 private:
1201 Zone* zone_;
1202
1203 // Stores the AST of the lookbehinds encountered in a queue. They are compiled
1204 // after the main expression, in breadth-first order.
1205 ZoneLinkedList<RegExpLookaround*> lookarounds_;
1206
1207 std::optional<ZoneMap<int, int>> quantifier_id_remapping_;
1208 ZoneMap<int, int> lookaround_id_remapping_;
1209
1210 BytecodeAssembler assembler_;
1212
1213 // Do not produce `SET_REGISTER_TO_CP` instructions. Used when compiling
1214 // the lookarounds' matching automata, since the capture results will be never
1215 // be used.
1217
1218 // Indicates whether the encountered lookarounds should be pushed into the
1219 // queue. Set when compiling the lookarounds' capturing automata, as the
1220 // children lookarounds where already pushed while compiling the matching
1221 // automata.
1223};
1224
1225} // namespace
1226
1228 RegExpTree* tree, RegExpFlags flags, Zone* zone) {
1229 return CompileVisitor::Compile(tree, flags, zone);
1230}
1231
1232} // namespace internal
1233} // namespace v8
friend Zone
Definition asm-types.cc:195
static void Canonicalize(ZoneList< CharacterRange > *ranges)
static void Negate(const ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
static ZoneList< RegExpInstruction > Compile(RegExpTree *tree, RegExpFlags flags, Zone *zone)
static bool CanBeHandled(RegExpTree *tree, RegExpFlags flags, int capture_count)
static constexpr bool kSupportsUnicode
Label & operator=(const Label &)=delete
static int StartRegister(int index)
Definition regexp-ast.h:621
static int EndRegister(int index)
Definition regexp-ast.h:622
static const int kInfinity
Definition regexp-ast.h:196
Zone * zone_
T const result_
JSRegExp::Flags flags_
int end
enum v8::internal::@1270::DeoptimizableCodeIterator::@67 state_
bool ignore_lookarounds_
Label label
bool ignore_captures_
bool can_compile_node_
int unbound_patch_list_begin_
ZoneLinkedList< RegExpLookaround * > lookarounds_
const ZoneMap< int, int > & lookaround_id_remapping_
BytecodeAssembler & assembler_
const ZoneMap< int, int > & quantifier_id_remapping_
int bound_index_
int replication_factor_
ZoneList< RegExpInstruction > code_
ZoneLinkedList< BFEntry > nodes_
std::vector< std::unique_ptr< InstanceTypeTree > > children
ZoneVector< RpoNumber > & result
Point from
uint32_t uc32
Definition strings.h:19
uint16_t uc16
Definition strings.h:18
Node::Uses::const_iterator begin(const Node::Uses &uses)
Definition node.h:708
Flag flags[]
Definition flags.cc:3797
constexpr base::uc32 kMaxCodePoint
base::Flags< RegExpFlag > RegExpFlags
V8_EXPORT_PRIVATE FlagValues v8_flags
#define DCHECK_LE(v1, v2)
Definition logging.h:490
#define DCHECK_IMPLIES(v1, v2)
Definition logging.h:493
#define DCHECK_NE(v1, v2)
Definition logging.h:486
#define DCHECK_GE(v1, v2)
Definition logging.h:488
#define DCHECK(condition)
Definition logging.h:482
#define DCHECK_EQ(v1, v2)
Definition logging.h:485
static RegExpInstruction Accept()
static RegExpInstruction EndLoop()
static RegExpInstruction SetQuantifierToClock(int32_t quantifier_id)
static RegExpInstruction ConsumeAnyChar()
static RegExpInstruction BeginLoop()
static RegExpInstruction FilterGroup(int32_t group_id)
static RegExpInstruction ConsumeRange(base::uc16 min, base::uc16 max)
static RegExpInstruction SetRegisterToCp(int32_t register_index)
static RegExpInstruction FilterLookaround(int32_t lookaround_id)
static RegExpInstruction FilterQuantifier(int32_t quantifier_id)
static RegExpInstruction StartLookaround(int lookaround_index, bool is_positive, RegExpLookaround::Type type)
static RegExpInstruction ClearRegister(int32_t register_index)
static RegExpInstruction ReadLookTable(int32_t index, bool is_positive, RegExpLookaround::Type type)
static RegExpInstruction WriteLookTable(int32_t index)
static RegExpInstruction RangeCount(int32_t num_ranges)
static RegExpInstruction EndLookaround()
static RegExpInstruction Assertion(RegExpAssertion::Type t)