16#include "unicode/locid.h"
17#include "unicode/uniset.h"
18#include "unicode/utypes.h"
24using namespace regexp_compiler_constants;
38 return compiler->zone()->New<
TextNode>(elms, compiler->read_backward(),
43 RegExpNode* on_success) {
44 return compiler->
zone()->
New<TextNode>(
elements(), compiler->read_backward(),
50bool CompareInverseRanges(ZoneList<CharacterRange>* ranges,
51 const int* special_class,
int length) {
59 if (ranges->length() != (length >> 1) + 1)
return false;
61 CharacterRange range = ranges->at(0);
62 if (range.from() != 0)
return false;
65 if (
static_cast<base::uc32>(special_class[
i]) != (range.to() + 1)) {
68 range = ranges->at((
i >> 1) + 1);
69 if (
static_cast<base::uc32>(special_class[
i + 1]) != range.from()) {
77bool CompareRanges(ZoneList<CharacterRange>* ranges,
const int* special_class,
82 if (ranges->length() * 2 != length)
return false;
85 CharacterRange range = ranges->at(
i >> 1);
86 if (range.from() !=
static_cast<base::uc32>(special_class[
i]) ||
87 range.to() !=
static_cast<base::uc32>(special_class[
i + 1] - 1)) {
154 static_assert(kBmp1Start == 0);
155 static_assert(kBmp1Start < kBmp1End);
161 static_assert(kBmp2Start < kBmp2End);
178 static constexpr int kCount =
arraysize(kStarts);
179 static_assert(kCount ==
arraysize(kEnds));
180 static_assert(kCount ==
arraysize(kTargets));
182 for (
int i = 0;
i < kCount;
i++) {
183 if (kStarts[
i] > range.to())
break;
184 const base::uc32 from = std::max(kStarts[
i], range.from());
185 const base::uc32 to = std::min(kEnds[
i], range.to());
186 if (from > to)
continue;
196 if (v->
empty())
return nullptr;
200 for (
size_t i = 0;
i < v->
size();
i++) {
208void AddBmpCharacters(RegExpCompiler* compiler, ChoiceNode*
result,
209 RegExpNode* on_success, UnicodeRangeSplitter* splitter) {
210 ZoneList<CharacterRange>* bmp =
211 ToCanonicalZoneList(splitter->bmp(), compiler->zone());
212 if (bmp ==
nullptr)
return;
214 compiler->zone(), bmp, compiler->read_backward(), on_success)));
217using UC16Range = uint32_t;
219 return (
static_cast<uint32_t
>(from) << 16) |
to;
228void AddNonBmpSurrogatePairs(RegExpCompiler* compiler, ChoiceNode*
result,
229 RegExpNode* on_success,
230 UnicodeRangeSplitter* splitter) {
231 DCHECK(!compiler->one_byte());
232 Zone*
const zone = compiler->zone();
233 ZoneList<CharacterRange>* non_bmp =
234 ToCanonicalZoneList(splitter->non_bmp(), zone);
235 if (non_bmp ==
nullptr)
return;
250 ZoneUnorderedMap<UC16Range, ZoneList<CharacterRange>*> grouped_by_leading(
252 ZoneList<CharacterRange>* leading_with_full_trailing_range =
253 zone->New<ZoneList<CharacterRange>>(1, zone);
256 const UC16Range leading_range = ToUC16Range(from_l, to_l);
257 if (grouped_by_leading.count(leading_range) == 0) {
259 leading_with_full_trailing_range->Add(
263 grouped_by_leading[leading_range] =
264 zone->New<ZoneList<CharacterRange>>(2, zone);
272 for (
int i = 0;
i < non_bmp->
length();
i++) {
285 if (from_l == to_l) {
287 AddRange(from_l, to_l, from_t, to_t);
301 if (from_l <= to_l) {
308 if (!leading_with_full_trailing_range->is_empty()) {
311 zone, leading_with_full_trailing_range,
313 compiler->read_backward(), on_success)));
315 for (
const auto& it : grouped_by_leading) {
316 CharacterRange leading_range =
318 ZoneList<CharacterRange>* trailing_ranges = it.second;
321 zone, leading_range, trailing_ranges, compiler->read_backward(),
326RegExpNode* NegativeLookaroundAgainstReadDirectionAndMatch(
327 RegExpCompiler* compiler, ZoneList<CharacterRange>* lookbehind,
328 ZoneList<CharacterRange>* match, RegExpNode* on_success,
329 bool read_backward) {
330 Zone* zone = compiler->zone();
332 zone, match, read_backward, on_success);
333 int stack_register = compiler->UnicodeLookaroundStackRegister();
334 int position_register = compiler->UnicodeLookaroundPositionRegister();
335 RegExpLookaround::Builder lookaround(
false, match_node, stack_register,
338 zone, lookbehind, !read_backward, lookaround.on_match_success());
339 return lookaround.ForMatch(negative_match);
342RegExpNode* MatchAndNegativeLookaroundInReadDirection(
343 RegExpCompiler* compiler, ZoneList<CharacterRange>* match,
344 ZoneList<CharacterRange>* lookahead, RegExpNode* on_success,
345 bool read_backward) {
346 Zone* zone = compiler->zone();
347 int stack_register = compiler->UnicodeLookaroundStackRegister();
348 int position_register = compiler->UnicodeLookaroundPositionRegister();
349 RegExpLookaround::Builder lookaround(
false, on_success, stack_register,
352 zone, lookahead, read_backward, lookaround.on_match_success());
354 zone, match, read_backward, lookaround.ForMatch(negative_match));
357void AddLoneLeadSurrogates(RegExpCompiler* compiler, ChoiceNode*
result,
358 RegExpNode* on_success,
359 UnicodeRangeSplitter* splitter) {
360 ZoneList<CharacterRange>* lead_surrogates =
361 ToCanonicalZoneList(splitter->lead_surrogates(), compiler->zone());
362 if (lead_surrogates ==
nullptr)
return;
363 Zone* zone = compiler->zone();
369 if (compiler->read_backward()) {
372 match = NegativeLookaroundAgainstReadDirectionAndMatch(
373 compiler, trail_surrogates, lead_surrogates, on_success,
true);
377 match = MatchAndNegativeLookaroundInReadDirection(
378 compiler, lead_surrogates, trail_surrogates, on_success,
false);
380 result->AddAlternative(GuardedAlternative(match));
383void AddLoneTrailSurrogates(RegExpCompiler* compiler, ChoiceNode*
result,
384 RegExpNode* on_success,
385 UnicodeRangeSplitter* splitter) {
386 ZoneList<CharacterRange>* trail_surrogates =
387 ToCanonicalZoneList(splitter->trail_surrogates(), compiler->zone());
388 if (trail_surrogates ==
nullptr)
return;
389 Zone* zone = compiler->zone();
395 if (compiler->read_backward()) {
398 match = MatchAndNegativeLookaroundInReadDirection(
399 compiler, trail_surrogates, lead_surrogates, on_success,
true);
403 match = NegativeLookaroundAgainstReadDirectionAndMatch(
404 compiler, lead_surrogates, trail_surrogates, on_success,
false);
406 result->AddAlternative(GuardedAlternative(match));
409RegExpNode* UnanchoredAdvance(RegExpCompiler* compiler,
410 RegExpNode* on_success) {
412 DCHECK(!compiler->read_backward());
413 Zone* zone = compiler->zone();
418 ZoneList<CharacterRange>* range =
429#ifdef V8_INTL_SUPPORT
438 if (ranges->length() == 1 && ranges->at(0).IsEverything(
kNonBmpEnd))
return;
442 for (
int i = 0;
i < ranges->
length();
i++) {
443 set.add(ranges->at(
i).from(), ranges->at(
i).to());
447 set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
448 for (
int i = 0;
i < set.getRangeCount();
i++) {
449 ranges->Add(
Range(set.getRangeStart(
i), set.getRangeEnd(
i)), zone);
459 Zone*
const zone = compiler->zone();
462 const bool needs_case_folding =
464 if (needs_case_folding) {
470 return zone->
New<TextNode>(
this, compiler->read_backward(), on_success);
482 IsUnicodeSets(compiler->flags()),
483 ranges->length() == 1 && ranges->first().IsEverything(
kMaxCodePoint));
484 ZoneList<CharacterRange>* negated =
485 zone->
New<ZoneList<CharacterRange>>(2, zone);
490 if (ranges->length() == 0) {
493 return zone->New<TextNode>(fail, compiler->read_backward(), on_success);
498 return UnanchoredAdvance(compiler, on_success);
506 ChoiceNode*
result = zone->
New<ChoiceNode>(2, zone);
507 UnicodeRangeSplitter splitter(ranges);
508 AddBmpCharacters(compiler,
result, on_success, &splitter);
509 AddNonBmpSurrogatePairs(compiler,
result, on_success, &splitter);
510 AddLoneLeadSurrogates(compiler,
result, on_success, &splitter);
511 AddLoneTrailSurrogates(compiler,
result, on_success, &splitter);
513 static constexpr int kMaxRangesToInline = 32;
514 if (ranges->length() > kMaxRangesToInline)
result->SetDoNotInline();
520 RegExpNode* on_success) {
521 Zone* zone = compiler->zone();
527 ZoneList<CharacterRange>* empty =
528 zone->template New<ZoneList<CharacterRange>>(0, zone);
529 return zone->template New<RegExpClassRanges>(zone, empty)
530 ->ToNode(compiler, on_success);
532 ZoneList<RegExpTree*>* alternatives =
533 zone->template New<ZoneList<RegExpTree*>>(
size, zone);
537 RegExpTree* empty_string =
nullptr;
539 for (
auto string : *
strings()) {
540 if (
string.
second->IsEmpty()) {
541 empty_string =
string.second;
543 alternatives->Add(
string.
second, zone);
552 alternatives->Add(zone->template New<RegExpClassRanges>(
556 if (empty_string !=
nullptr) {
557 alternatives->Add(empty_string, zone);
560 RegExpTree* node =
nullptr;
563 node = alternatives->first();
565 node = zone->template New<RegExpDisjunction>(alternatives);
567 return node->ToNode(compiler, on_success);
571 RegExpNode* on_success) {
573 ZoneList<CharacterRange>* temp_ranges =
574 zone->template New<ZoneList<CharacterRange>>(4, zone);
576 return root->ToNode(compiler, on_success);
580 ranges()->AddAll(*other->ranges(), zone);
581 if (other->has_strings()) {
583 strings_ = zone->template New<CharacterClassStrings>(zone);
585 strings()->insert(other->strings()->begin(), other->strings()->end());
593 std::swap(*
ranges(), *temp_ranges);
596 if (!other->has_strings()) {
600 if (other->strings()->find(iter->first) == other->strings()->end()) {
614 std::swap(*
ranges(), *temp_ranges);
618 if (other->strings()->find(iter->first) != other->strings()->end()) {
631 if (root->IsClassSetOperand()) {
632 return root->AsClassSetOperand();
634 DCHECK(root->IsClassSetExpression());
638 switch (node->operation()) {
640 for (
int i = 1;
i < node->operands()->
length();
i++) {
649 for (
int i = 1;
i < node->operands()->
length();
i++) {
652 result->Intersect(op, temp_ranges, zone);
657 for (
int i = 1;
i < node->operands()->
length();
i++) {
660 result->Subtract(op, temp_ranges, zone);
665 if (node->is_negated()) {
668 std::swap(*
result->ranges(), *temp_ranges);
670 node->is_negated_ =
false;
673 node->operands()->Set(0,
result);
674 node->operands()->Rewind(1);
682 base::uc16 character1 = a->data().at(index_a);
684 if (character1 < character2)
return -1;
685 if (character1 > character2)
return 1;
689int CompareFirstChar(RegExpTree*
const* a, RegExpTree*
const* b) {
690 RegExpAtom* atom1 = (*a)->AsAtom();
691 RegExpAtom* atom2 = (*b)->AsAtom();
692 return CompareCharAt(atom1, 0, atom2, 0);
695int CompareLastChar(RegExpTree*
const* a, RegExpTree*
const* b) {
696 RegExpAtom* atom1 = (*a)->AsAtom();
697 RegExpAtom* atom2 = (*b)->AsAtom();
698 return CompareCharAt(atom1, atom1->length() - 1, atom2, atom2->length() - 1);
701#ifdef V8_INTL_SUPPORT
703int CompareCaseInsensitive(
const icu::UnicodeString& a,
704 const icu::UnicodeString& b) {
705 return a.caseCompare(b, U_FOLD_CASE_DEFAULT);
708int CompareCharAtCaseInsensitive(RegExpAtom* a,
int index_a, RegExpAtom* b,
710 base::uc16 character1 = a->data().at(index_a);
711 base::uc16 character2 = b->data().at(index_b);
712 return CompareCaseInsensitive(icu::UnicodeString{character1},
713 icu::UnicodeString{character2});
716int CompareFirstCharCaseInsensitive(RegExpTree*
const* a,
717 RegExpTree*
const* b) {
718 RegExpAtom* atom1 = (*a)->AsAtom();
719 RegExpAtom* atom2 = (*b)->AsAtom();
720 return CompareCharAtCaseInsensitive(atom1, 0, atom2, 0);
723int CompareLastCharCaseInsensitive(RegExpTree*
const* a, RegExpTree*
const* b) {
724 RegExpAtom* atom1 = (*a)->AsAtom();
725 RegExpAtom* atom2 = (*b)->AsAtom();
726 return CompareCharAtCaseInsensitive(atom1, atom1->length() - 1, atom2,
727 atom2->length() - 1);
730bool Equals(
bool ignore_case,
const icu::UnicodeString& a,
731 const icu::UnicodeString& b) {
732 if (a == b)
return true;
733 if (ignore_case)
return CompareCaseInsensitive(a, b) == 0;
737bool CharAtEquals(
bool ignore_case,
const RegExpAtom* a,
int index_a,
738 const RegExpAtom* b,
int index_b) {
739 return Equals(ignore_case, a->data().at(index_a), b->data().at(index_b));
748 int length = canonicalize->
get(c,
'\0', chars);
751 if (length == 1) canonical = chars[0];
755int CompareCaseInsensitive(
758 if (a == b)
return 0;
759 if (a >=
'a' || b >=
'a') {
760 a = Canonical(canonicalize, a);
761 b = Canonical(canonicalize, b);
763 return static_cast<int>(
a) -
static_cast<int>(b);
766int CompareCharAtCaseInsensitive(
768 int index_a, RegExpAtom* b,
int index_b) {
769 base::uc16 character1 = a->data().at(index_a);
770 base::uc16 character2 = b->data().at(index_b);
771 return CompareCaseInsensitive(canonicalize, character1, character2);
773int CompareFirstCharCaseInsensitive(
775 RegExpTree*
const* a, RegExpTree*
const* b) {
776 RegExpAtom* atom1 = (*a)->AsAtom();
777 RegExpAtom* atom2 = (*b)->AsAtom();
778 return CompareCharAtCaseInsensitive(canonicalize, atom1, 0, atom2, 0);
781int CompareLastCharCaseInsensitive(
783 RegExpTree*
const* a, RegExpTree*
const* b) {
784 RegExpAtom* atom1 = (*a)->AsAtom();
785 RegExpAtom* atom2 = (*b)->AsAtom();
786 return CompareCharAtCaseInsensitive(canonicalize, atom1, atom1->length() - 1,
787 atom2, atom2->length() - 1);
790bool Equals(
bool ignore_case,
793 if (a == b)
return true;
795 return CompareCaseInsensitive(canonicalize, a, b) == 0;
800bool CharAtEquals(
bool ignore_case,
802 const RegExpAtom* a,
int index_a,
const RegExpAtom* b,
804 return Equals(ignore_case, canonicalize, a->data().at(index_a),
805 b->data().at(index_b));
819 bool found_consecutive_atoms =
false;
823 if (alternative->IsAtom())
break;
827 if (
i == length)
break;
832 if (!alternative->IsAtom())
break;
844 const bool backwards = compiler->read_backward();
845 if (IsIgnoreCase(compiler->flags())) {
846#ifdef V8_INTL_SUPPORT
847 auto cmp_fun = backwards ? CompareLastCharCaseInsensitive
848 : CompareFirstCharCaseInsensitive;
849 alternatives->StableSort(cmp_fun, first_atom,
i - first_atom);
852 compiler->isolate()->regexp_macro_assembler_canonicalize();
853 auto compare_closure = [canonicalize, backwards](
RegExpTree*
const*
a,
855 return backwards ? CompareLastCharCaseInsensitive(canonicalize, a, b)
856 : CompareFirstCharCaseInsensitive(canonicalize, a, b);
858 alternatives->StableSort(compare_closure, first_atom,
i - first_atom);
861 auto cmp_fun = backwards ? CompareLastChar : CompareFirstChar;
862 alternatives->StableSort(cmp_fun, first_atom,
i - first_atom);
864 if (
i - first_atom > 1) found_consecutive_atoms =
true;
866 return found_consecutive_atoms;
872 Zone* zone = compiler->zone();
873 const bool backwards = compiler->read_backward();
876 const bool ignore_case = IsIgnoreCase(compiler->flags());
882 if (!alternative->IsAtom()) {
887 RegExpAtom*
const atom = alternative->AsAtom();
888#ifdef V8_INTL_SUPPORT
889 icu::UnicodeString common_affix(
893 compiler->isolate()->regexp_macro_assembler_canonicalize();
897 common_affix = Canonical(canonicalize, common_affix);
900 int first_with_affix =
i;
901 int affix_length = atom->
length();
905 if (!alternative->IsAtom())
break;
906 RegExpAtom*
const alt_atom = alternative->AsAtom();
907#ifdef V8_INTL_SUPPORT
908 icu::UnicodeString new_affix(
909 alt_atom->
data().
at(backwards ? alt_atom->
length() - 1 : 0));
910 if (!Equals(ignore_case, new_affix, common_affix))
break;
913 alt_atom->
data().
at(backwards ? alt_atom->
length() - 1 : 0);
914 if (!Equals(ignore_case, canonicalize, new_affix, common_affix))
break;
916 affix_length = std::min(affix_length, alt_atom->
length());
919 if (
i > first_with_affix + 2) {
925 int run_length =
i - first_with_affix;
927 for (
int j = 1; j < run_length && affix_length > 1; j++) {
929 for (
int k = 1; k < affix_length; k++) {
930 const int alt_atom_pos = backwards ? alt_atom->
length() - k : k;
931 const int old_atom_pos = backwards ? old_atom->
length() - k : k;
932#ifdef V8_INTL_SUPPORT
933 if (!CharAtEquals(ignore_case, alt_atom, alt_atom_pos, old_atom,
936 if (!CharAtEquals(ignore_case, canonicalize, alt_atom, alt_atom_pos,
937 old_atom, old_atom_pos)) {
944 const int common_start =
945 backwards ? alt_atom->
length() - affix_length : 0;
947 common_start, common_start + affix_length));
950 for (
int j = 0; j < run_length; j++) {
952 int len = old_atom->
length();
953 if (len == affix_length) {
956 const int distinct_start = backwards ? 0 : affix_length;
957 const int distinct_end = backwards ? old_atom->
length() - affix_length
960 old_atom->
data().SubVector(distinct_start, distinct_end));
961 distinct->
Add(part, zone);
967 pair->
Add(common, zone);
969 pair->
Add(common, zone);
975 for (
int j = first_with_affix; j <
i; j++) {
986 Zone* zone = compiler->zone();
994 if (!alternative->IsAtom()) {
999 RegExpAtom*
const atom = alternative->AsAtom();
1000 if (atom->
length() != 1) {
1008 bool contains_trail_surrogate =
1010 int first_in_run =
i;
1014 while (
i < length) {
1016 if (!alternative->IsAtom())
break;
1017 RegExpAtom*
const alt_atom = alternative->AsAtom();
1018 if (alt_atom->
length() != 1)
break;
1021 contains_trail_surrogate |=
1025 if (
i > first_in_run + 1) {
1027 int run_length =
i - first_in_run;
1030 for (
int j = 0; j < run_length; j++) {
1043 for (
int j = first_in_run; j <
i; j++) {
1053 compiler->ToNodeMaybeCheckForStackOverflow();
1057 if (alternatives->
length() > 2) {
1062 return alternatives->at(0)->ToNode(compiler, on_success);
1069 compiler->zone()->New<ChoiceNode>(
length, compiler->zone());
1071 GuardedAlternative alternative(
1073 result->AddAlternative(alternative);
1080 return ToNode(min(), max(), is_greedy(), body(), compiler, on_success);
1090 Zone* zone = compiler->zone();
1095 int stack_register = compiler->UnicodeLookaroundStackRegister();
1096 int position_register = compiler->UnicodeLookaroundPositionRegister();
1100 for (
int i = 0;
i < 2;
i++) {
1101 bool lookbehind_for_word =
i == 0;
1102 bool lookahead_for_word =
1106 stack_register, position_register);
1108 zone, word_range,
true, lookbehind.on_match_success());
1111 lookbehind.ForMatch(backward),
1112 stack_register, position_register);
1114 zone, word_range,
false, lookahead.on_match_success());
1122 RegExpNode* on_success) {
1124 Zone* zone = compiler->zone();
1126 switch (assertion_type()) {
1127 case Type::START_OF_LINE:
1129 case Type::START_OF_INPUT:
1131 case Type::BOUNDARY:
1133 ? BoundaryAssertionAsLookaround(compiler, on_success,
1135 : AssertionNode::AtBoundary(on_success);
1136 case Type::NON_BOUNDARY:
1138 ? BoundaryAssertionAsLookaround(compiler, on_success,
1140 : AssertionNode::AtNonBoundary(on_success);
1141 case Type::END_OF_INPUT:
1143 case Type::END_OF_LINE: {
1147 int stack_pointer_register = compiler->AllocateRegister();
1148 int position_register = compiler->AllocateRegister();
1150 ChoiceNode*
result = zone->New<ChoiceNode>(2, zone);
1152 ZoneList<CharacterRange>* newline_ranges =
1153 zone->New<ZoneList<CharacterRange>>(3, zone);
1155 newline_ranges,
false, zone);
1156 RegExpClassRanges* newline_atom =
1159 stack_pointer_register, position_register,
1163 TextNode* newline_matcher =
1164 zone->New<TextNode>(newline_atom,
false, submatch_success);
1167 stack_pointer_register, position_register, newline_matcher,
1170 GuardedAlternative eol_alternative(end_of_line);
1171 result->AddAlternative(eol_alternative);
1173 result->AddAlternative(end_alternative);
1182 RegExpNode* on_success) {
1183 RegExpNode* backref_node = on_success;
1187 for (
auto capture : *captures()) {
1188 backref_node = compiler->zone()->New<BackReferenceNode>(
1193 return backref_node;
1197 RegExpNode* on_success) {
1205 ModifiersScope(RegExpCompiler* compiler, RegExpFlags flags)
1207 compiler->set_flags(flags);
1219 RegExpNode* on_success) {
1221 if (
flags() == compiler->flags()) {
1222 return body_->ToNode(compiler, on_success);
1229 ModifiersScope modifiers_scope(compiler,
flags());
1230 RegExpNode* body =
body_->ToNode(compiler, on_success);
1234 return modified_body;
1238 int stack_pointer_register,
1239 int position_register,
1240 int capture_register_count,
1241 int capture_register_start)
1242 : is_positive_(is_positive),
1243 on_success_(on_success),
1244 stack_pointer_register_(stack_pointer_register),
1245 position_register_(position_register) {
1248 stack_pointer_register, position_register, capture_register_count,
1253 stack_pointer_register, position_register, capture_register_count,
1254 capture_register_start, zone);
1262 stack_pointer_register_, position_register_, match, on_match_success);
1264 Zone* zone = on_success_->zone();
1273 position_register_, choice_node);
1279 compiler->ToNodeMaybeCheckForStackOverflow();
1281 int stack_pointer_register = compiler->AllocateRegister();
1282 int position_register = compiler->AllocateRegister();
1284 const int registers_per_capture = 2;
1285 const int register_of_first_capture = 2;
1287 int register_start =
1288 register_of_first_capture +
capture_from_ * registers_per_capture;
1291 bool was_reading_backward = compiler->read_backward();
1294 position_register, register_count, register_start);
1296 result = builder.ForMatch(match);
1297 compiler->set_read_backward(was_reading_backward);
1312 if (compiler->read_backward()) std::swap(start_reg, end_reg);
1320class AssertionSequenceRewriter final {
1324 static void MaybeRewrite(ZoneList<RegExpTree*>* terms,
Zone* zone) {
1325 AssertionSequenceRewriter rewriter(terms, zone);
1327 static constexpr int kNoIndex = -1;
1328 int from = kNoIndex;
1330 for (
int i = 0;
i < terms->
length();
i++) {
1331 RegExpTree* t = terms->at(
i);
1332 if (from == kNoIndex && t->IsAssertion()) {
1334 }
else if (from != kNoIndex && !t->IsAssertion()) {
1336 if (
i - from > 1) rewriter.Rewrite(from,
i);
1341 if (from != kNoIndex && terms->length() - from > 1) {
1342 rewriter.Rewrite(from, terms->length());
1351 void Rewrite(
int from,
int to) {
1355 uint32_t seen_assertions = 0;
1356 static_assert(
static_cast<int>(RegExpAssertion::Type::LAST_ASSERTION_TYPE) <
1359 for (
int i = from;
i <
to;
i++) {
1360 RegExpAssertion* t =
terms_->at(
i)->AsAssertion();
1361 const uint32_t bit = 1 <<
static_cast<int>(t->assertion_type());
1363 if (seen_assertions & bit) {
1368 seen_assertions |= bit;
1372 const uint32_t always_fails_mask =
1373 1 <<
static_cast<int>(RegExpAssertion::Type::BOUNDARY) |
1374 1 <<
static_cast<int>(RegExpAssertion::Type::NON_BOUNDARY);
1375 if ((seen_assertions & always_fails_mask) == always_fails_mask) {
1376 ReplaceSequenceWithFailure(from, to);
1380 void ReplaceSequenceWithFailure(
int from,
int to) {
1384 ZoneList<CharacterRange>* ranges =
1385 zone_->New<ZoneList<CharacterRange>>(0,
zone_);
1386 RegExpClassRanges*
cc =
zone_->New<RegExpClassRanges>(
zone_, ranges);
1390 RegExpEmpty* empty =
zone_->New<RegExpEmpty>();
1391 for (
int i = from + 1;
i <
to;
i++)
terms_->Set(
i, empty);
1395 AssertionSequenceRewriter(ZoneList<RegExpTree*>* terms,
Zone* zone)
1405 RegExpNode* on_success) {
1406 compiler->ToNodeMaybeCheckForStackOverflow();
1408 ZoneList<RegExpTree*>*
children = nodes();
1410 AssertionSequenceRewriter::MaybeRewrite(
children, compiler->zone());
1412 RegExpNode* current = on_success;
1413 if (compiler->read_backward()) {
1415 current =
children->at(
i)->ToNode(compiler, current);
1419 current =
children->at(
i)->ToNode(compiler, current);
1427void AddClass(
const int* elmv,
int elmc, ZoneList<CharacterRange>* ranges,
1431 for (
int i = 0;
i < elmc;
i += 2) {
1437void AddClassNegated(
const int* elmv,
int elmc,
1438 ZoneList<CharacterRange>* ranges,
Zone* zone) {
1444 for (
int i = 0;
i < elmc;
i += 2) {
1457 bool add_unicode_case_equivalents,
1459 if (add_unicode_case_equivalents &&
1468 AddUnicodeCaseEquivalents(new_ranges, zone);
1473 new_ranges = negated;
1475 ranges->AddAll(*new_ranges, zone);
1479 switch (standard_character_set) {
1522 int range_count = ranges->length();
1523#ifdef V8_INTL_SUPPORT
1524 icu::UnicodeSet others;
1525 for (
int i = 0;
i < range_count;
i++) {
1536 others.add(from, to);
1551 icu::UnicodeSet already_added(others);
1552 others.removeAll(RegExpCaseFolding::IgnoreSet());
1553 others.closeOver(USET_CASE_INSENSITIVE);
1554 others.removeAll(RegExpCaseFolding::IgnoreSet());
1555 others.removeAll(already_added);
1558 for (int32_t
i = 0;
i < others.getRangeCount();
i++) {
1559 UChar32 from = others.getRangeStart(
i);
1560 UChar32 to = others.getRangeEnd(
i);
1568 for (
int i = 0;
i < range_count;
i++) {
1580 if (top == bottom) {
1582 int length = isolate->jsregexp_uncanonicalize()->get(bottom,
'\0', chars);
1583 for (
int j = 0; j <
length; j++) {
1585 if (chr != bottom) {
1608 while (
pos <= top) {
1610 isolate->jsregexp_canonrange()->get(
pos,
'\0', equivalents);
1616 block_end = equivalents[0];
1618 int end = (block_end >
top) ? top : block_end;
1619 length = isolate->jsregexp_uncanonicalize()->get(block_end,
'\0',
1621 for (
int j = 0; j <
length; j++) {
1625 if (!(bottom <= range_from && range_to <= top)) {
1638 int n = ranges->length();
1639 if (n <= 1)
return true;
1641 for (
int i = 1;
i <
n;
i++) {
1643 if (next_range.
from() <= max + 1)
return false;
1644 max = next_range.
to();
1650 if (ranges_ ==
nullptr) {
1665 for (
int i =
count - 1;
i >= 0;
i--) {
1666 list->
at(to +
i) = list->
at(from +
i);
1670 list->
at(to +
i) = list->
at(from +
i);
1675int InsertRangeInCanonicalList(ZoneList<CharacterRange>* list,
int count,
1676 CharacterRange insert) {
1685 int end_pos =
count;
1686 for (
int i = count - 1;
i >= 0;
i--) {
1687 CharacterRange current = list->at(
i);
1688 if (current.from() > to + 1) {
1690 }
else if (current.to() + 1 < from) {
1703 if (start_pos == end_pos) {
1705 if (start_pos < count) {
1706 MoveRanges(list, start_pos, start_pos + 1, count - start_pos);
1708 list->at(start_pos) = insert;
1711 if (start_pos + 1 == end_pos) {
1713 CharacterRange to_replace = list->at(start_pos);
1714 int new_from = std::min(to_replace.from(), from);
1715 int new_to = std::max(to_replace.to(), to);
1722 int new_from = std::min(list->at(start_pos).from(), from);
1723 int new_to = std::max(list->at(end_pos - 1).to(), to);
1724 if (end_pos < count) {
1725 MoveRanges(list, end_pos, start_pos + 1, count - end_pos);
1728 return count - (end_pos - start_pos) + 1;
1736 if (ranges_ ==
nullptr)
return;
1742 if (character_ranges->
length() <= 1)
return;
1745 int n = character_ranges->
length();
1750 if (current.from() <= max + 1) {
1765 int num_canonical =
i;
1767 num_canonical = InsertRangeInCanonicalList(character_ranges, num_canonical,
1768 character_ranges->
at(read));
1771 character_ranges->
Rewind(num_canonical);
1782 int range_count = ranges->length();
1785 if (range_count > 0 && ranges->at(0).from() == 0) {
1786 from = ranges->at(0).to() + 1;
1789 while (
i < range_count) {
1792 from = range.to() + 1;
1810 while (lhs_index < lhs->
length() && rhs_index < rhs->
length()) {
1812 if (lhs->
at(lhs_index).to() < rhs->
at(rhs_index).from()) {
1816 if (rhs->
at(rhs_index).to() < lhs->
at(lhs_index).from()) {
1822 std::max(lhs->
at(lhs_index).from(), rhs->
at(rhs_index).from());
1823 base::uc32 to = std::min(lhs->
at(lhs_index).to(), rhs->
at(rhs_index).to());
1825 if (to == lhs->
at(lhs_index).to()) {
1832 DCHECK(IsCanonical(intersection));
1844 *from = range->at(*index).from();
1845 *to = range->at(*index).to();
1861 if (src->is_empty())
return;
1864 int to_remove_index = 0;
1867 while (src_index < src->
length() && to_remove_index < to_remove->
length()) {
1869 if (remove_range.
to() < from) {
1874 }
else if (to < remove_range.
from()) {
1879 SafeAdvanceRange(src, &src_index, &from, &to);
1880 }
else if (from >= remove_range.
from() && to <= remove_range.
to()) {
1884 SafeAdvanceRange(src, &src_index, &from, &to);
1885 }
else if (from < remove_range.
from() && to > remove_range.
to()) {
1890 from = remove_range.
to() + 1;
1892 }
else if (from < remove_range.
from()) {
1896 to = remove_range.
from() - 1;
1898 SafeAdvanceRange(src, &src_index, &from, &to);
1899 }
else if (to > remove_range.
to()) {
1903 from = remove_range.
to() + 1;
1918 for (; src_index < src->length(); src_index++) {
1919 result->Add(src->at(src_index), zone);
1927 DCHECK(IsCanonical(ranges));
1934 int n = ranges->length();
1935 for (; n > 0; n--) {
1937 if (
r.from() <= max_char) {
1938 r.to_ = std::min(
r.to_, max_char);
1949 DCHECK(IsCanonical(lhs));
1950 DCHECK(IsCanonical(rhs));
1953 for (
int i = 0;
i < lhs->
length();
i++) {
1954 if (lhs->
at(
i) != rhs->
at(
i))
return false;
1964class RegExpExpansionLimiter {
1967 RegExpExpansionLimiter(RegExpCompiler* compiler,
int factor)
1980 compiler->set_current_expansion_factor(new_factor);
1985 ~RegExpExpansionLimiter() {
2002 RegExpTree*
body, RegExpCompiler* compiler,
2003 RegExpNode* on_success,
2004 bool not_at_start) {
2025 static const int kMaxUnrolledMinMatches = 3;
2026 static const int kMaxUnrolledMaxMatches = 3;
2027 if (max == 0)
return on_success;
2031 bool needs_capture_clearing = !capture_registers.is_empty();
2032 Zone* zone = compiler->zone();
2034 if (body_can_be_empty) {
2035 body_start_reg = compiler->AllocateRegister();
2036 }
else if (compiler->optimize() && !needs_capture_clearing) {
2040 RegExpExpansionLimiter limiter(compiler, min + ((max != min) ? 1 : 0));
2041 if (min > 0 && min <= kMaxUnrolledMinMatches && limiter.ok_to_expand()) {
2042 int new_max = (max ==
kInfinity) ? max : max - min;
2045 RegExpNode* answer =
2046 ToNode(0, new_max, is_greedy,
body, compiler, on_success,
true);
2050 for (
int i = 0;
i < min;
i++) {
2056 if (max <= kMaxUnrolledMaxMatches && min == 0) {
2058 RegExpExpansionLimiter limiter(compiler, max);
2059 if (limiter.ok_to_expand()) {
2061 RegExpNode* answer = on_success;
2062 for (
int i = 0;
i < max;
i++) {
2063 ChoiceNode* alternation = zone->New<ChoiceNode>(2, zone);
2065 alternation->AddAlternative(
2066 GuardedAlternative(
body->
ToNode(compiler, answer)));
2067 alternation->AddAlternative(GuardedAlternative(on_success));
2069 alternation->AddAlternative(GuardedAlternative(on_success));
2070 alternation->AddAlternative(
2071 GuardedAlternative(
body->
ToNode(compiler, answer)));
2073 answer = alternation;
2074 if (not_at_start && !compiler->read_backward()) {
2075 alternation->set_not_at_start();
2082 bool has_min = min > 0;
2084 bool needs_counter = has_min || has_max;
2085 int reg_ctr = needs_counter ? compiler->AllocateRegister()
2087 LoopChoiceNode* center = zone->New<LoopChoiceNode>(
2088 body->
min_match() == 0, compiler->read_backward(), min, zone);
2089 if (not_at_start && !compiler->read_backward()) center->set_not_at_start();
2090 RegExpNode* loop_return =
2091 needs_counter ?
static_cast<RegExpNode*
>(
2093 : static_cast<RegExpNode*>(center);
2094 if (body_can_be_empty) {
2100 RegExpNode* body_node =
body->
ToNode(compiler, loop_return);
2101 if (body_can_be_empty) {
2106 if (needs_capture_clearing) {
2110 GuardedAlternative body_alt(body_node);
2112 Guard* body_guard = zone->New<Guard>(reg_ctr,
Guard::LT, max);
2113 body_alt.AddGuard(body_guard, zone);
2115 GuardedAlternative rest_alt(on_success);
2117 Guard* rest_guard = compiler->zone()->New<Guard>(reg_ctr,
Guard::GEQ, min);
2118 rest_alt.AddGuard(rest_guard, zone);
2121 center->AddLoopAlternative(body_alt);
2122 center->AddContinueAlternative(rest_alt);
2124 center->AddContinueAlternative(rest_alt);
2125 center->AddLoopAlternative(body_alt);
2127 if (needs_counter) {
int get(uchar c, uchar n, uchar *result)
static uint16_t LeadSurrogate(uint32_t char_code)
static uint16_t TrailSurrogate(uint32_t char_code)
static bool IsTrailSurrogate(int code)
static bool IsLeadSurrogate(int code)
void emplace_back(Args &&... args)
const T & at(size_t index) const
ActionNode * AsActionNode() override
static ActionNode * EmptyMatchCheck(int start_register, int repetition_register, int repetition_limit, RegExpNode *on_success)
static ActionNode * ModifyFlags(RegExpFlags flags, RegExpNode *on_success)
static ActionNode * StorePosition(int reg, bool is_capture, RegExpNode *on_success)
static ActionNode * PositiveSubmatchSuccess(int stack_pointer_reg, int restore_reg, int clear_capture_count, int clear_capture_from, RegExpNode *on_success)
static ActionNode * BeginNegativeSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *on_success)
static ActionNode * SetRegisterForLoop(int reg, int val, RegExpNode *on_success)
static ActionNode * IncrementRegister(int reg, RegExpNode *on_success)
static ActionNode * BeginPositiveSubmatch(int stack_pointer_reg, int position_reg, RegExpNode *body, ActionNode *success_node)
static ActionNode * ClearCaptures(Interval range, RegExpNode *on_success)
static AssertionNode * AtStart(RegExpNode *on_success)
static AssertionNode * AtEnd(RegExpNode *on_success)
static AssertionNode * AfterNewline(RegExpNode *on_success)
static bool Equals(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs)
static void Canonicalize(ZoneList< CharacterRange > *ranges)
static void AddUnicodeCaseEquivalents(ZoneList< CharacterRange > *ranges, Zone *zone)
static void Subtract(const ZoneList< CharacterRange > *src, const ZoneList< CharacterRange > *to_remove, ZoneList< CharacterRange > *dst, Zone *zone)
static void Negate(const ZoneList< CharacterRange > *src, ZoneList< CharacterRange > *dst, Zone *zone)
static V8_EXPORT_PRIVATE void AddCaseEquivalents(Isolate *isolate, Zone *zone, ZoneList< CharacterRange > *ranges, bool is_one_byte)
static void ClampToOneByte(ZoneList< CharacterRange > *ranges)
static CharacterRange Singleton(base::uc32 value)
static V8_EXPORT_PRIVATE bool IsCanonical(const ZoneList< CharacterRange > *ranges)
static ZoneList< CharacterRange > * List(Zone *zone, CharacterRange range)
static void Intersect(const ZoneList< CharacterRange > *lhs, const ZoneList< CharacterRange > *rhs, ZoneList< CharacterRange > *dst, Zone *zone)
static CharacterRange Range(base::uc32 from, base::uc32 to)
static CharacterRange Everything()
static V8_EXPORT_PRIVATE void AddClassEscape(StandardCharacterSet standard_character_set, ZoneList< CharacterRange > *ranges, bool add_unicode_case_equivalents, Zone *zone)
void set_standard_set_type(StandardCharacterSet standard_set_type)
V8_EXPORT_PRIVATE void Canonicalize()
ZoneList< CharacterRange > * ranges(Zone *zone)
base::Vector< const base::uc16 > data() const
static int StartRegister(int index)
static RegExpNode * ToNode(RegExpTree *body, int index, RegExpCompiler *compiler, RegExpNode *on_success)
static int EndRegister(int index)
bool contains_split_surrogate() const
StandardCharacterSet standard_type() const
bool is_standard(Zone *zone)
@ CONTAINS_SPLIT_SURROGATE
ZoneList< CharacterRange > * ranges(Zone *zone)
bool is_case_folded() const
RegExpClassRanges(Zone *zone, ZoneList< CharacterRange > *ranges, ClassRangesFlags class_ranges_flags=ClassRangesFlags())
static RegExpClassSetOperand * ComputeExpression(RegExpTree *root, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
CharacterClassStrings * strings_
ZoneList< CharacterRange > * ranges()
void Intersect(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
CharacterClassStrings * strings()
void Union(RegExpClassSetOperand *other, Zone *zone)
void Subtract(RegExpClassSetOperand *other, ZoneList< CharacterRange > *temp_ranges, Zone *zone)
static const int kNoRegister
ZoneList< RegExpTree * > * alternatives() const
void RationalizeConsecutiveAtoms(RegExpCompiler *compiler)
void FixSingleCharacterDisjunctions(RegExpCompiler *compiler)
bool SortConsecutiveAtoms(RegExpCompiler *compiler)
RegExpNode * on_match_success_
RegExpNode * ForMatch(RegExpNode *match)
Builder(bool is_positive, RegExpNode *on_success, int stack_pointer_register, int position_register, int capture_register_count=0, int capture_register_start=0)
RegExpTree * body() const
static RegExpNode * ToNode(int min, int max, bool is_greedy, RegExpTree *body, RegExpCompiler *compiler, RegExpNode *on_success, bool not_at_start=false)
ZoneList< TextElement > * elements()
virtual RegExpNode * ToNode(RegExpCompiler *compiler, RegExpNode *on_success)=0
virtual int min_match()=0
static const int kInfinity
virtual Interval CaptureRegisters()
static const uint32_t kMaxOneByteCharCodeU
static const int32_t kMaxOneByteCharCode
static TextElement Atom(RegExpAtom *atom)
static TextNode * CreateForSurrogatePair(Zone *zone, CharacterRange lead, ZoneList< CharacterRange > *trail_ranges, bool read_backward, RegExpNode *on_success)
static TextNode * CreateForCharacterRanges(Zone *zone, ZoneList< CharacterRange > *ranges, bool read_backward, RegExpNode *on_success)
CharacterRangeVector lead_surrogates_
CharacterRangeVector trail_surrogates_
CharacterRangeVector non_bmp_
void AddRange(CharacterRange range)
V8_EXPORT_PRIVATE UnicodeRangeSplitter(ZoneList< CharacterRange > *base)
CharacterRangeVector bmp_
V8_INLINE int length() const
V8_INLINE void Rewind(int pos)
V8_INLINE bool is_empty() const
void Add(const T &element, Zone *zone)
Handle< SharedFunctionInfo > info
std::vector< std::unique_ptr< InstanceTypeTree > > children
std::optional< TNode< JSArray > > a
ZoneVector< RpoNumber > & result
constexpr int kWordRangeCount
constexpr int kDigitRanges[]
constexpr int kLineTerminatorRangeCount
constexpr base::uc32 kRangeEndMarker
constexpr int kDigitRangeCount
constexpr int kSpaceRangeCount
constexpr int kWordRanges[]
constexpr int kLineTerminatorRanges[]
constexpr int kSpaceRanges[]
static const base::uc32 kNonBmpEnd
static const base::uc32 kNonBmpStart
constexpr int kBitsPerByte
static const base::uc32 kTrailSurrogateStart
constexpr bool IsEitherUnicode(RegExpFlags f)
bool NeedsUnicodeCaseEquivalents(RegExpFlags flags)
constexpr uint32_t kMaxUtf16CodeUnitU
constexpr base::uc32 kMaxCodePoint
base::Flags< RegExpFlag > RegExpFlags
static const base::uc32 kTrailSurrogateEnd
constexpr int kUInt32Size
static const base::uc32 kLeadSurrogateStart
static const base::uc32 kLeadSurrogateEnd
constexpr int kMaxUtf16CodeUnit
bool RangeContainsLatin1Equivalents(CharacterRange range)
const RegExpFlags previous_flags_
ZoneList< RegExpTree * > * terms_
RegExpCompiler * compiler_
static const int kMaxExpansionFactor
int saved_expansion_factor_
#define DCHECK_LE(v1, v2)
#define DCHECK_NOT_NULL(val)
#define DCHECK_IMPLIES(v1, v2)
#define DCHECK_NE(v1, v2)
#define DCHECK(condition)
#define DCHECK_LT(v1, v2)
#define DCHECK_EQ(v1, v2)
#define DCHECK_GT(v1, v2)
#define DISALLOW_IMPLICIT_CONSTRUCTORS(TypeName)
static const int kMaxWidth
static const int kMaxWidth
const wasm::FunctionBody & body_