19#include "unicode/uniset.h"
20#include "unicode/unistr.h"
21#include "unicode/usetiter.h"
22#include "unicode/utf16.h"
32enum class InClassEscapeState {
38enum class ClassSetOperandType {
40 kClassStringDisjunction,
42 kCharacterClassEscape,
47class RegExpTextBuilder {
49 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
51 RegExpTextBuilder(
Zone* zone, SmallRegExpTreeVector* terms_storage,
57 void AddAtom(RegExpTree* atom);
58 void AddTerm(RegExpTree* term);
59 void AddClassRanges(RegExpClassRanges*
cc);
60 void FlushPendingSurrogate();
62 RegExpTree* PopLastAtom();
63 RegExpTree* ToRegExp();
68 void AddLeadSurrogate(
base::uc16 lead_surrogate);
69 void AddTrailSurrogate(
base::uc16 trail_surrogate);
70 void FlushCharacters();
71 bool NeedsDesugaringForUnicode(RegExpClassRanges*
cc);
72 bool NeedsDesugaringForIgnoreCase(
base::uc32 c);
73 void AddClassRangesForDesugaring(
base::uc32 c);
74 bool ignore_case()
const {
return IsIgnoreCase(flags_); }
75 bool IsUnicodeMode()
const {
78 return IsUnicode(flags_) || IsUnicodeSets(flags_);
90void RegExpTextBuilder::AddLeadSurrogate(
base::uc16 lead_surrogate) {
92 FlushPendingSurrogate();
97void RegExpTextBuilder::AddTrailSurrogate(
base::uc16 trail_surrogate) {
103 base::uc32 combined =
105 if (NeedsDesugaringForIgnoreCase(combined)) {
106 AddClassRangesForDesugaring(combined);
108 ZoneList<base::uc16> surrogate_pair(2, zone());
109 surrogate_pair.Add(lead_surrogate, zone());
110 surrogate_pair.Add(trail_surrogate, zone());
112 zone()->New<RegExpAtom>(surrogate_pair.ToConstVector());
117 FlushPendingSurrogate();
121void RegExpTextBuilder::FlushPendingSurrogate() {
126 AddClassRangesForDesugaring(c);
130void RegExpTextBuilder::FlushCharacters() {
131 FlushPendingSurrogate();
133 RegExpTree* atom = zone()->New<RegExpAtom>(
characters_->ToConstVector());
135 text_.emplace_back(atom);
139void RegExpTextBuilder::FlushText() {
141 size_t num_text =
text_.size();
144 }
else if (num_text == 1) {
147 RegExpText* text = zone()->New<RegExpText>(zone());
148 for (
size_t i = 0;
i < num_text;
i++) {
149 text_[
i]->AppendToText(text, zone());
151 terms_->emplace_back(text);
156void RegExpTextBuilder::AddCharacter(
base::uc16 c) {
157 FlushPendingSurrogate();
159 characters_ = zone()->New<ZoneList<base::uc16>>(4, zone());
164void RegExpTextBuilder::AddUnicodeCharacter(
base::uc32 c) {
172 AddTrailSurrogate(c);
174 AddCharacter(
static_cast<base::uc16
>(c));
181 FlushPendingSurrogate();
182 AddUnicodeCharacter(character);
183 FlushPendingSurrogate();
186void RegExpTextBuilder::AddClassRanges(RegExpClassRanges* cr) {
187 if (NeedsDesugaringForUnicode(cr)) {
196void RegExpTextBuilder::AddClassRangesForDesugaring(
base::uc32 c) {
197 AddTerm(zone()->New<RegExpClassRanges>(
198 zone(), CharacterRange::List(zone(), CharacterRange::Singleton(c))));
201void RegExpTextBuilder::AddAtom(RegExpTree* atom) {
202 DCHECK(atom->IsTextElement());
204 text_.emplace_back(atom);
207void RegExpTextBuilder::AddTerm(RegExpTree* term) {
208 DCHECK(term->IsTextElement());
210 terms_->emplace_back(term);
213bool RegExpTextBuilder::NeedsDesugaringForUnicode(RegExpClassRanges*
cc) {
214 if (!IsUnicodeMode())
return false;
218 if (ignore_case())
return true;
219 ZoneList<CharacterRange>* ranges =
cc->ranges(zone());
220 CharacterRange::Canonicalize(ranges);
222 if (
cc->is_negated()) {
223 ZoneList<CharacterRange>* negated_ranges =
224 zone()->New<ZoneList<CharacterRange>>(ranges->length(), zone());
225 CharacterRange::Negate(ranges, negated_ranges, zone());
226 ranges = negated_ranges;
229 for (
int i = ranges->
length() - 1;
i >= 0;
i--) {
230 base::uc32 from = ranges->at(
i).from();
231 base::uc32 to = ranges->at(
i).to();
233 if (to >= kNonBmpStart)
return true;
243bool RegExpTextBuilder::NeedsDesugaringForIgnoreCase(
base::uc32 c) {
244#ifdef V8_INTL_SUPPORT
245 if (IsUnicodeMode() && ignore_case()) {
246 icu::UnicodeSet set(c, c);
247 set.closeOver(USET_CASE_INSENSITIVE);
248 set.removeAllStrings();
249 return set.size() > 1;
257RegExpTree* RegExpTextBuilder::PopLastAtom() {
258 FlushPendingSurrogate();
261 base::Vector<const base::uc16> char_vector =
characters_->ToConstVector();
262 int num_chars = char_vector.length();
264 base::Vector<const base::uc16> prefix =
265 char_vector.SubVector(0, num_chars - 1);
266 text_.emplace_back(zone()->New<RegExpAtom>(prefix));
267 char_vector = char_vector.SubVector(num_chars - 1, num_chars);
270 atom = zone()->New<RegExpAtom>(char_vector);
272 }
else if (!
text_.empty()) {
280RegExpTree* RegExpTextBuilder::ToRegExp() {
282 size_t num_alternatives =
terms_->size();
283 if (num_alternatives == 0)
return zone()->New<RegExpEmpty>();
284 if (num_alternatives == 1)
return terms_->back();
285 return zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
286 base::VectorOf(
terms_->begin(),
terms_->size()), zone()));
292 RegExpBuilder(
Zone* zone, RegExpFlags flags)
298 void AddCharacter(base::uc16 character);
299 void AddUnicodeCharacter(base::uc32 character);
300 void AddEscapedUnicodeCharacter(base::uc32 character);
304 void AddClassRanges(RegExpClassRanges* cc);
305 void AddAtom(RegExpTree* tree);
306 void AddTerm(RegExpTree* tree);
307 void AddAssertion(RegExpTree* tree);
308 void NewAlternative();
309 bool AddQuantifierToAtom(
int min,
int max,
int index,
310 RegExpQuantifier::QuantifierType type);
312 RegExpTree* ToRegExp();
315 bool ignore_case()
const {
return IsIgnoreCase(
flags_); }
316 bool multiline()
const {
return IsMultiline(
flags_); }
317 bool dotall()
const {
return IsDotAll(
flags_); }
321 bool IsUnicodeMode()
const {
333 using SmallRegExpTreeVector = SmallZoneVector<RegExpTree*, 8>;
334 SmallRegExpTreeVector
terms_;
339enum SubexpressionType {
347class RegExpParserState :
public ZoneObject {
350 RegExpParserState(RegExpParserState* previous_state,
351 SubexpressionType group_type,
352 RegExpLookaround::Type lookaround_type,
353 int disjunction_capture_index,
354 const ZoneVector<base::uc16>* capture_name,
355 RegExpFlags flags,
Zone* zone)
362 if (previous_state !=
nullptr) {
364 previous_state->non_participating_capture_group_interval();
371 RegExpBuilder* builder() {
return &
builder_; }
373 SubexpressionType group_type()
const {
return group_type_; }
382 const ZoneVector<base::uc16>* capture_name()
const {
return capture_name_; }
383 std::pair<int, int> non_participating_capture_group_interval()
const {
387 bool IsNamedCapture()
const {
return capture_name_ !=
nullptr; }
390 bool IsInsideCaptureGroup(
int index)
const {
391 for (
const RegExpParserState* s =
this; s !=
nullptr;
392 s = s->previous_state()) {
393 if (s->group_type() != CAPTURE)
continue;
395 if (index == s->capture_index())
return true;
397 if (index > s->capture_index())
return false;
403 bool IsInsideCaptureGroup(
const ZoneVector<base::uc16>* name)
const {
405 for (
const RegExpParserState* s =
this; s !=
nullptr;
406 s = s->previous_state()) {
407 if (s->capture_name() ==
nullptr)
continue;
408 if (*s->capture_name() == *name)
return true;
413 void NewAlternative(
int captures_started) {
414 if (non_participating_capture_group_interval().
second != 0) {
421 std::make_pair(capture_index(), captures_started);
445template <
class CharT>
446class RegExpParserImpl final {
448 RegExpParserImpl(
const CharT* input,
int input_length,
RegExpFlags flags,
449 uintptr_t stack_limit,
Zone* zone,
452 bool Parse(RegExpCompileData*
result);
454 RegExpTree* ParsePattern();
455 RegExpTree* ParseDisjunction();
456 RegExpTree* ParseGroup();
460 bool ParseIntervalQuantifier(
int* min_out,
int* max_out);
464 bool ParseHexEscape(
int length, base::uc32* value);
465 bool ParseUnicodeEscape(base::uc32* value);
466 bool ParseUnlimitedLengthHexNumber(
int max_value, base::uc32* value);
468 bool ParsePropertyClassName(ZoneVector<char>* name_1,
469 ZoneVector<char>* name_2);
470 bool AddPropertyClassRange(ZoneList<CharacterRange>* add_to_range,
472 const ZoneVector<char>& name_1,
473 const ZoneVector<char>& name_2);
475 RegExpTree* ParseClassRanges(ZoneList<CharacterRange>* ranges,
476 bool add_unicode_case_equivalents);
479 void ParseClassEscape(ZoneList<CharacterRange>* ranges,
Zone* zone,
480 bool add_unicode_case_equivalents, base::uc32* char_out,
481 bool* is_class_escape);
483 bool TryParseCharacterClassEscape(base::uc32 next,
484 InClassEscapeState in_class_escape_state,
485 ZoneList<CharacterRange>* ranges,
487 bool add_unicode_case_equivalents);
488 RegExpTree* ParseClassStringDisjunction(ZoneList<CharacterRange>* ranges,
490 RegExpTree* ParseClassSetOperand(
const RegExpBuilder* builder,
491 ClassSetOperandType* type_out);
492 RegExpTree* ParseClassSetOperand(
const RegExpBuilder* builder,
493 ClassSetOperandType* type_out,
494 ZoneList<CharacterRange>* ranges,
497 base::uc32 ParseClassSetCharacter();
499 base::uc32 ParseCharacterEscape(InClassEscapeState in_class_escape_state,
500 bool* is_escaped_unicode_character);
502 void AddMaybeSimpleCaseFoldedRange(ZoneList<CharacterRange>* ranges,
503 CharacterRange new_range);
505 RegExpTree* ParseClassUnion(
const RegExpBuilder* builder,
bool is_negated,
506 RegExpTree* first_operand,
507 ClassSetOperandType first_operand_type,
508 ZoneList<CharacterRange>* ranges,
510 base::uc32 first_character);
511 RegExpTree* ParseClassIntersection(
const RegExpBuilder* builder,
512 bool is_negated, RegExpTree* first_operand,
513 ClassSetOperandType first_operand_type);
514 RegExpTree* ParseClassSubtraction(
const RegExpBuilder* builder,
515 bool is_negated, RegExpTree* first_operand,
516 ClassSetOperandType first_operand_type);
517 RegExpTree* ParseCharacterClass(
const RegExpBuilder* state);
519 base::uc32 ParseOctalLiteral();
525 bool ParseBackReferenceIndex(
int* index_out);
529 void Advance(
int dist);
530 void RewindByOneCodepoint();
535 bool simple()
const {
return simple_; }
540 const bool current_is_surrogate =
543 const int rewind_bytes = current_is_surrogate ? 2 : 1;
546 bool failed()
const {
return failed_; }
548 bool IsUnicodeMode()
const {
553 bool unicode_sets()
const {
return IsUnicodeSets(
flags()); }
554 bool ignore_case()
const {
return IsIgnoreCase(
flags()); }
556 static bool IsSyntaxCharacterOrSlash(base::uc32 c);
557 static bool IsClassSetSyntaxCharacter(base::uc32 c);
558 static bool IsClassSetReservedPunctuator(base::uc32 c);
559 bool IsClassSetReservedDoublePunctuator(base::uc32 c);
565 RegExpCapture* GetCapture(
int index);
570 bool CreateNamedCaptureAtIndex(
const RegExpParserState* state,
int index);
574 const ZoneVector<base::uc16>* ParseCaptureGroupName();
576 bool ParseNamedBackReference(RegExpBuilder* builder,
577 RegExpParserState* state);
578 RegExpParserState* ParseOpenParenthesis(RegExpParserState* state);
583 void PatchNamedBackReferences();
585 ZoneVector<RegExpCapture*>* GetNamedCaptures();
589 bool HasNamedCaptures(InClassEscapeState in_class_escape_state);
594 bool has_more()
const {
return has_more_; }
595 bool has_next()
const {
return next_pos_ < input_length(); }
597 template <
bool update_position>
598 base::uc32 ReadNext();
599 CharT InputAt(
int index)
const {
600 DCHECK(0 <= index && index < input_length());
604 void ScanForCaptures(InClassEscapeState in_class_escape_state);
606 struct RegExpCaptureNameLess {
607 bool operator()(
const RegExpCapture* lhs,
const RegExpCapture* rhs)
const {
610 return *lhs->name() < *rhs->name();
614 class ForceUnicodeScope final {
616 explicit ForceUnicodeScope(RegExpParserImpl<CharT>* parser)
619 parser_->force_unicode_ =
true;
621 ~ForceUnicodeScope() {
623 parser_->force_unicode_ =
false;
636 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>*
660template <
class CharT>
661RegExpParserImpl<CharT>::RegExpParserImpl(
662 const CharT* input,
int input_length,
RegExpFlags flags,
688template <
bool update_position>
689inline base::uc32 RegExpParserImpl<uint8_t>::ReadNext() {
699template <
bool update_position>
700inline base::uc32 RegExpParserImpl<base::uc16>::ReadNext() {
706 if (IsUnicodeMode() &&
position < input_length() &&
718template <
class CharT>
721 return ReadNext<false>();
727template <
class CharT>
728void RegExpParserImpl<CharT>::Advance() {
731 if (
v8_flags.correctness_fuzzer_suppressions) {
732 FATAL(
"Aborting on stack overflow");
747template <
class CharT>
748void RegExpParserImpl<CharT>::RewindByOneCodepoint() {
749 if (!has_more())
return;
753 const int rewind_by =
758template <
class CharT>
759void RegExpParserImpl<CharT>::Reset(
int pos) {
765template <
class CharT>
766void RegExpParserImpl<CharT>::Advance(
int dist) {
772template <
class CharT>
773bool RegExpParserImpl<CharT>::IsSyntaxCharacterOrSlash(
base::uc32 c) {
798template <
class CharT>
799bool RegExpParserImpl<CharT>::IsClassSetSyntaxCharacter(
base::uc32 c) {
819template <
class CharT>
820bool RegExpParserImpl<CharT>::IsClassSetReservedPunctuator(
base::uc32 c) {
843template <
class CharT>
844bool RegExpParserImpl<CharT>::IsClassSetReservedDoublePunctuator(
base::uc32 c) {
845#define DOUBLE_PUNCTUATOR_CASE(Char) \
847 return Next() == Char
872#undef DOUBLE_PUNCTUATOR_CASE
877template <
class CharT>
878RegExpTree* RegExpParserImpl<CharT>::ReportError(
RegExpError error) {
890#define CHECK_FAILED ); \
891 if (failed_) return nullptr; \
896template <
class CharT>
897RegExpTree* RegExpParserImpl<CharT>::ParsePattern() {
903 if (
result->IsAtom() &&
result->AsAtom()->length() == input_length()) {
919template <
class CharT>
920RegExpTree* RegExpParserImpl<CharT>::ParseDisjunction() {
923 0,
nullptr,
flags(), zone());
924 RegExpParserState* state = &initial_state;
926 RegExpBuilder* builder = initial_state.builder();
930 if (failed())
return nullptr;
931 if (state->IsSubexpression()) {
933 return ReportError(RegExpError::kUnterminatedGroup);
937 return builder->ToRegExp();
939 if (!state->IsSubexpression()) {
947 RegExpTree* body = builder->ToRegExp();
949 int end_capture_index = captures_started();
951 int capture_index = state->capture_index();
952 SubexpressionType group_type = state->group_type();
955 if (group_type == CAPTURE) {
956 if (state->IsNamedCapture()) {
957 CreateNamedCaptureAtIndex(state, capture_index
CHECK_FAILED);
959 RegExpCapture* capture = GetCapture(capture_index);
960 capture->set_body(body);
962 }
else if (group_type == GROUPING) {
963 body = zone()->template New<RegExpGroup>(body, builder->flags());
965 DCHECK(group_type == POSITIVE_LOOKAROUND ||
966 group_type == NEGATIVE_LOOKAROUND);
967 bool is_positive = (group_type == POSITIVE_LOOKAROUND);
968 body = zone()->template New<RegExpLookaround>(
969 body, is_positive, end_capture_index - capture_index,
975 state = state->previous_state();
976 builder = state->builder();
978 builder->AddAtom(body);
985 state->NewAlternative(captures_started());
986 builder->NewAlternative();
995 builder->AddAssertion(zone()->
template New<RegExpAssertion>(
998 set_contains_anchor();
1006 builder->AddAssertion(
1007 zone()->
template New<RegExpAssertion>(assertion_type));
1012 ZoneList<CharacterRange>* ranges =
1013 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1015 if (builder->dotall()) {
1018 ranges,
false, zone());
1025 RegExpClassRanges*
cc =
1026 zone()->template New<RegExpClassRanges>(zone(), ranges);
1027 builder->AddClassRanges(
cc);
1032 builder = state->builder();
1033 flags_ = builder->flags();
1038 if (
cc->IsClassRanges()) {
1039 builder->AddClassRanges(
cc->AsClassRanges());
1041 DCHECK(
cc->IsClassSetExpression());
1042 builder->AddTerm(
cc);
1051 return ReportError(RegExpError::kEscapeAtEndOfPattern);
1075 const bool is_backref =
1078 if (state->IsInsideCaptureGroup(index)) {
1084 builder->AddEmpty();
1086 RegExpCapture* capture = GetCapture(index);
1088 zone()->template New<RegExpBackReference>(capture, zone());
1089 builder->AddAtom(atom);
1095 if (IsUnicodeMode()) {
1099 if (first_digit ==
'8' || first_digit ==
'9') {
1100 builder->AddCharacter(first_digit);
1108 if (IsUnicodeMode() && Next() >=
'0' && Next() <=
'9') {
1110 return ReportError(RegExpError::kInvalidDecimalEscape);
1113 builder->AddCharacter(octal);
1118 builder->AddAssertion(zone()->
template New<RegExpAssertion>(
1123 builder->AddAssertion(zone()->
template New<RegExpAssertion>(
1135 ZoneList<CharacterRange>* ranges =
1136 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1137 bool add_unicode_case_equivalents =
1138 IsUnicodeMode() && ignore_case();
1139 bool parsed_character_class_escape = TryParseCharacterClassEscape(
1140 next, InClassEscapeState::kNotInClass, ranges,
nullptr, zone(),
1143 if (parsed_character_class_escape) {
1144 RegExpClassRanges*
cc =
1145 zone()->template New<RegExpClassRanges>(zone(), ranges);
1146 builder->AddClassRanges(
cc);
1148 CHECK(!IsUnicodeMode());
1150 builder->AddCharacter(next);
1157 ZoneList<CharacterRange>* ranges =
1158 zone()->template New<ZoneList<CharacterRange>>(2, zone());
1160 if (unicode_sets()) {
1161 strings = zone()->template New<CharacterClassStrings>(zone());
1163 bool add_unicode_case_equivalents = ignore_case();
1164 bool parsed_character_class_escape = TryParseCharacterClassEscape(
1165 next, InClassEscapeState::kNotInClass, ranges, strings, zone(),
1168 if (parsed_character_class_escape) {
1169 if (unicode_sets()) {
1170 RegExpClassSetOperand* op =
1171 zone()->template New<RegExpClassSetOperand>(ranges,
1173 builder->AddTerm(op);
1175 RegExpClassRanges*
cc =
1176 zone()->template New<RegExpClassRanges>(zone(), ranges);
1177 builder->AddClassRanges(
cc);
1180 CHECK(!IsUnicodeMode());
1182 builder->AddCharacter(next);
1194 const bool has_named_captures =
1195 HasNamedCaptures(InClassEscapeState::kNotInClass
CHECK_FAILED);
1196 if (IsUnicodeMode() || has_named_captures) {
1206 bool is_escaped_unicode_character =
false;
1208 InClassEscapeState::kNotInClass,
1210 if (is_escaped_unicode_character) {
1211 builder->AddEscapedUnicodeCharacter(c);
1213 builder->AddCharacter(c);
1221 bool parsed = ParseIntervalQuantifier(&dummy, &dummy
CHECK_FAILED);
1222 if (parsed)
return ReportError(RegExpError::kNothingToRepeat);
1227 if (IsUnicodeMode()) {
1228 return ReportError(RegExpError::kLoneQuantifierBrackets);
1232 builder->AddUnicodeCharacter(
current());
1261 if (ParseIntervalQuantifier(&min, &max)) {
1263 return ReportError(RegExpError::kRangeOutOfOrder);
1266 }
else if (IsUnicodeMode()) {
1268 return ReportError(RegExpError::kIncompleteQuantifier);
1278 }
else if (
v8_flags.regexp_possessive_quantifier && current() ==
'+') {
1285 return ReportError(RegExpError::kInvalidQuantifier);
1291template <
class CharT>
1292RegExpParserState* RegExpParserImpl<CharT>::ParseOpenParenthesis(
1293 RegExpParserState* state) {
1295 bool is_named_capture =
false;
1296 const ZoneVector<base::uc16>* capture_name =
nullptr;
1297 SubexpressionType subexpr_type = CAPTURE;
1299 bool parsing_modifiers =
false;
1300 bool modifiers_polarity =
true;
1308 if (!
v8_flags.js_regexp_modifiers) {
1313 parsing_modifiers =
true;
1314 if (modifiers_polarity ==
false) {
1318 modifiers_polarity =
false;
1323 if (!
v8_flags.js_regexp_modifiers) {
1328 parsing_modifiers =
true;
1330 if ((modifiers & flag) != 0) {
1335 flags.set(flag, modifiers_polarity);
1340 parsing_modifiers =
false;
1341 subexpr_type = GROUPING;
1345 if (parsing_modifiers) {
1351 subexpr_type = POSITIVE_LOOKAROUND;
1355 if (parsing_modifiers) {
1361 subexpr_type = NEGATIVE_LOOKAROUND;
1365 if (parsing_modifiers) {
1370 if (Next() ==
'=') {
1373 subexpr_type = POSITIVE_LOOKAROUND;
1375 }
else if (Next() ==
'!') {
1378 subexpr_type = NEGATIVE_LOOKAROUND;
1381 is_named_capture =
true;
1389 }
while (parsing_modifiers);
1391 if (modifiers_polarity ==
false) {
1393 if (modifiers == 0) {
1398 if (subexpr_type == CAPTURE) {
1405 if (is_named_capture) {
1410 return zone()->template New<RegExpParserState>(
1425template <
class CharT>
1426void RegExpParserImpl<CharT>::ScanForCaptures(
1427 InClassEscapeState in_class_escape_state) {
1429 const int saved_position =
position();
1431 int capture_count = captures_started();
1433 if (in_class_escape_state == InClassEscapeState::kInClass) {
1436 DCHECK(!IsUnicodeMode());
1443 if (c ==
']')
break;
1456 int class_nest_level = 0;
1462 }
else if (c ==
'[') {
1465 if (unicode_sets()) class_nest_level++;
1466 }
else if (c ==
']') {
1467 if (class_nest_level == 0)
break;
1499 Reset(saved_position);
1502template <
class CharT>
1503bool RegExpParserImpl<CharT>::ParseBackReferenceIndex(
int* index_out) {
1505 DCHECK(
'1' <= Next() && Next() <=
'9');
1509 int value = Next() -
'0';
1514 value = 10 * value + (c -
'0');
1524 if (value > captures_started()) {
1526 ScanForCaptures(InClassEscapeState::kNotInClass);
1539void push_code_unit(ZoneVector<base::uc16>* v, uint32_t code_unit) {
1541 v->push_back(code_unit);
1550template <
class CharT>
1551const ZoneVector<base::uc16>* RegExpParserImpl<CharT>::ParseCaptureGroupName() {
1560 RewindByOneCodepoint();
1562 ZoneVector<base::uc16>* name =
1563 zone()->template New<ZoneVector<base::uc16>>(zone());
1572 ForceUnicodeScope force_unicode(
this);
1574 bool at_start =
true;
1580 if (c ==
'\\' && Next() ==
'u') {
1582 if (!ParseUnicodeEscape(&c)) {
1586 RewindByOneCodepoint();
1591 ReportError(RegExpError::kInvalidCaptureGroupName);
1597 ReportError(RegExpError::kInvalidCaptureGroupName);
1600 push_code_unit(name, c);
1606 push_code_unit(name, c);
1608 ReportError(RegExpError::kInvalidCaptureGroupName);
1622template <
class CharT>
1623bool RegExpParserImpl<CharT>::CreateNamedCaptureAtIndex(
1624 const RegExpParserState* state,
int index) {
1625 const ZoneVector<base::uc16>* name = state->capture_name();
1626 const std::pair<int, int> non_participating_capture_group_interval =
1627 state->non_participating_capture_group_interval();
1631 RegExpCapture* capture = GetCapture(index);
1634 capture->set_name(name);
1638 ZoneMap<RegExpCapture*, ZoneList<int>*, RegExpCaptureNameLess>>(zone());
1643 if (
v8_flags.js_regexp_duplicate_named_groups) {
1644 ZoneList<int>* named_capture_indices = named_capture_it->second;
1646 DCHECK(!named_capture_indices->is_empty());
1647 for (
int named_index : *named_capture_indices) {
1648 if (named_index < non_participating_capture_group_interval.first ||
1649 named_index > non_participating_capture_group_interval.second) {
1650 ReportError(RegExpError::kDuplicateCaptureGroupName);
1655 ReportError(RegExpError::kDuplicateCaptureGroupName);
1662 capture, zone()->
template New<ZoneList<int>>(1, zone()));
1663 entry.first->second->Add(index, zone());
1667template <
class CharT>
1668bool RegExpParserImpl<CharT>::ParseNamedBackReference(
1669 RegExpBuilder* builder, RegExpParserState* state) {
1677 const ZoneVector<base::uc16>* name = ParseCaptureGroupName();
1678 if (name ==
nullptr) {
1682 if (state->IsInsideCaptureGroup(name)) {
1683 builder->AddEmpty();
1685 RegExpBackReference* atom =
1686 zone()->template New<RegExpBackReference>(zone());
1687 atom->set_name(name);
1689 builder->AddAtom(atom);
1693 zone()->template New<ZoneList<RegExpBackReference*>>(1, zone());
1701template <
class CharT>
1702void RegExpParserImpl<CharT>::PatchNamedBackReferences() {
1706 ReportError(RegExpError::kInvalidNamedCaptureReference);
1718 RegExpCapture* search_capture =
1719 zone()->template New<RegExpCapture>(kInvalidIndex);
1721 search_capture->set_name(ref->name());
1725 ReportError(RegExpError::kInvalidNamedCaptureReference);
1730 capture_it->second->length() == 1);
1731 for (
int index : *capture_it->second) {
1732 ref->add_capture(GetCapture(index), zone());
1737template <
class CharT>
1738RegExpCapture* RegExpParserImpl<CharT>::GetCapture(
int index) {
1741 const int known_captures =
1743 SBXCHECK(index >= 1 && index <= known_captures);
1746 zone()->template New<ZoneList<RegExpCapture*>>(known_captures, zone());
1748 while (
captures_->length() < known_captures) {
1755template <
class CharT>
1756ZoneVector<RegExpCapture*>* RegExpParserImpl<CharT>::GetNamedCaptures() {
1762 ZoneVector<RegExpCapture*>* flattened_named_captures =
1763 zone()->template New<ZoneVector<RegExpCapture*>>(zone());
1766 capture.second->length() == 1);
1767 for (
int index : *capture.second) {
1768 flattened_named_captures->push_back(GetCapture(index));
1771 return flattened_named_captures;
1774template <
class CharT>
1775bool RegExpParserImpl<CharT>::HasNamedCaptures(
1776 InClassEscapeState in_class_escape_state) {
1781 ScanForCaptures(in_class_escape_state);
1793template <
class CharT>
1794bool RegExpParserImpl<CharT>::ParseIntervalQuantifier(
int* min_out,
1806 if (min > (RegExpTree::kInfinity - next) / 10) {
1811 min = RegExpTree::kInfinity;
1814 min = 10 * min + next;
1821 }
else if (
current() ==
',') {
1824 max = RegExpTree::kInfinity;
1829 if (max > (RegExpTree::kInfinity - next) / 10) {
1833 max = RegExpTree::kInfinity;
1836 max = 10 * max + next;
1854template <
class CharT>
1855base::uc32 RegExpParserImpl<CharT>::ParseOctalLiteral() {
1860 base::uc32 value =
current() -
'0';
1863 value = value * 8 +
current() -
'0';
1866 value = value * 8 +
current() -
'0';
1873template <
class CharT>
1874bool RegExpParserImpl<CharT>::ParseHexEscape(
int length, base::uc32* value) {
1879 int d = base::HexValue(c);
1892template <
class CharT>
1893bool RegExpParserImpl<CharT>::ParseUnicodeEscape(base::uc32* value) {
1897 if (
current() ==
'{' && IsUnicodeMode()) {
1900 if (ParseUnlimitedLengthHexNumber(0x10FFFF, value)) {
1910 bool result = ParseHexEscape(4, value);
1915 if (Next() ==
'u') {
1918 if (ParseHexEscape(4, &trail) &&
1921 static_cast<base::uc16
>(*value),
static_cast<base::uc16
>(trail));
1930#ifdef V8_INTL_SUPPORT
1934bool IsExactPropertyAlias(
const char* property_name, UProperty property) {
1935 const char* short_name = u_getPropertyName(property, U_SHORT_PROPERTY_NAME);
1936 if (short_name !=
nullptr && strcmp(property_name, short_name) == 0)
1938 for (
int i = 0;;
i++) {
1939 const char* long_name = u_getPropertyName(
1940 property,
static_cast<UPropertyNameChoice
>(U_LONG_PROPERTY_NAME +
i));
1941 if (long_name ==
nullptr)
break;
1942 if (strcmp(property_name, long_name) == 0)
return true;
1947bool IsExactPropertyValueAlias(
const char* property_value_name,
1948 UProperty property, int32_t property_value) {
1949 const char* short_name =
1950 u_getPropertyValueName(property, property_value, U_SHORT_PROPERTY_NAME);
1951 if (short_name !=
nullptr && strcmp(property_value_name, short_name) == 0) {
1954 for (
int i = 0;;
i++) {
1955 const char* long_name = u_getPropertyValueName(
1956 property, property_value,
1957 static_cast<UPropertyNameChoice
>(U_LONG_PROPERTY_NAME +
i));
1958 if (long_name ==
nullptr)
break;
1959 if (strcmp(property_value_name, long_name) == 0)
return true;
1964void ExtractStringsFromUnicodeSet(
const icu::UnicodeSet& set,
1965 CharacterClassStrings* strings,
1966 RegExpFlags flags,
Zone* zone) {
1967 DCHECK(set.hasStrings());
1968 DCHECK(IsUnicodeSets(flags));
1971 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone);
1972 RegExpTextBuilder string_builder(zone, &string_storage, flags);
1973 const bool needs_case_folding = IsIgnoreCase(flags);
1974 icu::UnicodeSetIterator iter(set);
1975 iter.skipToStrings();
1976 while (iter.next()) {
1977 const icu::UnicodeString& s = iter.getString();
1978 const char16_t* p = s.getBuffer();
1980 ZoneList<base::uc32>*
string =
1981 zone->template New<ZoneList<base::uc32>>(
length, zone);
1984 U16_NEXT(p,
i, length, c);
1985 string_builder.AddUnicodeCharacter(c);
1986 if (needs_case_folding) {
1987 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
1989 string->Add(c, zone);
1991 strings->emplace(string->ToVector(), string_builder.ToRegExp());
1992 string_storage.clear();
1996bool LookupPropertyValueName(UProperty property,
1997 const char* property_value_name,
bool negate,
1998 ZoneList<CharacterRange>* result_ranges,
1999 CharacterClassStrings* result_strings,
2000 RegExpFlags flags,
Zone* zone) {
2001 UProperty property_for_lookup =
property;
2002 if (property_for_lookup == UCHAR_SCRIPT_EXTENSIONS) {
2005 property_for_lookup = UCHAR_SCRIPT;
2008 u_getPropertyValueEnum(property_for_lookup, property_value_name);
2009 if (property_value == UCHAR_INVALID_CODE)
return false;
2013 if (!IsExactPropertyValueAlias(property_value_name, property_for_lookup,
2018 UErrorCode ec = U_ZERO_ERROR;
2019 icu::UnicodeSet set;
2020 set.applyIntPropertyValue(property, property_value, ec);
2021 bool success = ec == U_ZERO_ERROR && !set.isEmpty();
2024 if (set.hasStrings()) {
2025 ExtractStringsFromUnicodeSet(set, result_strings, flags, zone);
2027 const bool needs_case_folding = IsUnicodeSets(flags) && IsIgnoreCase(flags);
2028 if (needs_case_folding) set.closeOver(USET_SIMPLE_CASE_INSENSITIVE);
2029 set.removeAllStrings();
2030 if (negate) set.complement();
2031 for (
int i = 0;
i < set.getRangeCount();
i++) {
2033 CharacterRange::Range(set.getRangeStart(
i), set.getRangeEnd(
i)),
2041inline bool NameEquals(
const char* name,
const char (&
literal)[N]) {
2042 return strncmp(name,
literal, N + 1) == 0;
2045bool LookupSpecialPropertyValueName(
const char* name,
2046 ZoneList<CharacterRange>*
result,
2047 bool negate, RegExpFlags flags,
2049 if (NameEquals(name,
"Any")) {
2054 result->Add(CharacterRange::Everything(), zone);
2056 }
else if (NameEquals(name,
"ASCII")) {
2057 result->Add(negate ? CharacterRange::Range(0x80, String::kMaxCodePoint)
2058 : CharacterRange::Range(0x0, 0x7F),
2060 }
else if (NameEquals(name,
"Assigned")) {
2061 return LookupPropertyValueName(UCHAR_GENERAL_CATEGORY,
"Unassigned",
2062 !negate,
result,
nullptr, flags, zone);
2071bool IsSupportedBinaryProperty(UProperty property,
bool unicode_sets) {
2073 case UCHAR_ALPHABETIC:
2076 case UCHAR_ASCII_HEX_DIGIT:
2078 case UCHAR_BIDI_CONTROL:
2079 case UCHAR_BIDI_MIRRORED:
2080 case UCHAR_CASE_IGNORABLE:
2082 case UCHAR_CHANGES_WHEN_CASEFOLDED:
2083 case UCHAR_CHANGES_WHEN_CASEMAPPED:
2084 case UCHAR_CHANGES_WHEN_LOWERCASED:
2085 case UCHAR_CHANGES_WHEN_NFKC_CASEFOLDED:
2086 case UCHAR_CHANGES_WHEN_TITLECASED:
2087 case UCHAR_CHANGES_WHEN_UPPERCASED:
2089 case UCHAR_DEFAULT_IGNORABLE_CODE_POINT:
2090 case UCHAR_DEPRECATED:
2091 case UCHAR_DIACRITIC:
2093 case UCHAR_EMOJI_COMPONENT:
2094 case UCHAR_EMOJI_MODIFIER_BASE:
2095 case UCHAR_EMOJI_MODIFIER:
2096 case UCHAR_EMOJI_PRESENTATION:
2097 case UCHAR_EXTENDED_PICTOGRAPHIC:
2098 case UCHAR_EXTENDER:
2099 case UCHAR_GRAPHEME_BASE:
2100 case UCHAR_GRAPHEME_EXTEND:
2101 case UCHAR_HEX_DIGIT:
2102 case UCHAR_ID_CONTINUE:
2103 case UCHAR_ID_START:
2104 case UCHAR_IDEOGRAPHIC:
2105 case UCHAR_IDS_BINARY_OPERATOR:
2106 case UCHAR_IDS_TRINARY_OPERATOR:
2107 case UCHAR_JOIN_CONTROL:
2108 case UCHAR_LOGICAL_ORDER_EXCEPTION:
2109 case UCHAR_LOWERCASE:
2111 case UCHAR_NONCHARACTER_CODE_POINT:
2112 case UCHAR_PATTERN_SYNTAX:
2113 case UCHAR_PATTERN_WHITE_SPACE:
2114 case UCHAR_QUOTATION_MARK:
2116 case UCHAR_REGIONAL_INDICATOR:
2118 case UCHAR_SOFT_DOTTED:
2119 case UCHAR_TERMINAL_PUNCTUATION:
2120 case UCHAR_UNIFIED_IDEOGRAPH:
2121 case UCHAR_UPPERCASE:
2122 case UCHAR_VARIATION_SELECTOR:
2123 case UCHAR_WHITE_SPACE:
2124 case UCHAR_XID_CONTINUE:
2125 case UCHAR_XID_START:
2127 case UCHAR_BASIC_EMOJI:
2128 case UCHAR_EMOJI_KEYCAP_SEQUENCE:
2129 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
2130 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
2131 case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
2132 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
2133 case UCHAR_RGI_EMOJI:
2134 return unicode_sets;
2141bool IsBinaryPropertyOfStrings(UProperty property) {
2143 case UCHAR_BASIC_EMOJI:
2144 case UCHAR_EMOJI_KEYCAP_SEQUENCE:
2145 case UCHAR_RGI_EMOJI_MODIFIER_SEQUENCE:
2146 case UCHAR_RGI_EMOJI_FLAG_SEQUENCE:
2147 case UCHAR_RGI_EMOJI_TAG_SEQUENCE:
2148 case UCHAR_RGI_EMOJI_ZWJ_SEQUENCE:
2149 case UCHAR_RGI_EMOJI:
2157bool IsUnicodePropertyValueCharacter(
char c) {
2164 if (
'a' <= c && c <=
'z')
return true;
2165 if (
'A' <= c && c <=
'Z')
return true;
2166 if (
'0' <= c && c <=
'9')
return true;
2172template <
class CharT>
2173bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
2174 ZoneVector<char>* name_2) {
2187 for (Advance();
current() !=
'}' &&
current() !=
'='; Advance()) {
2188 if (!IsUnicodePropertyValueCharacter(
current()))
return false;
2189 if (!has_next())
return false;
2190 name_1->push_back(
static_cast<char>(
current()));
2193 for (Advance();
current() !=
'}'; Advance()) {
2194 if (!IsUnicodePropertyValueCharacter(
current()))
return false;
2195 if (!has_next())
return false;
2196 name_2->push_back(
static_cast<char>(
current()));
2198 name_2->push_back(0);
2204 name_1->push_back(0);
2206 DCHECK(name_1->size() - 1 == std::strlen(name_1->data()));
2207 DCHECK(name_2->empty() || name_2->size() - 1 == std::strlen(name_2->data()));
2211template <
class CharT>
2212bool RegExpParserImpl<CharT>::AddPropertyClassRange(
2213 ZoneList<CharacterRange>* add_to_ranges,
2214 CharacterClassStrings* add_to_strings,
bool negate,
2215 const ZoneVector<char>& name_1,
const ZoneVector<char>& name_2) {
2216 if (name_2.empty()) {
2218 const char* name = name_1.data();
2219 if (LookupPropertyValueName(UCHAR_GENERAL_CATEGORY_MASK, name, negate,
2220 add_to_ranges, add_to_strings,
flags(),
2225 if (LookupSpecialPropertyValueName(name, add_to_ranges, negate,
flags(),
2230 UProperty
property = u_getPropertyEnum(name);
2231 if (!IsSupportedBinaryProperty(property, unicode_sets()))
return false;
2232 if (!IsExactPropertyAlias(name, property))
return false;
2236 if (negate && IsBinaryPropertyOfStrings(property))
return false;
2237 if (unicode_sets()) {
2243 return LookupPropertyValueName(property,
"Y", negate, add_to_ranges,
2244 add_to_strings,
flags(), zone());
2246 return LookupPropertyValueName(property, negate ?
"N" :
"Y",
false,
2247 add_to_ranges, add_to_strings,
flags(),
2253 const char* property_name = name_1.data();
2254 const char* value_name = name_2.data();
2255 UProperty
property = u_getPropertyEnum(property_name);
2256 if (!IsExactPropertyAlias(property_name, property))
return false;
2257 if (property == UCHAR_GENERAL_CATEGORY) {
2259 property = UCHAR_GENERAL_CATEGORY_MASK;
2260 }
else if (property != UCHAR_SCRIPT &&
2261 property != UCHAR_SCRIPT_EXTENSIONS) {
2264 return LookupPropertyValueName(property, value_name, negate, add_to_ranges,
2265 add_to_strings,
flags(), zone());
2271template <
class CharT>
2272bool RegExpParserImpl<CharT>::ParsePropertyClassName(ZoneVector<char>* name_1,
2273 ZoneVector<char>* name_2) {
2277template <
class CharT>
2278bool RegExpParserImpl<CharT>::AddPropertyClassRange(
2279 ZoneList<CharacterRange>* add_to_ranges,
2280 CharacterClassStrings* add_to_strings,
bool negate,
2281 const ZoneVector<char>& name_1,
const ZoneVector<char>& name_2) {
2287template <
class CharT>
2288bool RegExpParserImpl<CharT>::ParseUnlimitedLengthHexNumber(
int max_value,
2289 base::uc32* value) {
2291 int d = base::HexValue(
current());
2297 if (
x >
static_cast<base::uc32
>(max_value)) {
2301 d = base::HexValue(
current());
2308template <
class CharT>
2309base::uc32 RegExpParserImpl<CharT>::ParseCharacterEscape(
2310 InClassEscapeState in_class_escape_state,
2311 bool* is_escaped_unicode_character) {
2317 const base::uc32 c =
current();
2340 base::uc32 controlLetter = Next();
2341 base::uc32 letter = controlLetter & ~(
'A' ^
'a');
2342 if (letter >=
'A' && letter <=
'Z') {
2346 return controlLetter & 0x1F;
2348 if (IsUnicodeMode()) {
2353 if (in_class_escape_state == InClassEscapeState::kInClass) {
2357 if ((controlLetter >=
'0' && controlLetter <=
'9') ||
2358 controlLetter ==
'_') {
2360 return controlLetter & 0x1F;
2372 if (Next() <
'0' || Next() >
'9') {
2388 if (IsUnicodeMode()) {
2394 return ParseOctalLiteral();
2400 if (ParseHexEscape(2, &value))
return value;
2401 if (IsUnicodeMode()) {
2415 if (ParseUnicodeEscape(&value)) {
2416 *is_escaped_unicode_character =
true;
2419 if (IsUnicodeMode()) {
2443 if (unicode_sets() && in_class_escape_state == InClassEscapeState::kInClass) {
2444 if (IsClassSetReservedPunctuator(c)) {
2449 if (IsUnicodeMode()) {
2450 if (!IsSyntaxCharacterOrSlash(c)) {
2457 DCHECK(!IsUnicodeMode());
2465 if (c ==
'k' && HasNamedCaptures(in_class_escape_state)) {
2473template <
class CharT>
2474RegExpTree* RegExpParserImpl<CharT>::ParseClassRanges(
2475 ZoneList<CharacterRange>* ranges,
bool add_unicode_case_equivalents) {
2476 base::uc32 char_1, char_2;
2477 bool is_class_1, is_class_2;
2478 while (has_more() &&
current() !=
']') {
2479 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_1,
2488 }
else if (
current() ==
']') {
2489 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2490 ranges->Add(CharacterRange::Singleton(
'-'), zone());
2493 ParseClassEscape(ranges, zone(), add_unicode_case_equivalents, &char_2,
2495 if (is_class_1 || is_class_2) {
2497 if (IsUnicodeMode()) {
2499 return ReportError(RegExpError::kInvalidCharacterClass);
2501 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2502 ranges->Add(CharacterRange::Singleton(
'-'), zone());
2503 if (!is_class_2) ranges->Add(CharacterRange::Singleton(char_2), zone());
2507 if (char_1 > char_2) {
2508 return ReportError(RegExpError::kOutOfOrderCharacterClass);
2510 ranges->Add(CharacterRange::Range(char_1, char_2), zone());
2512 if (!is_class_1) ranges->Add(CharacterRange::Singleton(char_1), zone());
2519template <
class CharT>
2520void RegExpParserImpl<CharT>::ParseClassEscape(
2521 ZoneList<CharacterRange>* ranges,
Zone* zone,
2522 bool add_unicode_case_equivalents, base::uc32* char_out,
2523 bool* is_class_escape) {
2524 *is_class_escape =
false;
2533 const base::uc32 next = Next();
2540 if (IsUnicodeMode()) {
2553 static constexpr InClassEscapeState kInClassEscape =
2554 InClassEscapeState::kInClass;
2556 TryParseCharacterClassEscape(next, kInClassEscape, ranges,
nullptr, zone,
2557 add_unicode_case_equivalents);
2558 if (*is_class_escape)
return;
2561 *char_out = ParseCharacterEscape(kInClassEscape, &dummy);
2565template <
class CharT>
2566bool RegExpParserImpl<CharT>::TryParseCharacterClassEscape(
2567 base::uc32 next, InClassEscapeState in_class_escape_state,
2568 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
2569 Zone* zone,
bool add_unicode_case_equivalents) {
2581 ranges, add_unicode_case_equivalents,
2587 if (!IsUnicodeMode())
return false;
2588 bool negate = next ==
'P';
2590 ZoneVector<char> name_1(zone);
2591 ZoneVector<char> name_2(zone);
2592 if (!ParsePropertyClassName(&name_1, &name_2) ||
2593 !AddPropertyClassRange(ranges, strings, negate, name_1, name_2)) {
2594 ReportError(in_class_escape_state == InClassEscapeState::kInClass
2595 ? RegExpError::kInvalidClassPropertyName
2596 : RegExpError::kInvalidPropertyName);
2609void AddClassString(ZoneList<base::uc32>* normalized_string,
2610 RegExpTree* regexp_string, ZoneList<CharacterRange>* ranges,
2611 CharacterClassStrings* strings,
Zone* zone) {
2612 if (normalized_string->length() == 1) {
2613 ranges->Add(CharacterRange::Singleton(normalized_string->at(0)), zone);
2615 strings->emplace(normalized_string->ToVector(), regexp_string);
2622template <
class CharT>
2623RegExpTree* RegExpParserImpl<CharT>::ParseClassStringDisjunction(
2624 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings) {
2635 ZoneList<base::uc32>*
string =
2636 zone()->template New<ZoneList<base::uc32>>(4, zone());
2637 RegExpTextBuilder::SmallRegExpTreeVector string_storage(zone());
2638 RegExpTextBuilder string_builder(zone(), &string_storage,
flags());
2640 while (has_more() &&
current() !=
'}') {
2642 AddClassString(
string, string_builder.ToRegExp(), ranges, strings,
2644 string = zone()->template New<ZoneList<base::uc32>>(4, zone());
2645 string_storage.clear();
2649 if (ignore_case()) {
2650#ifdef V8_INTL_SUPPORT
2651 c = u_foldCase(c, U_FOLD_CASE_DEFAULT);
2656 string->Add(c, zone());
2657 string_builder.AddUnicodeCharacter(c);
2661 AddClassString(
string, string_builder.ToRegExp(), ranges, strings, zone());
2662 CharacterRange::Canonicalize(ranges);
2675template <
class CharT>
2676RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
2677 const RegExpBuilder* builder, ClassSetOperandType* type_out) {
2678 ZoneList<CharacterRange>* ranges =
2679 zone()->template New<ZoneList<CharacterRange>>(1, zone());
2681 zone()->template New<CharacterClassStrings>(zone());
2683 RegExpTree* tree = ParseClassSetOperand(builder, type_out, ranges, strings,
2687 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
2688 ranges->is_empty());
2689 DCHECK_IMPLIES(*type_out == ClassSetOperandType::kClassSetCharacter,
2692 ranges->is_empty());
2696 tree->IsClassSetExpression());
2698 DCHECK_NE(*type_out, ClassSetOperandType::kClassSetRange);
2702 if (tree ==
nullptr) {
2703 if (*type_out == ClassSetOperandType::kClassSetCharacter) {
2704 AddMaybeSimpleCaseFoldedRange(ranges,
2705 CharacterRange::Singleton(character));
2707 tree = zone()->template New<RegExpClassSetOperand>(ranges, strings);
2719template <
class CharT>
2720RegExpTree* RegExpParserImpl<CharT>::ParseClassSetOperand(
2721 const RegExpBuilder* builder, ClassSetOperandType* type_out,
2722 ZoneList<CharacterRange>* ranges, CharacterClassStrings* strings,
2723 base::uc32* character) {
2727 const base::uc32 next = Next();
2729 *type_out = ClassSetOperandType::kClassStringDisjunction;
2730 ParseClassStringDisjunction(ranges, strings
CHECK_FAILED);
2733 static constexpr InClassEscapeState kInClassEscape =
2734 InClassEscapeState::kInClass;
2735 const bool add_unicode_case_equivalents = ignore_case();
2736 if (TryParseCharacterClassEscape(next, kInClassEscape, ranges, strings,
2737 zone(), add_unicode_case_equivalents)) {
2738 *type_out = ClassSetOperandType::kCharacterClassEscape;
2744 *type_out = ClassSetOperandType::kNestedClass;
2745 return ParseCharacterClass(builder);
2748 *type_out = ClassSetOperandType::kClassSetCharacter;
2754template <
class CharT>
2755base::uc32 RegExpParserImpl<CharT>::ParseClassSetCharacter() {
2757 const base::uc32 c =
current();
2759 const base::uc32 next = Next();
2768 static constexpr InClassEscapeState kInClassEscape =
2769 InClassEscapeState::kInClass;
2772 return ParseCharacterEscape(kInClassEscape, &dummy);
2774 if (IsClassSetSyntaxCharacter(c)) {
2775 ReportError(RegExpError::kInvalidCharacterInClass);
2778 if (IsClassSetReservedDoublePunctuator(c)) {
2779 ReportError(RegExpError::kInvalidClassSetOperation);
2788bool MayContainStrings(ClassSetOperandType type, RegExpTree* operand) {
2790 case ClassSetOperandType::kClassSetCharacter:
2791 case ClassSetOperandType::kClassSetRange:
2793 case ClassSetOperandType::kCharacterClassEscape:
2794 case ClassSetOperandType::kClassStringDisjunction:
2795 return operand->AsClassSetOperand()->has_strings();
2796 case ClassSetOperandType::kNestedClass:
2797 if (operand->IsClassRanges())
return false;
2798 return operand->AsClassSetExpression()->may_contain_strings();
2804template <
class CharT>
2805void RegExpParserImpl<CharT>::AddMaybeSimpleCaseFoldedRange(
2806 ZoneList<CharacterRange>* ranges, CharacterRange new_range) {
2808 if (ignore_case()) {
2809 ZoneList<CharacterRange>* new_ranges =
2810 zone()->template New<ZoneList<CharacterRange>>(2, zone());
2811 new_ranges->Add(new_range, zone());
2812 CharacterRange::AddUnicodeCaseEquivalents(new_ranges, zone());
2813 ranges->AddAll(*new_ranges, zone());
2815 ranges->Add(new_range, zone());
2817 CharacterRange::Canonicalize(ranges);
2821template <
class CharT>
2822RegExpTree* RegExpParserImpl<CharT>::ParseClassUnion(
2823 const RegExpBuilder* builder,
bool is_negated, RegExpTree* first_operand,
2824 ClassSetOperandType first_operand_type, ZoneList<CharacterRange>* ranges,
2825 CharacterClassStrings* strings, base::uc32 character) {
2827 ZoneList<RegExpTree*>* operands =
2828 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2829 bool may_contain_strings =
false;
2834 if (first_operand !=
nullptr) {
2835 may_contain_strings = MayContainStrings(first_operand_type, first_operand);
2836 operands->Add(first_operand, zone());
2838 ClassSetOperandType last_type = first_operand_type;
2839 while (has_more() &&
current() !=
']') {
2842 if (Next() ==
'-') {
2843 return ReportError(RegExpError::kInvalidClassSetOperation);
2857 if (last_type != ClassSetOperandType::kClassSetCharacter) {
2858 return ReportError(RegExpError::kInvalidCharacterClass);
2861 ParseClassSetOperand(builder, &last_type, ranges, strings,
2863 if (last_type != ClassSetOperandType::kClassSetCharacter) {
2864 return ReportError(RegExpError::kInvalidCharacterClass);
2866 if (from > character) {
2867 return ReportError(RegExpError::kOutOfOrderCharacterClass);
2869 AddMaybeSimpleCaseFoldedRange(ranges,
2870 CharacterRange::Range(from, character));
2871 last_type = ClassSetOperandType::kClassSetRange;
2874 if (last_type == ClassSetOperandType::kClassSetCharacter) {
2875 AddMaybeSimpleCaseFoldedRange(ranges,
2876 CharacterRange::Singleton(character));
2878 RegExpTree* operand = ParseClassSetOperand(
2879 builder, &last_type, ranges, strings, &character
CHECK_FAILED);
2880 if (operand !=
nullptr) {
2881 may_contain_strings |= MayContainStrings(last_type, operand);
2884 if (!ranges->is_empty() || !strings->empty()) {
2885 may_contain_strings |= !strings->empty();
2887 zone()->
template New<RegExpClassSetOperand>(ranges, strings),
2889 ranges = zone()->template New<ZoneList<CharacterRange>>(2, zone());
2890 strings = zone()->template New<CharacterClassStrings>(zone());
2892 operands->Add(operand, zone());
2898 return ReportError(RegExpError::kUnterminatedCharacterClass);
2901 if (last_type == ClassSetOperandType::kClassSetCharacter) {
2902 AddMaybeSimpleCaseFoldedRange(ranges, CharacterRange::Singleton(character));
2906 if (!ranges->is_empty() || !strings->empty()) {
2907 may_contain_strings |= !strings->empty();
2908 operands->Add(zone()->
template New<RegExpClassSetOperand>(ranges, strings),
2915 if (is_negated && may_contain_strings) {
2916 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2919 if (operands->is_empty()) {
2922 DCHECK(ranges->is_empty());
2923 DCHECK(strings->empty());
2924 return RegExpClassSetExpression::Empty(zone(), is_negated);
2927 return zone()->template New<RegExpClassSetExpression>(
2928 RegExpClassSetExpression::OperationType::kUnion, is_negated,
2929 may_contain_strings, operands);
2933template <
class CharT>
2934RegExpTree* RegExpParserImpl<CharT>::ParseClassIntersection(
2935 const RegExpBuilder* builder,
bool is_negated, RegExpTree* first_operand,
2936 ClassSetOperandType first_operand_type) {
2939 bool may_contain_strings =
2940 MayContainStrings(first_operand_type, first_operand);
2941 ZoneList<RegExpTree*>* operands =
2942 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2943 operands->Add(first_operand, zone());
2944 while (has_more() &&
current() !=
']') {
2945 if (
current() !=
'&' || Next() !=
'&') {
2946 return ReportError(RegExpError::kInvalidClassSetOperation);
2951 return ReportError(RegExpError::kInvalidCharacterInClass);
2954 ClassSetOperandType operand_type;
2955 RegExpTree* operand =
2956 ParseClassSetOperand(builder, &operand_type
CHECK_FAILED);
2957 may_contain_strings &= MayContainStrings(operand_type, operand);
2958 operands->Add(operand, zone());
2961 return ReportError(RegExpError::kUnterminatedCharacterClass);
2963 if (is_negated && may_contain_strings) {
2964 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2968 return zone()->template New<RegExpClassSetExpression>(
2969 RegExpClassSetExpression::OperationType::kIntersection, is_negated,
2970 may_contain_strings, operands);
2974template <
class CharT>
2975RegExpTree* RegExpParserImpl<CharT>::ParseClassSubtraction(
2976 const RegExpBuilder* builder,
bool is_negated, RegExpTree* first_operand,
2977 ClassSetOperandType first_operand_type) {
2980 const bool may_contain_strings =
2981 MayContainStrings(first_operand_type, first_operand);
2982 if (is_negated && may_contain_strings) {
2983 return ReportError(RegExpError::kNegatedCharacterClassWithStrings);
2985 ZoneList<RegExpTree*>* operands =
2986 zone()->template New<ZoneList<RegExpTree*>>(2, zone());
2987 operands->Add(first_operand, zone());
2988 while (has_more() &&
current() !=
']') {
2989 if (
current() !=
'-' || Next() !=
'-') {
2990 return ReportError(RegExpError::kInvalidClassSetOperation);
2993 ClassSetOperandType dummy;
2994 RegExpTree* operand = ParseClassSetOperand(builder, &dummy
CHECK_FAILED);
2995 operands->Add(operand, zone());
2998 return ReportError(RegExpError::kUnterminatedCharacterClass);
3002 return zone()->template New<RegExpClassSetExpression>(
3003 RegExpClassSetExpression::OperationType::kSubtraction, is_negated,
3004 may_contain_strings, operands);
3008template <
class CharT>
3009RegExpTree* RegExpParserImpl<CharT>::ParseCharacterClass(
3010 const RegExpBuilder* builder) {
3013 bool is_negated =
false;
3018 ZoneList<CharacterRange>* ranges =
3019 zone()->template New<ZoneList<CharacterRange>>(2, zone());
3022 if (unicode_sets()) {
3023 return RegExpClassSetExpression::Empty(zone(), is_negated);
3025 RegExpClassRanges::ClassRangesFlags class_ranges_flags;
3026 if (is_negated) class_ranges_flags = RegExpClassRanges::NEGATED;
3027 return zone()->template New<RegExpClassRanges>(zone(), ranges,
3028 class_ranges_flags);
3032 if (!unicode_sets()) {
3033 bool add_unicode_case_equivalents = IsUnicodeMode() && ignore_case();
3034 ParseClassRanges(ranges, add_unicode_case_equivalents
CHECK_FAILED);
3036 return ReportError(RegExpError::kUnterminatedCharacterClass);
3040 RegExpClassRanges::ClassRangesFlags character_class_flags;
3041 if (is_negated) character_class_flags = RegExpClassRanges::NEGATED;
3042 return zone()->template New<RegExpClassRanges>(zone(), ranges,
3043 character_class_flags);
3045 ClassSetOperandType operand_type;
3047 zone()->template New<CharacterClassStrings>(zone());
3049 RegExpTree* operand = ParseClassSetOperand(
3050 builder, &operand_type, ranges, strings, &character
CHECK_FAILED);
3053 if (Next() ==
'-') {
3054 if (operand ==
nullptr) {
3055 if (operand_type == ClassSetOperandType::kClassSetCharacter) {
3056 AddMaybeSimpleCaseFoldedRange(
3057 ranges, CharacterRange::Singleton(character));
3060 zone()->template New<RegExpClassSetOperand>(ranges, strings);
3062 return ParseClassSubtraction(builder, is_negated, operand,
3068 if (Next() ==
'&') {
3069 if (operand ==
nullptr) {
3070 if (operand_type == ClassSetOperandType::kClassSetCharacter) {
3071 AddMaybeSimpleCaseFoldedRange(
3072 ranges, CharacterRange::Singleton(character));
3075 zone()->template New<RegExpClassSetOperand>(ranges, strings);
3077 return ParseClassIntersection(builder, is_negated, operand,
3081 return ParseClassUnion(builder, is_negated, operand, operand_type, ranges,
3082 strings, character);
3088template <
class CharT>
3089bool RegExpParserImpl<CharT>::Parse(RegExpCompileData*
result) {
3091 RegExpTree* tree = ParsePattern();
3103 if (
v8_flags.trace_regexp_parser) {
3105 tree->Print(os, zone());
3110 const int capture_count = captures_started();
3111 result->simple = tree->IsAtom() && simple() && capture_count == 0;
3112 result->contains_anchor = contains_anchor();
3113 result->capture_count = capture_count;
3114 result->named_captures = GetNamedCaptures();
3118void RegExpBuilder::FlushText() { text_builder().FlushText(); }
3120void RegExpBuilder::AddCharacter(base::uc16 c) {
3122 text_builder().AddCharacter(c);
3125void RegExpBuilder::AddUnicodeCharacter(base::uc32 c) {
3127 text_builder().AddUnicodeCharacter(c);
3130void RegExpBuilder::AddEscapedUnicodeCharacter(base::uc32 character) {
3132 text_builder().AddEscapedUnicodeCharacter(character);
3135void RegExpBuilder::AddEmpty() {
3136 text_builder().FlushPendingSurrogate();
3140void RegExpBuilder::AddClassRanges(RegExpClassRanges* cc) {
3142 text_builder().AddClassRanges(cc);
3145void RegExpBuilder::AddAtom(RegExpTree* term) {
3146 if (term->IsEmpty()) {
3151 if (term->IsTextElement()) {
3152 text_builder().AddAtom(term);
3155 terms_.emplace_back(term);
3159void RegExpBuilder::AddTerm(RegExpTree* term) {
3160 DCHECK(!term->IsEmpty());
3162 if (term->IsTextElement()) {
3163 text_builder().AddTerm(term);
3166 terms_.emplace_back(term);
3170void RegExpBuilder::AddAssertion(RegExpTree* assert) {
3173 terms_.emplace_back(assert);
3176void RegExpBuilder::NewAlternative() { FlushTerms(); }
3178void RegExpBuilder::FlushTerms() {
3180 size_t num_terms =
terms_.size();
3181 RegExpTree* alternative;
3182 if (num_terms == 0) {
3183 alternative = zone()->New<RegExpEmpty>();
3184 }
else if (num_terms == 1) {
3185 alternative =
terms_.back();
3188 zone()->New<RegExpAlternative>(zone()->New<ZoneList<RegExpTree*>>(
3189 base::VectorOf(
terms_.begin(),
terms_.size()), zone()));
3195RegExpTree* RegExpBuilder::ToRegExp() {
3198 if (num_alternatives == 0)
return zone()->New<RegExpEmpty>();
3200 return zone()->New<RegExpDisjunction>(zone()->New<ZoneList<RegExpTree*>>(
3204bool RegExpBuilder::AddQuantifierToAtom(
3205 int min,
int max,
int index,
3206 RegExpQuantifier::QuantifierType quantifier_type) {
3211 RegExpTree* atom = text_builder().PopLastAtom();
3212 if (atom !=
nullptr) {
3214 }
else if (!
terms_.empty()) {
3217 if (atom->IsLookaround()) {
3219 if (IsUnicodeMode())
return false;
3221 if (atom->AsLookaround()->type() == RegExpLookaround::LOOKBEHIND) {
3225 if (atom->max_match() == 0) {
3230 terms_.emplace_back(atom);
3238 zone()->New<RegExpQuantifier>(min, max, quantifier_type, index, atom));
3242template class RegExpParserImpl<uint8_t>;
3243template class RegExpParserImpl<base::uc16>;
3248bool RegExpParser::ParseRegExpFromHeapString(
Isolate* isolate,
Zone* zone,
3253 uintptr_t stack_limit = isolate->stack_guard()->real_climit();
3258 stack_limit, zone, no_gc}
3263 stack_limit, zone, no_gc}
3269template <
class CharT>
3270bool RegExpParser::VerifyRegExpSyntax(
Zone* zone, uintptr_t stack_limit,
3271 const CharT* input,
int input_length,
3275 return RegExpParserImpl<CharT>{input, input_length,
flags,
3276 stack_limit, zone, no_gc}
3280template bool RegExpParser::VerifyRegExpSyntax<uint8_t>(
3283template bool RegExpParser::VerifyRegExpSyntax<base::uc16>(
#define SBXCHECK(condition)
static uint16_t LeadSurrogate(uint32_t char_code)
static const uchar kMaxNonSurrogateCharCode
static uint16_t TrailSurrogate(uint32_t char_code)
static int CombineSurrogatePair(uchar lead, uchar trail)
static bool IsTrailSurrogate(int code)
static bool IsLeadSurrogate(int code)
constexpr T * begin() const
static V8_EXPORT_PRIVATE void AddClassEscape(StandardCharacterSet standard_character_set, ZoneList< CharacterRange > *ranges, bool add_unicode_case_equivalents, Zone *zone)
static constexpr int kMaxCaptures
static const int kInfinity
base::Vector< const uint8_t > ToOneByteVector() const
base::Vector< const base::uc16 > ToUC16Vector() const
DisallowGarbageCollection no_gc_
other heap size flags(e.g. initial_heap_size) take precedence") DEFINE_SIZE_T( max_shared_heap_size
ZoneVector< RpoNumber > & result
FunctionLiteral * literal
void ReportError(Args &&... args)
constexpr std::optional< RegExpFlag > TryRegExpFlagFromChar(char c)
bool IsIdentifierStart(base::uc32 c)
PerThreadAssertScopeDebugOnly< false, SAFEPOINTS_ASSERT, HEAP_ALLOCATION_ASSERT > DisallowGarbageCollection
BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL BUILTIN_FP_CALL int character
base::Flags< RegExpFlag > RegExpFlags
uintptr_t GetCurrentStackPosition()
ZoneMap< base::Vector< const base::uc32 >, RegExpTree *, CharacterClassStringLess > CharacterClassStrings
constexpr bool IsDecimalDigit(base::uc32 c)
V8_EXPORT_PRIVATE FlagValues v8_flags
constexpr int AsciiAlphaToLower(base::uc32 c)
bool IsIdentifierPart(base::uc32 c)
ZoneList< RegExpTree * > * terms_
#define DOUBLE_PUNCTUATOR_CASE(Char)
RegExpParserState *const previous_state_
RegExpTextBuilder text_builder_
ZoneList< RegExpBackReference * > * named_back_references_
ZoneMap< RegExpCapture *, ZoneList< int > *, RegExpCaptureNameLess > * named_captures_
ZoneList< base::uc16 > * characters_
std::pair< int, int > non_participating_capture_group_interval_
const ZoneVector< base::uc16 > *const capture_name_
static const base::uc16 kNoPendingSurrogate
base::uc16 pending_surrogate_
RegExpParserImpl< CharT > *const parser_
SmallRegExpTreeVector text_
ZoneList< RegExpCapture * > * captures_
const int disjunction_capture_index_
SmallRegExpTreeVector alternatives_
const SubexpressionType group_type_
SmallRegExpTreeVector * terms_
const RegExpLookaround::Type lookaround_type_
const uintptr_t stack_limit_
bool is_scanned_for_captures_
static const base::uc32 kEndMarker
#define DCHECK_NOT_NULL(val)
#define DCHECK_IMPLIES(v1, v2)
#define DCHECK_NE(v1, v2)
#define DCHECK(condition)
#define DCHECK_EQ(v1, v2)
std::unique_ptr< ValueMirror > value